1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2023 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #include "private/cpu.h"
34
35 #ifndef FLAC__INTEGER_ONLY_LIBRARY
36 #ifndef FLAC__NO_ASM
37 #if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN
38 #include "private/lpc.h"
39 #include "FLAC/assert.h"
40 #include "FLAC/format.h"
41 #include "private/macros.h"
42 #include <arm_neon.h>
43
44 #if FLAC__HAS_A64NEONINTRIN
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])45 void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
46 {
47 #undef MAX_LAG
48 #define MAX_LAG 14
49 #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
50 }
51
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])52 void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
53 {
54 #undef MAX_LAG
55 #define MAX_LAG 10
56 #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
57 }
58
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])59 void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
60 {
61 #undef MAX_LAG
62 #define MAX_LAG 8
63 #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
64 }
65
66 #endif /* ifdef FLAC__HAS_A64NEONINTRIN */
67
68
69 #define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
70 summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \
71 summ_1 = vmulq_laneq_s32(tmp_vec[4], qlp_coeff_vec, lane); \
72 summ_2 = vmulq_laneq_s32(tmp_vec[8], qlp_coeff_vec, lane);
73
74
75 #define MACC_32BIT_LOOP_UNROOL_3(tmp_vec_ind, qlp_coeff_vec, lane) \
76 summ_0 = vmlaq_laneq_s32(summ_0,tmp_vec[tmp_vec_ind] ,qlp_coeff_vec, lane); \
77 summ_1 = vmlaq_laneq_s32(summ_1,tmp_vec[tmp_vec_ind+4] ,qlp_coeff_vec, lane); \
78 summ_2 = vmlaq_laneq_s32(summ_2,tmp_vec[tmp_vec_ind+8] ,qlp_coeff_vec, lane);
79
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])80 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
81 {
82 int i;
83 FLAC__int32 sum;
84 int32x4_t tmp_vec[20];
85
86 FLAC__ASSERT(order > 0);
87 FLAC__ASSERT(order <= 32);
88
89 // Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients(signal+order,....)
90 if(order <= 12) {
91 if(order > 8) {
92 if(order > 10) {
93 if (order == 12) {
94 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
95 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
96 int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], qlp_coeff[11]};
97
98 tmp_vec[0] = vld1q_s32(data - 12);
99 tmp_vec[1] = vld1q_s32(data - 11);
100 tmp_vec[2] = vld1q_s32(data - 10);
101 tmp_vec[3] = vld1q_s32(data - 9);
102 tmp_vec[4] = vld1q_s32(data - 8);
103 tmp_vec[5] = vld1q_s32(data - 7);
104 tmp_vec[6] = vld1q_s32(data - 6);
105 tmp_vec[7] = vld1q_s32(data - 5);
106
107 for (i = 0; i < (int)data_len - 11; i += 12)
108 {
109 int32x4_t summ_0, summ_1, summ_2;
110
111 tmp_vec[8] = vld1q_s32(data + i - 4);
112 tmp_vec[9] = vld1q_s32(data+i-3);
113 tmp_vec[10] = vld1q_s32(data+i-2);
114 tmp_vec[11] = vld1q_s32(data+i-1);
115 tmp_vec[12] = vld1q_s32(data+i);
116 tmp_vec[13] = vld1q_s32(data+i+1);
117 tmp_vec[14] = vld1q_s32(data+i+2);
118 tmp_vec[15] = vld1q_s32(data+i+3);
119 tmp_vec[16] = vld1q_s32(data + i + 4);
120 tmp_vec[17] = vld1q_s32(data + i + 5);
121 tmp_vec[18] = vld1q_s32(data + i + 6);
122 tmp_vec[19] = vld1q_s32(data + i + 7);
123
124 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 3)
125 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 2)
126 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 1)
127 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_2, 0)
128 MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 3)
129 MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 2)
130 MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 1)
131 MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_1, 0)
132 MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 3)
133 MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 2)
134 MACC_32BIT_LOOP_UNROOL_3(10, qlp_coeff_0, 1)
135 MACC_32BIT_LOOP_UNROOL_3(11, qlp_coeff_0, 0)
136
137 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
138 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
139 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
140
141 tmp_vec[0] = tmp_vec[12];
142 tmp_vec[1] = tmp_vec[13];
143 tmp_vec[2] = tmp_vec[14];
144 tmp_vec[3] = tmp_vec[15];
145 tmp_vec[4] = tmp_vec[16];
146 tmp_vec[5] = tmp_vec[17];
147 tmp_vec[6] = tmp_vec[18];
148 tmp_vec[7] = tmp_vec[19];
149 }
150 }
151
152 else { /* order == 11 */
153 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
154 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
155 int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0};
156
157 tmp_vec[0] = vld1q_s32(data - 11);
158 tmp_vec[1] = vld1q_s32(data - 10);
159 tmp_vec[2] = vld1q_s32(data - 9);
160 tmp_vec[3] = vld1q_s32(data - 8);
161 tmp_vec[4] = vld1q_s32(data - 7);
162 tmp_vec[5] = vld1q_s32(data - 6);
163 tmp_vec[6] = vld1q_s32(data - 5);
164
165 for (i = 0; i < (int)data_len - 11; i += 12)
166 {
167 int32x4_t summ_0, summ_1, summ_2;
168 tmp_vec[7] = vld1q_s32(data + i - 4);
169 tmp_vec[8] = vld1q_s32(data + i - 3);
170 tmp_vec[9] = vld1q_s32(data + i - 2);
171 tmp_vec[10] = vld1q_s32(data + i - 1);
172 tmp_vec[11] = vld1q_s32(data + i - 0);
173 tmp_vec[12] = vld1q_s32(data + i + 1);
174 tmp_vec[13] = vld1q_s32(data + i + 2);
175 tmp_vec[14] = vld1q_s32(data + i + 3);
176 tmp_vec[15] = vld1q_s32(data + i + 4);
177 tmp_vec[16] = vld1q_s32(data + i + 5);
178 tmp_vec[17] = vld1q_s32(data + i + 6);
179 tmp_vec[18] = vld1q_s32(data + i + 7);
180
181
182 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 2)
183 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 1)
184 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 0)
185 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 3)
186 MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 2)
187 MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 1)
188 MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 0)
189 MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 3)
190 MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 2)
191 MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 1)
192 MACC_32BIT_LOOP_UNROOL_3(10, qlp_coeff_0, 0)
193
194 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
195 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
196 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
197
198
199 tmp_vec[0] = tmp_vec[12];
200 tmp_vec[1] = tmp_vec[13];
201 tmp_vec[2] = tmp_vec[14];
202 tmp_vec[3] = tmp_vec[15];
203 tmp_vec[4] = tmp_vec[16];
204 tmp_vec[5] = tmp_vec[17];
205 tmp_vec[6] = tmp_vec[18];
206 }
207 }
208 }
209 else {
210 if(order == 10) {
211 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
212 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
213 int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
214
215 tmp_vec[0] = vld1q_s32(data - 10);
216 tmp_vec[1] = vld1q_s32(data - 9);
217 tmp_vec[2] = vld1q_s32(data - 8);
218 tmp_vec[3] = vld1q_s32(data - 7);
219 tmp_vec[4] = vld1q_s32(data - 6);
220 tmp_vec[5] = vld1q_s32(data - 5);
221
222 for (i = 0; i < (int)data_len - 11; i += 12)
223 {
224 int32x4_t summ_0, summ_1, summ_2;
225 tmp_vec[6] = vld1q_s32(data + i - 4);
226 tmp_vec[7] = vld1q_s32(data + i - 3);
227 tmp_vec[8] = vld1q_s32(data + i - 2);
228 tmp_vec[9] = vld1q_s32(data + i - 1);
229 tmp_vec[10] = vld1q_s32(data + i - 0);
230 tmp_vec[11] = vld1q_s32(data + i + 1);
231 tmp_vec[12] = vld1q_s32(data + i + 2);
232 tmp_vec[13] = vld1q_s32(data + i + 3);
233 tmp_vec[14] = vld1q_s32(data + i + 4);
234 tmp_vec[15] = vld1q_s32(data + i + 5);
235 tmp_vec[16] = vld1q_s32(data + i + 6);
236 tmp_vec[17] = vld1q_s32(data + i + 7);
237
238
239 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 1)
240 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 0)
241 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 3)
242 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 2)
243 MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 1)
244 MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 0)
245 MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 3)
246 MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 2)
247 MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 1)
248 MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 0)
249
250 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
251 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
252 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
253
254
255 tmp_vec[0] = tmp_vec[12];
256 tmp_vec[1] = tmp_vec[13];
257 tmp_vec[2] = tmp_vec[14];
258 tmp_vec[3] = tmp_vec[15];
259 tmp_vec[4] = tmp_vec[16];
260 tmp_vec[5] = tmp_vec[17];
261 }
262 }
263 else { /* order == 9 */
264 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
265 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
266 int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
267
268 tmp_vec[0] = vld1q_s32(data - 9);
269 tmp_vec[1] = vld1q_s32(data - 8);
270 tmp_vec[2] = vld1q_s32(data - 7);
271 tmp_vec[3] = vld1q_s32(data - 6);
272 tmp_vec[4] = vld1q_s32(data - 5);
273
274 for (i = 0; i < (int)data_len - 11; i += 12)
275 {
276 int32x4_t summ_0, summ_1, summ_2;
277 tmp_vec[5] = vld1q_s32(data + i - 4);
278 tmp_vec[6] = vld1q_s32(data + i - 3);
279 tmp_vec[7] = vld1q_s32(data + i - 2);
280 tmp_vec[8] = vld1q_s32(data + i - 1);
281 tmp_vec[9] = vld1q_s32(data + i - 0);
282 tmp_vec[10] = vld1q_s32(data + i + 1);
283 tmp_vec[11] = vld1q_s32(data + i + 2);
284 tmp_vec[12] = vld1q_s32(data + i + 3);
285 tmp_vec[13] = vld1q_s32(data + i + 4);
286 tmp_vec[14] = vld1q_s32(data + i + 5);
287 tmp_vec[15] = vld1q_s32(data + i + 6);
288 tmp_vec[16] = vld1q_s32(data + i + 7);
289
290 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 0)
291 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 3)
292 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 2)
293 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 1)
294 MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 0)
295 MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 3)
296 MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 2)
297 MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 1)
298 MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 0)
299
300 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
301 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
302 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
303
304 tmp_vec[0] = tmp_vec[12];
305 tmp_vec[1] = tmp_vec[13];
306 tmp_vec[2] = tmp_vec[14];
307 tmp_vec[3] = tmp_vec[15];
308 tmp_vec[4] = tmp_vec[16];
309 }
310 }
311 }
312 }
313 else if(order > 4) {
314 if(order > 6) {
315 if(order == 8) {
316 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
317 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
318
319 tmp_vec[0] = vld1q_s32(data - 8);
320 tmp_vec[1] = vld1q_s32(data - 7);
321 tmp_vec[2] = vld1q_s32(data - 6);
322 tmp_vec[3] = vld1q_s32(data - 5);
323
324 for (i = 0; i < (int)data_len - 11; i += 12)
325 {
326 int32x4_t summ_0, summ_1, summ_2;
327 tmp_vec[4] = vld1q_s32(data + i - 4);
328 tmp_vec[5] = vld1q_s32(data + i - 3);
329 tmp_vec[6] = vld1q_s32(data + i - 2);
330 tmp_vec[7] = vld1q_s32(data + i - 1);
331 tmp_vec[8] = vld1q_s32(data + i - 0);
332 tmp_vec[9] = vld1q_s32(data + i + 1);
333 tmp_vec[10] = vld1q_s32(data + i + 2);
334 tmp_vec[11] = vld1q_s32(data + i + 3);
335 tmp_vec[12] = vld1q_s32(data + i + 4);
336 tmp_vec[13] = vld1q_s32(data + i + 5);
337 tmp_vec[14] = vld1q_s32(data + i + 6);
338 tmp_vec[15] = vld1q_s32(data + i + 7);
339
340 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 3)
341 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 2)
342 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 1)
343 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 0)
344 MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 3)
345 MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 2)
346 MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 1)
347 MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 0)
348
349 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
350 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
351 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
352
353 tmp_vec[0] = tmp_vec[12];
354 tmp_vec[1] = tmp_vec[13];
355 tmp_vec[2] = tmp_vec[14];
356 tmp_vec[3] = tmp_vec[15];
357 }
358 }
359 else { /* order == 7 */
360 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
361 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
362
363 tmp_vec[0] = vld1q_s32(data - 7);
364 tmp_vec[1] = vld1q_s32(data - 6);
365 tmp_vec[2] = vld1q_s32(data - 5);
366
367 for (i = 0; i < (int)data_len - 11; i += 12)
368 {
369 int32x4_t summ_0, summ_1, summ_2;
370 tmp_vec[3] = vld1q_s32(data + i - 4);
371 tmp_vec[4] = vld1q_s32(data + i - 3);
372 tmp_vec[5] = vld1q_s32(data + i - 2);
373 tmp_vec[6] = vld1q_s32(data + i - 1);
374 tmp_vec[7] = vld1q_s32(data + i - 0);
375 tmp_vec[8] = vld1q_s32(data + i + 1);
376 tmp_vec[9] = vld1q_s32(data + i + 2);
377 tmp_vec[10] = vld1q_s32(data + i + 3);
378 tmp_vec[11] = vld1q_s32(data + i + 4);
379 tmp_vec[12] = vld1q_s32(data + i + 5);
380 tmp_vec[13] = vld1q_s32(data + i + 6);
381 tmp_vec[14] = vld1q_s32(data + i + 7);
382
383 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 2)
384 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 1)
385 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 0)
386 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 3)
387 MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 2)
388 MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 1)
389 MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 0)
390
391 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
392 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
393 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
394
395 tmp_vec[0] = tmp_vec[12];
396 tmp_vec[1] = tmp_vec[13];
397 tmp_vec[2] = tmp_vec[14];
398 }
399 }
400 }
401 else {
402 if(order == 6) {
403 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
404 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
405
406 tmp_vec[0] = vld1q_s32(data - 6);
407 tmp_vec[1] = vld1q_s32(data - 5);
408
409 for (i = 0; i < (int)data_len - 11; i += 12)
410 {
411 int32x4_t summ_0, summ_1, summ_2;
412 tmp_vec[2] = vld1q_s32(data + i - 4);
413 tmp_vec[3] = vld1q_s32(data + i - 3);
414 tmp_vec[4] = vld1q_s32(data + i - 2);
415 tmp_vec[5] = vld1q_s32(data + i - 1);
416 tmp_vec[6] = vld1q_s32(data + i - 0);
417 tmp_vec[7] = vld1q_s32(data + i + 1);
418 tmp_vec[8] = vld1q_s32(data + i + 2);
419 tmp_vec[9] = vld1q_s32(data + i + 3);
420 tmp_vec[10] = vld1q_s32(data + i + 4);
421 tmp_vec[11] = vld1q_s32(data + i + 5);
422 tmp_vec[12] = vld1q_s32(data + i + 6);
423 tmp_vec[13] = vld1q_s32(data + i + 7);
424
425 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 1)
426 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 0)
427 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 3)
428 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 2)
429 MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 1)
430 MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 0)
431
432 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
433 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
434 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
435
436 tmp_vec[0] = tmp_vec[12];
437 tmp_vec[1] = tmp_vec[13];
438 }
439 }
440 else { /* order == 5 */
441 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
442 int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
443
444 tmp_vec[0] = vld1q_s32(data - 5);
445
446 for (i = 0; i < (int)data_len - 11; i += 12)
447 {
448 int32x4_t summ_0, summ_1, summ_2;
449
450 tmp_vec[1] = vld1q_s32(data + i - 4);
451 tmp_vec[2] = vld1q_s32(data + i - 3);
452 tmp_vec[3] = vld1q_s32(data + i - 2);
453 tmp_vec[4] = vld1q_s32(data + i - 1);
454 tmp_vec[5] = vld1q_s32(data + i - 0);
455 tmp_vec[6] = vld1q_s32(data + i + 1);
456 tmp_vec[7] = vld1q_s32(data + i + 2);
457 tmp_vec[8] = vld1q_s32(data + i + 3);
458 tmp_vec[9] = vld1q_s32(data + i + 4);
459 tmp_vec[10] = vld1q_s32(data + i + 5);
460 tmp_vec[11] = vld1q_s32(data + i + 6);
461 tmp_vec[12] = vld1q_s32(data + i + 7);
462
463 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 0)
464 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 3)
465 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 2)
466 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 1)
467 MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 0)
468
469 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
470 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
471 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
472
473 tmp_vec[0] = tmp_vec[12];
474 }
475 }
476 }
477 }
478 else {
479 if(order > 2) {
480 if(order == 4) {
481 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
482
483 for (i = 0; i < (int)data_len - 11; i += 12)
484 {
485 int32x4_t summ_0, summ_1, summ_2;
486 tmp_vec[0] = vld1q_s32(data + i - 4);
487 tmp_vec[1] = vld1q_s32(data + i - 3);
488 tmp_vec[2] = vld1q_s32(data + i - 2);
489 tmp_vec[3] = vld1q_s32(data + i - 1);
490 tmp_vec[4] = vld1q_s32(data + i - 0);
491 tmp_vec[5] = vld1q_s32(data + i + 1);
492 tmp_vec[6] = vld1q_s32(data + i + 2);
493 tmp_vec[7] = vld1q_s32(data + i + 3);
494 tmp_vec[8] = vld1q_s32(data + i + 4);
495 tmp_vec[9] = vld1q_s32(data + i + 5);
496 tmp_vec[10] = vld1q_s32(data + i + 6);
497 tmp_vec[11] = vld1q_s32(data + i + 7);
498
499 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 3)
500 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 2)
501 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 1)
502 MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 0)
503
504 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
505 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
506 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
507 }
508 }
509 else { /* order == 3 */
510 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
511
512 for (i = 0; i < (int)data_len - 11; i += 12)
513 {
514 int32x4_t summ_0, summ_1, summ_2;
515 tmp_vec[0] = vld1q_s32(data + i - 3);
516 tmp_vec[1] = vld1q_s32(data + i - 2);
517 tmp_vec[2] = vld1q_s32(data + i - 1);
518 tmp_vec[4] = vld1q_s32(data + i + 1);
519 tmp_vec[5] = vld1q_s32(data + i + 2);
520 tmp_vec[6] = vld1q_s32(data + i + 3);
521 tmp_vec[8] = vld1q_s32(data + i + 5);
522 tmp_vec[9] = vld1q_s32(data + i + 6);
523 tmp_vec[10] = vld1q_s32(data + i + 7);
524
525 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 2)
526 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 1)
527 MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 0)
528
529 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
530 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
531 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
532 }
533 }
534 }
535 else {
536 if(order == 2) {
537 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
538
539 for (i = 0; i < (int)data_len - 11; i += 12)
540 {
541 int32x4_t summ_0, summ_1, summ_2;
542 tmp_vec[0] = vld1q_s32(data + i - 2);
543 tmp_vec[1] = vld1q_s32(data + i - 1);
544 tmp_vec[4] = vld1q_s32(data + i + 2);
545 tmp_vec[5] = vld1q_s32(data + i + 3);
546 tmp_vec[8] = vld1q_s32(data + i + 6);
547 tmp_vec[9] = vld1q_s32(data + i + 7);
548
549 MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 1)
550 MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 0)
551
552 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
553 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
554 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
555 }
556 }
557 else { /* order == 1 */
558 int32x4_t qlp_coeff_0 = vdupq_n_s32(qlp_coeff[0]);
559
560 for (i = 0; i < (int)data_len - 11; i += 12)
561 {
562 int32x4_t summ_0, summ_1, summ_2;
563 tmp_vec[0] = vld1q_s32(data + i - 1);
564 tmp_vec[4] = vld1q_s32(data + i + 3);
565 tmp_vec[8] = vld1q_s32(data + i + 7);
566
567 summ_0 = vmulq_s32(tmp_vec[0], qlp_coeff_0);
568 summ_1 = vmulq_s32(tmp_vec[4], qlp_coeff_0);
569 summ_2 = vmulq_s32(tmp_vec[8], qlp_coeff_0);
570
571 vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
572 vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
573 vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
574 }
575 }
576 }
577 }
578 for(; i < (int)data_len; i++) {
579 sum = 0;
580 switch(order) {
581 case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
582 case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
583 case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
584 case 9: sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
585 case 8: sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
586 case 7: sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
587 case 6: sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
588 case 5: sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
589 case 4: sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
590 case 3: sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
591 case 2: sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
592 case 1: sum += qlp_coeff[ 0] * data[i- 1];
593 }
594 residual[i] = data[i] - (sum >> lp_quantization);
595 }
596 }
597 else { /* order > 12 */
598 for(i = 0; i < (int)data_len; i++) {
599 sum = 0;
600 switch(order) {
601 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
602 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
603 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
604 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
605 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
606 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
607 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
608 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
609 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
610 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
611 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
612 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
613 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
614 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
615 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
616 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
617 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
618 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
619 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
620 case 13: sum += qlp_coeff[12] * data[i-13];
621 sum += qlp_coeff[11] * data[i-12];
622 sum += qlp_coeff[10] * data[i-11];
623 sum += qlp_coeff[ 9] * data[i-10];
624 sum += qlp_coeff[ 8] * data[i- 9];
625 sum += qlp_coeff[ 7] * data[i- 8];
626 sum += qlp_coeff[ 6] * data[i- 7];
627 sum += qlp_coeff[ 5] * data[i- 6];
628 sum += qlp_coeff[ 4] * data[i- 5];
629 sum += qlp_coeff[ 3] * data[i- 4];
630 sum += qlp_coeff[ 2] * data[i- 3];
631 sum += qlp_coeff[ 1] * data[i- 2];
632 sum += qlp_coeff[ 0] * data[i- 1];
633 }
634 residual[i] = data[i] - (sum >> lp_quantization);
635 }
636 }
637 }
638
639
640
641 #define MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
642 summ_l_0 = vmull_laneq_s32(vget_low_s32(tmp_vec[0]),qlp_coeff_vec, lane); \
643 summ_h_0 = vmull_high_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane);\
644 summ_l_1 = vmull_laneq_s32(vget_low_s32(tmp_vec[4]),qlp_coeff_vec, lane); \
645 summ_h_1 = vmull_high_laneq_s32(tmp_vec[4], qlp_coeff_vec, lane);\
646 summ_l_2 = vmull_laneq_s32(vget_low_s32(tmp_vec[8]),qlp_coeff_vec, lane);\
647 summ_h_2 = vmull_high_laneq_s32(tmp_vec[8], qlp_coeff_vec, lane);
648
649
650 #define MACC_64_BIT_LOOP_UNROOL_3(tmp_vec_ind, qlp_coeff_vec, lane) \
651 summ_l_0 = vmlal_laneq_s32(summ_l_0,vget_low_s32(tmp_vec[tmp_vec_ind]),qlp_coeff_vec, lane); \
652 summ_h_0 = vmlal_high_laneq_s32(summ_h_0, tmp_vec[tmp_vec_ind], qlp_coeff_vec, lane); \
653 summ_l_1 = vmlal_laneq_s32(summ_l_1, vget_low_s32(tmp_vec[tmp_vec_ind+4]),qlp_coeff_vec, lane); \
654 summ_h_1 = vmlal_high_laneq_s32(summ_h_1, tmp_vec[tmp_vec_ind+4], qlp_coeff_vec, lane); \
655 summ_l_2 = vmlal_laneq_s32(summ_l_2, vget_low_s32(tmp_vec[tmp_vec_ind+8]),qlp_coeff_vec, lane);\
656 summ_h_2 = vmlal_high_laneq_s32(summ_h_2,tmp_vec[tmp_vec_ind+8], qlp_coeff_vec, lane);
657
658 #define SHIFT_SUMS_64BITS_AND_STORE_SUB() \
659 res0 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_0,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_0,lp_quantization_vec))); \
660 res1 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_1,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_1,lp_quantization_vec))); \
661 res2 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_2,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_2,lp_quantization_vec))); \
662 vst1q_s32(residual+i+0, vsubq_s32(vld1q_s32(data+i+0), res0));\
663 vst1q_s32(residual+i+4, vsubq_s32(vld1q_s32(data+i+4), res1));\
664 vst1q_s32(residual+i+8, vsubq_s32(vld1q_s32(data+i+8), res2));
665
FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])666 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]) {
667 int i;
668 FLAC__int64 sum;
669
670 int32x4_t tmp_vec[20];
671 int32x4_t res0, res1, res2;
672 int64x2_t lp_quantization_vec = vdupq_n_s64(-lp_quantization);
673
674 FLAC__ASSERT(order > 0);
675 FLAC__ASSERT(order <= 32);
676
677 // Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit(signal+order,....)
678 if(order <= 12) {
679 if(order > 8) {
680 if(order > 10) {
681 if(order == 12) {
682 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
683 int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
684 int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],qlp_coeff[11]};
685
686 tmp_vec[0] = vld1q_s32(data - 12);
687 tmp_vec[1] = vld1q_s32(data - 11);
688 tmp_vec[2] = vld1q_s32(data - 10);
689 tmp_vec[3] = vld1q_s32(data - 9);
690 tmp_vec[4] = vld1q_s32(data - 8);
691 tmp_vec[5] = vld1q_s32(data - 7);
692 tmp_vec[6] = vld1q_s32(data - 6);
693 tmp_vec[7] = vld1q_s32(data - 5);
694
695 for (i = 0; i < (int)data_len - 11; i += 12)
696 {
697 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
698
699 tmp_vec[8] = vld1q_s32(data+i-4);
700 tmp_vec[9] = vld1q_s32(data+i-3);
701 tmp_vec[10] = vld1q_s32(data+i-2);
702 tmp_vec[11] = vld1q_s32(data+i-1);
703 tmp_vec[12] = vld1q_s32(data+i);
704 tmp_vec[13] = vld1q_s32(data+i+1);
705 tmp_vec[14] = vld1q_s32(data+i+2);
706 tmp_vec[15] = vld1q_s32(data+i+3);
707 tmp_vec[16] = vld1q_s32(data + i + 4);
708 tmp_vec[17] = vld1q_s32(data + i + 5);
709 tmp_vec[18] = vld1q_s32(data + i + 6);
710 tmp_vec[19] = vld1q_s32(data + i + 7);
711
712 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 3)
713 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 2)
714 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 1)
715 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_2, 0)
716 MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 3)
717 MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 2)
718 MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 1)
719 MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_1, 0)
720 MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 3)
721 MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 2)
722 MACC_64_BIT_LOOP_UNROOL_3(10,qlp_coeff_0, 1)
723 MACC_64_BIT_LOOP_UNROOL_3(11,qlp_coeff_0, 0)
724
725 SHIFT_SUMS_64BITS_AND_STORE_SUB()
726
727 tmp_vec[0] = tmp_vec[12];
728 tmp_vec[1] = tmp_vec[13];
729 tmp_vec[2] = tmp_vec[14];
730 tmp_vec[3] = tmp_vec[15];
731 tmp_vec[4] = tmp_vec[16];
732 tmp_vec[5] = tmp_vec[17];
733 tmp_vec[6] = tmp_vec[18];
734 tmp_vec[7] = tmp_vec[19];
735 }
736 }
737 else { /* order == 11 */
738 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
739 int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
740 int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0};
741
742 tmp_vec[0] = vld1q_s32(data - 11);
743 tmp_vec[1] = vld1q_s32(data - 10);
744 tmp_vec[2] = vld1q_s32(data - 9);
745 tmp_vec[3] = vld1q_s32(data - 8);
746 tmp_vec[4] = vld1q_s32(data - 7);
747 tmp_vec[5] = vld1q_s32(data - 6);
748 tmp_vec[6] = vld1q_s32(data - 5);
749
750 for (i = 0; i < (int)data_len - 11; i += 12)
751 {
752 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
753
754 tmp_vec[7] = vld1q_s32(data+i-4);
755 tmp_vec[8] = vld1q_s32(data+i-3);
756 tmp_vec[9] = vld1q_s32(data+i-2);
757 tmp_vec[10] = vld1q_s32(data+i-1);
758 tmp_vec[11] = vld1q_s32(data+i);
759 tmp_vec[12] = vld1q_s32(data+i+1);
760 tmp_vec[13] = vld1q_s32(data+i+2);
761 tmp_vec[14] = vld1q_s32(data+i+3);
762 tmp_vec[15] = vld1q_s32(data + i + 4);
763 tmp_vec[16] = vld1q_s32(data + i + 5);
764 tmp_vec[17] = vld1q_s32(data + i + 6);
765 tmp_vec[18] = vld1q_s32(data + i + 7);
766
767 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 2)
768 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 1)
769 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 0)
770 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 3)
771 MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 2)
772 MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 1)
773 MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 0)
774 MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 3)
775 MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 2)
776 MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 1)
777 MACC_64_BIT_LOOP_UNROOL_3(10,qlp_coeff_0, 0)
778
779 SHIFT_SUMS_64BITS_AND_STORE_SUB()
780
781 tmp_vec[0] = tmp_vec[12];
782 tmp_vec[1] = tmp_vec[13];
783 tmp_vec[2] = tmp_vec[14];
784 tmp_vec[3] = tmp_vec[15];
785 tmp_vec[4] = tmp_vec[16];
786 tmp_vec[5] = tmp_vec[17];
787 tmp_vec[6] = tmp_vec[18];
788 }
789 }
790 }
791 else
792 {
793 if (order == 10) {
794 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
795 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
796 int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
797
798 tmp_vec[0] = vld1q_s32(data - 10);
799 tmp_vec[1] = vld1q_s32(data - 9);
800 tmp_vec[2] = vld1q_s32(data - 8);
801 tmp_vec[3] = vld1q_s32(data - 7);
802 tmp_vec[4] = vld1q_s32(data - 6);
803 tmp_vec[5] = vld1q_s32(data - 5);
804
805
806 for (i = 0; i < (int)data_len - 11; i += 12)
807 {
808 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
809
810 tmp_vec[6] = vld1q_s32(data + i - 4);
811 tmp_vec[7] = vld1q_s32(data + i - 3);
812 tmp_vec[8] = vld1q_s32(data + i - 2);
813 tmp_vec[9] = vld1q_s32(data + i - 1);
814 tmp_vec[10] = vld1q_s32(data + i - 0);
815 tmp_vec[11] = vld1q_s32(data + i + 1);
816 tmp_vec[12] = vld1q_s32(data + i + 2);
817 tmp_vec[13] = vld1q_s32(data + i + 3);
818 tmp_vec[14] = vld1q_s32(data + i + 4);
819 tmp_vec[15] = vld1q_s32(data + i + 5);
820 tmp_vec[16] = vld1q_s32(data + i + 6);
821 tmp_vec[17] = vld1q_s32(data + i + 7);
822
823 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 1)
824 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 0)
825 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 3)
826 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 2)
827 MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 1)
828 MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 0)
829 MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 3)
830 MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 2)
831 MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 1)
832 MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 0)
833
834 SHIFT_SUMS_64BITS_AND_STORE_SUB()
835
836 tmp_vec[0] = tmp_vec[12];
837 tmp_vec[1] = tmp_vec[13];
838 tmp_vec[2] = tmp_vec[14];
839 tmp_vec[3] = tmp_vec[15];
840 tmp_vec[4] = tmp_vec[16];
841 tmp_vec[5] = tmp_vec[17];
842 }
843 }
844
845 else /* order == 9 */ {
846 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
847 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
848 int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
849
850 tmp_vec[0] = vld1q_s32(data - 9);
851 tmp_vec[1] = vld1q_s32(data - 8);
852 tmp_vec[2] = vld1q_s32(data - 7);
853 tmp_vec[3] = vld1q_s32(data - 6);
854 tmp_vec[4] = vld1q_s32(data - 5);
855
856 for (i = 0; i < (int)data_len - 11; i += 12)
857 {
858 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
859
860 tmp_vec[5] = vld1q_s32(data + i - 4);
861 tmp_vec[6] = vld1q_s32(data + i - 3);
862 tmp_vec[7] = vld1q_s32(data + i - 2);
863 tmp_vec[8] = vld1q_s32(data + i - 1);
864 tmp_vec[9] = vld1q_s32(data + i - 0);
865 tmp_vec[10] = vld1q_s32(data + i + 1);
866 tmp_vec[11] = vld1q_s32(data + i + 2);
867 tmp_vec[12] = vld1q_s32(data + i + 3);
868 tmp_vec[13] = vld1q_s32(data + i + 4);
869 tmp_vec[14] = vld1q_s32(data + i + 5);
870 tmp_vec[15] = vld1q_s32(data + i + 6);
871 tmp_vec[16] = vld1q_s32(data + i + 7);
872
873 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 0)
874 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 3)
875 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 2)
876 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 1)
877 MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 0)
878 MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 3)
879 MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 2)
880 MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 1)
881 MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 0)
882
883 SHIFT_SUMS_64BITS_AND_STORE_SUB()
884
885 tmp_vec[0] = tmp_vec[12];
886 tmp_vec[1] = tmp_vec[13];
887 tmp_vec[2] = tmp_vec[14];
888 tmp_vec[3] = tmp_vec[15];
889 tmp_vec[4] = tmp_vec[16];
890 }
891 }
892 }
893 }
894 else if (order > 4)
895 {
896 if (order > 6)
897 {
898 if (order == 8)
899 {
900 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
901 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
902
903 tmp_vec[0] = vld1q_s32(data - 8);
904 tmp_vec[1] = vld1q_s32(data - 7);
905 tmp_vec[2] = vld1q_s32(data - 6);
906 tmp_vec[3] = vld1q_s32(data - 5);
907
908 for (i = 0; i < (int)data_len - 11; i += 12)
909 {
910 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
911
912 tmp_vec[4] = vld1q_s32(data + i - 4);
913 tmp_vec[5] = vld1q_s32(data + i - 3);
914 tmp_vec[6] = vld1q_s32(data + i - 2);
915 tmp_vec[7] = vld1q_s32(data + i - 1);
916 tmp_vec[8] = vld1q_s32(data + i - 0);
917 tmp_vec[9] = vld1q_s32(data + i + 1);
918 tmp_vec[10] = vld1q_s32(data + i + 2);
919 tmp_vec[11] = vld1q_s32(data + i + 3);
920 tmp_vec[12] = vld1q_s32(data + i + 4);
921 tmp_vec[13] = vld1q_s32(data + i + 5);
922 tmp_vec[14] = vld1q_s32(data + i + 6);
923 tmp_vec[15] = vld1q_s32(data + i + 7);
924
925
926 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 3)
927 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 2)
928 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 1)
929 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 0)
930 MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 3)
931 MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 2)
932 MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 1)
933 MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 0)
934
935 SHIFT_SUMS_64BITS_AND_STORE_SUB()
936
937 tmp_vec[0] = tmp_vec[12];
938 tmp_vec[1] = tmp_vec[13];
939 tmp_vec[2] = tmp_vec[14];
940 tmp_vec[3] = tmp_vec[15];
941 }
942 }
943 else /* order == 7 */
944 {
945 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
946 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
947
948 tmp_vec[0] = vld1q_s32(data - 7);
949 tmp_vec[1] = vld1q_s32(data - 6);
950 tmp_vec[2] = vld1q_s32(data - 5);
951
952
953 for (i = 0; i < (int)data_len - 11; i += 12)
954 {
955 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
956 tmp_vec[3] = vld1q_s32(data +i - 4);
957 tmp_vec[4] = vld1q_s32(data + i - 3);
958 tmp_vec[5] = vld1q_s32(data + i - 2);
959 tmp_vec[6] = vld1q_s32(data + i - 1);
960 tmp_vec[7] = vld1q_s32(data + i - 0);
961 tmp_vec[8] = vld1q_s32(data + i + 1);
962 tmp_vec[9] = vld1q_s32(data + i + 2);
963 tmp_vec[10] = vld1q_s32(data + i + 3);
964 tmp_vec[11] = vld1q_s32(data + i + 4);
965 tmp_vec[12] = vld1q_s32(data + i + 5);
966 tmp_vec[13] = vld1q_s32(data + i + 6);
967 tmp_vec[14] = vld1q_s32(data + i + 7);
968
969
970 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 2)
971 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 1)
972 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 0)
973 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 3)
974 MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 2)
975 MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 1)
976 MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 0)
977
978 SHIFT_SUMS_64BITS_AND_STORE_SUB()
979
980 tmp_vec[0] = tmp_vec[12];
981 tmp_vec[1] = tmp_vec[13];
982 tmp_vec[2] = tmp_vec[14];
983 }
984 }
985 }
986 else
987 {
988 if (order == 6) {
989 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
990 int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
991
992 tmp_vec[0] = vld1q_s32(data - 6);
993 tmp_vec[1] = vld1q_s32(data - 5);
994
995 for (i = 0; i < (int)data_len - 11; i += 12)
996 {
997 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
998
999 tmp_vec[2] = vld1q_s32(data + i - 4);
1000 tmp_vec[3] = vld1q_s32(data + i - 3);
1001 tmp_vec[4] = vld1q_s32(data + i - 2);
1002 tmp_vec[5] = vld1q_s32(data + i - 1);
1003 tmp_vec[6] = vld1q_s32(data + i - 0);
1004 tmp_vec[7] = vld1q_s32(data + i + 1);
1005 tmp_vec[8] = vld1q_s32(data + i + 2);
1006 tmp_vec[9] = vld1q_s32(data + i + 3);
1007 tmp_vec[10] = vld1q_s32(data + i + 4);
1008 tmp_vec[11] = vld1q_s32(data + i + 5);
1009 tmp_vec[12] = vld1q_s32(data + i + 6);
1010 tmp_vec[13] = vld1q_s32(data + i + 7);
1011
1012
1013 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 1)
1014 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 0)
1015 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 3)
1016 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 2)
1017 MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 1)
1018 MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 0)
1019
1020 SHIFT_SUMS_64BITS_AND_STORE_SUB()
1021
1022 tmp_vec[0] = tmp_vec[12];
1023 tmp_vec[1] = tmp_vec[13];
1024 }
1025 }
1026
1027 else
1028 { /* order == 5 */
1029 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
1030 int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
1031
1032 tmp_vec[0] = vld1q_s32(data - 5);
1033
1034 for (i = 0; i < (int)data_len - 11; i += 12)
1035 {
1036 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1037 tmp_vec[1] = vld1q_s32(data + i - 4);
1038 tmp_vec[2] = vld1q_s32(data + i - 3);
1039 tmp_vec[3] = vld1q_s32(data + i - 2);
1040 tmp_vec[4] = vld1q_s32(data + i - 1);
1041 tmp_vec[5] = vld1q_s32(data + i - 0);
1042 tmp_vec[6] = vld1q_s32(data + i + 1);
1043 tmp_vec[7] = vld1q_s32(data + i + 2);
1044 tmp_vec[8] = vld1q_s32(data + i + 3);
1045 tmp_vec[9] = vld1q_s32(data + i + 4);
1046 tmp_vec[10] = vld1q_s32(data + i + 5);
1047 tmp_vec[11] = vld1q_s32(data + i + 6);
1048 tmp_vec[12] = vld1q_s32(data + i + 7);
1049
1050 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 0)
1051 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 3)
1052 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 2)
1053 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 1)
1054 MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 0)
1055
1056 SHIFT_SUMS_64BITS_AND_STORE_SUB()
1057
1058 tmp_vec[0] = tmp_vec[12];
1059 }
1060 }
1061 }
1062 }
1063 else
1064 {
1065 if (order > 2)
1066 {
1067 if (order == 4)
1068 {
1069 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
1070
1071 for (i = 0; i < (int)data_len - 11; i += 12)
1072 {
1073 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1074 tmp_vec[0] = vld1q_s32(data + i - 4);
1075 tmp_vec[1] = vld1q_s32(data + i - 3);
1076 tmp_vec[2] = vld1q_s32(data + i - 2);
1077 tmp_vec[3] = vld1q_s32(data + i - 1);
1078 tmp_vec[4] = vld1q_s32(data + i - 0);
1079 tmp_vec[5] = vld1q_s32(data + i + 1);
1080 tmp_vec[6] = vld1q_s32(data + i + 2);
1081 tmp_vec[7] = vld1q_s32(data + i + 3);
1082 tmp_vec[8] = vld1q_s32(data + i + 4);
1083 tmp_vec[9] = vld1q_s32(data + i + 5);
1084 tmp_vec[10] = vld1q_s32(data + i + 6);
1085 tmp_vec[11] = vld1q_s32(data + i + 7);
1086
1087 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 3)
1088 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 2)
1089 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 1)
1090 MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 0)
1091
1092 SHIFT_SUMS_64BITS_AND_STORE_SUB()
1093 }
1094 }
1095 else
1096 { /* order == 3 */
1097
1098 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
1099
1100 for (i = 0; i < (int)data_len - 11; i += 12)
1101 {
1102 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1103 tmp_vec[0] = vld1q_s32(data + i - 3);
1104 tmp_vec[1] = vld1q_s32(data + i - 2);
1105 tmp_vec[2] = vld1q_s32(data + i - 1);
1106 tmp_vec[4] = vld1q_s32(data + i + 1);
1107 tmp_vec[5] = vld1q_s32(data + i + 2);
1108 tmp_vec[6] = vld1q_s32(data + i + 3);
1109 tmp_vec[8] = vld1q_s32(data + i + 5);
1110 tmp_vec[9] = vld1q_s32(data + i + 6);
1111 tmp_vec[10] = vld1q_s32(data + i + 7);
1112
1113 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 2)
1114 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 1)
1115 MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 0)
1116
1117 SHIFT_SUMS_64BITS_AND_STORE_SUB()
1118 }
1119 }
1120 }
1121 else
1122 {
1123 if (order == 2)
1124 {
1125 int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
1126
1127 for (i = 0; i < (int)data_len - 11; i += 12)
1128 {
1129 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1130 tmp_vec[0] = vld1q_s32(data + i - 2);
1131 tmp_vec[1] = vld1q_s32(data + i - 1);
1132 tmp_vec[4] = vld1q_s32(data + i + 2);
1133 tmp_vec[5] = vld1q_s32(data + i + 3);
1134 tmp_vec[8] = vld1q_s32(data + i + 6);
1135 tmp_vec[9] = vld1q_s32(data + i + 7);
1136
1137 MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 1)
1138 MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 0)
1139
1140 SHIFT_SUMS_64BITS_AND_STORE_SUB()
1141 }
1142 }
1143
1144 else
1145 { /* order == 1 */
1146
1147 int32x2_t qlp_coeff_0_2 = vdup_n_s32(qlp_coeff[0]);
1148 int32x4_t qlp_coeff_0_4 = vdupq_n_s32(qlp_coeff[0]);
1149
1150 for (i = 0; i < (int)data_len - 11; i += 12)
1151 {
1152 int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1153 tmp_vec[0] = vld1q_s32(data + i - 1);
1154 tmp_vec[4] = vld1q_s32(data + i + 3);
1155 tmp_vec[8] = vld1q_s32(data + i + 7);
1156
1157 summ_l_0 = vmull_s32(vget_low_s32(tmp_vec[0]), qlp_coeff_0_2);
1158 summ_h_0 = vmull_high_s32(tmp_vec[0], qlp_coeff_0_4);
1159
1160 summ_l_1 = vmull_s32(vget_low_s32(tmp_vec[4]), qlp_coeff_0_2);
1161 summ_h_1 = vmull_high_s32(tmp_vec[4], qlp_coeff_0_4);
1162
1163 summ_l_2 = vmull_s32(vget_low_s32(tmp_vec[8]), qlp_coeff_0_2);
1164 summ_h_2 = vmull_high_s32(tmp_vec[8], qlp_coeff_0_4);
1165
1166 SHIFT_SUMS_64BITS_AND_STORE_SUB()
1167 }
1168 }
1169 }
1170 }
1171 for (; i < (int)data_len; i++)
1172 {
1173 sum = 0;
1174 switch (order)
1175 {
1176 case 12:
1177 sum += qlp_coeff[11] * (FLAC__int64)data[i - 12]; /* Falls through. */
1178 case 11:
1179 sum += qlp_coeff[10] * (FLAC__int64)data[i - 11]; /* Falls through. */
1180 case 10:
1181 sum += qlp_coeff[9] * (FLAC__int64)data[i - 10]; /* Falls through. */
1182 case 9:
1183 sum += qlp_coeff[8] * (FLAC__int64)data[i - 9]; /* Falls through. */
1184 case 8:
1185 sum += qlp_coeff[7] * (FLAC__int64)data[i - 8]; /* Falls through. */
1186 case 7:
1187 sum += qlp_coeff[6] * (FLAC__int64)data[i - 7]; /* Falls through. */
1188 case 6:
1189 sum += qlp_coeff[5] * (FLAC__int64)data[i - 6]; /* Falls through. */
1190 case 5:
1191 sum += qlp_coeff[4] * (FLAC__int64)data[i - 5]; /* Falls through. */
1192 case 4:
1193 sum += qlp_coeff[3] * (FLAC__int64)data[i - 4]; /* Falls through. */
1194 case 3:
1195 sum += qlp_coeff[2] * (FLAC__int64)data[i - 3]; /* Falls through. */
1196 case 2:
1197 sum += qlp_coeff[1] * (FLAC__int64)data[i - 2]; /* Falls through. */
1198 case 1:
1199 sum += qlp_coeff[0] * (FLAC__int64)data[i - 1];
1200 }
1201 residual[i] = data[i] - (sum >> lp_quantization);
1202 }
1203 }
1204 else
1205 { /* order > 12 */
1206 for (i = 0; i < (int)data_len; i++)
1207 {
1208 sum = 0;
1209 switch (order)
1210 {
1211 case 32:
1212 sum += qlp_coeff[31] * (FLAC__int64)data[i - 32]; /* Falls through. */
1213 case 31:
1214 sum += qlp_coeff[30] * (FLAC__int64)data[i - 31]; /* Falls through. */
1215 case 30:
1216 sum += qlp_coeff[29] * (FLAC__int64)data[i - 30]; /* Falls through. */
1217 case 29:
1218 sum += qlp_coeff[28] * (FLAC__int64)data[i - 29]; /* Falls through. */
1219 case 28:
1220 sum += qlp_coeff[27] * (FLAC__int64)data[i - 28]; /* Falls through. */
1221 case 27:
1222 sum += qlp_coeff[26] * (FLAC__int64)data[i - 27]; /* Falls through. */
1223 case 26:
1224 sum += qlp_coeff[25] * (FLAC__int64)data[i - 26]; /* Falls through. */
1225 case 25:
1226 sum += qlp_coeff[24] * (FLAC__int64)data[i - 25]; /* Falls through. */
1227 case 24:
1228 sum += qlp_coeff[23] * (FLAC__int64)data[i - 24]; /* Falls through. */
1229 case 23:
1230 sum += qlp_coeff[22] * (FLAC__int64)data[i - 23]; /* Falls through. */
1231 case 22:
1232 sum += qlp_coeff[21] * (FLAC__int64)data[i - 22]; /* Falls through. */
1233 case 21:
1234 sum += qlp_coeff[20] * (FLAC__int64)data[i - 21]; /* Falls through. */
1235 case 20:
1236 sum += qlp_coeff[19] * (FLAC__int64)data[i - 20]; /* Falls through. */
1237 case 19:
1238 sum += qlp_coeff[18] * (FLAC__int64)data[i - 19]; /* Falls through. */
1239 case 18:
1240 sum += qlp_coeff[17] * (FLAC__int64)data[i - 18]; /* Falls through. */
1241 case 17:
1242 sum += qlp_coeff[16] * (FLAC__int64)data[i - 17]; /* Falls through. */
1243 case 16:
1244 sum += qlp_coeff[15] * (FLAC__int64)data[i - 16]; /* Falls through. */
1245 case 15:
1246 sum += qlp_coeff[14] * (FLAC__int64)data[i - 15]; /* Falls through. */
1247 case 14:
1248 sum += qlp_coeff[13] * (FLAC__int64)data[i - 14]; /* Falls through. */
1249 case 13:
1250 sum += qlp_coeff[12] * (FLAC__int64)data[i - 13];
1251 sum += qlp_coeff[11] * (FLAC__int64)data[i - 12];
1252 sum += qlp_coeff[10] * (FLAC__int64)data[i - 11];
1253 sum += qlp_coeff[9] * (FLAC__int64)data[i - 10];
1254 sum += qlp_coeff[8] * (FLAC__int64)data[i - 9];
1255 sum += qlp_coeff[7] * (FLAC__int64)data[i - 8];
1256 sum += qlp_coeff[6] * (FLAC__int64)data[i - 7];
1257 sum += qlp_coeff[5] * (FLAC__int64)data[i - 6];
1258 sum += qlp_coeff[4] * (FLAC__int64)data[i - 5];
1259 sum += qlp_coeff[3] * (FLAC__int64)data[i - 4];
1260 sum += qlp_coeff[2] * (FLAC__int64)data[i - 3];
1261 sum += qlp_coeff[1] * (FLAC__int64)data[i - 2];
1262 sum += qlp_coeff[0] * (FLAC__int64)data[i - 1];
1263 }
1264 residual[i] = data[i] - (sum >> lp_quantization);
1265 }
1266 }
1267
1268 return;
1269 }
1270
1271 #endif /* FLAC__CPU_ARM64 && FLAC__HAS_ARCH64INTRIN */
1272 #endif /* FLAC__NO_ASM */
1273 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
1274