1*600f14f4SXin Li /* libFLAC - Free Lossless Audio Codec library
2*600f14f4SXin Li * Copyright (C) 2000-2009 Josh Coalson
3*600f14f4SXin Li * Copyright (C) 2011-2023 Xiph.Org Foundation
4*600f14f4SXin Li *
5*600f14f4SXin Li * Redistribution and use in source and binary forms, with or without
6*600f14f4SXin Li * modification, are permitted provided that the following conditions
7*600f14f4SXin Li * are met:
8*600f14f4SXin Li *
9*600f14f4SXin Li * - Redistributions of source code must retain the above copyright
10*600f14f4SXin Li * notice, this list of conditions and the following disclaimer.
11*600f14f4SXin Li *
12*600f14f4SXin Li * - Redistributions in binary form must reproduce the above copyright
13*600f14f4SXin Li * notice, this list of conditions and the following disclaimer in the
14*600f14f4SXin Li * documentation and/or other materials provided with the distribution.
15*600f14f4SXin Li *
16*600f14f4SXin Li * - Neither the name of the Xiph.org Foundation nor the names of its
17*600f14f4SXin Li * contributors may be used to endorse or promote products derived from
18*600f14f4SXin Li * this software without specific prior written permission.
19*600f14f4SXin Li *
20*600f14f4SXin Li * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21*600f14f4SXin Li * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22*600f14f4SXin Li * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23*600f14f4SXin Li * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24*600f14f4SXin Li * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25*600f14f4SXin Li * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26*600f14f4SXin Li * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27*600f14f4SXin Li * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28*600f14f4SXin Li * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29*600f14f4SXin Li * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30*600f14f4SXin Li * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*600f14f4SXin Li */
32*600f14f4SXin Li
33*600f14f4SXin Li #include "private/cpu.h"
34*600f14f4SXin Li
35*600f14f4SXin Li #ifndef FLAC__INTEGER_ONLY_LIBRARY
36*600f14f4SXin Li #ifndef FLAC__NO_ASM
37*600f14f4SXin Li #if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN
38*600f14f4SXin Li #include "private/lpc.h"
39*600f14f4SXin Li #include "FLAC/assert.h"
40*600f14f4SXin Li #include "FLAC/format.h"
41*600f14f4SXin Li #include "private/macros.h"
42*600f14f4SXin Li #include <arm_neon.h>
43*600f14f4SXin Li
44*600f14f4SXin Li #if FLAC__HAS_A64NEONINTRIN
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])45*600f14f4SXin Li void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
46*600f14f4SXin Li {
47*600f14f4SXin Li #undef MAX_LAG
48*600f14f4SXin Li #define MAX_LAG 14
49*600f14f4SXin Li #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
50*600f14f4SXin Li }
51*600f14f4SXin Li
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])52*600f14f4SXin Li void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
53*600f14f4SXin Li {
54*600f14f4SXin Li #undef MAX_LAG
55*600f14f4SXin Li #define MAX_LAG 10
56*600f14f4SXin Li #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
57*600f14f4SXin Li }
58*600f14f4SXin Li
FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])59*600f14f4SXin Li void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
60*600f14f4SXin Li {
61*600f14f4SXin Li #undef MAX_LAG
62*600f14f4SXin Li #define MAX_LAG 8
63*600f14f4SXin Li #include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
64*600f14f4SXin Li }
65*600f14f4SXin Li
66*600f14f4SXin Li #endif /* ifdef FLAC__HAS_A64NEONINTRIN */
67*600f14f4SXin Li
68*600f14f4SXin Li
69*600f14f4SXin Li #define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
70*600f14f4SXin Li summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \
71*600f14f4SXin Li summ_1 = vmulq_laneq_s32(tmp_vec[4], qlp_coeff_vec, lane); \
72*600f14f4SXin Li summ_2 = vmulq_laneq_s32(tmp_vec[8], qlp_coeff_vec, lane);
73*600f14f4SXin Li
74*600f14f4SXin Li
75*600f14f4SXin Li #define MACC_32BIT_LOOP_UNROOL_3(tmp_vec_ind, qlp_coeff_vec, lane) \
76*600f14f4SXin Li summ_0 = vmlaq_laneq_s32(summ_0,tmp_vec[tmp_vec_ind] ,qlp_coeff_vec, lane); \
77*600f14f4SXin Li summ_1 = vmlaq_laneq_s32(summ_1,tmp_vec[tmp_vec_ind+4] ,qlp_coeff_vec, lane); \
78*600f14f4SXin Li summ_2 = vmlaq_laneq_s32(summ_2,tmp_vec[tmp_vec_ind+8] ,qlp_coeff_vec, lane);
79*600f14f4SXin Li
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])80*600f14f4SXin Li void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
81*600f14f4SXin Li {
82*600f14f4SXin Li int i;
83*600f14f4SXin Li FLAC__int32 sum;
84*600f14f4SXin Li int32x4_t tmp_vec[20];
85*600f14f4SXin Li
86*600f14f4SXin Li FLAC__ASSERT(order > 0);
87*600f14f4SXin Li FLAC__ASSERT(order <= 32);
88*600f14f4SXin Li
89*600f14f4SXin Li // Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients(signal+order,....)
90*600f14f4SXin Li if(order <= 12) {
91*600f14f4SXin Li if(order > 8) {
92*600f14f4SXin Li if(order > 10) {
93*600f14f4SXin Li if (order == 12) {
94*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
95*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
96*600f14f4SXin Li int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], qlp_coeff[11]};
97*600f14f4SXin Li
98*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 12);
99*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 11);
100*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 10);
101*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 9);
102*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data - 8);
103*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data - 7);
104*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data - 6);
105*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data - 5);
106*600f14f4SXin Li
107*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
108*600f14f4SXin Li {
109*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
110*600f14f4SXin Li
111*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i - 4);
112*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data+i-3);
113*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data+i-2);
114*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data+i-1);
115*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data+i);
116*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data+i+1);
117*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data+i+2);
118*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data+i+3);
119*600f14f4SXin Li tmp_vec[16] = vld1q_s32(data + i + 4);
120*600f14f4SXin Li tmp_vec[17] = vld1q_s32(data + i + 5);
121*600f14f4SXin Li tmp_vec[18] = vld1q_s32(data + i + 6);
122*600f14f4SXin Li tmp_vec[19] = vld1q_s32(data + i + 7);
123*600f14f4SXin Li
124*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 3)
125*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 2)
126*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 1)
127*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_2, 0)
128*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 3)
129*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 2)
130*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 1)
131*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_1, 0)
132*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 3)
133*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 2)
134*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(10, qlp_coeff_0, 1)
135*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(11, qlp_coeff_0, 0)
136*600f14f4SXin Li
137*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
138*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
139*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
140*600f14f4SXin Li
141*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
142*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
143*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
144*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
145*600f14f4SXin Li tmp_vec[4] = tmp_vec[16];
146*600f14f4SXin Li tmp_vec[5] = tmp_vec[17];
147*600f14f4SXin Li tmp_vec[6] = tmp_vec[18];
148*600f14f4SXin Li tmp_vec[7] = tmp_vec[19];
149*600f14f4SXin Li }
150*600f14f4SXin Li }
151*600f14f4SXin Li
152*600f14f4SXin Li else { /* order == 11 */
153*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
154*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
155*600f14f4SXin Li int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0};
156*600f14f4SXin Li
157*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 11);
158*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 10);
159*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 9);
160*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 8);
161*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data - 7);
162*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data - 6);
163*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data - 5);
164*600f14f4SXin Li
165*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
166*600f14f4SXin Li {
167*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
168*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 4);
169*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i - 3);
170*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i - 2);
171*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i - 1);
172*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i - 0);
173*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 1);
174*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 2);
175*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 3);
176*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data + i + 4);
177*600f14f4SXin Li tmp_vec[16] = vld1q_s32(data + i + 5);
178*600f14f4SXin Li tmp_vec[17] = vld1q_s32(data + i + 6);
179*600f14f4SXin Li tmp_vec[18] = vld1q_s32(data + i + 7);
180*600f14f4SXin Li
181*600f14f4SXin Li
182*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 2)
183*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 1)
184*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 0)
185*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 3)
186*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 2)
187*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 1)
188*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 0)
189*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 3)
190*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 2)
191*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 1)
192*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(10, qlp_coeff_0, 0)
193*600f14f4SXin Li
194*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
195*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
196*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
197*600f14f4SXin Li
198*600f14f4SXin Li
199*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
200*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
201*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
202*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
203*600f14f4SXin Li tmp_vec[4] = tmp_vec[16];
204*600f14f4SXin Li tmp_vec[5] = tmp_vec[17];
205*600f14f4SXin Li tmp_vec[6] = tmp_vec[18];
206*600f14f4SXin Li }
207*600f14f4SXin Li }
208*600f14f4SXin Li }
209*600f14f4SXin Li else {
210*600f14f4SXin Li if(order == 10) {
211*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
212*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
213*600f14f4SXin Li int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
214*600f14f4SXin Li
215*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 10);
216*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 9);
217*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 8);
218*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 7);
219*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data - 6);
220*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data - 5);
221*600f14f4SXin Li
222*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
223*600f14f4SXin Li {
224*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
225*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 4);
226*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 3);
227*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i - 2);
228*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i - 1);
229*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i - 0);
230*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 1);
231*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 2);
232*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 3);
233*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 4);
234*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data + i + 5);
235*600f14f4SXin Li tmp_vec[16] = vld1q_s32(data + i + 6);
236*600f14f4SXin Li tmp_vec[17] = vld1q_s32(data + i + 7);
237*600f14f4SXin Li
238*600f14f4SXin Li
239*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 1)
240*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 0)
241*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 3)
242*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 2)
243*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 1)
244*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 0)
245*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 3)
246*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 2)
247*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 1)
248*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 0)
249*600f14f4SXin Li
250*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
251*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
252*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
253*600f14f4SXin Li
254*600f14f4SXin Li
255*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
256*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
257*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
258*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
259*600f14f4SXin Li tmp_vec[4] = tmp_vec[16];
260*600f14f4SXin Li tmp_vec[5] = tmp_vec[17];
261*600f14f4SXin Li }
262*600f14f4SXin Li }
263*600f14f4SXin Li else { /* order == 9 */
264*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
265*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
266*600f14f4SXin Li int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
267*600f14f4SXin Li
268*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 9);
269*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 8);
270*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 7);
271*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 6);
272*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data - 5);
273*600f14f4SXin Li
274*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
275*600f14f4SXin Li {
276*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
277*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 4);
278*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 3);
279*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 2);
280*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i - 1);
281*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i - 0);
282*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 1);
283*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 2);
284*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 3);
285*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 4);
286*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 5);
287*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data + i + 6);
288*600f14f4SXin Li tmp_vec[16] = vld1q_s32(data + i + 7);
289*600f14f4SXin Li
290*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_2, 0)
291*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 3)
292*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 2)
293*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 1)
294*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 0)
295*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 3)
296*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 2)
297*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 1)
298*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 0)
299*600f14f4SXin Li
300*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
301*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
302*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
303*600f14f4SXin Li
304*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
305*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
306*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
307*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
308*600f14f4SXin Li tmp_vec[4] = tmp_vec[16];
309*600f14f4SXin Li }
310*600f14f4SXin Li }
311*600f14f4SXin Li }
312*600f14f4SXin Li }
313*600f14f4SXin Li else if(order > 4) {
314*600f14f4SXin Li if(order > 6) {
315*600f14f4SXin Li if(order == 8) {
316*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
317*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
318*600f14f4SXin Li
319*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 8);
320*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 7);
321*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 6);
322*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 5);
323*600f14f4SXin Li
324*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
325*600f14f4SXin Li {
326*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
327*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 4);
328*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 3);
329*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 2);
330*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 1);
331*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i - 0);
332*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 1);
333*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 2);
334*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 3);
335*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 4);
336*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 5);
337*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 6);
338*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data + i + 7);
339*600f14f4SXin Li
340*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 3)
341*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 2)
342*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 1)
343*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 0)
344*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 3)
345*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 2)
346*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 1)
347*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 0)
348*600f14f4SXin Li
349*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
350*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
351*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
352*600f14f4SXin Li
353*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
354*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
355*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
356*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
357*600f14f4SXin Li }
358*600f14f4SXin Li }
359*600f14f4SXin Li else { /* order == 7 */
360*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
361*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
362*600f14f4SXin Li
363*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 7);
364*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 6);
365*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 5);
366*600f14f4SXin Li
367*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
368*600f14f4SXin Li {
369*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
370*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data + i - 4);
371*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 3);
372*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 2);
373*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 1);
374*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 0);
375*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 1);
376*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 2);
377*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 3);
378*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 4);
379*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 5);
380*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 6);
381*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 7);
382*600f14f4SXin Li
383*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 2)
384*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 1)
385*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 0)
386*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 3)
387*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 2)
388*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 1)
389*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 0)
390*600f14f4SXin Li
391*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
392*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
393*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
394*600f14f4SXin Li
395*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
396*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
397*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
398*600f14f4SXin Li }
399*600f14f4SXin Li }
400*600f14f4SXin Li }
401*600f14f4SXin Li else {
402*600f14f4SXin Li if(order == 6) {
403*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
404*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
405*600f14f4SXin Li
406*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 6);
407*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 5);
408*600f14f4SXin Li
409*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
410*600f14f4SXin Li {
411*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
412*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data + i - 4);
413*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data + i - 3);
414*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 2);
415*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 1);
416*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 0);
417*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i + 1);
418*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 2);
419*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 3);
420*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 4);
421*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 5);
422*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 6);
423*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 7);
424*600f14f4SXin Li
425*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 1)
426*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 0)
427*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 3)
428*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 2)
429*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 1)
430*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 0)
431*600f14f4SXin Li
432*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
433*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
434*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
435*600f14f4SXin Li
436*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
437*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
438*600f14f4SXin Li }
439*600f14f4SXin Li }
440*600f14f4SXin Li else { /* order == 5 */
441*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
442*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
443*600f14f4SXin Li
444*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 5);
445*600f14f4SXin Li
446*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
447*600f14f4SXin Li {
448*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
449*600f14f4SXin Li
450*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data + i - 4);
451*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data + i - 3);
452*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data + i - 2);
453*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 1);
454*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 0);
455*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i + 1);
456*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i + 2);
457*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 3);
458*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 4);
459*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 5);
460*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 6);
461*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 7);
462*600f14f4SXin Li
463*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_1, 0)
464*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 3)
465*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 2)
466*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 1)
467*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 0)
468*600f14f4SXin Li
469*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
470*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
471*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
472*600f14f4SXin Li
473*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
474*600f14f4SXin Li }
475*600f14f4SXin Li }
476*600f14f4SXin Li }
477*600f14f4SXin Li }
478*600f14f4SXin Li else {
479*600f14f4SXin Li if(order > 2) {
480*600f14f4SXin Li if(order == 4) {
481*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
482*600f14f4SXin Li
483*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
484*600f14f4SXin Li {
485*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
486*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data + i - 4);
487*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data + i - 3);
488*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data + i - 2);
489*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data + i - 1);
490*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 0);
491*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i + 1);
492*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i + 2);
493*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i + 3);
494*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 4);
495*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 5);
496*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 6);
497*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 7);
498*600f14f4SXin Li
499*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 3)
500*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 2)
501*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 1)
502*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 0)
503*600f14f4SXin Li
504*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
505*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
506*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
507*600f14f4SXin Li }
508*600f14f4SXin Li }
509*600f14f4SXin Li else { /* order == 3 */
510*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
511*600f14f4SXin Li
512*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
513*600f14f4SXin Li {
514*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
515*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data + i - 3);
516*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data + i - 2);
517*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data + i - 1);
518*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i + 1);
519*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i + 2);
520*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i + 3);
521*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 5);
522*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 6);
523*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 7);
524*600f14f4SXin Li
525*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 2)
526*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 1)
527*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 0)
528*600f14f4SXin Li
529*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
530*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
531*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
532*600f14f4SXin Li }
533*600f14f4SXin Li }
534*600f14f4SXin Li }
535*600f14f4SXin Li else {
536*600f14f4SXin Li if(order == 2) {
537*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
538*600f14f4SXin Li
539*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
540*600f14f4SXin Li {
541*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
542*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data + i - 2);
543*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data + i - 1);
544*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i + 2);
545*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i + 3);
546*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 6);
547*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 7);
548*600f14f4SXin Li
549*600f14f4SXin Li MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_0, 1)
550*600f14f4SXin Li MACC_32BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 0)
551*600f14f4SXin Li
552*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
553*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
554*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
555*600f14f4SXin Li }
556*600f14f4SXin Li }
557*600f14f4SXin Li else { /* order == 1 */
558*600f14f4SXin Li int32x4_t qlp_coeff_0 = vdupq_n_s32(qlp_coeff[0]);
559*600f14f4SXin Li
560*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
561*600f14f4SXin Li {
562*600f14f4SXin Li int32x4_t summ_0, summ_1, summ_2;
563*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data + i - 1);
564*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i + 3);
565*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 7);
566*600f14f4SXin Li
567*600f14f4SXin Li summ_0 = vmulq_s32(tmp_vec[0], qlp_coeff_0);
568*600f14f4SXin Li summ_1 = vmulq_s32(tmp_vec[4], qlp_coeff_0);
569*600f14f4SXin Li summ_2 = vmulq_s32(tmp_vec[8], qlp_coeff_0);
570*600f14f4SXin Li
571*600f14f4SXin Li vst1q_s32(residual+i + 0, vsubq_s32(vld1q_s32(data+i + 0) , vshlq_s32(summ_0,vdupq_n_s32(-lp_quantization))));
572*600f14f4SXin Li vst1q_s32(residual+i + 4, vsubq_s32(vld1q_s32(data+i + 4) , vshlq_s32(summ_1,vdupq_n_s32(-lp_quantization))));
573*600f14f4SXin Li vst1q_s32(residual+i + 8, vsubq_s32(vld1q_s32(data+i + 8) , vshlq_s32(summ_2,vdupq_n_s32(-lp_quantization))));
574*600f14f4SXin Li }
575*600f14f4SXin Li }
576*600f14f4SXin Li }
577*600f14f4SXin Li }
578*600f14f4SXin Li for(; i < (int)data_len; i++) {
579*600f14f4SXin Li sum = 0;
580*600f14f4SXin Li switch(order) {
581*600f14f4SXin Li case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
582*600f14f4SXin Li case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
583*600f14f4SXin Li case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
584*600f14f4SXin Li case 9: sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
585*600f14f4SXin Li case 8: sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
586*600f14f4SXin Li case 7: sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
587*600f14f4SXin Li case 6: sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
588*600f14f4SXin Li case 5: sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
589*600f14f4SXin Li case 4: sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
590*600f14f4SXin Li case 3: sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
591*600f14f4SXin Li case 2: sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
592*600f14f4SXin Li case 1: sum += qlp_coeff[ 0] * data[i- 1];
593*600f14f4SXin Li }
594*600f14f4SXin Li residual[i] = data[i] - (sum >> lp_quantization);
595*600f14f4SXin Li }
596*600f14f4SXin Li }
597*600f14f4SXin Li else { /* order > 12 */
598*600f14f4SXin Li for(i = 0; i < (int)data_len; i++) {
599*600f14f4SXin Li sum = 0;
600*600f14f4SXin Li switch(order) {
601*600f14f4SXin Li case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
602*600f14f4SXin Li case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
603*600f14f4SXin Li case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
604*600f14f4SXin Li case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
605*600f14f4SXin Li case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
606*600f14f4SXin Li case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
607*600f14f4SXin Li case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
608*600f14f4SXin Li case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
609*600f14f4SXin Li case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
610*600f14f4SXin Li case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
611*600f14f4SXin Li case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
612*600f14f4SXin Li case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
613*600f14f4SXin Li case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
614*600f14f4SXin Li case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
615*600f14f4SXin Li case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
616*600f14f4SXin Li case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
617*600f14f4SXin Li case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
618*600f14f4SXin Li case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
619*600f14f4SXin Li case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
620*600f14f4SXin Li case 13: sum += qlp_coeff[12] * data[i-13];
621*600f14f4SXin Li sum += qlp_coeff[11] * data[i-12];
622*600f14f4SXin Li sum += qlp_coeff[10] * data[i-11];
623*600f14f4SXin Li sum += qlp_coeff[ 9] * data[i-10];
624*600f14f4SXin Li sum += qlp_coeff[ 8] * data[i- 9];
625*600f14f4SXin Li sum += qlp_coeff[ 7] * data[i- 8];
626*600f14f4SXin Li sum += qlp_coeff[ 6] * data[i- 7];
627*600f14f4SXin Li sum += qlp_coeff[ 5] * data[i- 6];
628*600f14f4SXin Li sum += qlp_coeff[ 4] * data[i- 5];
629*600f14f4SXin Li sum += qlp_coeff[ 3] * data[i- 4];
630*600f14f4SXin Li sum += qlp_coeff[ 2] * data[i- 3];
631*600f14f4SXin Li sum += qlp_coeff[ 1] * data[i- 2];
632*600f14f4SXin Li sum += qlp_coeff[ 0] * data[i- 1];
633*600f14f4SXin Li }
634*600f14f4SXin Li residual[i] = data[i] - (sum >> lp_quantization);
635*600f14f4SXin Li }
636*600f14f4SXin Li }
637*600f14f4SXin Li }
638*600f14f4SXin Li
639*600f14f4SXin Li
640*600f14f4SXin Li
641*600f14f4SXin Li #define MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
642*600f14f4SXin Li summ_l_0 = vmull_laneq_s32(vget_low_s32(tmp_vec[0]),qlp_coeff_vec, lane); \
643*600f14f4SXin Li summ_h_0 = vmull_high_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane);\
644*600f14f4SXin Li summ_l_1 = vmull_laneq_s32(vget_low_s32(tmp_vec[4]),qlp_coeff_vec, lane); \
645*600f14f4SXin Li summ_h_1 = vmull_high_laneq_s32(tmp_vec[4], qlp_coeff_vec, lane);\
646*600f14f4SXin Li summ_l_2 = vmull_laneq_s32(vget_low_s32(tmp_vec[8]),qlp_coeff_vec, lane);\
647*600f14f4SXin Li summ_h_2 = vmull_high_laneq_s32(tmp_vec[8], qlp_coeff_vec, lane);
648*600f14f4SXin Li
649*600f14f4SXin Li
650*600f14f4SXin Li #define MACC_64_BIT_LOOP_UNROOL_3(tmp_vec_ind, qlp_coeff_vec, lane) \
651*600f14f4SXin Li summ_l_0 = vmlal_laneq_s32(summ_l_0,vget_low_s32(tmp_vec[tmp_vec_ind]),qlp_coeff_vec, lane); \
652*600f14f4SXin Li summ_h_0 = vmlal_high_laneq_s32(summ_h_0, tmp_vec[tmp_vec_ind], qlp_coeff_vec, lane); \
653*600f14f4SXin Li summ_l_1 = vmlal_laneq_s32(summ_l_1, vget_low_s32(tmp_vec[tmp_vec_ind+4]),qlp_coeff_vec, lane); \
654*600f14f4SXin Li summ_h_1 = vmlal_high_laneq_s32(summ_h_1, tmp_vec[tmp_vec_ind+4], qlp_coeff_vec, lane); \
655*600f14f4SXin Li summ_l_2 = vmlal_laneq_s32(summ_l_2, vget_low_s32(tmp_vec[tmp_vec_ind+8]),qlp_coeff_vec, lane);\
656*600f14f4SXin Li summ_h_2 = vmlal_high_laneq_s32(summ_h_2,tmp_vec[tmp_vec_ind+8], qlp_coeff_vec, lane);
657*600f14f4SXin Li
658*600f14f4SXin Li #define SHIFT_SUMS_64BITS_AND_STORE_SUB() \
659*600f14f4SXin Li res0 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_0,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_0,lp_quantization_vec))); \
660*600f14f4SXin Li res1 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_1,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_1,lp_quantization_vec))); \
661*600f14f4SXin Li res2 = vuzp1q_s32(vreinterpretq_s32_s64(vshlq_s64(summ_l_2,lp_quantization_vec)), vreinterpretq_s32_s64(vshlq_s64(summ_h_2,lp_quantization_vec))); \
662*600f14f4SXin Li vst1q_s32(residual+i+0, vsubq_s32(vld1q_s32(data+i+0), res0));\
663*600f14f4SXin Li vst1q_s32(residual+i+4, vsubq_s32(vld1q_s32(data+i+4), res1));\
664*600f14f4SXin Li vst1q_s32(residual+i+8, vsubq_s32(vld1q_s32(data+i+8), res2));
665*600f14f4SXin Li
FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])666*600f14f4SXin Li void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]) {
667*600f14f4SXin Li int i;
668*600f14f4SXin Li FLAC__int64 sum;
669*600f14f4SXin Li
670*600f14f4SXin Li int32x4_t tmp_vec[20];
671*600f14f4SXin Li int32x4_t res0, res1, res2;
672*600f14f4SXin Li int64x2_t lp_quantization_vec = vdupq_n_s64(-lp_quantization);
673*600f14f4SXin Li
674*600f14f4SXin Li FLAC__ASSERT(order > 0);
675*600f14f4SXin Li FLAC__ASSERT(order <= 32);
676*600f14f4SXin Li
677*600f14f4SXin Li // Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit(signal+order,....)
678*600f14f4SXin Li if(order <= 12) {
679*600f14f4SXin Li if(order > 8) {
680*600f14f4SXin Li if(order > 10) {
681*600f14f4SXin Li if(order == 12) {
682*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
683*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
684*600f14f4SXin Li int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],qlp_coeff[11]};
685*600f14f4SXin Li
686*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 12);
687*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 11);
688*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 10);
689*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 9);
690*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data - 8);
691*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data - 7);
692*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data - 6);
693*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data - 5);
694*600f14f4SXin Li
695*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
696*600f14f4SXin Li {
697*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
698*600f14f4SXin Li
699*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data+i-4);
700*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data+i-3);
701*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data+i-2);
702*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data+i-1);
703*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data+i);
704*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data+i+1);
705*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data+i+2);
706*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data+i+3);
707*600f14f4SXin Li tmp_vec[16] = vld1q_s32(data + i + 4);
708*600f14f4SXin Li tmp_vec[17] = vld1q_s32(data + i + 5);
709*600f14f4SXin Li tmp_vec[18] = vld1q_s32(data + i + 6);
710*600f14f4SXin Li tmp_vec[19] = vld1q_s32(data + i + 7);
711*600f14f4SXin Li
712*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 3)
713*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 2)
714*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 1)
715*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_2, 0)
716*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 3)
717*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 2)
718*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 1)
719*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_1, 0)
720*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 3)
721*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 2)
722*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(10,qlp_coeff_0, 1)
723*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(11,qlp_coeff_0, 0)
724*600f14f4SXin Li
725*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
726*600f14f4SXin Li
727*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
728*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
729*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
730*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
731*600f14f4SXin Li tmp_vec[4] = tmp_vec[16];
732*600f14f4SXin Li tmp_vec[5] = tmp_vec[17];
733*600f14f4SXin Li tmp_vec[6] = tmp_vec[18];
734*600f14f4SXin Li tmp_vec[7] = tmp_vec[19];
735*600f14f4SXin Li }
736*600f14f4SXin Li }
737*600f14f4SXin Li else { /* order == 11 */
738*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
739*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
740*600f14f4SXin Li int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0};
741*600f14f4SXin Li
742*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 11);
743*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 10);
744*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 9);
745*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 8);
746*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data - 7);
747*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data - 6);
748*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data - 5);
749*600f14f4SXin Li
750*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
751*600f14f4SXin Li {
752*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
753*600f14f4SXin Li
754*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data+i-4);
755*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data+i-3);
756*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data+i-2);
757*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data+i-1);
758*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data+i);
759*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data+i+1);
760*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data+i+2);
761*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data+i+3);
762*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data + i + 4);
763*600f14f4SXin Li tmp_vec[16] = vld1q_s32(data + i + 5);
764*600f14f4SXin Li tmp_vec[17] = vld1q_s32(data + i + 6);
765*600f14f4SXin Li tmp_vec[18] = vld1q_s32(data + i + 7);
766*600f14f4SXin Li
767*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 2)
768*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 1)
769*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_2, 0)
770*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 3)
771*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 2)
772*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 1)
773*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_1, 0)
774*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 3)
775*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 2)
776*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 1)
777*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(10,qlp_coeff_0, 0)
778*600f14f4SXin Li
779*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
780*600f14f4SXin Li
781*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
782*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
783*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
784*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
785*600f14f4SXin Li tmp_vec[4] = tmp_vec[16];
786*600f14f4SXin Li tmp_vec[5] = tmp_vec[17];
787*600f14f4SXin Li tmp_vec[6] = tmp_vec[18];
788*600f14f4SXin Li }
789*600f14f4SXin Li }
790*600f14f4SXin Li }
791*600f14f4SXin Li else
792*600f14f4SXin Li {
793*600f14f4SXin Li if (order == 10) {
794*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
795*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
796*600f14f4SXin Li int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
797*600f14f4SXin Li
798*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 10);
799*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 9);
800*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 8);
801*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 7);
802*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data - 6);
803*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data - 5);
804*600f14f4SXin Li
805*600f14f4SXin Li
806*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
807*600f14f4SXin Li {
808*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
809*600f14f4SXin Li
810*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 4);
811*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 3);
812*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i - 2);
813*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i - 1);
814*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i - 0);
815*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 1);
816*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 2);
817*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 3);
818*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 4);
819*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data + i + 5);
820*600f14f4SXin Li tmp_vec[16] = vld1q_s32(data + i + 6);
821*600f14f4SXin Li tmp_vec[17] = vld1q_s32(data + i + 7);
822*600f14f4SXin Li
823*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 1)
824*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_2, 0)
825*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 3)
826*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 2)
827*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 1)
828*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_1, 0)
829*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 3)
830*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 2)
831*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 1)
832*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(9, qlp_coeff_0, 0)
833*600f14f4SXin Li
834*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
835*600f14f4SXin Li
836*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
837*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
838*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
839*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
840*600f14f4SXin Li tmp_vec[4] = tmp_vec[16];
841*600f14f4SXin Li tmp_vec[5] = tmp_vec[17];
842*600f14f4SXin Li }
843*600f14f4SXin Li }
844*600f14f4SXin Li
845*600f14f4SXin Li else /* order == 9 */ {
846*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
847*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
848*600f14f4SXin Li int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
849*600f14f4SXin Li
850*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 9);
851*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 8);
852*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 7);
853*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 6);
854*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data - 5);
855*600f14f4SXin Li
856*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
857*600f14f4SXin Li {
858*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
859*600f14f4SXin Li
860*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 4);
861*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 3);
862*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 2);
863*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i - 1);
864*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i - 0);
865*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 1);
866*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 2);
867*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 3);
868*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 4);
869*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 5);
870*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data + i + 6);
871*600f14f4SXin Li tmp_vec[16] = vld1q_s32(data + i + 7);
872*600f14f4SXin Li
873*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_2, 0)
874*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 3)
875*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 2)
876*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 1)
877*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_1, 0)
878*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 3)
879*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 2)
880*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 1)
881*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(8, qlp_coeff_0, 0)
882*600f14f4SXin Li
883*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
884*600f14f4SXin Li
885*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
886*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
887*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
888*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
889*600f14f4SXin Li tmp_vec[4] = tmp_vec[16];
890*600f14f4SXin Li }
891*600f14f4SXin Li }
892*600f14f4SXin Li }
893*600f14f4SXin Li }
894*600f14f4SXin Li else if (order > 4)
895*600f14f4SXin Li {
896*600f14f4SXin Li if (order > 6)
897*600f14f4SXin Li {
898*600f14f4SXin Li if (order == 8)
899*600f14f4SXin Li {
900*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
901*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
902*600f14f4SXin Li
903*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 8);
904*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 7);
905*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 6);
906*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data - 5);
907*600f14f4SXin Li
908*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
909*600f14f4SXin Li {
910*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
911*600f14f4SXin Li
912*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 4);
913*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 3);
914*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 2);
915*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 1);
916*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i - 0);
917*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 1);
918*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 2);
919*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 3);
920*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 4);
921*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 5);
922*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 6);
923*600f14f4SXin Li tmp_vec[15] = vld1q_s32(data + i + 7);
924*600f14f4SXin Li
925*600f14f4SXin Li
926*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 3)
927*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 2)
928*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 1)
929*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_1, 0)
930*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 3)
931*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 2)
932*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 1)
933*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(7, qlp_coeff_0, 0)
934*600f14f4SXin Li
935*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
936*600f14f4SXin Li
937*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
938*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
939*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
940*600f14f4SXin Li tmp_vec[3] = tmp_vec[15];
941*600f14f4SXin Li }
942*600f14f4SXin Li }
943*600f14f4SXin Li else /* order == 7 */
944*600f14f4SXin Li {
945*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
946*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
947*600f14f4SXin Li
948*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 7);
949*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 6);
950*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data - 5);
951*600f14f4SXin Li
952*600f14f4SXin Li
953*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
954*600f14f4SXin Li {
955*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
956*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data +i - 4);
957*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 3);
958*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 2);
959*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 1);
960*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i - 0);
961*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 1);
962*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 2);
963*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 3);
964*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 4);
965*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 5);
966*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 6);
967*600f14f4SXin Li tmp_vec[14] = vld1q_s32(data + i + 7);
968*600f14f4SXin Li
969*600f14f4SXin Li
970*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 2)
971*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 1)
972*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_1, 0)
973*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 3)
974*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 2)
975*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 1)
976*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(6, qlp_coeff_0, 0)
977*600f14f4SXin Li
978*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
979*600f14f4SXin Li
980*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
981*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
982*600f14f4SXin Li tmp_vec[2] = tmp_vec[14];
983*600f14f4SXin Li }
984*600f14f4SXin Li }
985*600f14f4SXin Li }
986*600f14f4SXin Li else
987*600f14f4SXin Li {
988*600f14f4SXin Li if (order == 6) {
989*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
990*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
991*600f14f4SXin Li
992*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 6);
993*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data - 5);
994*600f14f4SXin Li
995*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
996*600f14f4SXin Li {
997*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
998*600f14f4SXin Li
999*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data + i - 4);
1000*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data + i - 3);
1001*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 2);
1002*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 1);
1003*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i - 0);
1004*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i + 1);
1005*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 2);
1006*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 3);
1007*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 4);
1008*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 5);
1009*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 6);
1010*600f14f4SXin Li tmp_vec[13] = vld1q_s32(data + i + 7);
1011*600f14f4SXin Li
1012*600f14f4SXin Li
1013*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 1)
1014*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_1, 0)
1015*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 3)
1016*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 2)
1017*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 1)
1018*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(5, qlp_coeff_0, 0)
1019*600f14f4SXin Li
1020*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
1021*600f14f4SXin Li
1022*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
1023*600f14f4SXin Li tmp_vec[1] = tmp_vec[13];
1024*600f14f4SXin Li }
1025*600f14f4SXin Li }
1026*600f14f4SXin Li
1027*600f14f4SXin Li else
1028*600f14f4SXin Li { /* order == 5 */
1029*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
1030*600f14f4SXin Li int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
1031*600f14f4SXin Li
1032*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data - 5);
1033*600f14f4SXin Li
1034*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
1035*600f14f4SXin Li {
1036*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1037*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data + i - 4);
1038*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data + i - 3);
1039*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data + i - 2);
1040*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 1);
1041*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i - 0);
1042*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i + 1);
1043*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i + 2);
1044*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 3);
1045*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 4);
1046*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 5);
1047*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 6);
1048*600f14f4SXin Li tmp_vec[12] = vld1q_s32(data + i + 7);
1049*600f14f4SXin Li
1050*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_1, 0)
1051*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 3)
1052*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 2)
1053*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 1)
1054*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(4, qlp_coeff_0, 0)
1055*600f14f4SXin Li
1056*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
1057*600f14f4SXin Li
1058*600f14f4SXin Li tmp_vec[0] = tmp_vec[12];
1059*600f14f4SXin Li }
1060*600f14f4SXin Li }
1061*600f14f4SXin Li }
1062*600f14f4SXin Li }
1063*600f14f4SXin Li else
1064*600f14f4SXin Li {
1065*600f14f4SXin Li if (order > 2)
1066*600f14f4SXin Li {
1067*600f14f4SXin Li if (order == 4)
1068*600f14f4SXin Li {
1069*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
1070*600f14f4SXin Li
1071*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
1072*600f14f4SXin Li {
1073*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1074*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data + i - 4);
1075*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data + i - 3);
1076*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data + i - 2);
1077*600f14f4SXin Li tmp_vec[3] = vld1q_s32(data + i - 1);
1078*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i - 0);
1079*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i + 1);
1080*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i + 2);
1081*600f14f4SXin Li tmp_vec[7] = vld1q_s32(data + i + 3);
1082*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 4);
1083*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 5);
1084*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 6);
1085*600f14f4SXin Li tmp_vec[11] = vld1q_s32(data + i + 7);
1086*600f14f4SXin Li
1087*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 3)
1088*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 2)
1089*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 1)
1090*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(3, qlp_coeff_0, 0)
1091*600f14f4SXin Li
1092*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
1093*600f14f4SXin Li }
1094*600f14f4SXin Li }
1095*600f14f4SXin Li else
1096*600f14f4SXin Li { /* order == 3 */
1097*600f14f4SXin Li
1098*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
1099*600f14f4SXin Li
1100*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
1101*600f14f4SXin Li {
1102*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1103*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data + i - 3);
1104*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data + i - 2);
1105*600f14f4SXin Li tmp_vec[2] = vld1q_s32(data + i - 1);
1106*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i + 1);
1107*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i + 2);
1108*600f14f4SXin Li tmp_vec[6] = vld1q_s32(data + i + 3);
1109*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 5);
1110*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 6);
1111*600f14f4SXin Li tmp_vec[10] = vld1q_s32(data + i + 7);
1112*600f14f4SXin Li
1113*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 2)
1114*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 1)
1115*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(2, qlp_coeff_0, 0)
1116*600f14f4SXin Li
1117*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
1118*600f14f4SXin Li }
1119*600f14f4SXin Li }
1120*600f14f4SXin Li }
1121*600f14f4SXin Li else
1122*600f14f4SXin Li {
1123*600f14f4SXin Li if (order == 2)
1124*600f14f4SXin Li {
1125*600f14f4SXin Li int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
1126*600f14f4SXin Li
1127*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
1128*600f14f4SXin Li {
1129*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1130*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data + i - 2);
1131*600f14f4SXin Li tmp_vec[1] = vld1q_s32(data + i - 1);
1132*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i + 2);
1133*600f14f4SXin Li tmp_vec[5] = vld1q_s32(data + i + 3);
1134*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 6);
1135*600f14f4SXin Li tmp_vec[9] = vld1q_s32(data + i + 7);
1136*600f14f4SXin Li
1137*600f14f4SXin Li MUL_64_BIT_LOOP_UNROOL_3(qlp_coeff_0, 1)
1138*600f14f4SXin Li MACC_64_BIT_LOOP_UNROOL_3(1, qlp_coeff_0, 0)
1139*600f14f4SXin Li
1140*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
1141*600f14f4SXin Li }
1142*600f14f4SXin Li }
1143*600f14f4SXin Li
1144*600f14f4SXin Li else
1145*600f14f4SXin Li { /* order == 1 */
1146*600f14f4SXin Li
1147*600f14f4SXin Li int32x2_t qlp_coeff_0_2 = vdup_n_s32(qlp_coeff[0]);
1148*600f14f4SXin Li int32x4_t qlp_coeff_0_4 = vdupq_n_s32(qlp_coeff[0]);
1149*600f14f4SXin Li
1150*600f14f4SXin Li for (i = 0; i < (int)data_len - 11; i += 12)
1151*600f14f4SXin Li {
1152*600f14f4SXin Li int64x2_t summ_l_0, summ_h_0, summ_l_1, summ_h_1, summ_l_2, summ_h_2;
1153*600f14f4SXin Li tmp_vec[0] = vld1q_s32(data + i - 1);
1154*600f14f4SXin Li tmp_vec[4] = vld1q_s32(data + i + 3);
1155*600f14f4SXin Li tmp_vec[8] = vld1q_s32(data + i + 7);
1156*600f14f4SXin Li
1157*600f14f4SXin Li summ_l_0 = vmull_s32(vget_low_s32(tmp_vec[0]), qlp_coeff_0_2);
1158*600f14f4SXin Li summ_h_0 = vmull_high_s32(tmp_vec[0], qlp_coeff_0_4);
1159*600f14f4SXin Li
1160*600f14f4SXin Li summ_l_1 = vmull_s32(vget_low_s32(tmp_vec[4]), qlp_coeff_0_2);
1161*600f14f4SXin Li summ_h_1 = vmull_high_s32(tmp_vec[4], qlp_coeff_0_4);
1162*600f14f4SXin Li
1163*600f14f4SXin Li summ_l_2 = vmull_s32(vget_low_s32(tmp_vec[8]), qlp_coeff_0_2);
1164*600f14f4SXin Li summ_h_2 = vmull_high_s32(tmp_vec[8], qlp_coeff_0_4);
1165*600f14f4SXin Li
1166*600f14f4SXin Li SHIFT_SUMS_64BITS_AND_STORE_SUB()
1167*600f14f4SXin Li }
1168*600f14f4SXin Li }
1169*600f14f4SXin Li }
1170*600f14f4SXin Li }
1171*600f14f4SXin Li for (; i < (int)data_len; i++)
1172*600f14f4SXin Li {
1173*600f14f4SXin Li sum = 0;
1174*600f14f4SXin Li switch (order)
1175*600f14f4SXin Li {
1176*600f14f4SXin Li case 12:
1177*600f14f4SXin Li sum += qlp_coeff[11] * (FLAC__int64)data[i - 12]; /* Falls through. */
1178*600f14f4SXin Li case 11:
1179*600f14f4SXin Li sum += qlp_coeff[10] * (FLAC__int64)data[i - 11]; /* Falls through. */
1180*600f14f4SXin Li case 10:
1181*600f14f4SXin Li sum += qlp_coeff[9] * (FLAC__int64)data[i - 10]; /* Falls through. */
1182*600f14f4SXin Li case 9:
1183*600f14f4SXin Li sum += qlp_coeff[8] * (FLAC__int64)data[i - 9]; /* Falls through. */
1184*600f14f4SXin Li case 8:
1185*600f14f4SXin Li sum += qlp_coeff[7] * (FLAC__int64)data[i - 8]; /* Falls through. */
1186*600f14f4SXin Li case 7:
1187*600f14f4SXin Li sum += qlp_coeff[6] * (FLAC__int64)data[i - 7]; /* Falls through. */
1188*600f14f4SXin Li case 6:
1189*600f14f4SXin Li sum += qlp_coeff[5] * (FLAC__int64)data[i - 6]; /* Falls through. */
1190*600f14f4SXin Li case 5:
1191*600f14f4SXin Li sum += qlp_coeff[4] * (FLAC__int64)data[i - 5]; /* Falls through. */
1192*600f14f4SXin Li case 4:
1193*600f14f4SXin Li sum += qlp_coeff[3] * (FLAC__int64)data[i - 4]; /* Falls through. */
1194*600f14f4SXin Li case 3:
1195*600f14f4SXin Li sum += qlp_coeff[2] * (FLAC__int64)data[i - 3]; /* Falls through. */
1196*600f14f4SXin Li case 2:
1197*600f14f4SXin Li sum += qlp_coeff[1] * (FLAC__int64)data[i - 2]; /* Falls through. */
1198*600f14f4SXin Li case 1:
1199*600f14f4SXin Li sum += qlp_coeff[0] * (FLAC__int64)data[i - 1];
1200*600f14f4SXin Li }
1201*600f14f4SXin Li residual[i] = data[i] - (sum >> lp_quantization);
1202*600f14f4SXin Li }
1203*600f14f4SXin Li }
1204*600f14f4SXin Li else
1205*600f14f4SXin Li { /* order > 12 */
1206*600f14f4SXin Li for (i = 0; i < (int)data_len; i++)
1207*600f14f4SXin Li {
1208*600f14f4SXin Li sum = 0;
1209*600f14f4SXin Li switch (order)
1210*600f14f4SXin Li {
1211*600f14f4SXin Li case 32:
1212*600f14f4SXin Li sum += qlp_coeff[31] * (FLAC__int64)data[i - 32]; /* Falls through. */
1213*600f14f4SXin Li case 31:
1214*600f14f4SXin Li sum += qlp_coeff[30] * (FLAC__int64)data[i - 31]; /* Falls through. */
1215*600f14f4SXin Li case 30:
1216*600f14f4SXin Li sum += qlp_coeff[29] * (FLAC__int64)data[i - 30]; /* Falls through. */
1217*600f14f4SXin Li case 29:
1218*600f14f4SXin Li sum += qlp_coeff[28] * (FLAC__int64)data[i - 29]; /* Falls through. */
1219*600f14f4SXin Li case 28:
1220*600f14f4SXin Li sum += qlp_coeff[27] * (FLAC__int64)data[i - 28]; /* Falls through. */
1221*600f14f4SXin Li case 27:
1222*600f14f4SXin Li sum += qlp_coeff[26] * (FLAC__int64)data[i - 27]; /* Falls through. */
1223*600f14f4SXin Li case 26:
1224*600f14f4SXin Li sum += qlp_coeff[25] * (FLAC__int64)data[i - 26]; /* Falls through. */
1225*600f14f4SXin Li case 25:
1226*600f14f4SXin Li sum += qlp_coeff[24] * (FLAC__int64)data[i - 25]; /* Falls through. */
1227*600f14f4SXin Li case 24:
1228*600f14f4SXin Li sum += qlp_coeff[23] * (FLAC__int64)data[i - 24]; /* Falls through. */
1229*600f14f4SXin Li case 23:
1230*600f14f4SXin Li sum += qlp_coeff[22] * (FLAC__int64)data[i - 23]; /* Falls through. */
1231*600f14f4SXin Li case 22:
1232*600f14f4SXin Li sum += qlp_coeff[21] * (FLAC__int64)data[i - 22]; /* Falls through. */
1233*600f14f4SXin Li case 21:
1234*600f14f4SXin Li sum += qlp_coeff[20] * (FLAC__int64)data[i - 21]; /* Falls through. */
1235*600f14f4SXin Li case 20:
1236*600f14f4SXin Li sum += qlp_coeff[19] * (FLAC__int64)data[i - 20]; /* Falls through. */
1237*600f14f4SXin Li case 19:
1238*600f14f4SXin Li sum += qlp_coeff[18] * (FLAC__int64)data[i - 19]; /* Falls through. */
1239*600f14f4SXin Li case 18:
1240*600f14f4SXin Li sum += qlp_coeff[17] * (FLAC__int64)data[i - 18]; /* Falls through. */
1241*600f14f4SXin Li case 17:
1242*600f14f4SXin Li sum += qlp_coeff[16] * (FLAC__int64)data[i - 17]; /* Falls through. */
1243*600f14f4SXin Li case 16:
1244*600f14f4SXin Li sum += qlp_coeff[15] * (FLAC__int64)data[i - 16]; /* Falls through. */
1245*600f14f4SXin Li case 15:
1246*600f14f4SXin Li sum += qlp_coeff[14] * (FLAC__int64)data[i - 15]; /* Falls through. */
1247*600f14f4SXin Li case 14:
1248*600f14f4SXin Li sum += qlp_coeff[13] * (FLAC__int64)data[i - 14]; /* Falls through. */
1249*600f14f4SXin Li case 13:
1250*600f14f4SXin Li sum += qlp_coeff[12] * (FLAC__int64)data[i - 13];
1251*600f14f4SXin Li sum += qlp_coeff[11] * (FLAC__int64)data[i - 12];
1252*600f14f4SXin Li sum += qlp_coeff[10] * (FLAC__int64)data[i - 11];
1253*600f14f4SXin Li sum += qlp_coeff[9] * (FLAC__int64)data[i - 10];
1254*600f14f4SXin Li sum += qlp_coeff[8] * (FLAC__int64)data[i - 9];
1255*600f14f4SXin Li sum += qlp_coeff[7] * (FLAC__int64)data[i - 8];
1256*600f14f4SXin Li sum += qlp_coeff[6] * (FLAC__int64)data[i - 7];
1257*600f14f4SXin Li sum += qlp_coeff[5] * (FLAC__int64)data[i - 6];
1258*600f14f4SXin Li sum += qlp_coeff[4] * (FLAC__int64)data[i - 5];
1259*600f14f4SXin Li sum += qlp_coeff[3] * (FLAC__int64)data[i - 4];
1260*600f14f4SXin Li sum += qlp_coeff[2] * (FLAC__int64)data[i - 3];
1261*600f14f4SXin Li sum += qlp_coeff[1] * (FLAC__int64)data[i - 2];
1262*600f14f4SXin Li sum += qlp_coeff[0] * (FLAC__int64)data[i - 1];
1263*600f14f4SXin Li }
1264*600f14f4SXin Li residual[i] = data[i] - (sum >> lp_quantization);
1265*600f14f4SXin Li }
1266*600f14f4SXin Li }
1267*600f14f4SXin Li
1268*600f14f4SXin Li return;
1269*600f14f4SXin Li }
1270*600f14f4SXin Li
1271*600f14f4SXin Li #endif /* FLAC__CPU_ARM64 && FLAC__HAS_ARCH64INTRIN */
1272*600f14f4SXin Li #endif /* FLAC__NO_ASM */
1273*600f14f4SXin Li #endif /* FLAC__INTEGER_ONLY_LIBRARY */
1274