xref: /aosp_15_r20/external/flac/src/libFLAC/lpc_intrin_sse41.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2023  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36 
37 #include "private/cpu.h"
38 
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE4_1_SUPPORTED
44 
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47 
48 #include <smmintrin.h> /* SSE4.1 */
49 
50 #if defined FLAC__CPU_IA32 /* unused for x64 */
51 
52 #define RESIDUAL64_RESULT(xmmN)  residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt))
53 #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization))
54 
55 FLAC__SSE_TARGET("sse4.1")
FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])56 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
57 {
58 	int i;
59 	const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
60 
61 	FLAC__ASSERT(order > 0);
62 	FLAC__ASSERT(order <= 32);
63 	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
64 
65 	if(order <= 12) {
66 		if(order > 8) { /* order == 9, 10, 11, 12 */
67 			if(order > 10) { /* order == 11, 12 */
68 				if(order == 12) {
69 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
70 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
71 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
72 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
73 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
74 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
75 					xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0  0  q[11] q[10]
76 
77 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
78 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
79 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
80 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
81 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
82 					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
83 
84 					for(i = 0; i < (int)data_len; i++) {
85 						//sum = 0;
86 						//sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
87 						//sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
88 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
89 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
90 						xmm7 = _mm_mul_epi32(xmm7, xmm5);
91 
92 						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
93 						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
94 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
95 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
96 						xmm6 = _mm_mul_epi32(xmm6, xmm4);
97 						xmm7 = _mm_add_epi64(xmm7, xmm6);
98 
99 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
100 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
101 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
102 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
103 						xmm6 = _mm_mul_epi32(xmm6, xmm3);
104 						xmm7 = _mm_add_epi64(xmm7, xmm6);
105 
106 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
107 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
108 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
109 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
110 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
111 						xmm7 = _mm_add_epi64(xmm7, xmm6);
112 
113 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
114 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
115 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
116 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
117 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
118 						xmm7 = _mm_add_epi64(xmm7, xmm6);
119 
120 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
121 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
122 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
123 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
124 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
125 						xmm7 = _mm_add_epi64(xmm7, xmm6);
126 
127 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
128 						RESIDUAL64_RESULT1(xmm7);
129 					}
130 				}
131 				else { /* order == 11 */
132 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
133 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
134 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
135 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
136 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
137 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
138 					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
139 
140 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
141 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
142 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
143 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
144 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
145 
146 					for(i = 0; i < (int)data_len; i++) {
147 						//sum = 0;
148 						//sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
149 						xmm7 = _mm_cvtsi32_si128(data[i-11]);
150 						xmm7 = _mm_mul_epi32(xmm7, xmm5);
151 
152 						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
153 						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
154 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
155 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
156 						xmm6 = _mm_mul_epi32(xmm6, xmm4);
157 						xmm7 = _mm_add_epi64(xmm7, xmm6);
158 
159 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
160 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
161 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
162 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
163 						xmm6 = _mm_mul_epi32(xmm6, xmm3);
164 						xmm7 = _mm_add_epi64(xmm7, xmm6);
165 
166 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
167 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
168 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
169 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
170 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
171 						xmm7 = _mm_add_epi64(xmm7, xmm6);
172 
173 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
174 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
175 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
176 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
177 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
178 						xmm7 = _mm_add_epi64(xmm7, xmm6);
179 
180 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
181 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
182 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
183 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
184 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
185 						xmm7 = _mm_add_epi64(xmm7, xmm6);
186 
187 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
188 						RESIDUAL64_RESULT1(xmm7);
189 					}
190 				}
191 			}
192 			else { /* order == 9, 10 */
193 				if(order == 10) {
194 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
195 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
196 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
197 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
198 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
199 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
200 
201 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
202 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
203 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
204 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
205 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
206 
207 					for(i = 0; i < (int)data_len; i++) {
208 						//sum = 0;
209 						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
210 						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
211 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
212 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
213 						xmm7 = _mm_mul_epi32(xmm7, xmm4);
214 
215 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
216 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
217 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
218 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
219 						xmm6 = _mm_mul_epi32(xmm6, xmm3);
220 						xmm7 = _mm_add_epi64(xmm7, xmm6);
221 
222 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
223 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
224 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
225 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
226 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
227 						xmm7 = _mm_add_epi64(xmm7, xmm6);
228 
229 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
230 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
231 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
232 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
233 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
234 						xmm7 = _mm_add_epi64(xmm7, xmm6);
235 
236 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
237 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
238 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
239 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
240 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
241 						xmm7 = _mm_add_epi64(xmm7, xmm6);
242 
243 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
244 						RESIDUAL64_RESULT(xmm7);
245 					}
246 				}
247 				else { /* order == 9 */
248 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
249 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
250 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
251 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
252 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
253 					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
254 
255 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
256 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
257 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
258 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
259 
260 					for(i = 0; i < (int)data_len; i++) {
261 						//sum = 0;
262 						//sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
263 						xmm7 = _mm_cvtsi32_si128(data[i-9]);
264 						xmm7 = _mm_mul_epi32(xmm7, xmm4);
265 
266 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
267 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
268 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
269 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
270 						xmm6 = _mm_mul_epi32(xmm6, xmm3);
271 						xmm7 = _mm_add_epi64(xmm7, xmm6);
272 
273 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
274 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
275 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
276 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
277 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
278 						xmm7 = _mm_add_epi64(xmm7, xmm6);
279 
280 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
281 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
282 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
283 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
284 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
285 						xmm7 = _mm_add_epi64(xmm7, xmm6);
286 
287 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
288 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
289 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
290 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
291 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
292 						xmm7 = _mm_add_epi64(xmm7, xmm6);
293 
294 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
295 						RESIDUAL64_RESULT(xmm7);
296 					}
297 				}
298 			}
299 		}
300 		else if(order > 4) { /* order == 5, 6, 7, 8 */
301 			if(order > 6) { /* order == 7, 8 */
302 				if(order == 8) {
303 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
304 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
305 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
306 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
307 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
308 
309 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
310 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
311 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
312 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
313 
314 					for(i = 0; i < (int)data_len; i++) {
315 						//sum = 0;
316 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
317 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
318 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
319 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
320 						xmm7 = _mm_mul_epi32(xmm7, xmm3);
321 
322 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
323 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
324 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
325 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
326 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
327 						xmm7 = _mm_add_epi64(xmm7, xmm6);
328 
329 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
330 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
331 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
332 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
333 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
334 						xmm7 = _mm_add_epi64(xmm7, xmm6);
335 
336 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
337 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
338 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
339 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
340 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
341 						xmm7 = _mm_add_epi64(xmm7, xmm6);
342 
343 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
344 						RESIDUAL64_RESULT(xmm7);
345 					}
346 				}
347 				else { /* order == 7 */
348 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
349 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
350 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
351 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
352 					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
353 
354 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
355 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
356 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
357 
358 					for(i = 0; i < (int)data_len; i++) {
359 						//sum = 0;
360 						//sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
361 						xmm7 = _mm_cvtsi32_si128(data[i-7]);
362 						xmm7 = _mm_mul_epi32(xmm7, xmm3);
363 
364 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
365 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
366 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
367 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
368 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
369 						xmm7 = _mm_add_epi64(xmm7, xmm6);
370 
371 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
372 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
373 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
374 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
375 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
376 						xmm7 = _mm_add_epi64(xmm7, xmm6);
377 
378 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
379 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
380 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
381 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
382 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
383 						xmm7 = _mm_add_epi64(xmm7, xmm6);
384 
385 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
386 						RESIDUAL64_RESULT(xmm7);
387 					}
388 				}
389 			}
390 			else { /* order == 5, 6 */
391 				if(order == 6) {
392 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
393 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
394 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
395 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
396 
397 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
398 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
399 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
400 
401 					for(i = 0; i < (int)data_len; i++) {
402 						//sum = 0;
403 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
404 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
405 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
406 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
407 						xmm7 = _mm_mul_epi32(xmm7, xmm2);
408 
409 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
410 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
411 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
412 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
413 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
414 						xmm7 = _mm_add_epi64(xmm7, xmm6);
415 
416 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
417 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
418 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
419 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
420 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
421 						xmm7 = _mm_add_epi64(xmm7, xmm6);
422 
423 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
424 						RESIDUAL64_RESULT(xmm7);
425 					}
426 				}
427 				else { /* order == 5 */
428 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
429 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
430 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
431 					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
432 
433 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
434 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
435 
436 					for(i = 0; i < (int)data_len; i++) {
437 						//sum = 0;
438 						//sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
439 						xmm7 = _mm_cvtsi32_si128(data[i-5]);
440 						xmm7 = _mm_mul_epi32(xmm7, xmm2);
441 
442 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
443 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
444 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
445 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
446 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
447 						xmm7 = _mm_add_epi64(xmm7, xmm6);
448 
449 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
450 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
451 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
452 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
453 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
454 						xmm7 = _mm_add_epi64(xmm7, xmm6);
455 
456 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
457 						RESIDUAL64_RESULT(xmm7);
458 					}
459 				}
460 			}
461 		}
462 		else { /* order == 1, 2, 3, 4 */
463 			if(order > 2) { /* order == 3, 4 */
464 				if(order == 4) {
465 					__m128i xmm0, xmm1, xmm6, xmm7;
466 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
467 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
468 
469 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
470 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
471 
472 					for(i = 0; i < (int)data_len; i++) {
473 						//sum = 0;
474 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
475 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
476 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
477 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
478 						xmm7 = _mm_mul_epi32(xmm7, xmm1);
479 
480 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
481 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
482 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
483 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
484 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
485 						xmm7 = _mm_add_epi64(xmm7, xmm6);
486 
487 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
488 						RESIDUAL64_RESULT(xmm7);
489 					}
490 				}
491 				else { /* order == 3 */
492 					__m128i xmm0, xmm1, xmm6, xmm7;
493 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
494 					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
495 
496 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
497 
498 					for(i = 0; i < (int)data_len; i++) {
499 						//sum = 0;
500 						//sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
501 						xmm7 = _mm_cvtsi32_si128(data[i-3]);
502 						xmm7 = _mm_mul_epi32(xmm7, xmm1);
503 
504 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
505 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
506 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
507 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
508 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
509 						xmm7 = _mm_add_epi64(xmm7, xmm6);
510 
511 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
512 						RESIDUAL64_RESULT(xmm7);
513 					}
514 				}
515 			}
516 			else { /* order == 1, 2 */
517 				if(order == 2) {
518 					__m128i xmm0, xmm7;
519 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
520 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
521 
522 					for(i = 0; i < (int)data_len; i++) {
523 						//sum = 0;
524 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
525 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
526 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
527 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
528 						xmm7 = _mm_mul_epi32(xmm7, xmm0);
529 
530 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
531 						RESIDUAL64_RESULT(xmm7);
532 					}
533 				}
534 				else { /* order == 1 */
535 					__m128i xmm0, xmm7;
536 					xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
537 
538 					for(i = 0; i < (int)data_len; i++) {
539 						//sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
540 						xmm7 = _mm_cvtsi32_si128(data[i-1]);
541 						xmm7 = _mm_mul_epi32(xmm7, xmm0);
542 						RESIDUAL64_RESULT(xmm7);
543 					}
544 				}
545 			}
546 		}
547 	}
548 	else { /* order > 12 */
549 		FLAC__int64 sum;
550 		for(i = 0; i < (int)data_len; i++) {
551 			sum = 0;
552 			switch(order) {
553 				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; /* Falls through. */
554 				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; /* Falls through. */
555 				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; /* Falls through. */
556 				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; /* Falls through. */
557 				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; /* Falls through. */
558 				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; /* Falls through. */
559 				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; /* Falls through. */
560 				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; /* Falls through. */
561 				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; /* Falls through. */
562 				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; /* Falls through. */
563 				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; /* Falls through. */
564 				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; /* Falls through. */
565 				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; /* Falls through. */
566 				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; /* Falls through. */
567 				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; /* Falls through. */
568 				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; /* Falls through. */
569 				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; /* Falls through. */
570 				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; /* Falls through. */
571 				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; /* Falls through. */
572 				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
573 				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
574 				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
575 				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
576 				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
577 				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
578 				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
579 				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
580 				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
581 				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
582 				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
583 				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
584 				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
585 			}
586 			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
587 		}
588 	}
589 }
590 
591 #endif /* defined FLAC__CPU_IA32 */
592 
593 FLAC__SSE_TARGET("sse4.1")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])594 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
595 {
596 	int i;
597 	FLAC__int32 sum;
598 	const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
599 
600 	FLAC__ASSERT(order > 0);
601 	FLAC__ASSERT(order <= 32);
602 
603 	if(order <= 12) {
604 		if(order > 8) {
605 			if(order > 10) {
606 				if(order == 12) {
607 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
608 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
609 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
610 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
611 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
612 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
613 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
614 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
615 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
616 					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
617 					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
618 					q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
619 					q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
620 
621 					for(i = 0; i < (int)data_len-3; i+=4) {
622 						__m128i summ, mull;
623 						summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
624 						mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
625 						mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
626 						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
627 						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
628 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
629 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
630 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
631 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
632 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
633 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
634 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
635 						summ = _mm_sra_epi32(summ, cnt);
636 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
637 					}
638 				}
639 				else { /* order == 11 */
640 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
641 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
642 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
643 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
644 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
645 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
646 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
647 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
648 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
649 					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
650 					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
651 					q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
652 
653 					for(i = 0; i < (int)data_len-3; i+=4) {
654 						__m128i summ, mull;
655 						summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
656 						mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
657 						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
658 						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
659 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
660 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
661 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
662 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
663 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
664 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
665 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
666 						summ = _mm_sra_epi32(summ, cnt);
667 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
668 					}
669 				}
670 			}
671 			else {
672 				if(order == 10) {
673 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
674 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
675 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
676 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
677 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
678 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
679 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
680 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
681 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
682 					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
683 					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
684 
685 					for(i = 0; i < (int)data_len-3; i+=4) {
686 						__m128i summ, mull;
687 						summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
688 						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
689 						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
690 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
691 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
692 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
693 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
694 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
695 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
696 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
697 						summ = _mm_sra_epi32(summ, cnt);
698 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
699 					}
700 				}
701 				else { /* order == 9 */
702 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
703 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
704 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
705 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
706 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
707 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
708 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
709 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
710 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
711 					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
712 
713 					for(i = 0; i < (int)data_len-3; i+=4) {
714 						__m128i summ, mull;
715 						summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
716 						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
717 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
718 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
719 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
720 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
721 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
722 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
723 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
724 						summ = _mm_sra_epi32(summ, cnt);
725 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
726 					}
727 				}
728 			}
729 		}
730 		else if(order > 4) {
731 			if(order > 6) {
732 				if(order == 8) {
733 					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
734 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
735 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
736 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
737 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
738 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
739 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
740 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
741 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
742 
743 					for(i = 0; i < (int)data_len-3; i+=4) {
744 						__m128i summ, mull;
745 						summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
746 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
747 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
748 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
749 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
750 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
751 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
752 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
753 						summ = _mm_sra_epi32(summ, cnt);
754 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
755 					}
756 				}
757 				else { /* order == 7 */
758 					__m128i q0, q1, q2, q3, q4, q5, q6;
759 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
760 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
761 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
762 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
763 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
764 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
765 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
766 
767 					for(i = 0; i < (int)data_len-3; i+=4) {
768 						__m128i summ, mull;
769 						summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
770 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
771 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
772 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
773 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
774 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
775 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
776 						summ = _mm_sra_epi32(summ, cnt);
777 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
778 					}
779 				}
780 			}
781 			else {
782 				if(order == 6) {
783 					__m128i q0, q1, q2, q3, q4, q5;
784 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
785 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
786 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
787 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
788 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
789 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
790 
791 					for(i = 0; i < (int)data_len-3; i+=4) {
792 						__m128i summ, mull;
793 						summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
794 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
795 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
796 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
797 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
798 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
799 						summ = _mm_sra_epi32(summ, cnt);
800 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
801 					}
802 				}
803 				else { /* order == 5 */
804 					__m128i q0, q1, q2, q3, q4;
805 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
806 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
807 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
808 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
809 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
810 
811 					for(i = 0; i < (int)data_len-3; i+=4) {
812 						__m128i summ, mull;
813 						summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
814 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
815 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
816 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
817 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
818 						summ = _mm_sra_epi32(summ, cnt);
819 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
820 					}
821 				}
822 			}
823 		}
824 		else {
825 			if(order > 2) {
826 				if(order == 4) {
827 					__m128i q0, q1, q2, q3;
828 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
829 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
830 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
831 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
832 
833 					for(i = 0; i < (int)data_len-3; i+=4) {
834 						__m128i summ, mull;
835 						summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
836 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
837 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
838 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
839 						summ = _mm_sra_epi32(summ, cnt);
840 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
841 					}
842 				}
843 				else { /* order == 3 */
844 					__m128i q0, q1, q2;
845 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
846 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
847 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
848 
849 					for(i = 0; i < (int)data_len-3; i+=4) {
850 						__m128i summ, mull;
851 						summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
852 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
853 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
854 						summ = _mm_sra_epi32(summ, cnt);
855 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
856 					}
857 				}
858 			}
859 			else {
860 				if(order == 2) {
861 					__m128i q0, q1;
862 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
863 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
864 
865 					for(i = 0; i < (int)data_len-3; i+=4) {
866 						__m128i summ, mull;
867 						summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
868 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
869 						summ = _mm_sra_epi32(summ, cnt);
870 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
871 					}
872 				}
873 				else { /* order == 1 */
874 					__m128i q0;
875 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
876 
877 					for(i = 0; i < (int)data_len-3; i+=4) {
878 						__m128i summ;
879 						summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
880 						summ = _mm_sra_epi32(summ, cnt);
881 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
882 					}
883 				}
884 			}
885 		}
886 		for(; i < (int)data_len; i++) {
887 			sum = 0;
888 			switch(order) {
889 				case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
890 				case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
891 				case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
892 				case 9:  sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
893 				case 8:  sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
894 				case 7:  sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
895 				case 6:  sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
896 				case 5:  sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
897 				case 4:  sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
898 				case 3:  sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
899 				case 2:  sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
900 				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
901 			}
902 			residual[i] = data[i] - (sum >> lp_quantization);
903 		}
904 	}
905 	else { /* order > 12 */
906 		for(i = 0; i < (int)data_len; i++) {
907 			sum = 0;
908 			switch(order) {
909 				case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
910 				case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
911 				case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
912 				case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
913 				case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
914 				case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
915 				case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
916 				case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
917 				case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
918 				case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
919 				case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
920 				case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
921 				case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
922 				case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
923 				case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
924 				case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
925 				case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
926 				case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
927 				case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
928 				case 13: sum += qlp_coeff[12] * data[i-13];
929 				         sum += qlp_coeff[11] * data[i-12];
930 				         sum += qlp_coeff[10] * data[i-11];
931 				         sum += qlp_coeff[ 9] * data[i-10];
932 				         sum += qlp_coeff[ 8] * data[i- 9];
933 				         sum += qlp_coeff[ 7] * data[i- 8];
934 				         sum += qlp_coeff[ 6] * data[i- 7];
935 				         sum += qlp_coeff[ 5] * data[i- 6];
936 				         sum += qlp_coeff[ 4] * data[i- 5];
937 				         sum += qlp_coeff[ 3] * data[i- 4];
938 				         sum += qlp_coeff[ 2] * data[i- 3];
939 				         sum += qlp_coeff[ 1] * data[i- 2];
940 				         sum += qlp_coeff[ 0] * data[i- 1];
941 			}
942 			residual[i] = data[i] - (sum >> lp_quantization);
943 		}
944 	}
945 }
946 
947 #endif /* FLAC__SSE4_1_SUPPORTED */
948 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
949 #endif /* FLAC__NO_ASM */
950 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
951