xref: /aosp_15_r20/external/flac/src/libFLAC/lpc_intrin_sse41.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1*600f14f4SXin Li /* libFLAC - Free Lossless Audio Codec library
2*600f14f4SXin Li  * Copyright (C) 2000-2009  Josh Coalson
3*600f14f4SXin Li  * Copyright (C) 2011-2023  Xiph.Org Foundation
4*600f14f4SXin Li  *
5*600f14f4SXin Li  * Redistribution and use in source and binary forms, with or without
6*600f14f4SXin Li  * modification, are permitted provided that the following conditions
7*600f14f4SXin Li  * are met:
8*600f14f4SXin Li  *
9*600f14f4SXin Li  * - Redistributions of source code must retain the above copyright
10*600f14f4SXin Li  * notice, this list of conditions and the following disclaimer.
11*600f14f4SXin Li  *
12*600f14f4SXin Li  * - Redistributions in binary form must reproduce the above copyright
13*600f14f4SXin Li  * notice, this list of conditions and the following disclaimer in the
14*600f14f4SXin Li  * documentation and/or other materials provided with the distribution.
15*600f14f4SXin Li  *
16*600f14f4SXin Li  * - Neither the name of the Xiph.org Foundation nor the names of its
17*600f14f4SXin Li  * contributors may be used to endorse or promote products derived from
18*600f14f4SXin Li  * this software without specific prior written permission.
19*600f14f4SXin Li  *
20*600f14f4SXin Li  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21*600f14f4SXin Li  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22*600f14f4SXin Li  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23*600f14f4SXin Li  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24*600f14f4SXin Li  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25*600f14f4SXin Li  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26*600f14f4SXin Li  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27*600f14f4SXin Li  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28*600f14f4SXin Li  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29*600f14f4SXin Li  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30*600f14f4SXin Li  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*600f14f4SXin Li  */
32*600f14f4SXin Li 
33*600f14f4SXin Li #ifdef HAVE_CONFIG_H
34*600f14f4SXin Li #  include <config.h>
35*600f14f4SXin Li #endif
36*600f14f4SXin Li 
37*600f14f4SXin Li #include "private/cpu.h"
38*600f14f4SXin Li 
39*600f14f4SXin Li #ifndef FLAC__INTEGER_ONLY_LIBRARY
40*600f14f4SXin Li #ifndef FLAC__NO_ASM
41*600f14f4SXin Li #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42*600f14f4SXin Li #include "private/lpc.h"
43*600f14f4SXin Li #ifdef FLAC__SSE4_1_SUPPORTED
44*600f14f4SXin Li 
45*600f14f4SXin Li #include "FLAC/assert.h"
46*600f14f4SXin Li #include "FLAC/format.h"
47*600f14f4SXin Li 
48*600f14f4SXin Li #include <smmintrin.h> /* SSE4.1 */
49*600f14f4SXin Li 
50*600f14f4SXin Li #if defined FLAC__CPU_IA32 /* unused for x64 */
51*600f14f4SXin Li 
52*600f14f4SXin Li #define RESIDUAL64_RESULT(xmmN)  residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt))
53*600f14f4SXin Li #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization))
54*600f14f4SXin Li 
55*600f14f4SXin Li FLAC__SSE_TARGET("sse4.1")
FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])56*600f14f4SXin Li void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
57*600f14f4SXin Li {
58*600f14f4SXin Li 	int i;
59*600f14f4SXin Li 	const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
60*600f14f4SXin Li 
61*600f14f4SXin Li 	FLAC__ASSERT(order > 0);
62*600f14f4SXin Li 	FLAC__ASSERT(order <= 32);
63*600f14f4SXin Li 	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
64*600f14f4SXin Li 
65*600f14f4SXin Li 	if(order <= 12) {
66*600f14f4SXin Li 		if(order > 8) { /* order == 9, 10, 11, 12 */
67*600f14f4SXin Li 			if(order > 10) { /* order == 11, 12 */
68*600f14f4SXin Li 				if(order == 12) {
69*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
70*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
71*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
72*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
73*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
74*600f14f4SXin Li 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
75*600f14f4SXin Li 					xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0  0  q[11] q[10]
76*600f14f4SXin Li 
77*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
78*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
79*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
80*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
81*600f14f4SXin Li 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
82*600f14f4SXin Li 					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
83*600f14f4SXin Li 
84*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
85*600f14f4SXin Li 						//sum = 0;
86*600f14f4SXin Li 						//sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
87*600f14f4SXin Li 						//sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
88*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
89*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
90*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm5);
91*600f14f4SXin Li 
92*600f14f4SXin Li 						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
93*600f14f4SXin Li 						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
94*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
95*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
96*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm4);
97*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
98*600f14f4SXin Li 
99*600f14f4SXin Li 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
100*600f14f4SXin Li 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
101*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
102*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
103*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm3);
104*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
105*600f14f4SXin Li 
106*600f14f4SXin Li 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
107*600f14f4SXin Li 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
108*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
109*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
110*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
111*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
112*600f14f4SXin Li 
113*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
114*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
115*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
116*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
117*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
118*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
119*600f14f4SXin Li 
120*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
121*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
122*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
123*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
124*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
125*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
126*600f14f4SXin Li 
127*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
128*600f14f4SXin Li 						RESIDUAL64_RESULT1(xmm7);
129*600f14f4SXin Li 					}
130*600f14f4SXin Li 				}
131*600f14f4SXin Li 				else { /* order == 11 */
132*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
133*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
134*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
135*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
136*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
137*600f14f4SXin Li 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
138*600f14f4SXin Li 					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
139*600f14f4SXin Li 
140*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
141*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
142*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
143*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
144*600f14f4SXin Li 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
145*600f14f4SXin Li 
146*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
147*600f14f4SXin Li 						//sum = 0;
148*600f14f4SXin Li 						//sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
149*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-11]);
150*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm5);
151*600f14f4SXin Li 
152*600f14f4SXin Li 						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
153*600f14f4SXin Li 						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
154*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
155*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
156*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm4);
157*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
158*600f14f4SXin Li 
159*600f14f4SXin Li 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
160*600f14f4SXin Li 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
161*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
162*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
163*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm3);
164*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
165*600f14f4SXin Li 
166*600f14f4SXin Li 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
167*600f14f4SXin Li 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
168*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
169*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
170*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
171*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
172*600f14f4SXin Li 
173*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
174*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
175*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
176*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
177*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
178*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
179*600f14f4SXin Li 
180*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
181*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
182*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
183*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
184*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
185*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
186*600f14f4SXin Li 
187*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
188*600f14f4SXin Li 						RESIDUAL64_RESULT1(xmm7);
189*600f14f4SXin Li 					}
190*600f14f4SXin Li 				}
191*600f14f4SXin Li 			}
192*600f14f4SXin Li 			else { /* order == 9, 10 */
193*600f14f4SXin Li 				if(order == 10) {
194*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
195*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
196*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
197*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
198*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
199*600f14f4SXin Li 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
200*600f14f4SXin Li 
201*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
202*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
203*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
204*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
205*600f14f4SXin Li 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
206*600f14f4SXin Li 
207*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
208*600f14f4SXin Li 						//sum = 0;
209*600f14f4SXin Li 						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
210*600f14f4SXin Li 						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
211*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
212*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
213*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm4);
214*600f14f4SXin Li 
215*600f14f4SXin Li 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
216*600f14f4SXin Li 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
217*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
218*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
219*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm3);
220*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
221*600f14f4SXin Li 
222*600f14f4SXin Li 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
223*600f14f4SXin Li 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
224*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
225*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
226*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
227*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
228*600f14f4SXin Li 
229*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
230*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
231*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
232*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
233*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
234*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
235*600f14f4SXin Li 
236*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
237*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
238*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
239*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
240*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
241*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
242*600f14f4SXin Li 
243*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
244*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
245*600f14f4SXin Li 					}
246*600f14f4SXin Li 				}
247*600f14f4SXin Li 				else { /* order == 9 */
248*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
249*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
250*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
251*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
252*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
253*600f14f4SXin Li 					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
254*600f14f4SXin Li 
255*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
256*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
257*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
258*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
259*600f14f4SXin Li 
260*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
261*600f14f4SXin Li 						//sum = 0;
262*600f14f4SXin Li 						//sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
263*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-9]);
264*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm4);
265*600f14f4SXin Li 
266*600f14f4SXin Li 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
267*600f14f4SXin Li 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
268*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
269*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
270*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm3);
271*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
272*600f14f4SXin Li 
273*600f14f4SXin Li 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
274*600f14f4SXin Li 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
275*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
276*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
277*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
278*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
279*600f14f4SXin Li 
280*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
281*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
282*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
283*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
284*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
285*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
286*600f14f4SXin Li 
287*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
288*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
289*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
290*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
291*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
292*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
293*600f14f4SXin Li 
294*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
295*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
296*600f14f4SXin Li 					}
297*600f14f4SXin Li 				}
298*600f14f4SXin Li 			}
299*600f14f4SXin Li 		}
300*600f14f4SXin Li 		else if(order > 4) { /* order == 5, 6, 7, 8 */
301*600f14f4SXin Li 			if(order > 6) { /* order == 7, 8 */
302*600f14f4SXin Li 				if(order == 8) {
303*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
304*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
305*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
306*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
307*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
308*600f14f4SXin Li 
309*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
310*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
311*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
312*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
313*600f14f4SXin Li 
314*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
315*600f14f4SXin Li 						//sum = 0;
316*600f14f4SXin Li 						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
317*600f14f4SXin Li 						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
318*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
319*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
320*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm3);
321*600f14f4SXin Li 
322*600f14f4SXin Li 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
323*600f14f4SXin Li 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
324*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
325*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
326*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
327*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
328*600f14f4SXin Li 
329*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
330*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
331*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
332*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
333*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
334*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
335*600f14f4SXin Li 
336*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
337*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
338*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
339*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
340*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
341*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
342*600f14f4SXin Li 
343*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
344*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
345*600f14f4SXin Li 					}
346*600f14f4SXin Li 				}
347*600f14f4SXin Li 				else { /* order == 7 */
348*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
349*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
350*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
351*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
352*600f14f4SXin Li 					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
353*600f14f4SXin Li 
354*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
355*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
356*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
357*600f14f4SXin Li 
358*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
359*600f14f4SXin Li 						//sum = 0;
360*600f14f4SXin Li 						//sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
361*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-7]);
362*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm3);
363*600f14f4SXin Li 
364*600f14f4SXin Li 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
365*600f14f4SXin Li 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
366*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
367*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
368*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm2);
369*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
370*600f14f4SXin Li 
371*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
372*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
373*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
374*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
375*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
376*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
377*600f14f4SXin Li 
378*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
379*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
380*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
381*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
382*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
383*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
384*600f14f4SXin Li 
385*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
386*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
387*600f14f4SXin Li 					}
388*600f14f4SXin Li 				}
389*600f14f4SXin Li 			}
390*600f14f4SXin Li 			else { /* order == 5, 6 */
391*600f14f4SXin Li 				if(order == 6) {
392*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
393*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
394*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
395*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
396*600f14f4SXin Li 
397*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
398*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
399*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
400*600f14f4SXin Li 
401*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
402*600f14f4SXin Li 						//sum = 0;
403*600f14f4SXin Li 						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
404*600f14f4SXin Li 						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
405*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
406*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
407*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm2);
408*600f14f4SXin Li 
409*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
410*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
411*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
412*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
413*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
414*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
415*600f14f4SXin Li 
416*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
417*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
418*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
419*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
420*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
421*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
422*600f14f4SXin Li 
423*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
424*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
425*600f14f4SXin Li 					}
426*600f14f4SXin Li 				}
427*600f14f4SXin Li 				else { /* order == 5 */
428*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
429*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
430*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
431*600f14f4SXin Li 					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
432*600f14f4SXin Li 
433*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
434*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
435*600f14f4SXin Li 
436*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
437*600f14f4SXin Li 						//sum = 0;
438*600f14f4SXin Li 						//sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
439*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-5]);
440*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm2);
441*600f14f4SXin Li 
442*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
443*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
444*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
445*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
446*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm1);
447*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
448*600f14f4SXin Li 
449*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
450*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
451*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
452*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
453*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
454*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
455*600f14f4SXin Li 
456*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
457*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
458*600f14f4SXin Li 					}
459*600f14f4SXin Li 				}
460*600f14f4SXin Li 			}
461*600f14f4SXin Li 		}
462*600f14f4SXin Li 		else { /* order == 1, 2, 3, 4 */
463*600f14f4SXin Li 			if(order > 2) { /* order == 3, 4 */
464*600f14f4SXin Li 				if(order == 4) {
465*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm6, xmm7;
466*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
467*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
468*600f14f4SXin Li 
469*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
470*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
471*600f14f4SXin Li 
472*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
473*600f14f4SXin Li 						//sum = 0;
474*600f14f4SXin Li 						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
475*600f14f4SXin Li 						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
476*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
477*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
478*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm1);
479*600f14f4SXin Li 
480*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
481*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
482*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
483*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
484*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
485*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
486*600f14f4SXin Li 
487*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
488*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
489*600f14f4SXin Li 					}
490*600f14f4SXin Li 				}
491*600f14f4SXin Li 				else { /* order == 3 */
492*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm6, xmm7;
493*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
494*600f14f4SXin Li 					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
495*600f14f4SXin Li 
496*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
497*600f14f4SXin Li 
498*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
499*600f14f4SXin Li 						//sum = 0;
500*600f14f4SXin Li 						//sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
501*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-3]);
502*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm1);
503*600f14f4SXin Li 
504*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
505*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
506*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
507*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
508*600f14f4SXin Li 						xmm6 = _mm_mul_epi32(xmm6, xmm0);
509*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, xmm6);
510*600f14f4SXin Li 
511*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
512*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
513*600f14f4SXin Li 					}
514*600f14f4SXin Li 				}
515*600f14f4SXin Li 			}
516*600f14f4SXin Li 			else { /* order == 1, 2 */
517*600f14f4SXin Li 				if(order == 2) {
518*600f14f4SXin Li 					__m128i xmm0, xmm7;
519*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
520*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
521*600f14f4SXin Li 
522*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
523*600f14f4SXin Li 						//sum = 0;
524*600f14f4SXin Li 						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
525*600f14f4SXin Li 						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
526*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
527*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
528*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm0);
529*600f14f4SXin Li 
530*600f14f4SXin Li 						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
531*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
532*600f14f4SXin Li 					}
533*600f14f4SXin Li 				}
534*600f14f4SXin Li 				else { /* order == 1 */
535*600f14f4SXin Li 					__m128i xmm0, xmm7;
536*600f14f4SXin Li 					xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
537*600f14f4SXin Li 
538*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
539*600f14f4SXin Li 						//sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
540*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-1]);
541*600f14f4SXin Li 						xmm7 = _mm_mul_epi32(xmm7, xmm0);
542*600f14f4SXin Li 						RESIDUAL64_RESULT(xmm7);
543*600f14f4SXin Li 					}
544*600f14f4SXin Li 				}
545*600f14f4SXin Li 			}
546*600f14f4SXin Li 		}
547*600f14f4SXin Li 	}
548*600f14f4SXin Li 	else { /* order > 12 */
549*600f14f4SXin Li 		FLAC__int64 sum;
550*600f14f4SXin Li 		for(i = 0; i < (int)data_len; i++) {
551*600f14f4SXin Li 			sum = 0;
552*600f14f4SXin Li 			switch(order) {
553*600f14f4SXin Li 				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; /* Falls through. */
554*600f14f4SXin Li 				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; /* Falls through. */
555*600f14f4SXin Li 				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; /* Falls through. */
556*600f14f4SXin Li 				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; /* Falls through. */
557*600f14f4SXin Li 				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; /* Falls through. */
558*600f14f4SXin Li 				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; /* Falls through. */
559*600f14f4SXin Li 				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; /* Falls through. */
560*600f14f4SXin Li 				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; /* Falls through. */
561*600f14f4SXin Li 				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; /* Falls through. */
562*600f14f4SXin Li 				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; /* Falls through. */
563*600f14f4SXin Li 				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; /* Falls through. */
564*600f14f4SXin Li 				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; /* Falls through. */
565*600f14f4SXin Li 				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; /* Falls through. */
566*600f14f4SXin Li 				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; /* Falls through. */
567*600f14f4SXin Li 				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; /* Falls through. */
568*600f14f4SXin Li 				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; /* Falls through. */
569*600f14f4SXin Li 				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; /* Falls through. */
570*600f14f4SXin Li 				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; /* Falls through. */
571*600f14f4SXin Li 				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; /* Falls through. */
572*600f14f4SXin Li 				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
573*600f14f4SXin Li 				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
574*600f14f4SXin Li 				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
575*600f14f4SXin Li 				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
576*600f14f4SXin Li 				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
577*600f14f4SXin Li 				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
578*600f14f4SXin Li 				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
579*600f14f4SXin Li 				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
580*600f14f4SXin Li 				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
581*600f14f4SXin Li 				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
582*600f14f4SXin Li 				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
583*600f14f4SXin Li 				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
584*600f14f4SXin Li 				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
585*600f14f4SXin Li 			}
586*600f14f4SXin Li 			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
587*600f14f4SXin Li 		}
588*600f14f4SXin Li 	}
589*600f14f4SXin Li }
590*600f14f4SXin Li 
591*600f14f4SXin Li #endif /* defined FLAC__CPU_IA32 */
592*600f14f4SXin Li 
593*600f14f4SXin Li FLAC__SSE_TARGET("sse4.1")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])594*600f14f4SXin Li void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
595*600f14f4SXin Li {
596*600f14f4SXin Li 	int i;
597*600f14f4SXin Li 	FLAC__int32 sum;
598*600f14f4SXin Li 	const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
599*600f14f4SXin Li 
600*600f14f4SXin Li 	FLAC__ASSERT(order > 0);
601*600f14f4SXin Li 	FLAC__ASSERT(order <= 32);
602*600f14f4SXin Li 
603*600f14f4SXin Li 	if(order <= 12) {
604*600f14f4SXin Li 		if(order > 8) {
605*600f14f4SXin Li 			if(order > 10) {
606*600f14f4SXin Li 				if(order == 12) {
607*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
608*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
609*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
610*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
611*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
612*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
613*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
614*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
615*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
616*600f14f4SXin Li 					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
617*600f14f4SXin Li 					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
618*600f14f4SXin Li 					q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
619*600f14f4SXin Li 					q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
620*600f14f4SXin Li 
621*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
622*600f14f4SXin Li 						__m128i summ, mull;
623*600f14f4SXin Li 						summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
624*600f14f4SXin Li 						mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
625*600f14f4SXin Li 						mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
626*600f14f4SXin Li 						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
627*600f14f4SXin Li 						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
628*600f14f4SXin Li 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
629*600f14f4SXin Li 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
630*600f14f4SXin Li 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
631*600f14f4SXin Li 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
632*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
633*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
634*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
635*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
636*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
637*600f14f4SXin Li 					}
638*600f14f4SXin Li 				}
639*600f14f4SXin Li 				else { /* order == 11 */
640*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
641*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
642*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
643*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
644*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
645*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
646*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
647*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
648*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
649*600f14f4SXin Li 					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
650*600f14f4SXin Li 					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
651*600f14f4SXin Li 					q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
652*600f14f4SXin Li 
653*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
654*600f14f4SXin Li 						__m128i summ, mull;
655*600f14f4SXin Li 						summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
656*600f14f4SXin Li 						mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
657*600f14f4SXin Li 						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
658*600f14f4SXin Li 						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
659*600f14f4SXin Li 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
660*600f14f4SXin Li 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
661*600f14f4SXin Li 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
662*600f14f4SXin Li 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
663*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
664*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
665*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
666*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
667*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
668*600f14f4SXin Li 					}
669*600f14f4SXin Li 				}
670*600f14f4SXin Li 			}
671*600f14f4SXin Li 			else {
672*600f14f4SXin Li 				if(order == 10) {
673*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
674*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
675*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
676*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
677*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
678*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
679*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
680*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
681*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
682*600f14f4SXin Li 					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
683*600f14f4SXin Li 					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
684*600f14f4SXin Li 
685*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
686*600f14f4SXin Li 						__m128i summ, mull;
687*600f14f4SXin Li 						summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
688*600f14f4SXin Li 						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
689*600f14f4SXin Li 						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
690*600f14f4SXin Li 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
691*600f14f4SXin Li 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
692*600f14f4SXin Li 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
693*600f14f4SXin Li 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
694*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
695*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
696*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
697*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
698*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
699*600f14f4SXin Li 					}
700*600f14f4SXin Li 				}
701*600f14f4SXin Li 				else { /* order == 9 */
702*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
703*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
704*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
705*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
706*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
707*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
708*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
709*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
710*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
711*600f14f4SXin Li 					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
712*600f14f4SXin Li 
713*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
714*600f14f4SXin Li 						__m128i summ, mull;
715*600f14f4SXin Li 						summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
716*600f14f4SXin Li 						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
717*600f14f4SXin Li 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
718*600f14f4SXin Li 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
719*600f14f4SXin Li 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
720*600f14f4SXin Li 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
721*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
722*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
723*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
724*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
725*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
726*600f14f4SXin Li 					}
727*600f14f4SXin Li 				}
728*600f14f4SXin Li 			}
729*600f14f4SXin Li 		}
730*600f14f4SXin Li 		else if(order > 4) {
731*600f14f4SXin Li 			if(order > 6) {
732*600f14f4SXin Li 				if(order == 8) {
733*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
734*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
735*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
736*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
737*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
738*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
739*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
740*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
741*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
742*600f14f4SXin Li 
743*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
744*600f14f4SXin Li 						__m128i summ, mull;
745*600f14f4SXin Li 						summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
746*600f14f4SXin Li 						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
747*600f14f4SXin Li 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
748*600f14f4SXin Li 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
749*600f14f4SXin Li 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
750*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
751*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
752*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
753*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
754*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
755*600f14f4SXin Li 					}
756*600f14f4SXin Li 				}
757*600f14f4SXin Li 				else { /* order == 7 */
758*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6;
759*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
760*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
761*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
762*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
763*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
764*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
765*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
766*600f14f4SXin Li 
767*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
768*600f14f4SXin Li 						__m128i summ, mull;
769*600f14f4SXin Li 						summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
770*600f14f4SXin Li 						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
771*600f14f4SXin Li 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
772*600f14f4SXin Li 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
773*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
774*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
775*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
776*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
777*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
778*600f14f4SXin Li 					}
779*600f14f4SXin Li 				}
780*600f14f4SXin Li 			}
781*600f14f4SXin Li 			else {
782*600f14f4SXin Li 				if(order == 6) {
783*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5;
784*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
785*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
786*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
787*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
788*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
789*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
790*600f14f4SXin Li 
791*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
792*600f14f4SXin Li 						__m128i summ, mull;
793*600f14f4SXin Li 						summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
794*600f14f4SXin Li 						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
795*600f14f4SXin Li 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
796*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
797*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
798*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
799*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
800*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
801*600f14f4SXin Li 					}
802*600f14f4SXin Li 				}
803*600f14f4SXin Li 				else { /* order == 5 */
804*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4;
805*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
806*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
807*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
808*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
809*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
810*600f14f4SXin Li 
811*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
812*600f14f4SXin Li 						__m128i summ, mull;
813*600f14f4SXin Li 						summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
814*600f14f4SXin Li 						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
815*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
816*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
817*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
818*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
819*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
820*600f14f4SXin Li 					}
821*600f14f4SXin Li 				}
822*600f14f4SXin Li 			}
823*600f14f4SXin Li 		}
824*600f14f4SXin Li 		else {
825*600f14f4SXin Li 			if(order > 2) {
826*600f14f4SXin Li 				if(order == 4) {
827*600f14f4SXin Li 					__m128i q0, q1, q2, q3;
828*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
829*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
830*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
831*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
832*600f14f4SXin Li 
833*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
834*600f14f4SXin Li 						__m128i summ, mull;
835*600f14f4SXin Li 						summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
836*600f14f4SXin Li 						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
837*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
838*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
839*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
840*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
841*600f14f4SXin Li 					}
842*600f14f4SXin Li 				}
843*600f14f4SXin Li 				else { /* order == 3 */
844*600f14f4SXin Li 					__m128i q0, q1, q2;
845*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
846*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
847*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
848*600f14f4SXin Li 
849*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
850*600f14f4SXin Li 						__m128i summ, mull;
851*600f14f4SXin Li 						summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
852*600f14f4SXin Li 						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
853*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
854*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
855*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
856*600f14f4SXin Li 					}
857*600f14f4SXin Li 				}
858*600f14f4SXin Li 			}
859*600f14f4SXin Li 			else {
860*600f14f4SXin Li 				if(order == 2) {
861*600f14f4SXin Li 					__m128i q0, q1;
862*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
863*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
864*600f14f4SXin Li 
865*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
866*600f14f4SXin Li 						__m128i summ, mull;
867*600f14f4SXin Li 						summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
868*600f14f4SXin Li 						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
869*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
870*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
871*600f14f4SXin Li 					}
872*600f14f4SXin Li 				}
873*600f14f4SXin Li 				else { /* order == 1 */
874*600f14f4SXin Li 					__m128i q0;
875*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
876*600f14f4SXin Li 
877*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
878*600f14f4SXin Li 						__m128i summ;
879*600f14f4SXin Li 						summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
880*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
881*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
882*600f14f4SXin Li 					}
883*600f14f4SXin Li 				}
884*600f14f4SXin Li 			}
885*600f14f4SXin Li 		}
886*600f14f4SXin Li 		for(; i < (int)data_len; i++) {
887*600f14f4SXin Li 			sum = 0;
888*600f14f4SXin Li 			switch(order) {
889*600f14f4SXin Li 				case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
890*600f14f4SXin Li 				case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
891*600f14f4SXin Li 				case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
892*600f14f4SXin Li 				case 9:  sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
893*600f14f4SXin Li 				case 8:  sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
894*600f14f4SXin Li 				case 7:  sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
895*600f14f4SXin Li 				case 6:  sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
896*600f14f4SXin Li 				case 5:  sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
897*600f14f4SXin Li 				case 4:  sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
898*600f14f4SXin Li 				case 3:  sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
899*600f14f4SXin Li 				case 2:  sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
900*600f14f4SXin Li 				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
901*600f14f4SXin Li 			}
902*600f14f4SXin Li 			residual[i] = data[i] - (sum >> lp_quantization);
903*600f14f4SXin Li 		}
904*600f14f4SXin Li 	}
905*600f14f4SXin Li 	else { /* order > 12 */
906*600f14f4SXin Li 		for(i = 0; i < (int)data_len; i++) {
907*600f14f4SXin Li 			sum = 0;
908*600f14f4SXin Li 			switch(order) {
909*600f14f4SXin Li 				case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
910*600f14f4SXin Li 				case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
911*600f14f4SXin Li 				case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
912*600f14f4SXin Li 				case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
913*600f14f4SXin Li 				case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
914*600f14f4SXin Li 				case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
915*600f14f4SXin Li 				case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
916*600f14f4SXin Li 				case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
917*600f14f4SXin Li 				case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
918*600f14f4SXin Li 				case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
919*600f14f4SXin Li 				case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
920*600f14f4SXin Li 				case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
921*600f14f4SXin Li 				case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
922*600f14f4SXin Li 				case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
923*600f14f4SXin Li 				case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
924*600f14f4SXin Li 				case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
925*600f14f4SXin Li 				case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
926*600f14f4SXin Li 				case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
927*600f14f4SXin Li 				case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
928*600f14f4SXin Li 				case 13: sum += qlp_coeff[12] * data[i-13];
929*600f14f4SXin Li 				         sum += qlp_coeff[11] * data[i-12];
930*600f14f4SXin Li 				         sum += qlp_coeff[10] * data[i-11];
931*600f14f4SXin Li 				         sum += qlp_coeff[ 9] * data[i-10];
932*600f14f4SXin Li 				         sum += qlp_coeff[ 8] * data[i- 9];
933*600f14f4SXin Li 				         sum += qlp_coeff[ 7] * data[i- 8];
934*600f14f4SXin Li 				         sum += qlp_coeff[ 6] * data[i- 7];
935*600f14f4SXin Li 				         sum += qlp_coeff[ 5] * data[i- 6];
936*600f14f4SXin Li 				         sum += qlp_coeff[ 4] * data[i- 5];
937*600f14f4SXin Li 				         sum += qlp_coeff[ 3] * data[i- 4];
938*600f14f4SXin Li 				         sum += qlp_coeff[ 2] * data[i- 3];
939*600f14f4SXin Li 				         sum += qlp_coeff[ 1] * data[i- 2];
940*600f14f4SXin Li 				         sum += qlp_coeff[ 0] * data[i- 1];
941*600f14f4SXin Li 			}
942*600f14f4SXin Li 			residual[i] = data[i] - (sum >> lp_quantization);
943*600f14f4SXin Li 		}
944*600f14f4SXin Li 	}
945*600f14f4SXin Li }
946*600f14f4SXin Li 
947*600f14f4SXin Li #endif /* FLAC__SSE4_1_SUPPORTED */
948*600f14f4SXin Li #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
949*600f14f4SXin Li #endif /* FLAC__NO_ASM */
950*600f14f4SXin Li #endif /* FLAC__INTEGER_ONLY_LIBRARY */
951