xref: /aosp_15_r20/external/flac/src/libFLAC/lpc_intrin_sse2.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1*600f14f4SXin Li /* libFLAC - Free Lossless Audio Codec library
2*600f14f4SXin Li  * Copyright (C) 2000-2009  Josh Coalson
3*600f14f4SXin Li  * Copyright (C) 2011-2023  Xiph.Org Foundation
4*600f14f4SXin Li  *
5*600f14f4SXin Li  * Redistribution and use in source and binary forms, with or without
6*600f14f4SXin Li  * modification, are permitted provided that the following conditions
7*600f14f4SXin Li  * are met:
8*600f14f4SXin Li  *
9*600f14f4SXin Li  * - Redistributions of source code must retain the above copyright
10*600f14f4SXin Li  * notice, this list of conditions and the following disclaimer.
11*600f14f4SXin Li  *
12*600f14f4SXin Li  * - Redistributions in binary form must reproduce the above copyright
13*600f14f4SXin Li  * notice, this list of conditions and the following disclaimer in the
14*600f14f4SXin Li  * documentation and/or other materials provided with the distribution.
15*600f14f4SXin Li  *
16*600f14f4SXin Li  * - Neither the name of the Xiph.org Foundation nor the names of its
17*600f14f4SXin Li  * contributors may be used to endorse or promote products derived from
18*600f14f4SXin Li  * this software without specific prior written permission.
19*600f14f4SXin Li  *
20*600f14f4SXin Li  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21*600f14f4SXin Li  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22*600f14f4SXin Li  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23*600f14f4SXin Li  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24*600f14f4SXin Li  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25*600f14f4SXin Li  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26*600f14f4SXin Li  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27*600f14f4SXin Li  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28*600f14f4SXin Li  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29*600f14f4SXin Li  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30*600f14f4SXin Li  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*600f14f4SXin Li  */
32*600f14f4SXin Li 
33*600f14f4SXin Li #ifdef HAVE_CONFIG_H
34*600f14f4SXin Li #  include <config.h>
35*600f14f4SXin Li #endif
36*600f14f4SXin Li 
37*600f14f4SXin Li #include "private/cpu.h"
38*600f14f4SXin Li 
39*600f14f4SXin Li #ifndef FLAC__INTEGER_ONLY_LIBRARY
40*600f14f4SXin Li #ifndef FLAC__NO_ASM
41*600f14f4SXin Li #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42*600f14f4SXin Li #include "private/lpc.h"
43*600f14f4SXin Li #ifdef FLAC__SSE2_SUPPORTED
44*600f14f4SXin Li 
45*600f14f4SXin Li #include "FLAC/assert.h"
46*600f14f4SXin Li #include "FLAC/format.h"
47*600f14f4SXin Li 
48*600f14f4SXin Li #include <emmintrin.h> /* SSE2 */
49*600f14f4SXin Li 
50*600f14f4SXin Li #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51*600f14f4SXin Li #define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52*600f14f4SXin Li 
53*600f14f4SXin Li 
54*600f14f4SXin Li FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])55*600f14f4SXin Li void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
56*600f14f4SXin Li {
57*600f14f4SXin Li #undef MAX_LAG
58*600f14f4SXin Li #define MAX_LAG 8
59*600f14f4SXin Li #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
60*600f14f4SXin Li }
61*600f14f4SXin Li 
62*600f14f4SXin Li FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])63*600f14f4SXin Li void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
64*600f14f4SXin Li {
65*600f14f4SXin Li #undef MAX_LAG
66*600f14f4SXin Li #define MAX_LAG 10
67*600f14f4SXin Li #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
68*600f14f4SXin Li }
69*600f14f4SXin Li 
70*600f14f4SXin Li 
71*600f14f4SXin Li FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])72*600f14f4SXin Li void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
73*600f14f4SXin Li {
74*600f14f4SXin Li #undef MAX_LAG
75*600f14f4SXin Li #define MAX_LAG 14
76*600f14f4SXin Li #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
77*600f14f4SXin Li }
78*600f14f4SXin Li 
79*600f14f4SXin Li FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])80*600f14f4SXin Li void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
81*600f14f4SXin Li {
82*600f14f4SXin Li 	int i;
83*600f14f4SXin Li 	FLAC__int32 sum;
84*600f14f4SXin Li 	const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
85*600f14f4SXin Li 
86*600f14f4SXin Li 	FLAC__ASSERT(order > 0);
87*600f14f4SXin Li 	FLAC__ASSERT(order <= 32);
88*600f14f4SXin Li 
89*600f14f4SXin Li 	if(order <= 12) {
90*600f14f4SXin Li 		if(order > 8) {
91*600f14f4SXin Li 			if(order > 10) {
92*600f14f4SXin Li 				if(order == 12) {
93*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
94*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
95*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
96*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
97*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
98*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
99*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
100*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
101*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
102*600f14f4SXin Li 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
103*600f14f4SXin Li 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
104*600f14f4SXin Li 					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
105*600f14f4SXin Li 					q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
106*600f14f4SXin Li 
107*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
108*600f14f4SXin Li 						__m128i summ, mull;
109*600f14f4SXin Li 						summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
110*600f14f4SXin Li 						mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
111*600f14f4SXin Li 						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
112*600f14f4SXin Li 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
113*600f14f4SXin Li 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
114*600f14f4SXin Li 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
115*600f14f4SXin Li 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
116*600f14f4SXin Li 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
117*600f14f4SXin Li 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
118*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
119*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
120*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
121*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
122*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
123*600f14f4SXin Li 					}
124*600f14f4SXin Li 				}
125*600f14f4SXin Li 				else { /* order == 11 */
126*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
127*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
128*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
129*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
130*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
131*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
132*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
133*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
134*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
135*600f14f4SXin Li 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
136*600f14f4SXin Li 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
137*600f14f4SXin Li 					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
138*600f14f4SXin Li 
139*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
140*600f14f4SXin Li 						__m128i summ, mull;
141*600f14f4SXin Li 						summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
142*600f14f4SXin Li 						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
143*600f14f4SXin Li 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
144*600f14f4SXin Li 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
145*600f14f4SXin Li 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
146*600f14f4SXin Li 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
147*600f14f4SXin Li 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
148*600f14f4SXin Li 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
149*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
150*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
151*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
152*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
153*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
154*600f14f4SXin Li 					}
155*600f14f4SXin Li 				}
156*600f14f4SXin Li 			}
157*600f14f4SXin Li 			else {
158*600f14f4SXin Li 				if(order == 10) {
159*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
160*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
161*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
162*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
163*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
164*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
165*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
166*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
167*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
168*600f14f4SXin Li 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
169*600f14f4SXin Li 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
170*600f14f4SXin Li 
171*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
172*600f14f4SXin Li 						__m128i summ, mull;
173*600f14f4SXin Li 						summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
174*600f14f4SXin Li 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
175*600f14f4SXin Li 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
176*600f14f4SXin Li 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
177*600f14f4SXin Li 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
178*600f14f4SXin Li 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
179*600f14f4SXin Li 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
180*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
181*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
182*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
183*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
184*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
185*600f14f4SXin Li 					}
186*600f14f4SXin Li 				}
187*600f14f4SXin Li 				else { /* order == 9 */
188*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
189*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
190*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
191*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
192*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
193*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
194*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
195*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
196*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
197*600f14f4SXin Li 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
198*600f14f4SXin Li 
199*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
200*600f14f4SXin Li 						__m128i summ, mull;
201*600f14f4SXin Li 						summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
202*600f14f4SXin Li 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
203*600f14f4SXin Li 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
204*600f14f4SXin Li 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
205*600f14f4SXin Li 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
206*600f14f4SXin Li 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
207*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
208*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
209*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
210*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
211*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
212*600f14f4SXin Li 					}
213*600f14f4SXin Li 				}
214*600f14f4SXin Li 			}
215*600f14f4SXin Li 		}
216*600f14f4SXin Li 		else if(order > 4) {
217*600f14f4SXin Li 			if(order > 6) {
218*600f14f4SXin Li 				if(order == 8) {
219*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
220*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
221*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
222*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
223*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
224*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
225*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
226*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
227*600f14f4SXin Li 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
228*600f14f4SXin Li 
229*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
230*600f14f4SXin Li 						__m128i summ, mull;
231*600f14f4SXin Li 						summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
232*600f14f4SXin Li 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
233*600f14f4SXin Li 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
234*600f14f4SXin Li 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
235*600f14f4SXin Li 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
236*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
237*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
238*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
239*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
240*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
241*600f14f4SXin Li 					}
242*600f14f4SXin Li 				}
243*600f14f4SXin Li 				else { /* order == 7 */
244*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5, q6;
245*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
246*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
247*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
248*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
249*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
250*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
251*600f14f4SXin Li 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
252*600f14f4SXin Li 
253*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
254*600f14f4SXin Li 						__m128i summ, mull;
255*600f14f4SXin Li 						summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
256*600f14f4SXin Li 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
257*600f14f4SXin Li 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
258*600f14f4SXin Li 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
259*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
260*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
261*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
262*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
263*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
264*600f14f4SXin Li 					}
265*600f14f4SXin Li 				}
266*600f14f4SXin Li 			}
267*600f14f4SXin Li 			else {
268*600f14f4SXin Li 				if(order == 6) {
269*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4, q5;
270*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
271*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
272*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
273*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
274*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
275*600f14f4SXin Li 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
276*600f14f4SXin Li 
277*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
278*600f14f4SXin Li 						__m128i summ, mull;
279*600f14f4SXin Li 						summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
280*600f14f4SXin Li 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
281*600f14f4SXin Li 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
282*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
283*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
284*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
285*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
286*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
287*600f14f4SXin Li 					}
288*600f14f4SXin Li 				}
289*600f14f4SXin Li 				else { /* order == 5 */
290*600f14f4SXin Li 					__m128i q0, q1, q2, q3, q4;
291*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
292*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
293*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
294*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
295*600f14f4SXin Li 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
296*600f14f4SXin Li 
297*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
298*600f14f4SXin Li 						__m128i summ, mull;
299*600f14f4SXin Li 						summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
300*600f14f4SXin Li 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
301*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
302*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
303*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
304*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
305*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
306*600f14f4SXin Li 					}
307*600f14f4SXin Li 				}
308*600f14f4SXin Li 			}
309*600f14f4SXin Li 		}
310*600f14f4SXin Li 		else {
311*600f14f4SXin Li 			if(order > 2) {
312*600f14f4SXin Li 				if(order == 4) {
313*600f14f4SXin Li 					__m128i q0, q1, q2, q3;
314*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
315*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
316*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
317*600f14f4SXin Li 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
318*600f14f4SXin Li 
319*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
320*600f14f4SXin Li 						__m128i summ, mull;
321*600f14f4SXin Li 						summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
322*600f14f4SXin Li 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
323*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
324*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
325*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
326*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
327*600f14f4SXin Li 					}
328*600f14f4SXin Li 				}
329*600f14f4SXin Li 				else { /* order == 3 */
330*600f14f4SXin Li 					__m128i q0, q1, q2;
331*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
332*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
333*600f14f4SXin Li 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
334*600f14f4SXin Li 
335*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
336*600f14f4SXin Li 						__m128i summ, mull;
337*600f14f4SXin Li 						summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
338*600f14f4SXin Li 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
339*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
340*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
341*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
342*600f14f4SXin Li 					}
343*600f14f4SXin Li 				}
344*600f14f4SXin Li 			}
345*600f14f4SXin Li 			else {
346*600f14f4SXin Li 				if(order == 2) {
347*600f14f4SXin Li 					__m128i q0, q1;
348*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
349*600f14f4SXin Li 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
350*600f14f4SXin Li 
351*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
352*600f14f4SXin Li 						__m128i summ, mull;
353*600f14f4SXin Li 						summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
354*600f14f4SXin Li 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
355*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
356*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
357*600f14f4SXin Li 					}
358*600f14f4SXin Li 				}
359*600f14f4SXin Li 				else { /* order == 1 */
360*600f14f4SXin Li 					__m128i q0;
361*600f14f4SXin Li 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
362*600f14f4SXin Li 
363*600f14f4SXin Li 					for(i = 0; i < (int)data_len-3; i+=4) {
364*600f14f4SXin Li 						__m128i summ;
365*600f14f4SXin Li 						summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
366*600f14f4SXin Li 						summ = _mm_sra_epi32(summ, cnt);
367*600f14f4SXin Li 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
368*600f14f4SXin Li 					}
369*600f14f4SXin Li 				}
370*600f14f4SXin Li 			}
371*600f14f4SXin Li 		}
372*600f14f4SXin Li 		for(; i < (int)data_len; i++) {
373*600f14f4SXin Li 			sum = 0;
374*600f14f4SXin Li 			switch(order) {
375*600f14f4SXin Li 				case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
376*600f14f4SXin Li 				case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
377*600f14f4SXin Li 				case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
378*600f14f4SXin Li 				case 9:  sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
379*600f14f4SXin Li 				case 8:  sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
380*600f14f4SXin Li 				case 7:  sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
381*600f14f4SXin Li 				case 6:  sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
382*600f14f4SXin Li 				case 5:  sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
383*600f14f4SXin Li 				case 4:  sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
384*600f14f4SXin Li 				case 3:  sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
385*600f14f4SXin Li 				case 2:  sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
386*600f14f4SXin Li 				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
387*600f14f4SXin Li 			}
388*600f14f4SXin Li 			residual[i] = data[i] - (sum >> lp_quantization);
389*600f14f4SXin Li 		}
390*600f14f4SXin Li 	}
391*600f14f4SXin Li 	else { /* order > 12 */
392*600f14f4SXin Li 		for(i = 0; i < (int)data_len; i++) {
393*600f14f4SXin Li 			sum = 0;
394*600f14f4SXin Li 			switch(order) {
395*600f14f4SXin Li 				case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
396*600f14f4SXin Li 				case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
397*600f14f4SXin Li 				case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
398*600f14f4SXin Li 				case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
399*600f14f4SXin Li 				case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
400*600f14f4SXin Li 				case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
401*600f14f4SXin Li 				case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
402*600f14f4SXin Li 				case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
403*600f14f4SXin Li 				case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
404*600f14f4SXin Li 				case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
405*600f14f4SXin Li 				case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
406*600f14f4SXin Li 				case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
407*600f14f4SXin Li 				case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
408*600f14f4SXin Li 				case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
409*600f14f4SXin Li 				case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
410*600f14f4SXin Li 				case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
411*600f14f4SXin Li 				case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
412*600f14f4SXin Li 				case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
413*600f14f4SXin Li 				case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
414*600f14f4SXin Li 				case 13: sum += qlp_coeff[12] * data[i-13];
415*600f14f4SXin Li 				         sum += qlp_coeff[11] * data[i-12];
416*600f14f4SXin Li 				         sum += qlp_coeff[10] * data[i-11];
417*600f14f4SXin Li 				         sum += qlp_coeff[ 9] * data[i-10];
418*600f14f4SXin Li 				         sum += qlp_coeff[ 8] * data[i- 9];
419*600f14f4SXin Li 				         sum += qlp_coeff[ 7] * data[i- 8];
420*600f14f4SXin Li 				         sum += qlp_coeff[ 6] * data[i- 7];
421*600f14f4SXin Li 				         sum += qlp_coeff[ 5] * data[i- 6];
422*600f14f4SXin Li 				         sum += qlp_coeff[ 4] * data[i- 5];
423*600f14f4SXin Li 				         sum += qlp_coeff[ 3] * data[i- 4];
424*600f14f4SXin Li 				         sum += qlp_coeff[ 2] * data[i- 3];
425*600f14f4SXin Li 				         sum += qlp_coeff[ 1] * data[i- 2];
426*600f14f4SXin Li 				         sum += qlp_coeff[ 0] * data[i- 1];
427*600f14f4SXin Li 			}
428*600f14f4SXin Li 			residual[i] = data[i] - (sum >> lp_quantization);
429*600f14f4SXin Li 		}
430*600f14f4SXin Li 	}
431*600f14f4SXin Li }
432*600f14f4SXin Li 
433*600f14f4SXin Li #if defined FLAC__CPU_IA32 /* unused for x86_64 */
434*600f14f4SXin Li 
435*600f14f4SXin Li FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])436*600f14f4SXin Li void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
437*600f14f4SXin Li {
438*600f14f4SXin Li 	int i;
439*600f14f4SXin Li 
440*600f14f4SXin Li 	FLAC__ASSERT(order > 0);
441*600f14f4SXin Li 	FLAC__ASSERT(order <= 32);
442*600f14f4SXin Li 
443*600f14f4SXin Li 	if(order <= 12) {
444*600f14f4SXin Li 		if(order > 8) { /* order == 9, 10, 11, 12 */
445*600f14f4SXin Li 			if(order > 10) { /* order == 11, 12 */
446*600f14f4SXin Li 				if(order == 12) {
447*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
448*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
449*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
450*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
451*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
452*600f14f4SXin Li 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
453*600f14f4SXin Li 					xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0  0  q[11] q[10]
454*600f14f4SXin Li 
455*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
456*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
457*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
458*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
459*600f14f4SXin Li 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
460*600f14f4SXin Li 					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
461*600f14f4SXin Li 
462*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
463*600f14f4SXin Li 						//sum = 0;
464*600f14f4SXin Li 						//sum += qlp_coeff[11] * data[i-12];
465*600f14f4SXin Li 						//sum += qlp_coeff[10] * data[i-11];
466*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
467*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
468*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
469*600f14f4SXin Li 
470*600f14f4SXin Li 						//sum += qlp_coeff[9] * data[i-10];
471*600f14f4SXin Li 						//sum += qlp_coeff[8] * data[i-9];
472*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
473*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
474*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm4);
475*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
476*600f14f4SXin Li 
477*600f14f4SXin Li 						//sum += qlp_coeff[7] * data[i-8];
478*600f14f4SXin Li 						//sum += qlp_coeff[6] * data[i-7];
479*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
480*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
481*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
482*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
483*600f14f4SXin Li 
484*600f14f4SXin Li 						//sum += qlp_coeff[5] * data[i-6];
485*600f14f4SXin Li 						//sum += qlp_coeff[4] * data[i-5];
486*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
487*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
488*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
489*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
490*600f14f4SXin Li 
491*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
492*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
493*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
494*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
495*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
496*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
497*600f14f4SXin Li 
498*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
499*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
500*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
501*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
502*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
503*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
504*600f14f4SXin Li 
505*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
506*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
507*600f14f4SXin Li 					}
508*600f14f4SXin Li 				}
509*600f14f4SXin Li 				else { /* order == 11 */
510*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
511*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
512*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
513*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
514*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
515*600f14f4SXin Li 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
516*600f14f4SXin Li 					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
517*600f14f4SXin Li 
518*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
519*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
520*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
521*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
522*600f14f4SXin Li 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
523*600f14f4SXin Li 
524*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
525*600f14f4SXin Li 						//sum = 0;
526*600f14f4SXin Li 						//sum  = qlp_coeff[10] * data[i-11];
527*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-11]);
528*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm5);
529*600f14f4SXin Li 
530*600f14f4SXin Li 						//sum += qlp_coeff[9] * data[i-10];
531*600f14f4SXin Li 						//sum += qlp_coeff[8] * data[i-9];
532*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
533*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
534*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm4);
535*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
536*600f14f4SXin Li 
537*600f14f4SXin Li 						//sum += qlp_coeff[7] * data[i-8];
538*600f14f4SXin Li 						//sum += qlp_coeff[6] * data[i-7];
539*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
540*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
541*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
542*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
543*600f14f4SXin Li 
544*600f14f4SXin Li 						//sum += qlp_coeff[5] * data[i-6];
545*600f14f4SXin Li 						//sum += qlp_coeff[4] * data[i-5];
546*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
547*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
548*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
549*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
550*600f14f4SXin Li 
551*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
552*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
553*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
554*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
555*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
556*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
557*600f14f4SXin Li 
558*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
559*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
560*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
561*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
562*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
563*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
564*600f14f4SXin Li 
565*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
566*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
567*600f14f4SXin Li 					}
568*600f14f4SXin Li 				}
569*600f14f4SXin Li 			}
570*600f14f4SXin Li 			else { /* order == 9, 10 */
571*600f14f4SXin Li 				if(order == 10) {
572*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
573*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
574*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
575*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
576*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
577*600f14f4SXin Li 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
578*600f14f4SXin Li 
579*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
580*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
581*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
582*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
583*600f14f4SXin Li 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
584*600f14f4SXin Li 
585*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
586*600f14f4SXin Li 						//sum = 0;
587*600f14f4SXin Li 						//sum += qlp_coeff[9] * data[i-10];
588*600f14f4SXin Li 						//sum += qlp_coeff[8] * data[i-9];
589*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
590*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
591*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm4);
592*600f14f4SXin Li 
593*600f14f4SXin Li 						//sum += qlp_coeff[7] * data[i-8];
594*600f14f4SXin Li 						//sum += qlp_coeff[6] * data[i-7];
595*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
596*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
597*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
598*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
599*600f14f4SXin Li 
600*600f14f4SXin Li 						//sum += qlp_coeff[5] * data[i-6];
601*600f14f4SXin Li 						//sum += qlp_coeff[4] * data[i-5];
602*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
603*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
604*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
605*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
606*600f14f4SXin Li 
607*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
608*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
609*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
610*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
611*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
612*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
613*600f14f4SXin Li 
614*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
615*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
616*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
617*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
618*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
619*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
620*600f14f4SXin Li 
621*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
622*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
623*600f14f4SXin Li 					}
624*600f14f4SXin Li 				}
625*600f14f4SXin Li 				else { /* order == 9 */
626*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
627*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
628*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
629*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
630*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
631*600f14f4SXin Li 					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
632*600f14f4SXin Li 
633*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
634*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
635*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
636*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
637*600f14f4SXin Li 
638*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
639*600f14f4SXin Li 						//sum = 0;
640*600f14f4SXin Li 						//sum  = qlp_coeff[8] * data[i-9];
641*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-9]);
642*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm4);
643*600f14f4SXin Li 
644*600f14f4SXin Li 						//sum += qlp_coeff[7] * data[i-8];
645*600f14f4SXin Li 						//sum += qlp_coeff[6] * data[i-7];
646*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
647*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
648*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
649*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
650*600f14f4SXin Li 
651*600f14f4SXin Li 						//sum += qlp_coeff[5] * data[i-6];
652*600f14f4SXin Li 						//sum += qlp_coeff[4] * data[i-5];
653*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
654*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
655*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
656*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
657*600f14f4SXin Li 
658*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
659*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
660*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
661*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
662*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
663*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
664*600f14f4SXin Li 
665*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
666*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
667*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
668*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
669*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
670*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
671*600f14f4SXin Li 
672*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
673*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
674*600f14f4SXin Li 					}
675*600f14f4SXin Li 				}
676*600f14f4SXin Li 			}
677*600f14f4SXin Li 		}
678*600f14f4SXin Li 		else if(order > 4) { /* order == 5, 6, 7, 8 */
679*600f14f4SXin Li 			if(order > 6) { /* order == 7, 8 */
680*600f14f4SXin Li 				if(order == 8) {
681*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
682*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
683*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
684*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
685*600f14f4SXin Li 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
686*600f14f4SXin Li 
687*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
688*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
689*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
690*600f14f4SXin Li 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
691*600f14f4SXin Li 
692*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
693*600f14f4SXin Li 						//sum = 0;
694*600f14f4SXin Li 						//sum += qlp_coeff[7] * data[i-8];
695*600f14f4SXin Li 						//sum += qlp_coeff[6] * data[i-7];
696*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
697*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
698*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm3);
699*600f14f4SXin Li 
700*600f14f4SXin Li 						//sum += qlp_coeff[5] * data[i-6];
701*600f14f4SXin Li 						//sum += qlp_coeff[4] * data[i-5];
702*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
703*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
704*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
705*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
706*600f14f4SXin Li 
707*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
708*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
709*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
710*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
711*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
712*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
713*600f14f4SXin Li 
714*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
715*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
716*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
717*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
718*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
719*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
720*600f14f4SXin Li 
721*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
722*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
723*600f14f4SXin Li 					}
724*600f14f4SXin Li 				}
725*600f14f4SXin Li 				else { /* order == 7 */
726*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
727*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
728*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
729*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
730*600f14f4SXin Li 					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
731*600f14f4SXin Li 
732*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
733*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
734*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
735*600f14f4SXin Li 
736*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
737*600f14f4SXin Li 						//sum = 0;
738*600f14f4SXin Li 						//sum  = qlp_coeff[6] * data[i-7];
739*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-7]);
740*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm3);
741*600f14f4SXin Li 
742*600f14f4SXin Li 						//sum += qlp_coeff[5] * data[i-6];
743*600f14f4SXin Li 						//sum += qlp_coeff[4] * data[i-5];
744*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
745*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
746*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
747*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
748*600f14f4SXin Li 
749*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
750*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
751*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
752*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
753*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
754*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
755*600f14f4SXin Li 
756*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
757*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
758*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
759*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
760*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
761*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
762*600f14f4SXin Li 
763*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
764*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
765*600f14f4SXin Li 					}
766*600f14f4SXin Li 				}
767*600f14f4SXin Li 			}
768*600f14f4SXin Li 			else { /* order == 5, 6 */
769*600f14f4SXin Li 				if(order == 6) {
770*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
771*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
772*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
773*600f14f4SXin Li 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
774*600f14f4SXin Li 
775*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
776*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
777*600f14f4SXin Li 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
778*600f14f4SXin Li 
779*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
780*600f14f4SXin Li 						//sum = 0;
781*600f14f4SXin Li 						//sum += qlp_coeff[5] * data[i-6];
782*600f14f4SXin Li 						//sum += qlp_coeff[4] * data[i-5];
783*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
784*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
785*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm2);
786*600f14f4SXin Li 
787*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
788*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
789*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
790*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
791*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
792*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
793*600f14f4SXin Li 
794*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
795*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
796*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
797*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
798*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
799*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
800*600f14f4SXin Li 
801*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
802*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
803*600f14f4SXin Li 					}
804*600f14f4SXin Li 				}
805*600f14f4SXin Li 				else { /* order == 5 */
806*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
807*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
808*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
809*600f14f4SXin Li 					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
810*600f14f4SXin Li 
811*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
812*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
813*600f14f4SXin Li 
814*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
815*600f14f4SXin Li 						//sum = 0;
816*600f14f4SXin Li 						//sum  = qlp_coeff[4] * data[i-5];
817*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-5]);
818*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm2);
819*600f14f4SXin Li 
820*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
821*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
822*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
823*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
824*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
825*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
826*600f14f4SXin Li 
827*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
828*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
829*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
830*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
831*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
832*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
833*600f14f4SXin Li 
834*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
835*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
836*600f14f4SXin Li 					}
837*600f14f4SXin Li 				}
838*600f14f4SXin Li 			}
839*600f14f4SXin Li 		}
840*600f14f4SXin Li 		else { /* order == 1, 2, 3, 4 */
841*600f14f4SXin Li 			if(order > 2) { /* order == 3, 4 */
842*600f14f4SXin Li 				if(order == 4) {
843*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm6, xmm7;
844*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
845*600f14f4SXin Li 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
846*600f14f4SXin Li 
847*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
848*600f14f4SXin Li 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
849*600f14f4SXin Li 
850*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
851*600f14f4SXin Li 						//sum = 0;
852*600f14f4SXin Li 						//sum += qlp_coeff[3] * data[i-4];
853*600f14f4SXin Li 						//sum += qlp_coeff[2] * data[i-3];
854*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
855*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
856*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm1);
857*600f14f4SXin Li 
858*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
859*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
860*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
861*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
862*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
863*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
864*600f14f4SXin Li 
865*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
866*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
867*600f14f4SXin Li 					}
868*600f14f4SXin Li 				}
869*600f14f4SXin Li 				else { /* order == 3 */
870*600f14f4SXin Li 					__m128i xmm0, xmm1, xmm6, xmm7;
871*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
872*600f14f4SXin Li 					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
873*600f14f4SXin Li 
874*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
875*600f14f4SXin Li 
876*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
877*600f14f4SXin Li 						//sum = 0;
878*600f14f4SXin Li 						//sum  = qlp_coeff[2] * data[i-3];
879*600f14f4SXin Li 						xmm7 = _mm_cvtsi32_si128(data[i-3]);
880*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm1);
881*600f14f4SXin Li 
882*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
883*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
884*600f14f4SXin Li 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
885*600f14f4SXin Li 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
886*600f14f4SXin Li 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
887*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, xmm6);
888*600f14f4SXin Li 
889*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
890*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
891*600f14f4SXin Li 					}
892*600f14f4SXin Li 				}
893*600f14f4SXin Li 			}
894*600f14f4SXin Li 			else { /* order == 1, 2 */
895*600f14f4SXin Li 				if(order == 2) {
896*600f14f4SXin Li 					__m128i xmm0, xmm7;
897*600f14f4SXin Li 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
898*600f14f4SXin Li 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
899*600f14f4SXin Li 
900*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++) {
901*600f14f4SXin Li 						//sum = 0;
902*600f14f4SXin Li 						//sum += qlp_coeff[1] * data[i-2];
903*600f14f4SXin Li 						//sum += qlp_coeff[0] * data[i-1];
904*600f14f4SXin Li 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
905*600f14f4SXin Li 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
906*600f14f4SXin Li 						xmm7 = _mm_mul_epu32(xmm7, xmm0);
907*600f14f4SXin Li 
908*600f14f4SXin Li 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
909*600f14f4SXin Li 						RESIDUAL32_RESULT(xmm7);
910*600f14f4SXin Li 					}
911*600f14f4SXin Li 				}
912*600f14f4SXin Li 				else { /* order == 1 */
913*600f14f4SXin Li 					for(i = 0; i < (int)data_len; i++)
914*600f14f4SXin Li 						residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
915*600f14f4SXin Li 				}
916*600f14f4SXin Li 			}
917*600f14f4SXin Li 		}
918*600f14f4SXin Li 	}
919*600f14f4SXin Li 	else { /* order > 12 */
920*600f14f4SXin Li 		FLAC__int32 sum;
921*600f14f4SXin Li 		for(i = 0; i < (int)data_len; i++) {
922*600f14f4SXin Li 			sum = 0;
923*600f14f4SXin Li 			switch(order) {
924*600f14f4SXin Li 				case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
925*600f14f4SXin Li 				case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
926*600f14f4SXin Li 				case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
927*600f14f4SXin Li 				case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
928*600f14f4SXin Li 				case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
929*600f14f4SXin Li 				case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
930*600f14f4SXin Li 				case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
931*600f14f4SXin Li 				case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
932*600f14f4SXin Li 				case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
933*600f14f4SXin Li 				case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
934*600f14f4SXin Li 				case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
935*600f14f4SXin Li 				case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
936*600f14f4SXin Li 				case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
937*600f14f4SXin Li 				case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
938*600f14f4SXin Li 				case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
939*600f14f4SXin Li 				case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
940*600f14f4SXin Li 				case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
941*600f14f4SXin Li 				case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
942*600f14f4SXin Li 				case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
943*600f14f4SXin Li 				case 13: sum += qlp_coeff[12] * data[i-13];
944*600f14f4SXin Li 				         sum += qlp_coeff[11] * data[i-12];
945*600f14f4SXin Li 				         sum += qlp_coeff[10] * data[i-11];
946*600f14f4SXin Li 				         sum += qlp_coeff[ 9] * data[i-10];
947*600f14f4SXin Li 				         sum += qlp_coeff[ 8] * data[i- 9];
948*600f14f4SXin Li 				         sum += qlp_coeff[ 7] * data[i- 8];
949*600f14f4SXin Li 				         sum += qlp_coeff[ 6] * data[i- 7];
950*600f14f4SXin Li 				         sum += qlp_coeff[ 5] * data[i- 6];
951*600f14f4SXin Li 				         sum += qlp_coeff[ 4] * data[i- 5];
952*600f14f4SXin Li 				         sum += qlp_coeff[ 3] * data[i- 4];
953*600f14f4SXin Li 				         sum += qlp_coeff[ 2] * data[i- 3];
954*600f14f4SXin Li 				         sum += qlp_coeff[ 1] * data[i- 2];
955*600f14f4SXin Li 				         sum += qlp_coeff[ 0] * data[i- 1];
956*600f14f4SXin Li 			}
957*600f14f4SXin Li 			residual[i] = data[i] - (sum >> lp_quantization);
958*600f14f4SXin Li 		}
959*600f14f4SXin Li 	}
960*600f14f4SXin Li }
961*600f14f4SXin Li 
962*600f14f4SXin Li #endif /* FLAC__CPU_IA32 */
963*600f14f4SXin Li #endif /* FLAC__SSE2_SUPPORTED */
964*600f14f4SXin Li #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
965*600f14f4SXin Li #endif /* FLAC__NO_ASM */
966*600f14f4SXin Li #endif /* FLAC__INTEGER_ONLY_LIBRARY */
967