xref: /aosp_15_r20/external/flac/src/libFLAC/lpc_intrin_sse2.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2023  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36 
37 #include "private/cpu.h"
38 
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE2_SUPPORTED
44 
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47 
48 #include <emmintrin.h> /* SSE2 */
49 
50 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51 #define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 
53 
54 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])55 void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
56 {
57 #undef MAX_LAG
58 #define MAX_LAG 8
59 #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
60 }
61 
62 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])63 void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
64 {
65 #undef MAX_LAG
66 #define MAX_LAG 10
67 #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
68 }
69 
70 
71 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])72 void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
73 {
74 #undef MAX_LAG
75 #define MAX_LAG 14
76 #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
77 }
78 
79 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])80 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
81 {
82 	int i;
83 	FLAC__int32 sum;
84 	const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
85 
86 	FLAC__ASSERT(order > 0);
87 	FLAC__ASSERT(order <= 32);
88 
89 	if(order <= 12) {
90 		if(order > 8) {
91 			if(order > 10) {
92 				if(order == 12) {
93 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
94 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
95 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
96 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
97 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
98 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
99 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
100 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
101 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
102 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
103 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
104 					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
105 					q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
106 
107 					for(i = 0; i < (int)data_len-3; i+=4) {
108 						__m128i summ, mull;
109 						summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
110 						mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
111 						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
112 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
113 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
114 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
115 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
116 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
117 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
118 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
119 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
120 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
121 						summ = _mm_sra_epi32(summ, cnt);
122 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
123 					}
124 				}
125 				else { /* order == 11 */
126 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
127 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
128 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
129 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
130 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
131 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
132 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
133 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
134 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
135 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
136 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
137 					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
138 
139 					for(i = 0; i < (int)data_len-3; i+=4) {
140 						__m128i summ, mull;
141 						summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
142 						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
143 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
144 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
145 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
146 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
147 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
148 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
149 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
150 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
151 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
152 						summ = _mm_sra_epi32(summ, cnt);
153 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
154 					}
155 				}
156 			}
157 			else {
158 				if(order == 10) {
159 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
160 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
161 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
162 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
163 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
164 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
165 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
166 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
167 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
168 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
169 					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
170 
171 					for(i = 0; i < (int)data_len-3; i+=4) {
172 						__m128i summ, mull;
173 						summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
174 						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
175 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
176 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
177 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
178 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
179 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
180 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
181 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
182 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
183 						summ = _mm_sra_epi32(summ, cnt);
184 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
185 					}
186 				}
187 				else { /* order == 9 */
188 					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
189 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
190 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
191 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
192 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
193 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
194 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
195 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
196 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
197 					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
198 
199 					for(i = 0; i < (int)data_len-3; i+=4) {
200 						__m128i summ, mull;
201 						summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
202 						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
203 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
204 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
205 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
206 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
207 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
208 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
209 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
210 						summ = _mm_sra_epi32(summ, cnt);
211 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
212 					}
213 				}
214 			}
215 		}
216 		else if(order > 4) {
217 			if(order > 6) {
218 				if(order == 8) {
219 					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
220 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
221 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
222 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
223 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
224 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
225 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
226 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
227 					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
228 
229 					for(i = 0; i < (int)data_len-3; i+=4) {
230 						__m128i summ, mull;
231 						summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
232 						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
233 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
234 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
235 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
236 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
237 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
238 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
239 						summ = _mm_sra_epi32(summ, cnt);
240 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
241 					}
242 				}
243 				else { /* order == 7 */
244 					__m128i q0, q1, q2, q3, q4, q5, q6;
245 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
246 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
247 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
248 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
249 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
250 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
251 					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
252 
253 					for(i = 0; i < (int)data_len-3; i+=4) {
254 						__m128i summ, mull;
255 						summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
256 						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
257 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
258 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
259 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
260 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
261 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
262 						summ = _mm_sra_epi32(summ, cnt);
263 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
264 					}
265 				}
266 			}
267 			else {
268 				if(order == 6) {
269 					__m128i q0, q1, q2, q3, q4, q5;
270 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
271 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
272 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
273 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
274 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
275 					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
276 
277 					for(i = 0; i < (int)data_len-3; i+=4) {
278 						__m128i summ, mull;
279 						summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
280 						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
281 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
282 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
283 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
284 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
285 						summ = _mm_sra_epi32(summ, cnt);
286 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
287 					}
288 				}
289 				else { /* order == 5 */
290 					__m128i q0, q1, q2, q3, q4;
291 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
292 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
293 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
294 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
295 					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
296 
297 					for(i = 0; i < (int)data_len-3; i+=4) {
298 						__m128i summ, mull;
299 						summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
300 						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
301 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
302 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
303 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
304 						summ = _mm_sra_epi32(summ, cnt);
305 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
306 					}
307 				}
308 			}
309 		}
310 		else {
311 			if(order > 2) {
312 				if(order == 4) {
313 					__m128i q0, q1, q2, q3;
314 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
315 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
316 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
317 					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
318 
319 					for(i = 0; i < (int)data_len-3; i+=4) {
320 						__m128i summ, mull;
321 						summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
322 						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
323 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
324 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
325 						summ = _mm_sra_epi32(summ, cnt);
326 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
327 					}
328 				}
329 				else { /* order == 3 */
330 					__m128i q0, q1, q2;
331 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
332 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
333 					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
334 
335 					for(i = 0; i < (int)data_len-3; i+=4) {
336 						__m128i summ, mull;
337 						summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
338 						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
339 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
340 						summ = _mm_sra_epi32(summ, cnt);
341 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
342 					}
343 				}
344 			}
345 			else {
346 				if(order == 2) {
347 					__m128i q0, q1;
348 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
349 					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
350 
351 					for(i = 0; i < (int)data_len-3; i+=4) {
352 						__m128i summ, mull;
353 						summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
354 						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
355 						summ = _mm_sra_epi32(summ, cnt);
356 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
357 					}
358 				}
359 				else { /* order == 1 */
360 					__m128i q0;
361 					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
362 
363 					for(i = 0; i < (int)data_len-3; i+=4) {
364 						__m128i summ;
365 						summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
366 						summ = _mm_sra_epi32(summ, cnt);
367 						_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
368 					}
369 				}
370 			}
371 		}
372 		for(; i < (int)data_len; i++) {
373 			sum = 0;
374 			switch(order) {
375 				case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
376 				case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
377 				case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
378 				case 9:  sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
379 				case 8:  sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
380 				case 7:  sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
381 				case 6:  sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
382 				case 5:  sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
383 				case 4:  sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
384 				case 3:  sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
385 				case 2:  sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
386 				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
387 			}
388 			residual[i] = data[i] - (sum >> lp_quantization);
389 		}
390 	}
391 	else { /* order > 12 */
392 		for(i = 0; i < (int)data_len; i++) {
393 			sum = 0;
394 			switch(order) {
395 				case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
396 				case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
397 				case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
398 				case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
399 				case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
400 				case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
401 				case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
402 				case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
403 				case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
404 				case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
405 				case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
406 				case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
407 				case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
408 				case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
409 				case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
410 				case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
411 				case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
412 				case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
413 				case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
414 				case 13: sum += qlp_coeff[12] * data[i-13];
415 				         sum += qlp_coeff[11] * data[i-12];
416 				         sum += qlp_coeff[10] * data[i-11];
417 				         sum += qlp_coeff[ 9] * data[i-10];
418 				         sum += qlp_coeff[ 8] * data[i- 9];
419 				         sum += qlp_coeff[ 7] * data[i- 8];
420 				         sum += qlp_coeff[ 6] * data[i- 7];
421 				         sum += qlp_coeff[ 5] * data[i- 6];
422 				         sum += qlp_coeff[ 4] * data[i- 5];
423 				         sum += qlp_coeff[ 3] * data[i- 4];
424 				         sum += qlp_coeff[ 2] * data[i- 3];
425 				         sum += qlp_coeff[ 1] * data[i- 2];
426 				         sum += qlp_coeff[ 0] * data[i- 1];
427 			}
428 			residual[i] = data[i] - (sum >> lp_quantization);
429 		}
430 	}
431 }
432 
433 #if defined FLAC__CPU_IA32 /* unused for x86_64 */
434 
435 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])436 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
437 {
438 	int i;
439 
440 	FLAC__ASSERT(order > 0);
441 	FLAC__ASSERT(order <= 32);
442 
443 	if(order <= 12) {
444 		if(order > 8) { /* order == 9, 10, 11, 12 */
445 			if(order > 10) { /* order == 11, 12 */
446 				if(order == 12) {
447 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
448 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
449 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
450 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
451 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
452 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
453 					xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0  0  q[11] q[10]
454 
455 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
456 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
457 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
458 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
459 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
460 					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
461 
462 					for(i = 0; i < (int)data_len; i++) {
463 						//sum = 0;
464 						//sum += qlp_coeff[11] * data[i-12];
465 						//sum += qlp_coeff[10] * data[i-11];
466 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
467 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
468 						xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
469 
470 						//sum += qlp_coeff[9] * data[i-10];
471 						//sum += qlp_coeff[8] * data[i-9];
472 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
473 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
474 						xmm6 = _mm_mul_epu32(xmm6, xmm4);
475 						xmm7 = _mm_add_epi32(xmm7, xmm6);
476 
477 						//sum += qlp_coeff[7] * data[i-8];
478 						//sum += qlp_coeff[6] * data[i-7];
479 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
480 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
481 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
482 						xmm7 = _mm_add_epi32(xmm7, xmm6);
483 
484 						//sum += qlp_coeff[5] * data[i-6];
485 						//sum += qlp_coeff[4] * data[i-5];
486 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
487 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
488 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
489 						xmm7 = _mm_add_epi32(xmm7, xmm6);
490 
491 						//sum += qlp_coeff[3] * data[i-4];
492 						//sum += qlp_coeff[2] * data[i-3];
493 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
494 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
495 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
496 						xmm7 = _mm_add_epi32(xmm7, xmm6);
497 
498 						//sum += qlp_coeff[1] * data[i-2];
499 						//sum += qlp_coeff[0] * data[i-1];
500 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
501 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
502 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
503 						xmm7 = _mm_add_epi32(xmm7, xmm6);
504 
505 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
506 						RESIDUAL32_RESULT(xmm7);
507 					}
508 				}
509 				else { /* order == 11 */
510 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
511 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
512 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
513 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
514 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
515 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
516 					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
517 
518 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
519 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
520 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
521 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
522 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
523 
524 					for(i = 0; i < (int)data_len; i++) {
525 						//sum = 0;
526 						//sum  = qlp_coeff[10] * data[i-11];
527 						xmm7 = _mm_cvtsi32_si128(data[i-11]);
528 						xmm7 = _mm_mul_epu32(xmm7, xmm5);
529 
530 						//sum += qlp_coeff[9] * data[i-10];
531 						//sum += qlp_coeff[8] * data[i-9];
532 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
533 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
534 						xmm6 = _mm_mul_epu32(xmm6, xmm4);
535 						xmm7 = _mm_add_epi32(xmm7, xmm6);
536 
537 						//sum += qlp_coeff[7] * data[i-8];
538 						//sum += qlp_coeff[6] * data[i-7];
539 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
540 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
541 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
542 						xmm7 = _mm_add_epi32(xmm7, xmm6);
543 
544 						//sum += qlp_coeff[5] * data[i-6];
545 						//sum += qlp_coeff[4] * data[i-5];
546 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
547 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
548 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
549 						xmm7 = _mm_add_epi32(xmm7, xmm6);
550 
551 						//sum += qlp_coeff[3] * data[i-4];
552 						//sum += qlp_coeff[2] * data[i-3];
553 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
554 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
555 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
556 						xmm7 = _mm_add_epi32(xmm7, xmm6);
557 
558 						//sum += qlp_coeff[1] * data[i-2];
559 						//sum += qlp_coeff[0] * data[i-1];
560 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
561 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
562 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
563 						xmm7 = _mm_add_epi32(xmm7, xmm6);
564 
565 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
566 						RESIDUAL32_RESULT(xmm7);
567 					}
568 				}
569 			}
570 			else { /* order == 9, 10 */
571 				if(order == 10) {
572 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
573 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
574 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
575 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
576 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
577 					xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
578 
579 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
580 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
581 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
582 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
583 					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
584 
585 					for(i = 0; i < (int)data_len; i++) {
586 						//sum = 0;
587 						//sum += qlp_coeff[9] * data[i-10];
588 						//sum += qlp_coeff[8] * data[i-9];
589 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
590 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
591 						xmm7 = _mm_mul_epu32(xmm7, xmm4);
592 
593 						//sum += qlp_coeff[7] * data[i-8];
594 						//sum += qlp_coeff[6] * data[i-7];
595 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
596 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
597 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
598 						xmm7 = _mm_add_epi32(xmm7, xmm6);
599 
600 						//sum += qlp_coeff[5] * data[i-6];
601 						//sum += qlp_coeff[4] * data[i-5];
602 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
603 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
604 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
605 						xmm7 = _mm_add_epi32(xmm7, xmm6);
606 
607 						//sum += qlp_coeff[3] * data[i-4];
608 						//sum += qlp_coeff[2] * data[i-3];
609 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
610 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
611 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
612 						xmm7 = _mm_add_epi32(xmm7, xmm6);
613 
614 						//sum += qlp_coeff[1] * data[i-2];
615 						//sum += qlp_coeff[0] * data[i-1];
616 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
617 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
618 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
619 						xmm7 = _mm_add_epi32(xmm7, xmm6);
620 
621 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
622 						RESIDUAL32_RESULT(xmm7);
623 					}
624 				}
625 				else { /* order == 9 */
626 					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
627 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
628 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
629 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
630 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
631 					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
632 
633 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
634 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
635 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
636 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
637 
638 					for(i = 0; i < (int)data_len; i++) {
639 						//sum = 0;
640 						//sum  = qlp_coeff[8] * data[i-9];
641 						xmm7 = _mm_cvtsi32_si128(data[i-9]);
642 						xmm7 = _mm_mul_epu32(xmm7, xmm4);
643 
644 						//sum += qlp_coeff[7] * data[i-8];
645 						//sum += qlp_coeff[6] * data[i-7];
646 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
647 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
648 						xmm6 = _mm_mul_epu32(xmm6, xmm3);
649 						xmm7 = _mm_add_epi32(xmm7, xmm6);
650 
651 						//sum += qlp_coeff[5] * data[i-6];
652 						//sum += qlp_coeff[4] * data[i-5];
653 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
654 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
655 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
656 						xmm7 = _mm_add_epi32(xmm7, xmm6);
657 
658 						//sum += qlp_coeff[3] * data[i-4];
659 						//sum += qlp_coeff[2] * data[i-3];
660 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
661 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
662 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
663 						xmm7 = _mm_add_epi32(xmm7, xmm6);
664 
665 						//sum += qlp_coeff[1] * data[i-2];
666 						//sum += qlp_coeff[0] * data[i-1];
667 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
668 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
669 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
670 						xmm7 = _mm_add_epi32(xmm7, xmm6);
671 
672 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
673 						RESIDUAL32_RESULT(xmm7);
674 					}
675 				}
676 			}
677 		}
678 		else if(order > 4) { /* order == 5, 6, 7, 8 */
679 			if(order > 6) { /* order == 7, 8 */
680 				if(order == 8) {
681 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
682 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
683 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
684 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
685 					xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
686 
687 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
688 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
689 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
690 					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
691 
692 					for(i = 0; i < (int)data_len; i++) {
693 						//sum = 0;
694 						//sum += qlp_coeff[7] * data[i-8];
695 						//sum += qlp_coeff[6] * data[i-7];
696 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
697 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
698 						xmm7 = _mm_mul_epu32(xmm7, xmm3);
699 
700 						//sum += qlp_coeff[5] * data[i-6];
701 						//sum += qlp_coeff[4] * data[i-5];
702 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
703 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
704 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
705 						xmm7 = _mm_add_epi32(xmm7, xmm6);
706 
707 						//sum += qlp_coeff[3] * data[i-4];
708 						//sum += qlp_coeff[2] * data[i-3];
709 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
710 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
711 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
712 						xmm7 = _mm_add_epi32(xmm7, xmm6);
713 
714 						//sum += qlp_coeff[1] * data[i-2];
715 						//sum += qlp_coeff[0] * data[i-1];
716 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
717 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
718 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
719 						xmm7 = _mm_add_epi32(xmm7, xmm6);
720 
721 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
722 						RESIDUAL32_RESULT(xmm7);
723 					}
724 				}
725 				else { /* order == 7 */
726 					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
727 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
728 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
729 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
730 					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
731 
732 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
733 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
734 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
735 
736 					for(i = 0; i < (int)data_len; i++) {
737 						//sum = 0;
738 						//sum  = qlp_coeff[6] * data[i-7];
739 						xmm7 = _mm_cvtsi32_si128(data[i-7]);
740 						xmm7 = _mm_mul_epu32(xmm7, xmm3);
741 
742 						//sum += qlp_coeff[5] * data[i-6];
743 						//sum += qlp_coeff[4] * data[i-5];
744 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
745 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
746 						xmm6 = _mm_mul_epu32(xmm6, xmm2);
747 						xmm7 = _mm_add_epi32(xmm7, xmm6);
748 
749 						//sum += qlp_coeff[3] * data[i-4];
750 						//sum += qlp_coeff[2] * data[i-3];
751 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
752 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
753 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
754 						xmm7 = _mm_add_epi32(xmm7, xmm6);
755 
756 						//sum += qlp_coeff[1] * data[i-2];
757 						//sum += qlp_coeff[0] * data[i-1];
758 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
759 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
760 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
761 						xmm7 = _mm_add_epi32(xmm7, xmm6);
762 
763 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
764 						RESIDUAL32_RESULT(xmm7);
765 					}
766 				}
767 			}
768 			else { /* order == 5, 6 */
769 				if(order == 6) {
770 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
771 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
772 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
773 					xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
774 
775 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
776 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
777 					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
778 
779 					for(i = 0; i < (int)data_len; i++) {
780 						//sum = 0;
781 						//sum += qlp_coeff[5] * data[i-6];
782 						//sum += qlp_coeff[4] * data[i-5];
783 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
784 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
785 						xmm7 = _mm_mul_epu32(xmm7, xmm2);
786 
787 						//sum += qlp_coeff[3] * data[i-4];
788 						//sum += qlp_coeff[2] * data[i-3];
789 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
790 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
791 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
792 						xmm7 = _mm_add_epi32(xmm7, xmm6);
793 
794 						//sum += qlp_coeff[1] * data[i-2];
795 						//sum += qlp_coeff[0] * data[i-1];
796 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
797 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
798 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
799 						xmm7 = _mm_add_epi32(xmm7, xmm6);
800 
801 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
802 						RESIDUAL32_RESULT(xmm7);
803 					}
804 				}
805 				else { /* order == 5 */
806 					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
807 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
808 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
809 					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
810 
811 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
812 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
813 
814 					for(i = 0; i < (int)data_len; i++) {
815 						//sum = 0;
816 						//sum  = qlp_coeff[4] * data[i-5];
817 						xmm7 = _mm_cvtsi32_si128(data[i-5]);
818 						xmm7 = _mm_mul_epu32(xmm7, xmm2);
819 
820 						//sum += qlp_coeff[3] * data[i-4];
821 						//sum += qlp_coeff[2] * data[i-3];
822 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
823 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
824 						xmm6 = _mm_mul_epu32(xmm6, xmm1);
825 						xmm7 = _mm_add_epi32(xmm7, xmm6);
826 
827 						//sum += qlp_coeff[1] * data[i-2];
828 						//sum += qlp_coeff[0] * data[i-1];
829 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
830 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
831 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
832 						xmm7 = _mm_add_epi32(xmm7, xmm6);
833 
834 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
835 						RESIDUAL32_RESULT(xmm7);
836 					}
837 				}
838 			}
839 		}
840 		else { /* order == 1, 2, 3, 4 */
841 			if(order > 2) { /* order == 3, 4 */
842 				if(order == 4) {
843 					__m128i xmm0, xmm1, xmm6, xmm7;
844 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
845 					xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
846 
847 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
848 					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
849 
850 					for(i = 0; i < (int)data_len; i++) {
851 						//sum = 0;
852 						//sum += qlp_coeff[3] * data[i-4];
853 						//sum += qlp_coeff[2] * data[i-3];
854 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
855 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
856 						xmm7 = _mm_mul_epu32(xmm7, xmm1);
857 
858 						//sum += qlp_coeff[1] * data[i-2];
859 						//sum += qlp_coeff[0] * data[i-1];
860 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
861 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
862 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
863 						xmm7 = _mm_add_epi32(xmm7, xmm6);
864 
865 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
866 						RESIDUAL32_RESULT(xmm7);
867 					}
868 				}
869 				else { /* order == 3 */
870 					__m128i xmm0, xmm1, xmm6, xmm7;
871 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
872 					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
873 
874 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
875 
876 					for(i = 0; i < (int)data_len; i++) {
877 						//sum = 0;
878 						//sum  = qlp_coeff[2] * data[i-3];
879 						xmm7 = _mm_cvtsi32_si128(data[i-3]);
880 						xmm7 = _mm_mul_epu32(xmm7, xmm1);
881 
882 						//sum += qlp_coeff[1] * data[i-2];
883 						//sum += qlp_coeff[0] * data[i-1];
884 						xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
885 						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
886 						xmm6 = _mm_mul_epu32(xmm6, xmm0);
887 						xmm7 = _mm_add_epi32(xmm7, xmm6);
888 
889 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
890 						RESIDUAL32_RESULT(xmm7);
891 					}
892 				}
893 			}
894 			else { /* order == 1, 2 */
895 				if(order == 2) {
896 					__m128i xmm0, xmm7;
897 					xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
898 					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
899 
900 					for(i = 0; i < (int)data_len; i++) {
901 						//sum = 0;
902 						//sum += qlp_coeff[1] * data[i-2];
903 						//sum += qlp_coeff[0] * data[i-1];
904 						xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
905 						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
906 						xmm7 = _mm_mul_epu32(xmm7, xmm0);
907 
908 						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
909 						RESIDUAL32_RESULT(xmm7);
910 					}
911 				}
912 				else { /* order == 1 */
913 					for(i = 0; i < (int)data_len; i++)
914 						residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
915 				}
916 			}
917 		}
918 	}
919 	else { /* order > 12 */
920 		FLAC__int32 sum;
921 		for(i = 0; i < (int)data_len; i++) {
922 			sum = 0;
923 			switch(order) {
924 				case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
925 				case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
926 				case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
927 				case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
928 				case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
929 				case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
930 				case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
931 				case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
932 				case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
933 				case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
934 				case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
935 				case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
936 				case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
937 				case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
938 				case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
939 				case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
940 				case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
941 				case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
942 				case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
943 				case 13: sum += qlp_coeff[12] * data[i-13];
944 				         sum += qlp_coeff[11] * data[i-12];
945 				         sum += qlp_coeff[10] * data[i-11];
946 				         sum += qlp_coeff[ 9] * data[i-10];
947 				         sum += qlp_coeff[ 8] * data[i- 9];
948 				         sum += qlp_coeff[ 7] * data[i- 8];
949 				         sum += qlp_coeff[ 6] * data[i- 7];
950 				         sum += qlp_coeff[ 5] * data[i- 6];
951 				         sum += qlp_coeff[ 4] * data[i- 5];
952 				         sum += qlp_coeff[ 3] * data[i- 4];
953 				         sum += qlp_coeff[ 2] * data[i- 3];
954 				         sum += qlp_coeff[ 1] * data[i- 2];
955 				         sum += qlp_coeff[ 0] * data[i- 1];
956 			}
957 			residual[i] = data[i] - (sum >> lp_quantization);
958 		}
959 	}
960 }
961 
962 #endif /* FLAC__CPU_IA32 */
963 #endif /* FLAC__SSE2_SUPPORTED */
964 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
965 #endif /* FLAC__NO_ASM */
966 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
967