1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2023 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
36
37 #include "private/cpu.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE2_SUPPORTED
44
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47
48 #include <emmintrin.h> /* SSE2 */
49
50 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51 #define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52
53
54 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])55 void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
56 {
57 #undef MAX_LAG
58 #define MAX_LAG 8
59 #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
60 }
61
62 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])63 void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
64 {
65 #undef MAX_LAG
66 #define MAX_LAG 10
67 #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
68 }
69
70
71 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[],uint32_t data_len,uint32_t lag,double autoc[])72 void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
73 {
74 #undef MAX_LAG
75 #define MAX_LAG 14
76 #include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
77 }
78
79 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])80 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
81 {
82 int i;
83 FLAC__int32 sum;
84 const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
85
86 FLAC__ASSERT(order > 0);
87 FLAC__ASSERT(order <= 32);
88
89 if(order <= 12) {
90 if(order > 8) {
91 if(order > 10) {
92 if(order == 12) {
93 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
94 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
95 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
96 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
97 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
98 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
99 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
100 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
101 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
102 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
103 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
104 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
105 q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
106
107 for(i = 0; i < (int)data_len-3; i+=4) {
108 __m128i summ, mull;
109 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
110 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
111 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
112 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
113 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
114 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
115 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
116 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
117 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
118 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
119 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
120 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
121 summ = _mm_sra_epi32(summ, cnt);
122 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
123 }
124 }
125 else { /* order == 11 */
126 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
127 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
128 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
129 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
130 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
131 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
132 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
133 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
134 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
135 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
136 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
137 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
138
139 for(i = 0; i < (int)data_len-3; i+=4) {
140 __m128i summ, mull;
141 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
142 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
143 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
144 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
145 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
146 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
147 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
148 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
149 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
150 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
151 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
152 summ = _mm_sra_epi32(summ, cnt);
153 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
154 }
155 }
156 }
157 else {
158 if(order == 10) {
159 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
160 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
161 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
162 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
163 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
164 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
165 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
166 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
167 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
168 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
169 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
170
171 for(i = 0; i < (int)data_len-3; i+=4) {
172 __m128i summ, mull;
173 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
174 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
175 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
176 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
177 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
178 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
179 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
180 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
181 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
182 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
183 summ = _mm_sra_epi32(summ, cnt);
184 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
185 }
186 }
187 else { /* order == 9 */
188 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
189 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
190 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
191 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
192 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
193 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
194 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
195 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
196 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
197 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
198
199 for(i = 0; i < (int)data_len-3; i+=4) {
200 __m128i summ, mull;
201 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
202 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
203 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
204 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
205 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
206 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
207 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
208 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
209 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
210 summ = _mm_sra_epi32(summ, cnt);
211 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
212 }
213 }
214 }
215 }
216 else if(order > 4) {
217 if(order > 6) {
218 if(order == 8) {
219 __m128i q0, q1, q2, q3, q4, q5, q6, q7;
220 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
221 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
222 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
223 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
224 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
225 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
226 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
227 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
228
229 for(i = 0; i < (int)data_len-3; i+=4) {
230 __m128i summ, mull;
231 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
232 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
233 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
234 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
235 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
236 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
237 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
238 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
239 summ = _mm_sra_epi32(summ, cnt);
240 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
241 }
242 }
243 else { /* order == 7 */
244 __m128i q0, q1, q2, q3, q4, q5, q6;
245 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
246 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
247 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
248 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
249 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
250 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
251 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
252
253 for(i = 0; i < (int)data_len-3; i+=4) {
254 __m128i summ, mull;
255 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
256 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
257 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
258 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
259 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
260 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
261 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
262 summ = _mm_sra_epi32(summ, cnt);
263 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
264 }
265 }
266 }
267 else {
268 if(order == 6) {
269 __m128i q0, q1, q2, q3, q4, q5;
270 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
271 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
272 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
273 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
274 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
275 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
276
277 for(i = 0; i < (int)data_len-3; i+=4) {
278 __m128i summ, mull;
279 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
280 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
281 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
282 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
283 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
284 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
285 summ = _mm_sra_epi32(summ, cnt);
286 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
287 }
288 }
289 else { /* order == 5 */
290 __m128i q0, q1, q2, q3, q4;
291 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
292 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
293 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
294 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
295 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
296
297 for(i = 0; i < (int)data_len-3; i+=4) {
298 __m128i summ, mull;
299 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
300 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
301 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
302 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
303 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
304 summ = _mm_sra_epi32(summ, cnt);
305 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
306 }
307 }
308 }
309 }
310 else {
311 if(order > 2) {
312 if(order == 4) {
313 __m128i q0, q1, q2, q3;
314 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
315 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
316 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
317 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
318
319 for(i = 0; i < (int)data_len-3; i+=4) {
320 __m128i summ, mull;
321 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
322 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
323 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
324 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
325 summ = _mm_sra_epi32(summ, cnt);
326 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
327 }
328 }
329 else { /* order == 3 */
330 __m128i q0, q1, q2;
331 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
332 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
333 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
334
335 for(i = 0; i < (int)data_len-3; i+=4) {
336 __m128i summ, mull;
337 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
338 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
339 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
340 summ = _mm_sra_epi32(summ, cnt);
341 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
342 }
343 }
344 }
345 else {
346 if(order == 2) {
347 __m128i q0, q1;
348 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
349 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
350
351 for(i = 0; i < (int)data_len-3; i+=4) {
352 __m128i summ, mull;
353 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
354 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
355 summ = _mm_sra_epi32(summ, cnt);
356 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
357 }
358 }
359 else { /* order == 1 */
360 __m128i q0;
361 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
362
363 for(i = 0; i < (int)data_len-3; i+=4) {
364 __m128i summ;
365 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
366 summ = _mm_sra_epi32(summ, cnt);
367 _mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
368 }
369 }
370 }
371 }
372 for(; i < (int)data_len; i++) {
373 sum = 0;
374 switch(order) {
375 case 12: sum += qlp_coeff[11] * data[i-12]; /* Falls through. */
376 case 11: sum += qlp_coeff[10] * data[i-11]; /* Falls through. */
377 case 10: sum += qlp_coeff[ 9] * data[i-10]; /* Falls through. */
378 case 9: sum += qlp_coeff[ 8] * data[i- 9]; /* Falls through. */
379 case 8: sum += qlp_coeff[ 7] * data[i- 8]; /* Falls through. */
380 case 7: sum += qlp_coeff[ 6] * data[i- 7]; /* Falls through. */
381 case 6: sum += qlp_coeff[ 5] * data[i- 6]; /* Falls through. */
382 case 5: sum += qlp_coeff[ 4] * data[i- 5]; /* Falls through. */
383 case 4: sum += qlp_coeff[ 3] * data[i- 4]; /* Falls through. */
384 case 3: sum += qlp_coeff[ 2] * data[i- 3]; /* Falls through. */
385 case 2: sum += qlp_coeff[ 1] * data[i- 2]; /* Falls through. */
386 case 1: sum += qlp_coeff[ 0] * data[i- 1];
387 }
388 residual[i] = data[i] - (sum >> lp_quantization);
389 }
390 }
391 else { /* order > 12 */
392 for(i = 0; i < (int)data_len; i++) {
393 sum = 0;
394 switch(order) {
395 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
396 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
397 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
398 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
399 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
400 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
401 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
402 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
403 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
404 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
405 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
406 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
407 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
408 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
409 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
410 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
411 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
412 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
413 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
414 case 13: sum += qlp_coeff[12] * data[i-13];
415 sum += qlp_coeff[11] * data[i-12];
416 sum += qlp_coeff[10] * data[i-11];
417 sum += qlp_coeff[ 9] * data[i-10];
418 sum += qlp_coeff[ 8] * data[i- 9];
419 sum += qlp_coeff[ 7] * data[i- 8];
420 sum += qlp_coeff[ 6] * data[i- 7];
421 sum += qlp_coeff[ 5] * data[i- 6];
422 sum += qlp_coeff[ 4] * data[i- 5];
423 sum += qlp_coeff[ 3] * data[i- 4];
424 sum += qlp_coeff[ 2] * data[i- 3];
425 sum += qlp_coeff[ 1] * data[i- 2];
426 sum += qlp_coeff[ 0] * data[i- 1];
427 }
428 residual[i] = data[i] - (sum >> lp_quantization);
429 }
430 }
431 }
432
433 #if defined FLAC__CPU_IA32 /* unused for x86_64 */
434
435 FLAC__SSE_TARGET("sse2")
FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 * data,uint32_t data_len,const FLAC__int32 qlp_coeff[],uint32_t order,int lp_quantization,FLAC__int32 residual[])436 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
437 {
438 int i;
439
440 FLAC__ASSERT(order > 0);
441 FLAC__ASSERT(order <= 32);
442
443 if(order <= 12) {
444 if(order > 8) { /* order == 9, 10, 11, 12 */
445 if(order > 10) { /* order == 11, 12 */
446 if(order == 12) {
447 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
448 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)); // 0 0 q[1] q[0]
449 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)); // 0 0 q[3] q[2]
450 xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)); // 0 0 q[5] q[4]
451 xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)); // 0 0 q[7] q[6]
452 xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)); // 0 0 q[9] q[8]
453 xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0 0 q[11] q[10]
454
455 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
456 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
457 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
458 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
459 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
460 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
461
462 for(i = 0; i < (int)data_len; i++) {
463 //sum = 0;
464 //sum += qlp_coeff[11] * data[i-12];
465 //sum += qlp_coeff[10] * data[i-11];
466 xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12)); // 0 0 d[i-11] d[i-12]
467 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
468 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
469
470 //sum += qlp_coeff[9] * data[i-10];
471 //sum += qlp_coeff[8] * data[i-9];
472 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
473 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
474 xmm6 = _mm_mul_epu32(xmm6, xmm4);
475 xmm7 = _mm_add_epi32(xmm7, xmm6);
476
477 //sum += qlp_coeff[7] * data[i-8];
478 //sum += qlp_coeff[6] * data[i-7];
479 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
480 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
481 xmm6 = _mm_mul_epu32(xmm6, xmm3);
482 xmm7 = _mm_add_epi32(xmm7, xmm6);
483
484 //sum += qlp_coeff[5] * data[i-6];
485 //sum += qlp_coeff[4] * data[i-5];
486 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
487 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
488 xmm6 = _mm_mul_epu32(xmm6, xmm2);
489 xmm7 = _mm_add_epi32(xmm7, xmm6);
490
491 //sum += qlp_coeff[3] * data[i-4];
492 //sum += qlp_coeff[2] * data[i-3];
493 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
494 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
495 xmm6 = _mm_mul_epu32(xmm6, xmm1);
496 xmm7 = _mm_add_epi32(xmm7, xmm6);
497
498 //sum += qlp_coeff[1] * data[i-2];
499 //sum += qlp_coeff[0] * data[i-1];
500 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
501 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
502 xmm6 = _mm_mul_epu32(xmm6, xmm0);
503 xmm7 = _mm_add_epi32(xmm7, xmm6);
504
505 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
506 RESIDUAL32_RESULT(xmm7);
507 }
508 }
509 else { /* order == 11 */
510 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
511 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
512 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
513 xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
514 xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
515 xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
516 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
517
518 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
519 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
520 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
521 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
522 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
523
524 for(i = 0; i < (int)data_len; i++) {
525 //sum = 0;
526 //sum = qlp_coeff[10] * data[i-11];
527 xmm7 = _mm_cvtsi32_si128(data[i-11]);
528 xmm7 = _mm_mul_epu32(xmm7, xmm5);
529
530 //sum += qlp_coeff[9] * data[i-10];
531 //sum += qlp_coeff[8] * data[i-9];
532 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
533 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
534 xmm6 = _mm_mul_epu32(xmm6, xmm4);
535 xmm7 = _mm_add_epi32(xmm7, xmm6);
536
537 //sum += qlp_coeff[7] * data[i-8];
538 //sum += qlp_coeff[6] * data[i-7];
539 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
540 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
541 xmm6 = _mm_mul_epu32(xmm6, xmm3);
542 xmm7 = _mm_add_epi32(xmm7, xmm6);
543
544 //sum += qlp_coeff[5] * data[i-6];
545 //sum += qlp_coeff[4] * data[i-5];
546 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
547 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
548 xmm6 = _mm_mul_epu32(xmm6, xmm2);
549 xmm7 = _mm_add_epi32(xmm7, xmm6);
550
551 //sum += qlp_coeff[3] * data[i-4];
552 //sum += qlp_coeff[2] * data[i-3];
553 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
554 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
555 xmm6 = _mm_mul_epu32(xmm6, xmm1);
556 xmm7 = _mm_add_epi32(xmm7, xmm6);
557
558 //sum += qlp_coeff[1] * data[i-2];
559 //sum += qlp_coeff[0] * data[i-1];
560 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
561 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
562 xmm6 = _mm_mul_epu32(xmm6, xmm0);
563 xmm7 = _mm_add_epi32(xmm7, xmm6);
564
565 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
566 RESIDUAL32_RESULT(xmm7);
567 }
568 }
569 }
570 else { /* order == 9, 10 */
571 if(order == 10) {
572 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
573 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
574 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
575 xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
576 xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
577 xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
578
579 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
580 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
581 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
582 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
583 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
584
585 for(i = 0; i < (int)data_len; i++) {
586 //sum = 0;
587 //sum += qlp_coeff[9] * data[i-10];
588 //sum += qlp_coeff[8] * data[i-9];
589 xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
590 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
591 xmm7 = _mm_mul_epu32(xmm7, xmm4);
592
593 //sum += qlp_coeff[7] * data[i-8];
594 //sum += qlp_coeff[6] * data[i-7];
595 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
596 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
597 xmm6 = _mm_mul_epu32(xmm6, xmm3);
598 xmm7 = _mm_add_epi32(xmm7, xmm6);
599
600 //sum += qlp_coeff[5] * data[i-6];
601 //sum += qlp_coeff[4] * data[i-5];
602 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
603 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
604 xmm6 = _mm_mul_epu32(xmm6, xmm2);
605 xmm7 = _mm_add_epi32(xmm7, xmm6);
606
607 //sum += qlp_coeff[3] * data[i-4];
608 //sum += qlp_coeff[2] * data[i-3];
609 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
610 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
611 xmm6 = _mm_mul_epu32(xmm6, xmm1);
612 xmm7 = _mm_add_epi32(xmm7, xmm6);
613
614 //sum += qlp_coeff[1] * data[i-2];
615 //sum += qlp_coeff[0] * data[i-1];
616 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
617 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
618 xmm6 = _mm_mul_epu32(xmm6, xmm0);
619 xmm7 = _mm_add_epi32(xmm7, xmm6);
620
621 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
622 RESIDUAL32_RESULT(xmm7);
623 }
624 }
625 else { /* order == 9 */
626 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
627 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
628 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
629 xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
630 xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
631 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
632
633 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
634 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
635 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
636 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
637
638 for(i = 0; i < (int)data_len; i++) {
639 //sum = 0;
640 //sum = qlp_coeff[8] * data[i-9];
641 xmm7 = _mm_cvtsi32_si128(data[i-9]);
642 xmm7 = _mm_mul_epu32(xmm7, xmm4);
643
644 //sum += qlp_coeff[7] * data[i-8];
645 //sum += qlp_coeff[6] * data[i-7];
646 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
647 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
648 xmm6 = _mm_mul_epu32(xmm6, xmm3);
649 xmm7 = _mm_add_epi32(xmm7, xmm6);
650
651 //sum += qlp_coeff[5] * data[i-6];
652 //sum += qlp_coeff[4] * data[i-5];
653 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
654 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
655 xmm6 = _mm_mul_epu32(xmm6, xmm2);
656 xmm7 = _mm_add_epi32(xmm7, xmm6);
657
658 //sum += qlp_coeff[3] * data[i-4];
659 //sum += qlp_coeff[2] * data[i-3];
660 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
661 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
662 xmm6 = _mm_mul_epu32(xmm6, xmm1);
663 xmm7 = _mm_add_epi32(xmm7, xmm6);
664
665 //sum += qlp_coeff[1] * data[i-2];
666 //sum += qlp_coeff[0] * data[i-1];
667 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
668 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
669 xmm6 = _mm_mul_epu32(xmm6, xmm0);
670 xmm7 = _mm_add_epi32(xmm7, xmm6);
671
672 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
673 RESIDUAL32_RESULT(xmm7);
674 }
675 }
676 }
677 }
678 else if(order > 4) { /* order == 5, 6, 7, 8 */
679 if(order > 6) { /* order == 7, 8 */
680 if(order == 8) {
681 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
682 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
683 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
684 xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
685 xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
686
687 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
688 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
689 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
690 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
691
692 for(i = 0; i < (int)data_len; i++) {
693 //sum = 0;
694 //sum += qlp_coeff[7] * data[i-8];
695 //sum += qlp_coeff[6] * data[i-7];
696 xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
697 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
698 xmm7 = _mm_mul_epu32(xmm7, xmm3);
699
700 //sum += qlp_coeff[5] * data[i-6];
701 //sum += qlp_coeff[4] * data[i-5];
702 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
703 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
704 xmm6 = _mm_mul_epu32(xmm6, xmm2);
705 xmm7 = _mm_add_epi32(xmm7, xmm6);
706
707 //sum += qlp_coeff[3] * data[i-4];
708 //sum += qlp_coeff[2] * data[i-3];
709 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
710 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
711 xmm6 = _mm_mul_epu32(xmm6, xmm1);
712 xmm7 = _mm_add_epi32(xmm7, xmm6);
713
714 //sum += qlp_coeff[1] * data[i-2];
715 //sum += qlp_coeff[0] * data[i-1];
716 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
717 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
718 xmm6 = _mm_mul_epu32(xmm6, xmm0);
719 xmm7 = _mm_add_epi32(xmm7, xmm6);
720
721 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
722 RESIDUAL32_RESULT(xmm7);
723 }
724 }
725 else { /* order == 7 */
726 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
727 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
728 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
729 xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
730 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
731
732 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
733 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
734 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
735
736 for(i = 0; i < (int)data_len; i++) {
737 //sum = 0;
738 //sum = qlp_coeff[6] * data[i-7];
739 xmm7 = _mm_cvtsi32_si128(data[i-7]);
740 xmm7 = _mm_mul_epu32(xmm7, xmm3);
741
742 //sum += qlp_coeff[5] * data[i-6];
743 //sum += qlp_coeff[4] * data[i-5];
744 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
745 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
746 xmm6 = _mm_mul_epu32(xmm6, xmm2);
747 xmm7 = _mm_add_epi32(xmm7, xmm6);
748
749 //sum += qlp_coeff[3] * data[i-4];
750 //sum += qlp_coeff[2] * data[i-3];
751 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
752 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
753 xmm6 = _mm_mul_epu32(xmm6, xmm1);
754 xmm7 = _mm_add_epi32(xmm7, xmm6);
755
756 //sum += qlp_coeff[1] * data[i-2];
757 //sum += qlp_coeff[0] * data[i-1];
758 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
759 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
760 xmm6 = _mm_mul_epu32(xmm6, xmm0);
761 xmm7 = _mm_add_epi32(xmm7, xmm6);
762
763 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
764 RESIDUAL32_RESULT(xmm7);
765 }
766 }
767 }
768 else { /* order == 5, 6 */
769 if(order == 6) {
770 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
771 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
772 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
773 xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
774
775 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
776 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
777 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
778
779 for(i = 0; i < (int)data_len; i++) {
780 //sum = 0;
781 //sum += qlp_coeff[5] * data[i-6];
782 //sum += qlp_coeff[4] * data[i-5];
783 xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
784 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
785 xmm7 = _mm_mul_epu32(xmm7, xmm2);
786
787 //sum += qlp_coeff[3] * data[i-4];
788 //sum += qlp_coeff[2] * data[i-3];
789 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
790 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
791 xmm6 = _mm_mul_epu32(xmm6, xmm1);
792 xmm7 = _mm_add_epi32(xmm7, xmm6);
793
794 //sum += qlp_coeff[1] * data[i-2];
795 //sum += qlp_coeff[0] * data[i-1];
796 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
797 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
798 xmm6 = _mm_mul_epu32(xmm6, xmm0);
799 xmm7 = _mm_add_epi32(xmm7, xmm6);
800
801 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
802 RESIDUAL32_RESULT(xmm7);
803 }
804 }
805 else { /* order == 5 */
806 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
807 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
808 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
809 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
810
811 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
812 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
813
814 for(i = 0; i < (int)data_len; i++) {
815 //sum = 0;
816 //sum = qlp_coeff[4] * data[i-5];
817 xmm7 = _mm_cvtsi32_si128(data[i-5]);
818 xmm7 = _mm_mul_epu32(xmm7, xmm2);
819
820 //sum += qlp_coeff[3] * data[i-4];
821 //sum += qlp_coeff[2] * data[i-3];
822 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
823 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
824 xmm6 = _mm_mul_epu32(xmm6, xmm1);
825 xmm7 = _mm_add_epi32(xmm7, xmm6);
826
827 //sum += qlp_coeff[1] * data[i-2];
828 //sum += qlp_coeff[0] * data[i-1];
829 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
830 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
831 xmm6 = _mm_mul_epu32(xmm6, xmm0);
832 xmm7 = _mm_add_epi32(xmm7, xmm6);
833
834 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
835 RESIDUAL32_RESULT(xmm7);
836 }
837 }
838 }
839 }
840 else { /* order == 1, 2, 3, 4 */
841 if(order > 2) { /* order == 3, 4 */
842 if(order == 4) {
843 __m128i xmm0, xmm1, xmm6, xmm7;
844 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
845 xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
846
847 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
848 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
849
850 for(i = 0; i < (int)data_len; i++) {
851 //sum = 0;
852 //sum += qlp_coeff[3] * data[i-4];
853 //sum += qlp_coeff[2] * data[i-3];
854 xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
855 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
856 xmm7 = _mm_mul_epu32(xmm7, xmm1);
857
858 //sum += qlp_coeff[1] * data[i-2];
859 //sum += qlp_coeff[0] * data[i-1];
860 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
861 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
862 xmm6 = _mm_mul_epu32(xmm6, xmm0);
863 xmm7 = _mm_add_epi32(xmm7, xmm6);
864
865 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
866 RESIDUAL32_RESULT(xmm7);
867 }
868 }
869 else { /* order == 3 */
870 __m128i xmm0, xmm1, xmm6, xmm7;
871 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
872 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
873
874 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
875
876 for(i = 0; i < (int)data_len; i++) {
877 //sum = 0;
878 //sum = qlp_coeff[2] * data[i-3];
879 xmm7 = _mm_cvtsi32_si128(data[i-3]);
880 xmm7 = _mm_mul_epu32(xmm7, xmm1);
881
882 //sum += qlp_coeff[1] * data[i-2];
883 //sum += qlp_coeff[0] * data[i-1];
884 xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
885 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
886 xmm6 = _mm_mul_epu32(xmm6, xmm0);
887 xmm7 = _mm_add_epi32(xmm7, xmm6);
888
889 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
890 RESIDUAL32_RESULT(xmm7);
891 }
892 }
893 }
894 else { /* order == 1, 2 */
895 if(order == 2) {
896 __m128i xmm0, xmm7;
897 xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
898 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
899
900 for(i = 0; i < (int)data_len; i++) {
901 //sum = 0;
902 //sum += qlp_coeff[1] * data[i-2];
903 //sum += qlp_coeff[0] * data[i-1];
904 xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
905 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
906 xmm7 = _mm_mul_epu32(xmm7, xmm0);
907
908 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
909 RESIDUAL32_RESULT(xmm7);
910 }
911 }
912 else { /* order == 1 */
913 for(i = 0; i < (int)data_len; i++)
914 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
915 }
916 }
917 }
918 }
919 else { /* order > 12 */
920 FLAC__int32 sum;
921 for(i = 0; i < (int)data_len; i++) {
922 sum = 0;
923 switch(order) {
924 case 32: sum += qlp_coeff[31] * data[i-32]; /* Falls through. */
925 case 31: sum += qlp_coeff[30] * data[i-31]; /* Falls through. */
926 case 30: sum += qlp_coeff[29] * data[i-30]; /* Falls through. */
927 case 29: sum += qlp_coeff[28] * data[i-29]; /* Falls through. */
928 case 28: sum += qlp_coeff[27] * data[i-28]; /* Falls through. */
929 case 27: sum += qlp_coeff[26] * data[i-27]; /* Falls through. */
930 case 26: sum += qlp_coeff[25] * data[i-26]; /* Falls through. */
931 case 25: sum += qlp_coeff[24] * data[i-25]; /* Falls through. */
932 case 24: sum += qlp_coeff[23] * data[i-24]; /* Falls through. */
933 case 23: sum += qlp_coeff[22] * data[i-23]; /* Falls through. */
934 case 22: sum += qlp_coeff[21] * data[i-22]; /* Falls through. */
935 case 21: sum += qlp_coeff[20] * data[i-21]; /* Falls through. */
936 case 20: sum += qlp_coeff[19] * data[i-20]; /* Falls through. */
937 case 19: sum += qlp_coeff[18] * data[i-19]; /* Falls through. */
938 case 18: sum += qlp_coeff[17] * data[i-18]; /* Falls through. */
939 case 17: sum += qlp_coeff[16] * data[i-17]; /* Falls through. */
940 case 16: sum += qlp_coeff[15] * data[i-16]; /* Falls through. */
941 case 15: sum += qlp_coeff[14] * data[i-15]; /* Falls through. */
942 case 14: sum += qlp_coeff[13] * data[i-14]; /* Falls through. */
943 case 13: sum += qlp_coeff[12] * data[i-13];
944 sum += qlp_coeff[11] * data[i-12];
945 sum += qlp_coeff[10] * data[i-11];
946 sum += qlp_coeff[ 9] * data[i-10];
947 sum += qlp_coeff[ 8] * data[i- 9];
948 sum += qlp_coeff[ 7] * data[i- 8];
949 sum += qlp_coeff[ 6] * data[i- 7];
950 sum += qlp_coeff[ 5] * data[i- 6];
951 sum += qlp_coeff[ 4] * data[i- 5];
952 sum += qlp_coeff[ 3] * data[i- 4];
953 sum += qlp_coeff[ 2] * data[i- 3];
954 sum += qlp_coeff[ 1] * data[i- 2];
955 sum += qlp_coeff[ 0] * data[i- 1];
956 }
957 residual[i] = data[i] - (sum >> lp_quantization);
958 }
959 }
960 }
961
962 #endif /* FLAC__CPU_IA32 */
963 #endif /* FLAC__SSE2_SUPPORTED */
964 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
965 #endif /* FLAC__NO_ASM */
966 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
967