1 /* Copyright (c) 2014-2020, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <smmintrin.h>
35 #include "main.h"
36 #include "celt/x86/x86cpu.h"
37 #include "stack_alloc.h"
38
39 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
40 const silk_encoder_state *psEncC, /* I Encoder State */
41 silk_nsq_state *NSQ, /* I/O NSQ state */
42 const opus_int16 x16[], /* I input */
43 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
44 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
45 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
46 opus_int subfr, /* I subframe number */
47 const opus_int LTP_scale_Q14, /* I */
48 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
49 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
50 const opus_int signal_type /* I Signal type */
51 );
52
53 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
54 silk_nsq_state *NSQ, /* I/O NSQ state */
55 opus_int signalType, /* I Signal type */
56 const opus_int32 x_sc_Q10[], /* I */
57 opus_int8 pulses[], /* O */
58 opus_int16 xq[], /* O */
59 opus_int32 sLTP_Q15[], /* I/O LTP state */
60 const opus_int16 a_Q12[], /* I Short term prediction coefs */
61 const opus_int16 b_Q14[], /* I Long term prediction coefs */
62 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
63 opus_int lag, /* I Pitch lag */
64 opus_int32 HarmShapeFIRPacked_Q14, /* I */
65 opus_int Tilt_Q14, /* I Spectral tilt */
66 opus_int32 LF_shp_Q14, /* I */
67 opus_int32 Gain_Q16, /* I */
68 opus_int Lambda_Q10, /* I */
69 opus_int offset_Q10, /* I */
70 opus_int length, /* I Input length */
71 opus_int32 table[][4] /* I */
72 );
73
silk_NSQ_sse4_1(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int16 x16[],opus_int8 pulses[],const opus_int16 * PredCoef_Q12,const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)74 void silk_NSQ_sse4_1(
75 const silk_encoder_state *psEncC, /* I Encoder State */
76 silk_nsq_state *NSQ, /* I/O NSQ state */
77 SideInfoIndices *psIndices, /* I/O Quantization Indices */
78 const opus_int16 x16[], /* I Input */
79 opus_int8 pulses[], /* O Quantized pulse signal */
80 const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */
81 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
82 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
83 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
84 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
85 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
86 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
87 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
88 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
89 const opus_int LTP_scale_Q14 /* I LTP state scaling */
90 )
91 {
92 opus_int k, lag, start_idx, LSF_interpolation_flag;
93 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
94 opus_int16 *pxq;
95 VARDECL( opus_int32, sLTP_Q15 );
96 VARDECL( opus_int16, sLTP );
97 opus_int32 HarmShapeFIRPacked_Q14;
98 opus_int offset_Q10;
99 VARDECL( opus_int32, x_sc_Q10 );
100
101 opus_int32 table[ 64 ][ 4 ];
102 opus_int32 tmp1;
103 opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
104
105 #ifdef OPUS_CHECK_ASM
106 silk_nsq_state NSQ_c;
107 SideInfoIndices psIndices_c;
108 opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
109 const opus_int8 *const pulses_a = pulses;
110 #endif
111
112 SAVE_STACK;
113
114 #ifdef OPUS_CHECK_ASM
115 ( void )pulses_a;
116 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
117 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
118 silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
119 silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
120
121 silk_NSQ_c(
122 psEncC,
123 &NSQ_c,
124 &psIndices_c,
125 x16,
126 pulses_c,
127 PredCoef_Q12,
128 LTPCoef_Q14,
129 AR_Q13,
130 HarmShapeGain_Q14,
131 Tilt_Q14,
132 LF_shp_Q14,
133 Gains_Q16,
134 pitchL,
135 Lambda_Q10,
136 LTP_scale_Q14
137 );
138 #endif
139
140 NSQ->rand_seed = psIndices->Seed;
141
142 /* Set unvoiced lag to the previous one, overwrite later for voiced */
143 lag = NSQ->lagPrev;
144
145 silk_assert( NSQ->prev_gain_Q16 != 0 );
146
147 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
148
149 /* 0 */
150 q1_Q10 = offset_Q10;
151 q2_Q10 = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
152 rd1_Q20 = q1_Q10 * Lambda_Q10;
153 rd2_Q20 = q2_Q10 * Lambda_Q10;
154
155 table[ 32 ][ 0 ] = q1_Q10;
156 table[ 32 ][ 1 ] = q2_Q10;
157 table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
158 table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
159
160 /* -1 */
161 q1_Q10 = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
162 q2_Q10 = offset_Q10;
163 rd1_Q20 = - q1_Q10 * Lambda_Q10;
164 rd2_Q20 = q2_Q10 * Lambda_Q10;
165
166 table[ 31 ][ 0 ] = q1_Q10;
167 table[ 31 ][ 1 ] = q2_Q10;
168 table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
169 table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
170
171 /* > 0 */
172 for (k = 1; k <= 31; k++)
173 {
174 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
175
176 q1_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10;
177 q2_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
178 rd1_Q20 = q1_Q10 * Lambda_Q10;
179 rd2_Q20 = q2_Q10 * Lambda_Q10;
180
181 table[ 32 + k ][ 0 ] = q1_Q10;
182 table[ 32 + k ][ 1 ] = q2_Q10;
183 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
184 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
185 }
186
187 /* < -1 */
188 for (k = -32; k <= -2; k++)
189 {
190 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
191
192 q1_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10;
193 q2_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
194 rd1_Q20 = - q1_Q10 * Lambda_Q10;
195 rd2_Q20 = - q2_Q10 * Lambda_Q10;
196
197 table[ 32 + k ][ 0 ] = q1_Q10;
198 table[ 32 + k ][ 1 ] = q2_Q10;
199 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
200 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
201 }
202
203 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
204 LSF_interpolation_flag = 0;
205 } else {
206 LSF_interpolation_flag = 1;
207 }
208
209 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
210 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
211 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
212 /* Set up pointers to start of sub frame */
213 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
214 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
215 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
216 for( k = 0; k < psEncC->nb_subfr; k++ ) {
217 A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
218 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
219 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
220
221 /* Noise shape parameters */
222 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
223 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
224 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
225
226 NSQ->rewhite_flag = 0;
227 if( psIndices->signalType == TYPE_VOICED ) {
228 /* Voiced */
229 lag = pitchL[ k ];
230
231 /* Re-whitening */
232 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
233 /* Rewhiten with new A coefs */
234 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
235 celt_assert( start_idx > 0 );
236
237 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
238 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
239
240 NSQ->rewhite_flag = 1;
241 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
242 }
243 }
244
245 silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
246
247 if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
248 {
249 silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
250 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
251 offset_Q10, psEncC->subfr_length, &(table[32]) );
252 }
253 else
254 {
255 silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
256 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
257 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
258 }
259
260 x16 += psEncC->subfr_length;
261 pulses += psEncC->subfr_length;
262 pxq += psEncC->subfr_length;
263 }
264
265 /* Update lagPrev for next frame */
266 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
267
268 /* Save quantized speech and noise shaping signals */
269 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
270 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
271
272 #ifdef OPUS_CHECK_ASM
273 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
274 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
275 silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
276 #endif
277
278 RESTORE_STACK;
279 }
280
281 /************************************/
282 /* silk_noise_shape_quantizer_10_16 */
283 /************************************/
silk_noise_shape_quantizer_10_16_sse4_1(silk_nsq_state * NSQ,opus_int signalType,const opus_int32 x_sc_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int32 table[][4])284 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
285 silk_nsq_state *NSQ, /* I/O NSQ state */
286 opus_int signalType, /* I Signal type */
287 const opus_int32 x_sc_Q10[], /* I */
288 opus_int8 pulses[], /* O */
289 opus_int16 xq[], /* O */
290 opus_int32 sLTP_Q15[], /* I/O LTP state */
291 const opus_int16 a_Q12[], /* I Short term prediction coefs */
292 const opus_int16 b_Q14[], /* I Long term prediction coefs */
293 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
294 opus_int lag, /* I Pitch lag */
295 opus_int32 HarmShapeFIRPacked_Q14, /* I */
296 opus_int Tilt_Q14, /* I Spectral tilt */
297 opus_int32 LF_shp_Q14, /* I */
298 opus_int32 Gain_Q16, /* I */
299 opus_int Lambda_Q10, /* I */
300 opus_int offset_Q10, /* I */
301 opus_int length, /* I Input length */
302 opus_int32 table[][4] /* I */
303 )
304 {
305 opus_int i;
306 opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
307 opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
308 opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
309 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
310 opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
311
312 __m128i xmm_tempa, xmm_tempb;
313
314 __m128i xmm_one;
315
316 __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
317 __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
318 __m128i a_Q12_01234567, a_Q12_89ABCDEF;
319
320 __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
321 __m128i AR_shp_Q13_76543210;
322
323 int rdo_offset = (Lambda_Q10 >> 1) - 512;
324
325 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
326 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
327 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
328
329 /* Set up short term AR state */
330 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
331
332 sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
333 xq_Q14 = psLPC_Q14[ 0 ];
334 sDiff_shp_Q14 = NSQ->sDiff_shp_Q14;
335 LTP_pred_Q13 = 0;
336
337 /* load a_Q12 */
338 xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
339
340 /* load a_Q12[0] - a_Q12[7] */
341 a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 0 ] ) );
342 /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
343 a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 8 ] ) );
344
345 a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
346 a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
347
348 /* load AR_shp_Q13 */
349 AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(void*)(&AR_shp_Q13[0] ) );
350
351 /* load psLPC_Q14 */
352 xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
353
354 xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-16]) );
355 xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-12]) );
356
357 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
358 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
359
360 psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
361 psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
362
363 xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -8 ]) );
364 xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -4 ]) );
365
366 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
367 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
368
369 psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
370 psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
371
372 /* load sAR2_Q14 */
373 xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 0 ]) ) );
374 xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 4 ]) ) );
375
376 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
377 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
378
379 sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
380 sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
381
382 /* prepare 1 in 8 * 16bit */
383 xmm_one = _mm_set1_epi16(1);
384
385 for( i = 0; i < length; i++ )
386 {
387 /* Short-term prediction */
388 __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
389
390 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
391 LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
392
393 /* shift psLPC_Q14 */
394 psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
395 psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
396
397 psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
398 psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
399
400 psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
401 psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14), 7 );
402
403 /* high part, use pmaddwd, results in 4 32-bit */
404 xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
405 xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
406
407 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
408 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
409 xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
410
411 xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
412 xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
413
414 xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
415 xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
416
417 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
418 xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
419
420 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
421 xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
422
423 /* accumulate */
424 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
425 xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
426
427 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
428
429 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
430 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
431
432 LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
433
434 /* Long-term prediction */
435 if ( opus_likely( signalType == TYPE_VOICED ) ) {
436 /* Unrolled loop */
437 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
438 LTP_pred_Q13 = 2;
439 {
440 __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
441
442 b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
443 b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
444
445 /* loaded: [0] [-1] [-2] [-3] */
446 pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
447 /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
448 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
449 /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
450 xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
451 /* right shift 2 bytes (16 bits), zero extended */
452 xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
453
454 /* a[1] * b[-1], a[3] * b[-3] */
455 pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
456 pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
457
458 pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
459 /* equal shift right 8 bytes*/
460 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
461 xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
462
463 LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
464
465 LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
466 pred_lag_ptr++;
467 }
468 }
469
470 /* Noise shape feedback */
471 NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
472 NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
473
474 sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
475 sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
476
477 sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
478 sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14), 0 );
479
480 /* high part, use pmaddwd, results in 4 32-bit */
481 xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
482
483 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
484 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
485 xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
486
487 xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
488 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
489
490 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
491
492 /* accumulate */
493 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
494
495 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
496 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
497
498 n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
499
500 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
501 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
502
503 n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 ); /* Q11 -> Q12 */
504 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
505
506 n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
507 n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
508
509 celt_assert( lag > 0 || signalType != TYPE_VOICED );
510
511 /* Combine prediction and noise shaping signals */
512 tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
513 tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
514 if( lag > 0 ) {
515 /* Symmetric, packed FIR coefficients */
516 n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
517 n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
518 n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
519 shp_lag_ptr++;
520
521 tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */
522 tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */
523 tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */
524 } else {
525 tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */
526 }
527
528 r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */
529
530 /* Generate dither */
531 NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
532
533 /* Flip sign depending on dither */
534 tmp2 = -r_Q10;
535 if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
536
537 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
538
539 /* Find two quantization level candidates and measure their rate-distortion */
540 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
541 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
542 if (Lambda_Q10 > 2048) {
543 /* For aggressive RDO, the bias becomes more than one pulse. */
544 if (q1_Q10 > rdo_offset) {
545 q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
546 } else if (q1_Q10 < -rdo_offset) {
547 q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
548 } else if (q1_Q10 < 0) {
549 q1_Q0 = -1;
550 } else {
551 q1_Q0 = 0;
552 }
553 }
554
555 q1_Q10 = table[q1_Q0][0];
556 q2_Q10 = table[q1_Q0][1];
557
558 if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
559 {
560 q1_Q10 = q2_Q10;
561 }
562
563 pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
564
565 /* Excitation */
566 exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
567
568 tmp2 = -exc_Q14;
569 if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
570
571 /* Add predictions */
572 LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
573 xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
574
575 /* Update states */
576 psLPC_Q14++;
577 *psLPC_Q14 = xq_Q14;
578 NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
579 sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
580
581 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
582 sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
583 NSQ->sLTP_shp_buf_idx++;
584 NSQ->sLTP_buf_idx++;
585
586 /* Make dither dependent on quantized signal */
587 NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
588 }
589
590 NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
591
592 /* Scale XQ back to normal level before saving */
593 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
594
595 /* write back sAR2_Q14 */
596 xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
597 xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
598 _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
599 _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
600
601 /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
602 {
603 __m128i xmm_Gain_Q10;
604 __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
605
606 /* prepare (1 << 7) in packed 4 32-bits */
607 xmm_tempa = _mm_set1_epi32( (1 << 7) );
608
609 /* prepare Gain_Q10 in packed 4 32-bits */
610 xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
611
612 /* process xq */
613 for (i = 0; i < length - 7; i += 8)
614 {
615 xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 0 ] ) ) );
616 xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 4 ] ) ) );
617
618 /* equal shift right 4 bytes*/
619 xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
620 /* equal shift right 4 bytes*/
621 xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
622
623 xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
624 xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
625 xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
626 xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
627
628 xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
629 xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
630 xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
631 xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
632
633 xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
634 xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
635
636 /* silk_RSHIFT_ROUND(xq, 8) */
637 xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
638 xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
639
640 xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
641 xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
642
643 /* silk_SAT16 */
644 xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
645
646 /* save to xq */
647 _mm_storeu_si128( (__m128i *)(void*)(&xq[ i ] ), xmm_xq_Q14_3210 );
648 }
649 }
650 for ( ; i < length; i++)
651 {
652 xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
653 }
654
655 /* Update LPC synth buffer */
656 silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
657 }
658
silk_nsq_scale_states_sse4_1(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,const opus_int16 x16[],opus_int32 x_sc_Q10[],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type)659 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
660 const silk_encoder_state *psEncC, /* I Encoder State */
661 silk_nsq_state *NSQ, /* I/O NSQ state */
662 const opus_int16 x16[], /* I input */
663 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
664 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
665 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
666 opus_int subfr, /* I subframe number */
667 const opus_int LTP_scale_Q14, /* I */
668 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
669 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
670 const opus_int signal_type /* I Signal type */
671 )
672 {
673 opus_int i, lag;
674 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
675 __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
676
677 lag = pitchL[ subfr ];
678 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
679 silk_assert( inv_gain_Q31 != 0 );
680
681 /* Scale input */
682 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
683
684 /* prepare inv_gain_Q26 in packed 4 32-bits */
685 xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
686
687 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
688 xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
689
690 /* equal shift right 4 bytes*/
691 xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
692
693 xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
694 xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
695
696 xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
697 xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
698
699 xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
700
701 _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
702 }
703
704 for( ; i < psEncC->subfr_length; i++ ) {
705 x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
706 }
707
708 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
709 if( NSQ->rewhite_flag ) {
710 if( subfr == 0 ) {
711 /* Do LTP downscaling */
712 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
713 }
714 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
715 silk_assert( i < MAX_FRAME_LENGTH );
716 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
717 }
718 }
719
720 /* Adjust for changing gain */
721 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
722 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
723 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
724
725 /* Scale long-term shaping state */
726
727 /* prepare gain_adj_Q16 in packed 4 32-bits */
728 xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
729
730 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
731 {
732 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
733 /* equal shift right 4 bytes*/
734 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
735
736 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
737 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
738
739 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
740 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
741
742 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
743
744 _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
745 }
746
747 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
748 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
749 }
750
751 /* Scale long-term prediction state */
752 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
753 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
754 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
755 }
756 }
757
758 NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
759 NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
760
761 /* Scale short-term prediction and shaping states */
762 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
763 NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
764 }
765 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
766 NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
767 }
768
769 /* Save inverse gain */
770 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
771 }
772 }
773