1 /* Copyright (c) 2014-2020, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <smmintrin.h>
35 #include "main.h"
36 #include "celt/x86/x86cpu.h"
37
38 #include "stack_alloc.h"
39
40 typedef struct {
41 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
42 opus_int32 RandState[ DECISION_DELAY ];
43 opus_int32 Q_Q10[ DECISION_DELAY ];
44 opus_int32 Xq_Q14[ DECISION_DELAY ];
45 opus_int32 Pred_Q15[ DECISION_DELAY ];
46 opus_int32 Shape_Q14[ DECISION_DELAY ];
47 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
48 opus_int32 LF_AR_Q14;
49 opus_int32 Diff_Q14;
50 opus_int32 Seed;
51 opus_int32 SeedInit;
52 opus_int32 RD_Q10;
53 } NSQ_del_dec_struct;
54
55 typedef struct {
56 opus_int32 Q_Q10;
57 opus_int32 RD_Q10;
58 opus_int32 xq_Q14;
59 opus_int32 LF_AR_Q14;
60 opus_int32 Diff_Q14;
61 opus_int32 sLTP_shp_Q14;
62 opus_int32 LPC_exc_Q14;
63 } NSQ_sample_struct;
64
65 typedef NSQ_sample_struct NSQ_sample_pair[ 2 ];
66
67 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
68 const silk_encoder_state *psEncC, /* I Encoder State */
69 silk_nsq_state *NSQ, /* I/O NSQ state */
70 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
71 const opus_int16 x16[], /* I Input */
72 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
73 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
74 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
75 opus_int subfr, /* I Subframe number */
76 opus_int nStatesDelayedDecision, /* I Number of del dec states */
77 const opus_int LTP_scale_Q14, /* I LTP state scaling */
78 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
79 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
80 const opus_int signal_type, /* I Signal type */
81 const opus_int decisionDelay /* I Decision delay */
82 );
83
84 /******************************************/
85 /* Noise shape quantizer for one subframe */
86 /******************************************/
87 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
88 silk_nsq_state *NSQ, /* I/O NSQ state */
89 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
90 opus_int signalType, /* I Signal type */
91 const opus_int32 x_Q10[], /* I */
92 opus_int8 pulses[], /* O */
93 opus_int16 xq[], /* O */
94 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
95 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
96 const opus_int16 a_Q12[], /* I Short term prediction coefs */
97 const opus_int16 b_Q14[], /* I Long term prediction coefs */
98 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
99 opus_int lag, /* I Pitch lag */
100 opus_int32 HarmShapeFIRPacked_Q14, /* I */
101 opus_int Tilt_Q14, /* I Spectral tilt */
102 opus_int32 LF_shp_Q14, /* I */
103 opus_int32 Gain_Q16, /* I */
104 opus_int Lambda_Q10, /* I */
105 opus_int offset_Q10, /* I */
106 opus_int length, /* I Input length */
107 opus_int subfr, /* I Subframe number */
108 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
109 opus_int predictLPCOrder, /* I Prediction filter order */
110 opus_int warping_Q16, /* I */
111 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
112 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
113 opus_int decisionDelay /* I */
114 );
115
silk_NSQ_del_dec_sse4_1(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int16 x16[],opus_int8 pulses[],const opus_int16 * PredCoef_Q12,const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)116 void silk_NSQ_del_dec_sse4_1(
117 const silk_encoder_state *psEncC, /* I Encoder State */
118 silk_nsq_state *NSQ, /* I/O NSQ state */
119 SideInfoIndices *psIndices, /* I/O Quantization Indices */
120 const opus_int16 x16[], /* I Input */
121 opus_int8 pulses[], /* O Quantized pulse signal */
122 const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */
123 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
124 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
125 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
126 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
127 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
128 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
129 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
130 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
131 const opus_int LTP_scale_Q14 /* I LTP state scaling */
132 )
133 {
134 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
135 opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
136 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
137 opus_int16 *pxq;
138 VARDECL( opus_int32, sLTP_Q15 );
139 VARDECL( opus_int16, sLTP );
140 opus_int32 HarmShapeFIRPacked_Q14;
141 opus_int offset_Q10;
142 opus_int32 RDmin_Q10, Gain_Q10;
143 VARDECL( opus_int32, x_sc_Q10 );
144 VARDECL( opus_int32, delayedGain_Q10 );
145 VARDECL( NSQ_del_dec_struct, psDelDec );
146 NSQ_del_dec_struct *psDD;
147 #ifdef OPUS_CHECK_ASM
148 silk_nsq_state NSQ_c;
149 SideInfoIndices psIndices_c;
150 opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
151 const opus_int8 *const pulses_a = pulses;
152 #endif
153 SAVE_STACK;
154
155 #ifdef OPUS_CHECK_ASM
156 ( void )pulses_a;
157 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
158 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
159 silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
160 silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
161 silk_NSQ_del_dec_c(
162 psEncC,
163 &NSQ_c,
164 &psIndices_c,
165 x16,
166 pulses_c,
167 PredCoef_Q12,
168 LTPCoef_Q14,
169 AR_Q13,
170 HarmShapeGain_Q14,
171 Tilt_Q14,
172 LF_shp_Q14,
173 Gains_Q16,
174 pitchL,
175 Lambda_Q10,
176 LTP_scale_Q14
177 );
178 #endif
179
180 /* Set unvoiced lag to the previous one, overwrite later for voiced */
181 lag = NSQ->lagPrev;
182
183 silk_assert( NSQ->prev_gain_Q16 != 0 );
184
185 /* Initialize delayed decision states */
186 ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
187 silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
188 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
189 psDD = &psDelDec[ k ];
190 psDD->Seed = ( k + psIndices->Seed ) & 3;
191 psDD->SeedInit = psDD->Seed;
192 psDD->RD_Q10 = 0;
193 psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;
194 psDD->Diff_Q14 = NSQ->sDiff_shp_Q14;
195 psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
196 silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
197 silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
198 }
199
200 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
201 smpl_buf_idx = 0; /* index of oldest samples */
202
203 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
204
205 /* For voiced frames limit the decision delay to lower than the pitch lag */
206 if( psIndices->signalType == TYPE_VOICED ) {
207 for( k = 0; k < psEncC->nb_subfr; k++ ) {
208 decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
209 }
210 } else {
211 if( lag > 0 ) {
212 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
213 }
214 }
215
216 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
217 LSF_interpolation_flag = 0;
218 } else {
219 LSF_interpolation_flag = 1;
220 }
221
222 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
223 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
224 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
225 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
226 /* Set up pointers to start of sub frame */
227 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
228 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
229 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
230 subfr = 0;
231 for( k = 0; k < psEncC->nb_subfr; k++ ) {
232 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
233 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
234 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
235
236 /* Noise shape parameters */
237 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
238 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
239 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
240
241 NSQ->rewhite_flag = 0;
242 if( psIndices->signalType == TYPE_VOICED ) {
243 /* Voiced */
244 lag = pitchL[ k ];
245
246 /* Re-whitening */
247 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
248 if( k == 2 ) {
249 /* RESET DELAYED DECISIONS */
250 /* Find winner */
251 RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
252 Winner_ind = 0;
253 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
254 if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
255 RDmin_Q10 = psDelDec[ i ].RD_Q10;
256 Winner_ind = i;
257 }
258 }
259 for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
260 if( i != Winner_ind ) {
261 psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
262 silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
263 }
264 }
265
266 /* Copy final part of signals from winner state to output and long-term filter states */
267 psDD = &psDelDec[ Winner_ind ];
268 last_smple_idx = smpl_buf_idx + decisionDelay;
269 for( i = 0; i < decisionDelay; i++ ) {
270 last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
271 if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
272 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
273 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
274 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
275 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
276 }
277
278 subfr = 0;
279 }
280
281 /* Rewhiten with new A coefs */
282 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
283 celt_assert( start_idx > 0 );
284
285 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
286 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
287
288 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
289 NSQ->rewhite_flag = 1;
290 }
291 }
292
293 silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
294 psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
295
296 silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
297 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
298 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
299 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
300
301 x16 += psEncC->subfr_length;
302 pulses += psEncC->subfr_length;
303 pxq += psEncC->subfr_length;
304 }
305
306 /* Find winner */
307 RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
308 Winner_ind = 0;
309 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
310 if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
311 RDmin_Q10 = psDelDec[ k ].RD_Q10;
312 Winner_ind = k;
313 }
314 }
315
316 /* Copy final part of signals from winner state to output and long-term filter states */
317 psDD = &psDelDec[ Winner_ind ];
318 psIndices->Seed = psDD->SeedInit;
319 last_smple_idx = smpl_buf_idx + decisionDelay;
320 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
321 for( i = 0; i < decisionDelay; i++ ) {
322 last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
323 if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
324
325 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
326 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
327 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
328 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
329 }
330 silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
331 silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
332
333 /* Update states */
334 NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
335 NSQ->sDiff_shp_Q14 = psDD->Diff_Q14;
336 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
337
338 /* Save quantized speech signal */
339 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
340 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
341
342 #ifdef OPUS_CHECK_ASM
343 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
344 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
345 silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
346 #endif
347
348 RESTORE_STACK;
349 }
350
351 /******************************************/
352 /* Noise shape quantizer for one subframe */
353 /******************************************/
silk_noise_shape_quantizer_del_dec_sse4_1(silk_nsq_state * NSQ,NSQ_del_dec_struct psDelDec[],opus_int signalType,const opus_int32 x_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],opus_int32 delayedGain_Q10[],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int subfr,opus_int shapingLPCOrder,opus_int predictLPCOrder,opus_int warping_Q16,opus_int nStatesDelayedDecision,opus_int * smpl_buf_idx,opus_int decisionDelay)354 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
355 silk_nsq_state *NSQ, /* I/O NSQ state */
356 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
357 opus_int signalType, /* I Signal type */
358 const opus_int32 x_Q10[], /* I */
359 opus_int8 pulses[], /* O */
360 opus_int16 xq[], /* O */
361 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
362 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
363 const opus_int16 a_Q12[], /* I Short term prediction coefs */
364 const opus_int16 b_Q14[], /* I Long term prediction coefs */
365 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
366 opus_int lag, /* I Pitch lag */
367 opus_int32 HarmShapeFIRPacked_Q14, /* I */
368 opus_int Tilt_Q14, /* I Spectral tilt */
369 opus_int32 LF_shp_Q14, /* I */
370 opus_int32 Gain_Q16, /* I */
371 opus_int Lambda_Q10, /* I */
372 opus_int offset_Q10, /* I */
373 opus_int length, /* I Input length */
374 opus_int subfr, /* I Subframe number */
375 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
376 opus_int predictLPCOrder, /* I Prediction filter order */
377 opus_int warping_Q16, /* I */
378 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
379 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
380 opus_int decisionDelay /* I */
381 )
382 {
383 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
384 opus_int32 Winner_rand_state;
385 opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
386 opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
387 opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
388 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
389 opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
390 int rdo_offset;
391
392 VARDECL( NSQ_sample_pair, psSampleState );
393 NSQ_del_dec_struct *psDD;
394 NSQ_sample_struct *psSS;
395
396 __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
397 __m128i b_Q12_0123, b_sr_Q12_0123;
398 SAVE_STACK;
399
400 celt_assert( nStatesDelayedDecision > 0 );
401 ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
402
403 rdo_offset = (Lambda_Q10 >> 1) - 512;
404
405 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
406 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
407 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
408
409 a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
410 a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
411
412 if( opus_likely( predictLPCOrder == 16 ) ) {
413 a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
414 a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
415 }
416
417 if( signalType == TYPE_VOICED ){
418 b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
419 b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
420 }
421 for( i = 0; i < length; i++ ) {
422 /* Perform common calculations used in all states */
423
424 /* Long-term prediction */
425 if( signalType == TYPE_VOICED ) {
426 /* Unrolled loop */
427 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
428 LTP_pred_Q14 = 2;
429 {
430 __m128i tmpa, tmpb, pred_lag_ptr_tmp;
431 pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
432 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
433 tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
434 tmpa = _mm_srli_si128( tmpa, 2 );
435
436 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
437 pred_lag_ptr_tmp = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
438 pred_lag_ptr_tmp = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
439 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
440
441 tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
442 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
443 LTP_pred_Q14 += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
444
445 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
446 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
447 pred_lag_ptr++;
448 }
449 } else {
450 LTP_pred_Q14 = 0;
451 }
452
453 /* Long-term shaping */
454 if( lag > 0 ) {
455 /* Symmetric, packed FIR coefficients */
456 n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
457 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
458 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
459 shp_lag_ptr++;
460 } else {
461 n_LTP_Q14 = 0;
462 }
463 {
464 __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
465
466 for( k = 0; k < nStatesDelayedDecision; k++ ) {
467 /* Delayed decision state */
468 psDD = &psDelDec[ k ];
469
470 /* Sample state */
471 psSS = psSampleState[ k ];
472
473 /* Generate dither */
474 psDD->Seed = silk_RAND( psDD->Seed );
475
476 /* Pointer used in short term prediction and shaping */
477 psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
478 /* Short-term prediction */
479 silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
480 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
481 LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
482
483 tmpb = _mm_setzero_si128();
484
485 /* step 1 */
486 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
487 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */
488 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
489
490 tmpa = _mm_srli_epi64( tmpa, 16 );
491 tmpb = _mm_add_epi32( tmpb, tmpa );
492
493 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
494 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
495 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
496 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
497 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
498
499 /* step 2 */
500 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -7 ] ) );
501 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
502 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
503 tmpa = _mm_srli_epi64( tmpa, 16 );
504 tmpb = _mm_add_epi32( tmpb, tmpa );
505
506 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
507 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
508 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
509 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
510 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
511
512 if ( opus_likely( predictLPCOrder == 16 ) )
513 {
514 /* step 3 */
515 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -11 ] ) );
516 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
517 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
518 tmpa = _mm_srli_epi64( tmpa, 16 );
519 tmpb = _mm_add_epi32( tmpb, tmpa );
520
521 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
522 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
523 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
524 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
525 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
526
527 /* step 4 */
528 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -15 ] ) );
529 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
530 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
531 tmpa = _mm_srli_epi64( tmpa, 16 );
532 tmpb = _mm_add_epi32( tmpb, tmpa );
533
534 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
535 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
536 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
537 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
538 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
539
540 /* add at last */
541 /* equal shift right 8 bytes*/
542 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
543 tmpb = _mm_add_epi32( tmpb, tmpa );
544 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );
545 }
546 else
547 {
548 /* add at last */
549 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
550 tmpb = _mm_add_epi32( tmpb, tmpa );
551 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );
552
553 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
554 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
555 }
556
557 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
558
559 /* Noise shape feedback */
560 celt_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
561 /* Output of lowpass section */
562 tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
563 /* Output of allpass section */
564 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ 1 ], tmp2), warping_Q16 );
565 psDD->sAR2_Q14[ 0 ] = tmp2;
566 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
567 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
568 /* Loop over allpass sections */
569 for( j = 2; j < shapingLPCOrder; j += 2 ) {
570 /* Output of allpass section */
571 tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 0 ], tmp1), warping_Q16 );
572 psDD->sAR2_Q14[ j - 1 ] = tmp1;
573 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
574 /* Output of allpass section */
575 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 1 ], tmp2), warping_Q16 );
576 psDD->sAR2_Q14[ j + 0 ] = tmp2;
577 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
578 }
579 psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
580 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
581
582 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */
583 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */
584 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */
585
586 n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 ); /* Q12 */
587 n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */
588 n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */
589
590 /* Input minus prediction plus noise feedback */
591 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
592 tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
593 tmp2 = silk_ADD32_ovflw( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */
594 tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */
595 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */
596
597 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */
598
599 /* Flip sign depending on dither */
600 if ( psDD->Seed < 0 ) {
601 r_Q10 = -r_Q10;
602 }
603 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
604
605 /* Find two quantization level candidates and measure their rate-distortion */
606 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
607 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
608 if (Lambda_Q10 > 2048) {
609 /* For aggressive RDO, the bias becomes more than one pulse. */
610 if (q1_Q10 > rdo_offset) {
611 q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
612 } else if (q1_Q10 < -rdo_offset) {
613 q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
614 } else if (q1_Q10 < 0) {
615 q1_Q0 = -1;
616 } else {
617 q1_Q0 = 0;
618 }
619 }
620 if( q1_Q0 > 0 ) {
621 q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
622 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
623 q2_Q10 = silk_ADD32( q1_Q10, 1024 );
624 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
625 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
626 } else if( q1_Q0 == 0 ) {
627 q1_Q10 = offset_Q10;
628 q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
629 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
630 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
631 } else if( q1_Q0 == -1 ) {
632 q2_Q10 = offset_Q10;
633 q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
634 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
635 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
636 } else { /* q1_Q0 < -1 */
637 q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
638 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
639 q2_Q10 = silk_ADD32( q1_Q10, 1024 );
640 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
641 rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
642 }
643 rr_Q10 = silk_SUB32( r_Q10, q1_Q10 );
644 rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
645 rr_Q10 = silk_SUB32( r_Q10, q2_Q10 );
646 rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
647
648 if( rd1_Q10 < rd2_Q10 ) {
649 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
650 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
651 psSS[ 0 ].Q_Q10 = q1_Q10;
652 psSS[ 1 ].Q_Q10 = q2_Q10;
653 } else {
654 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
655 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
656 psSS[ 0 ].Q_Q10 = q2_Q10;
657 psSS[ 1 ].Q_Q10 = q1_Q10;
658 }
659
660 /* Update states for best quantization */
661
662 /* Quantized excitation */
663 exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
664 if ( psDD->Seed < 0 ) {
665 exc_Q14 = -exc_Q14;
666 }
667
668 /* Add predictions */
669 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
670 xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 );
671
672 /* Update states */
673 psSS[ 0 ].Diff_Q14 = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) );
674 sLF_AR_shp_Q14 = silk_SUB32_ovflw( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
675 psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
676 psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;
677 psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;
678 psSS[ 0 ].xq_Q14 = xq_Q14;
679
680 /* Update states for second best quantization */
681
682 /* Quantized excitation */
683 exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
684 if ( psDD->Seed < 0 ) {
685 exc_Q14 = -exc_Q14;
686 }
687
688 /* Add predictions */
689 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
690 xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 );
691
692 /* Update states */
693 psSS[ 1 ].Diff_Q14 = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) );
694 sLF_AR_shp_Q14 = silk_SUB32_ovflw( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
695 psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
696 psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;
697 psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;
698 psSS[ 1 ].xq_Q14 = xq_Q14;
699 }
700 }
701 *smpl_buf_idx = ( *smpl_buf_idx - 1 ) % DECISION_DELAY;
702 if( *smpl_buf_idx < 0 ) *smpl_buf_idx += DECISION_DELAY;
703 last_smple_idx = ( *smpl_buf_idx + decisionDelay ) % DECISION_DELAY;
704
705 /* Find winner */
706 RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
707 Winner_ind = 0;
708 for( k = 1; k < nStatesDelayedDecision; k++ ) {
709 if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
710 RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10;
711 Winner_ind = k;
712 }
713 }
714
715 /* Increase RD values of expired states */
716 Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
717 for( k = 0; k < nStatesDelayedDecision; k++ ) {
718 if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
719 psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
720 psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
721 silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
722 }
723 }
724
725 /* Find worst in first set and best in second set */
726 RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
727 RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10;
728 RDmax_ind = 0;
729 RDmin_ind = 0;
730 for( k = 1; k < nStatesDelayedDecision; k++ ) {
731 /* find worst in first set */
732 if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
733 RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10;
734 RDmax_ind = k;
735 }
736 /* find best in second set */
737 if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
738 RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10;
739 RDmin_ind = k;
740 }
741 }
742
743 /* Replace a state if best from second set outperforms worst in first set */
744 if( RDmin_Q10 < RDmax_Q10 ) {
745 silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
746 ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
747 silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
748 }
749
750 /* Write samples from winner to output and long-term filter states */
751 psDD = &psDelDec[ Winner_ind ];
752 if( subfr > 0 || i >= decisionDelay ) {
753 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
754 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
755 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
756 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
757 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->Pred_Q15[ last_smple_idx ];
758 }
759 NSQ->sLTP_shp_buf_idx++;
760 NSQ->sLTP_buf_idx++;
761
762 /* Update states */
763 for( k = 0; k < nStatesDelayedDecision; k++ ) {
764 psDD = &psDelDec[ k ];
765 psSS = &psSampleState[ k ][ 0 ];
766 psDD->LF_AR_Q14 = psSS->LF_AR_Q14;
767 psDD->Diff_Q14 = psSS->Diff_Q14;
768 psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
769 psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;
770 psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;
771 psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
772 psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14;
773 psDD->Seed = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
774 psDD->RandState[ *smpl_buf_idx ] = psDD->Seed;
775 psDD->RD_Q10 = psSS->RD_Q10;
776 }
777 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
778 }
779 /* Update LPC states */
780 for( k = 0; k < nStatesDelayedDecision; k++ ) {
781 psDD = &psDelDec[ k ];
782 silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
783 }
784 RESTORE_STACK;
785 }
786
silk_nsq_del_dec_scale_states_sse4_1(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,NSQ_del_dec_struct psDelDec[],const opus_int16 x16[],opus_int32 x_sc_Q10[],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,opus_int nStatesDelayedDecision,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type,const opus_int decisionDelay)787 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
788 const silk_encoder_state *psEncC, /* I Encoder State */
789 silk_nsq_state *NSQ, /* I/O NSQ state */
790 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
791 const opus_int16 x16[], /* I Input */
792 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
793 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
794 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
795 opus_int subfr, /* I Subframe number */
796 opus_int nStatesDelayedDecision, /* I Number of del dec states */
797 const opus_int LTP_scale_Q14, /* I LTP state scaling */
798 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
799 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
800 const opus_int signal_type, /* I Signal type */
801 const opus_int decisionDelay /* I Decision delay */
802 )
803 {
804 opus_int i, k, lag;
805 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
806 NSQ_del_dec_struct *psDD;
807 __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
808
809 lag = pitchL[ subfr ];
810 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
811 silk_assert( inv_gain_Q31 != 0 );
812
813 /* Scale input */
814 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
815
816 /* prepare inv_gain_Q26 in packed 4 32-bits */
817 xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
818
819 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
820 xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
821
822 /* equal shift right 4 bytes*/
823 xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
824
825 xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
826 xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
827
828 xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
829 xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
830
831 xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
832
833 _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
834 }
835
836 for( ; i < psEncC->subfr_length; i++ ) {
837 x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
838 }
839
840 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
841 if( NSQ->rewhite_flag ) {
842 if( subfr == 0 ) {
843 /* Do LTP downscaling */
844 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
845 }
846 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
847 silk_assert( i < MAX_FRAME_LENGTH );
848 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
849 }
850 }
851
852 /* Adjust for changing gain */
853 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
854 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
855
856 /* Scale long-term shaping state */
857 {
858 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
859
860 /* prepare gain_adj_Q16 in packed 4 32-bits */
861 xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
862
863 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
864 {
865 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
866 /* equal shift right 4 bytes*/
867 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
868
869 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
870 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
871
872 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
873 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
874
875 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
876
877 _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
878 }
879
880 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
881 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
882 }
883
884 /* Scale long-term prediction state */
885 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
886 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
887 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
888 }
889 }
890
891 for( k = 0; k < nStatesDelayedDecision; k++ ) {
892 psDD = &psDelDec[ k ];
893
894 /* Scale scalar states */
895 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
896 psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
897
898 /* Scale short-term prediction and shaping states */
899 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
900 psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
901 }
902 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
903 psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
904 }
905 for( i = 0; i < DECISION_DELAY; i++ ) {
906 psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[ i ] );
907 psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
908 }
909 }
910 }
911
912 /* Save inverse gain */
913 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
914 }
915 }
916