1 /***********************************************************************
2 Copyright (c) 2021 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #ifdef OPUS_CHECK_ASM
33 #include <string.h>
34 #endif
35
36 #include "opus_defines.h"
37 #include <immintrin.h>
38
39 #include "main.h"
40 #include "stack_alloc.h"
41 #include "NSQ.h"
42 #include "celt/x86/x86cpu.h"
43
44 /* Returns TRUE if all assumptions met */
verify_assumptions(const silk_encoder_state * psEncC)45 static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
46 {
47 /* This optimization is based on these assumptions */
48 /* These assumptions are fundamental and hence assert are */
49 /* used. Should any assert triggers, we have to re-visit */
50 /* all related code to make sure it still functions the */
51 /* same as the C implementation. */
52 silk_assert(MAX_DEL_DEC_STATES <= 4 &&
53 MAX_FRAME_LENGTH % 4 == 0 &&
54 MAX_SUB_FRAME_LENGTH % 4 == 0 &&
55 LTP_MEM_LENGTH_MS % 4 == 0 );
56 silk_assert(psEncC->fs_kHz == 8 ||
57 psEncC->fs_kHz == 12 ||
58 psEncC->fs_kHz == 16 );
59 silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&
60 psEncC->nb_subfr > 0 );
61 silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&
62 psEncC->nStatesDelayedDecision > 0 );
63 silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);
64
65 /* Regressions were observed on certain AMD Zen CPUs when */
66 /* nStatesDelayedDecision is 1 or 2. Ideally we should detect */
67 /* these CPUs and enable this optimization on others; however, */
68 /* there is no good way to do so under current OPUS framework. */
69 return psEncC->nStatesDelayedDecision == 3 ||
70 psEncC->nStatesDelayedDecision == 4;
71 }
72
73 /* Intrinsics not defined on MSVC */
74 #ifdef _MSC_VER
75 #include <Intsafe.h>
76 #define __m128i_u __m128i
__builtin_sadd_overflow(opus_int32 a,opus_int32 b,opus_int32 * res)77 static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
78 {
79 *res = a+b;
80 return (*res ^ a) & (*res ^ b) & 0x80000000;
81 }
__builtin_ctz(unsigned int x)82 static inline int __builtin_ctz(unsigned int x)
83 {
84 DWORD res = 0;
85 return _BitScanForward(&res, x) ? res : 32;
86 }
87 #endif
88
silk_cvtepi64_epi32_high(__m256i num)89 static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num)
90 {
91 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1)));
92 }
93
silk_sat16(opus_int32 num)94 static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)
95 {
96 num = num > silk_int16_MAX ? silk_int16_MAX : num;
97 num = num < silk_int16_MIN ? silk_int16_MIN : num;
98 return num;
99 }
100
silk_sar_round_32(opus_int32 a,int bits)101 static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)
102 {
103 silk_assert(bits > 0 && bits < 31);
104 a += 1 << (bits-1);
105 return a >> bits;
106 }
107
silk_sar_round_smulww(opus_int32 a,opus_int32 b,int bits)108 static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)
109 {
110 silk_assert(bits > 0 && bits < 63);
111 #ifdef OPUS_CHECK_ASM
112 return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
113 #else
114 /* This code is more correct, but it won't overflow like the C code in some rare cases. */
115 silk_assert(bits > 0 && bits < 63);
116 opus_int64 t = ((opus_int64)a) * ((opus_int64)b);
117 bits += 16;
118 t += 1ull << (bits-1);
119 return t >> bits;
120 #endif
121 }
122
silk_add_sat32(opus_int32 a,opus_int32 b)123 static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)
124 {
125 opus_int32 sum;
126 if (__builtin_sadd_overflow(a, b, &sum))
127 {
128 return a >= 0 ? silk_int32_MAX : silk_int32_MIN;
129 }
130 return sum;
131 }
132
silk_mm_srai_round_epi32(__m128i a,int bits)133 static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits)
134 {
135 silk_assert(bits > 0 && bits < 31);
136 return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits);
137 }
138
139 /* add/subtract with output saturated */
silk_mm_add_sat_epi32(__m128i a,__m128i b)140 static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b)
141 {
142 __m128i r = _mm_add_epi32(a, b);
143 __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r)); /* OF = (sum ^ a) & (sum ^ b) */
144 __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
145 return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
146 }
silk_mm_sub_sat_epi32(__m128i a,__m128i b)147 static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b)
148 {
149 __m128i r = _mm_sub_epi32(a, b);
150 __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
151 __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
152 return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
153 }
silk_mm256_sub_sat_epi32(__m256i a,__m256i b)154 static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b)
155 {
156 __m256i r = _mm256_sub_epi32(a, b);
157 __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
158 __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
159 return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31));
160 }
161
silk_mm_limit_epi32(__m128i num,opus_int32 limit1,opus_int32 limit2)162 static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2)
163 {
164 opus_int32 lo = limit1 < limit2 ? limit1 : limit2;
165 opus_int32 hi = limit1 > limit2 ? limit1 : limit2;
166
167 num = _mm_min_epi32(num, _mm_set1_epi32(hi));
168 num = _mm_max_epi32(num, _mm_set1_epi32(lo));
169 return num;
170 }
171
172 /* cond < 0 ? -num : num */
silk_mm_sign_epi32(__m128i num,__m128i cond)173 static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond)
174 {
175 return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1)));
176 }
silk_mm256_sign_epi32(__m256i num,__m256i cond)177 static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond)
178 {
179 return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1)));
180 }
181
182 /* (a32 * b32) >> 16 */
silk_mm_smulww_epi32(__m128i a,opus_int32 b)183 static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)
184 {
185 return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));
186 }
187
188 /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
silk_mm_smulwb_epi32(__m128i a,opus_int32 b)189 static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)
190 {
191 return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16))));
192 }
193
194 /* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
silk_mm256_smulbb_epi32(__m256i a,__m256i b)195 static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)
196 {
197 const char FF = (char)0xFF;
198 __m256i msk = _mm256_set_epi8(
199 FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,
200 FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);
201 __m256i lo = _mm256_mullo_epi16(a, b);
202 __m256i hi = _mm256_mulhi_epi16(a, b);
203 lo = _mm256_shuffle_epi8(lo, msk);
204 hi = _mm256_shuffle_epi8(hi, msk);
205 return _mm256_unpacklo_epi16(lo, hi);
206 }
207
silk_mm256_reverse_epi32(__m256i v)208 static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)
209 {
210 v = _mm256_shuffle_epi32(v, 0x1B);
211 v = _mm256_permute4x64_epi64(v, 0x4E);
212 return v;
213 }
214
silk_mm256_hsum_epi32(__m256i v)215 static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)
216 {
217 __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));
218 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));
219 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
220 return _mm_cvtsi128_si32(sum);
221 }
222
silk_mm_hmin_epi32(__m128i num)223 static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num)
224 {
225 num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */
226 num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
227 return num;
228 }
229
silk_mm_hmax_epi32(__m128i num)230 static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num)
231 {
232 num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */
233 num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
234 return num;
235 }
236
silk_mm_mask_hmin_epi32(__m128i num,__m128i mask)237 static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask)
238 {
239 num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask);
240 return silk_mm_hmin_epi32(num);
241 }
242
silk_mm_mask_hmax_epi32(__m128i num,__m128i mask)243 static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask)
244 {
245 num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask);
246 return silk_mm_hmax_epi32(num);
247 }
248
silk_mm256_rand_epi32(__m128i seed)249 static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed)
250 {
251 seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER));
252 seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT));
253 return seed;
254 }
255
silk_index_of_first_equal_epi32(__m128i a,__m128i b)256 static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b)
257 {
258 unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111;
259 silk_assert(mask != 0);
260 return __builtin_ctz(mask) >> 2;
261 }
262
silk_index_to_selector(opus_int32 index)263 static __m128i silk_index_to_selector(opus_int32 index)
264 {
265 silk_assert(index < 4);
266 index <<= 2;
267 return _mm_set_epi8(
268 index + 3, index + 2, index + 1, index + 0,
269 index + 3, index + 2, index + 1, index + 0,
270 index + 3, index + 2, index + 1, index + 0,
271 index + 3, index + 2, index + 1, index + 0);
272 }
273
silk_select_winner(__m128i num,__m128i selector)274 static opus_int32 silk_select_winner(__m128i num, __m128i selector)
275 {
276 return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector));
277 }
278
279 typedef struct
280 {
281 __m128i RandState;
282 __m128i Q_Q10;
283 __m128i Xq_Q14;
284 __m128i Pred_Q15;
285 __m128i Shape_Q14;
286 } NSQ_del_dec_sample_struct;
287
288 typedef struct
289 {
290 __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH];
291 __m128i LF_AR_Q14;
292 __m128i Seed;
293 __m128i SeedInit;
294 __m128i RD_Q10;
295 __m128i Diff_Q14;
296 __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER];
297 NSQ_del_dec_sample_struct Samples[DECISION_DELAY];
298 } NSQ_del_dec_struct;
299
300 static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
301 const silk_encoder_state *psEncC, /* I Encoder State */
302 silk_nsq_state *NSQ, /* I/O NSQ state */
303 NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */
304 const opus_int16 x16[], /* I Input */
305 opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */
306 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
307 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
308 opus_int subfr, /* I Subframe number */
309 const opus_int LTP_scale_Q14, /* I LTP state scaling */
310 const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */
311 const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */
312 const opus_int signal_type, /* I Signal type */
313 const opus_int decisionDelay /* I Decision delay */
314 );
315
316 /*******************************************/
317 /* LPC analysis filter */
318 /* NB! State is kept internally and the */
319 /* filter always starts with zero state */
320 /* first d output samples are set to zero */
321 /*******************************************/
322 static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
323 opus_int16 *out, /* O Output signal */
324 const opus_int16 *in, /* I Input signal */
325 const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */
326 const opus_int32 len, /* I Signal length */
327 const opus_int32 order /* I Filter order */
328 );
329
330 /******************************************/
331 /* Noise shape quantizer for one subframe */
332 /******************************************/
333 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
334 silk_nsq_state *NSQ, /* I/O NSQ state */
335 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
336 opus_int signalType, /* I Signal type */
337 const opus_int32 x_Q10[], /* I */
338 opus_int8 pulses[], /* O */
339 opus_int16 xq[], /* O */
340 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
341 opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */
342 const opus_int16 a_Q12[], /* I Short term prediction coefs */
343 const opus_int16 b_Q14[], /* I Long term prediction coefs */
344 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
345 opus_int lag, /* I Pitch lag */
346 opus_int32 HarmShapeFIRPacked_Q14, /* I */
347 opus_int Tilt_Q14, /* I Spectral tilt */
348 opus_int32 LF_shp_Q14, /* I */
349 opus_int32 Gain_Q16, /* I */
350 opus_int Lambda_Q10, /* I */
351 opus_int offset_Q10, /* I */
352 opus_int length, /* I Input length */
353 opus_int subfr, /* I Subframe number */
354 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
355 opus_int predictLPCOrder, /* I Prediction filter order */
356 opus_int warping_Q16, /* I */
357 __m128i MaskDelDec, /* I Mask of states in decision tree */
358 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
359 opus_int decisionDelay /* I */
360 );
361
silk_NSQ_del_dec_avx2(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int16 x16[],opus_int8 pulses[],const opus_int16 * PredCoef_Q12,const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int32 pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)362 void silk_NSQ_del_dec_avx2(
363 const silk_encoder_state *psEncC, /* I Encoder State */
364 silk_nsq_state *NSQ, /* I/O NSQ state */
365 SideInfoIndices *psIndices, /* I/O Quantization Indices */
366 const opus_int16 x16[], /* I Input */
367 opus_int8 pulses[], /* O Quantized pulse signal */
368 const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */
369 const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */
370 const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */
371 const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */
372 const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */
373 const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */
374 const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */
375 const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */
376 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
377 const opus_int LTP_scale_Q14 /* I LTP state scaling */
378 )
379 {
380 #ifdef OPUS_CHECK_ASM
381 silk_nsq_state NSQ_c;
382 SideInfoIndices psIndices_c;
383 opus_int8 pulses_c[MAX_FRAME_LENGTH];
384 const opus_int8 *const pulses_a = pulses;
385
386 silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
387 silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
388 silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
389 silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
390 pitchL, Lambda_Q10, LTP_scale_Q14);
391 #endif
392
393 if (!verify_assumptions(psEncC))
394 {
395 silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
396 return;
397 }
398
399 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
400 opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
401 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
402 opus_int16 *pxq;
403 VARDECL(opus_int32, sLTP_Q15);
404 VARDECL(opus_int16, sLTP);
405 opus_int32 HarmShapeFIRPacked_Q14;
406 opus_int offset_Q10;
407 opus_int32 Gain_Q10;
408 opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
409 opus_int32 delayedGain_Q10[DECISION_DELAY];
410 NSQ_del_dec_struct psDelDec = {0};
411 NSQ_del_dec_sample_struct *psSample;
412 __m128i RDmin_Q10, MaskDelDec, Winner_selector;
413 SAVE_STACK;
414
415 MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
416
417 /* Set unvoiced lag to the previous one, overwrite later for voiced */
418 lag = NSQ->lagPrev;
419
420 silk_assert(NSQ->prev_gain_Q16 != 0);
421 psDelDec.Seed = _mm_and_si128(
422 _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
423 _mm_set1_epi32(3));
424 psDelDec.SeedInit = psDelDec.Seed;
425 psDelDec.RD_Q10 = _mm_setzero_si128();
426 psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
427 psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
428 psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
429 for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
430 {
431 psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
432 }
433 for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
434 {
435 psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
436 }
437
438 offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
439 smpl_buf_idx = 0; /* index of oldest samples */
440
441 decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
442
443 /* For voiced frames limit the decision delay to lower than the pitch lag */
444 if (psIndices->signalType == TYPE_VOICED)
445 {
446 for (k = 0; k < psEncC->nb_subfr; k++)
447 {
448 decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
449 }
450 }
451 else
452 {
453 if (lag > 0)
454 {
455 decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
456 }
457 }
458
459 if (psIndices->NLSFInterpCoef_Q2 == 4)
460 {
461 LSF_interpolation_flag = 0;
462 }
463 else
464 {
465 LSF_interpolation_flag = 1;
466 }
467
468 ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
469 ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
470 /* Set up pointers to start of sub frame */
471 pxq = &NSQ->xq[psEncC->ltp_mem_length];
472 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
473 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
474 subfr = 0;
475 for (k = 0; k < psEncC->nb_subfr; k++)
476 {
477 A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
478 B_Q14 = <PCoef_Q14[k * LTP_ORDER];
479 AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
480
481 /* Noise shape parameters */
482 silk_assert(HarmShapeGain_Q14[k] >= 0);
483 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
484 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
485
486 NSQ->rewhite_flag = 0;
487 if (psIndices->signalType == TYPE_VOICED)
488 {
489 /* Voiced */
490 lag = pitchL[k];
491
492 /* Re-whitening */
493 if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
494 {
495 if (k == 2)
496 {
497 /* RESET DELAYED DECISIONS */
498 /* Find winner */
499 RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
500 Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
501 Winner_selector = silk_index_to_selector(Winner_ind);
502 psDelDec.RD_Q10 = _mm_add_epi32(
503 psDelDec.RD_Q10,
504 _mm_blendv_epi8(
505 _mm_set1_epi32(silk_int32_MAX >> 4),
506 _mm_setzero_si128(),
507 _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
508
509 /* Copy final part of signals from winner state to output and long-term filter states */
510 last_smple_idx = smpl_buf_idx + decisionDelay;
511 for (i = 0; i < decisionDelay; i++)
512 {
513 last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
514 psSample = &psDelDec.Samples[last_smple_idx];
515 pulses[i - decisionDelay] =
516 (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
517 pxq[i - decisionDelay] =
518 silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
519 NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
520 silk_select_winner(psSample->Shape_Q14, Winner_selector);
521 }
522
523 subfr = 0;
524 }
525
526 /* Rewhiten with new A coefs */
527 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
528 silk_assert(start_idx > 0);
529
530 silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
531 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
532
533 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
534 NSQ->rewhite_flag = 1;
535 }
536 }
537
538 silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
539 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
540
541 silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
542 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
543 Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
544 psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
545
546 x16 += psEncC->subfr_length;
547 pulses += psEncC->subfr_length;
548 pxq += psEncC->subfr_length;
549 }
550
551 /* Find winner */
552 RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
553 Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
554
555 /* Copy final part of signals from winner state to output and long-term filter states */
556 psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
557 last_smple_idx = smpl_buf_idx + decisionDelay;
558 Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
559 for (i = 0; i < decisionDelay; i++)
560 {
561 last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
562 psSample = &psDelDec.Samples[last_smple_idx];
563
564 pulses[i - decisionDelay] =
565 (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
566 pxq[i - decisionDelay] =
567 silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
568 NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
569 silk_select_winner(psSample->Shape_Q14, Winner_selector);
570 }
571 for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
572 {
573 NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
574 }
575 for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
576 {
577 NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
578 }
579
580 /* Update states */
581 NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
582 NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
583 NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
584
585 /* Save quantized speech signal */
586 silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
587 silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
588
589 #ifdef OPUS_CHECK_ASM
590 silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
591 silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
592 silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
593 #endif
594
595 RESTORE_STACK;
596 }
597
silk_noise_shape_quantizer_short_prediction_x4(const __m128i * buf32,const opus_int16 * coef16,opus_int order)598 static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order)
599 {
600 __m256i out;
601 silk_assert(order == 10 || order == 16);
602
603 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
604 out = _mm256_set1_epi32(order >> 1);
605 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */
606 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */
607 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */
608 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */
609 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */
610 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */
611 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */
612 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */
613 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */
614 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */
615
616 if (order == 16)
617 {
618 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */
619 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */
620 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */
621 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */
622 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */
623 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */
624 }
625 return silk_cvtepi64_epi32_high(out);
626 }
627
628 /******************************************/
629 /* Noise shape quantizer for one subframe */
630 /******************************************/
silk_noise_shape_quantizer_del_dec_avx2(silk_nsq_state * NSQ,NSQ_del_dec_struct * psDelDec,opus_int signalType,const opus_int32 x_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],opus_int32 delayedGain_Q10[DECISION_DELAY],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int subfr,opus_int shapingLPCOrder,opus_int predictLPCOrder,opus_int warping_Q16,__m128i MaskDelDec,opus_int * smpl_buf_idx,opus_int decisionDelay)631 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
632 silk_nsq_state *NSQ, /* I/O NSQ state */
633 NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */
634 opus_int signalType, /* I Signal type */
635 const opus_int32 x_Q10[], /* I */
636 opus_int8 pulses[], /* O */
637 opus_int16 xq[], /* O */
638 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
639 opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */
640 const opus_int16 a_Q12[], /* I Short term prediction coefs */
641 const opus_int16 b_Q14[], /* I Long term prediction coefs */
642 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
643 opus_int lag, /* I Pitch lag */
644 opus_int32 HarmShapeFIRPacked_Q14, /* I */
645 opus_int Tilt_Q14, /* I Spectral tilt */
646 opus_int32 LF_shp_Q14, /* I */
647 opus_int32 Gain_Q16, /* I */
648 opus_int Lambda_Q10, /* I */
649 opus_int offset_Q10, /* I */
650 opus_int length, /* I Input length */
651 opus_int subfr, /* I Subframe number */
652 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
653 opus_int predictLPCOrder, /* I Prediction filter order */
654 opus_int warping_Q16, /* I */
655 __m128i MaskDelDec, /* I Mask of states in decision tree */
656 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
657 opus_int decisionDelay /* I */
658 )
659 {
660 int i;
661 opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];
662 opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];
663 opus_int32 Gain_Q10 = Gain_Q16 >> 6;
664
665 for (i = 0; i < length; i++)
666 {
667 /* Perform common calculations used in all states */
668 /* NSQ_sample_struct */
669 /* Low 128 bits => 1st set */
670 /* High 128 bits => 2nd set */
671 int j;
672 __m256i SS_Q_Q10;
673 __m256i SS_RD_Q10;
674 __m256i SS_xq_Q14;
675 __m256i SS_LF_AR_Q14;
676 __m256i SS_Diff_Q14;
677 __m256i SS_sLTP_shp_Q14;
678 __m256i SS_LPC_exc_Q14;
679 __m256i exc_Q14;
680 __m256i q_Q10, rr_Q10, rd_Q10;
681 __m256i mask;
682 __m128i LPC_pred_Q14, n_AR_Q14;
683 __m128i RDmin_Q10, RDmax_Q10;
684 __m128i n_LF_Q14;
685 __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;
686 __m128i Winner_rand_state, Winner_selector;
687 __m128i tmp0, tmp1;
688 NSQ_del_dec_sample_struct *psLastSample, *psSample;
689 opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;
690 opus_int32 LTP_pred_Q14, n_LTP_Q14;
691
692 /* Long-term prediction */
693 if (signalType == TYPE_VOICED)
694 {
695 /* Unrolled loop */
696 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
697 LTP_pred_Q14 = 2;
698 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]);
699 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]);
700 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]);
701 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]);
702 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]);
703 LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */
704 pred_lag_ptr++;
705 }
706 else
707 {
708 LTP_pred_Q14 = 0;
709 }
710
711 /* Long-term shaping */
712 if (lag > 0)
713 {
714 /* Symmetric, packed FIR coefficients */
715 n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]);
716 n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14);
717 n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14);
718 n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */
719 shp_lag_ptr++;
720 }
721 else
722 {
723 n_LTP_Q14 = 0;
724 }
725
726 /* BEGIN Updating Delayed Decision States */
727
728 /* Generate dither */
729 psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed);
730
731 /* Short-term prediction */
732 LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder);
733 LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */
734
735 /* Noise shape feedback */
736 silk_assert(shapingLPCOrder > 0);
737 silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */
738 /* Output of lowpass section */
739 tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16));
740 n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1);
741 for (j = 0; j < shapingLPCOrder - 1; j++)
742 {
743 /* Output of allpass section */
744 tmp1 = psDelDec->sAR2_Q14[j];
745 psDelDec->sAR2_Q14[j] = tmp0;
746 n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j]));
747 tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16));
748 }
749 psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0;
750 n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1]));
751
752 n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1); /* Q11 -> Q12 */
753 n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */
754 n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2); /* Q12 -> Q14 */
755
756 tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */
757 tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16); /* Q12 */
758 n_LF_Q14 = _mm_add_epi32(tmp0, tmp1); /* Q12 */
759 n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2); /* Q12 -> Q14 */
760
761 /* Input minus prediction plus noise feedback */
762 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
763 tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14); /* Q14 */
764 tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */
765 tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0); /* Q13 */
766 tmp0 = silk_mm_srai_round_epi32(tmp0, 4); /* Q10 */
767
768 r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */
769
770 /* Flip sign depending on dither */
771 r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed);
772 r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10);
773
774 /* Find two quantization level candidates and measure their rate-distortion */
775 q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10));
776 q1_Q0 = _mm_srai_epi32(q1_Q10, 10);
777 if (Lambda_Q10 > 2048)
778 {
779 /* For aggressive RDO, the bias becomes more than one pulse. */
780 tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */
781 q1_Q0 = _mm_srai_epi32(q1_Q10, 31);
782 tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128());
783 tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10);
784 q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1);
785 }
786
787 tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0);
788 q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0);
789 q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10));
790
791 /* check if q1_Q0 is 0 or -1 */
792 tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0);
793 tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128());
794 tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1);
795 q2_Q10 = _mm_add_epi32(q1_Q10, tmp0);
796 q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10);
797
798 rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10);
799 rd_Q10 = _mm256_abs_epi32(q_Q10);
800 rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10);
801 rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10));
802 rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10);
803 rd_Q10 = _mm256_srai_epi32(rd_Q10, 10);
804
805 mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1)));
806 SS_RD_Q10 = _mm256_add_epi32(
807 _mm256_broadcastsi128_si256(psDelDec->RD_Q10),
808 _mm256_blendv_epi8(
809 _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1),
810 rd_Q10,
811 mask));
812 SS_Q_Q10 = _mm256_blendv_epi8(
813 _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1),
814 q_Q10,
815 mask);
816
817 /* Update states for best and second best quantization */
818
819 /* Quantized excitation */
820 exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed));
821
822 /* Add predictions */
823 exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14));
824 SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1);
825 SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14));
826
827 /* Update states */
828 SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4)));
829 SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14));
830 SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14));
831
832 /* END Updating Delayed Decision States */
833
834 *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY;
835 last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY;
836 psLastSample = &psDelDec->Samples[last_smple_idx];
837
838 /* Find winner */
839 RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec);
840 Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10)));
841
842 /* Increase RD values of expired states */
843 Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector);
844
845 SS_RD_Q10 = _mm256_blendv_epi8(
846 _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)),
847 SS_RD_Q10,
848 _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state)));
849
850 /* find worst in first set */
851 RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec);
852 /* find best in second set */
853 RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);
854
855 /* Replace a state if best from second set outperforms worst in first set */
856 tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10);
857 if (!_mm_test_all_zeros(tmp0, tmp0))
858 {
859 int t;
860 RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));
861 RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));
862 tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));
863 tmp0 = _mm_blendv_epi8(
864 _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),
865 silk_index_to_selector(RDmin_ind),
866 tmp1);
867 for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)
868 {
869 psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);
870 }
871 psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);
872 psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0);
873 for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)
874 {
875 psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);
876 }
877 for (t = 0; t < DECISION_DELAY; t++)
878 {
879 psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);
880 psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);
881 psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);
882 psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);
883 psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);
884 }
885 mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));
886 SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);
887 SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);
888 SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);
889 SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);
890 SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);
891 SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);
892 SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);
893 }
894
895 /* Write samples from winner to output and long-term filter states */
896 if (subfr > 0 || i >= decisionDelay)
897 {
898 pulses[i - decisionDelay] =
899 (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);
900 xq[i - decisionDelay] =
901 silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));
902 NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =
903 silk_select_winner(psLastSample->Shape_Q14, Winner_selector);
904 sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =
905 silk_select_winner(psLastSample->Pred_Q15, Winner_selector);
906 }
907 NSQ->sLTP_shp_buf_idx++;
908 NSQ->sLTP_buf_idx++;
909
910 /* Update states */
911 psSample = &psDelDec->Samples[*smpl_buf_idx];
912 psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10));
913 psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14);
914 psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14);
915 psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14);
916 psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10);
917 psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14);
918 psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10);
919 psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14);
920 psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14);
921 psSample->RandState = psDelDec->Seed;
922 delayedGain_Q10[*smpl_buf_idx] = Gain_Q10;
923 }
924 /* Update LPC states */
925 for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
926 {
927 psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i];
928 }
929 }
930
silk_nsq_del_dec_scale_states_avx2(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,NSQ_del_dec_struct * psDelDec,const opus_int16 x16[],opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type,const opus_int decisionDelay)931 static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
932 const silk_encoder_state *psEncC, /* I Encoder State */
933 silk_nsq_state *NSQ, /* I/O NSQ state */
934 NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */
935 const opus_int16 x16[], /* I Input */
936 opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */
937 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
938 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
939 opus_int subfr, /* I Subframe number */
940 const opus_int LTP_scale_Q14, /* I LTP state scaling */
941 const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */
942 const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */
943 const opus_int signal_type, /* I Signal type */
944 const opus_int decisionDelay /* I Decision delay */
945 )
946 {
947 int i;
948 opus_int lag;
949 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
950 NSQ_del_dec_sample_struct *psSample;
951
952 lag = pitchL[subfr];
953 inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47);
954 silk_assert(inv_gain_Q31 != 0);
955
956 /* Scale input */
957 inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5);
958 for (i = 0; i < psEncC->subfr_length; i+=4)
959 {
960 __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
961 x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
962 _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
963 }
964
965 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
966 if (NSQ->rewhite_flag)
967 {
968 if (subfr == 0)
969 {
970 /* Do LTP downscaling */
971 inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);
972 }
973 for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)
974 {
975 silk_assert(i < MAX_FRAME_LENGTH);
976 sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);
977 }
978 }
979
980 /* Adjust for changing gain */
981 if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)
982 {
983 gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);
984
985 /* Scale long-term shaping state */
986 for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
987 {
988 __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i];
989 *p = silk_mm_smulww_epi32(*p, gain_adj_Q16);
990 }
991
992 /* Scale long-term prediction state */
993 if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)
994 {
995 for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)
996 {
997 sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;
998 }
999 }
1000
1001 /* Scale scalar states */
1002 psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16);
1003 psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16);
1004
1005 /* Scale short-term prediction and shaping states */
1006 for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
1007 {
1008 psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);
1009 }
1010 for (i = 0; i < DECISION_DELAY; i++)
1011 {
1012 psSample = &psDelDec->Samples[i];
1013 psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);
1014 psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);
1015 }
1016 for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
1017 {
1018 psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);
1019 }
1020
1021 /* Save inverse gain */
1022 NSQ->prev_gain_Q16 = Gains_Q16[subfr];
1023 }
1024 }
1025
silk_LPC_analysis_filter_avx2(opus_int16 * out,const opus_int16 * in,const opus_int16 * B,const opus_int32 len,const opus_int32 order)1026 static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
1027 opus_int16 *out, /* O Output signal */
1028 const opus_int16 *in, /* I Input signal */
1029 const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */
1030 const opus_int32 len, /* I Signal length */
1031 const opus_int32 order /* I Filter order */
1032 )
1033 {
1034 int i;
1035 opus_int32 out32_Q12, out32;
1036 silk_assert(order == 10 || order == 16);
1037
1038 for(i = order; i < len; i++ )
1039 {
1040 const opus_int16 *in_ptr = &in[ i ];
1041 /* Allowing wrap around so that two wraps can cancel each other. The rare
1042 cases where the result wraps around can only be triggered by invalid streams*/
1043
1044 __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8]));
1045 __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)& B[0]));
1046 __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
1047 if (order > 10)
1048 {
1049 in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16]));
1050 B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B [8]));
1051 B_v = silk_mm256_reverse_epi32(B_v);
1052 }
1053 else
1054 {
1055 in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10]));
1056 B_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B [8]));
1057 B_v = _mm256_shuffle_epi32(B_v, 0x01);
1058 }
1059 sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v));
1060
1061 out32_Q12 = silk_mm256_hsum_epi32(sum);
1062
1063 /* Subtract prediction */
1064 out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 );
1065
1066 /* Scale to Q0 */
1067 out32 = silk_sar_round_32(out32_Q12, 12);
1068
1069 /* Saturate output */
1070 out[ i ] = silk_sat16(out32);
1071 }
1072
1073 /* Set first d output samples to zero */
1074 silk_memset( out, 0, order * sizeof( opus_int16 ) );
1075 }
1076