xref: /aosp_15_r20/external/libopus/silk/x86/NSQ_del_dec_avx2.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1 /***********************************************************************
2 Copyright (c) 2021 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #ifdef OPUS_CHECK_ASM
33 #include <string.h>
34 #endif
35 
36 #include "opus_defines.h"
37 #include <immintrin.h>
38 
39 #include "main.h"
40 #include "stack_alloc.h"
41 #include "NSQ.h"
42 #include "celt/x86/x86cpu.h"
43 
44 /* Returns TRUE if all assumptions met */
verify_assumptions(const silk_encoder_state * psEncC)45 static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
46 {
47     /* This optimization is based on these assumptions        */
48     /* These assumptions are fundamental and hence assert are */
49     /* used. Should any assert triggers, we have to re-visit  */
50     /* all related code to make sure it still functions the   */
51     /* same as the C implementation.                          */
52     silk_assert(MAX_DEL_DEC_STATES  <= 4      &&
53                 MAX_FRAME_LENGTH     % 4 == 0 &&
54                 MAX_SUB_FRAME_LENGTH % 4 == 0 &&
55                 LTP_MEM_LENGTH_MS    % 4 == 0 );
56     silk_assert(psEncC->fs_kHz ==  8 ||
57                 psEncC->fs_kHz == 12 ||
58                 psEncC->fs_kHz == 16 );
59     silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&
60                 psEncC->nb_subfr > 0             );
61     silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&
62                 psEncC->nStatesDelayedDecision > 0                   );
63     silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);
64 
65     /* Regressions were observed on certain AMD Zen CPUs when      */
66     /* nStatesDelayedDecision is 1 or 2. Ideally we should detect  */
67     /* these CPUs and enable this optimization on others; however, */
68     /* there is no good way to do so under current OPUS framework. */
69     return psEncC->nStatesDelayedDecision == 3 ||
70            psEncC->nStatesDelayedDecision == 4;
71 }
72 
73 /* Intrinsics not defined on MSVC */
74 #ifdef _MSC_VER
75 #include <Intsafe.h>
76 #define __m128i_u __m128i
__builtin_sadd_overflow(opus_int32 a,opus_int32 b,opus_int32 * res)77 static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
78 {
79     *res = a+b;
80     return (*res ^ a) & (*res ^ b) & 0x80000000;
81 }
__builtin_ctz(unsigned int x)82 static inline int __builtin_ctz(unsigned int x)
83 {
84     DWORD res = 0;
85     return _BitScanForward(&res, x) ? res : 32;
86 }
87 #endif
88 
silk_cvtepi64_epi32_high(__m256i num)89 static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num)
90 {
91     return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1)));
92 }
93 
silk_sat16(opus_int32 num)94 static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)
95 {
96     num = num > silk_int16_MAX ? silk_int16_MAX : num;
97     num = num < silk_int16_MIN ? silk_int16_MIN : num;
98     return num;
99 }
100 
silk_sar_round_32(opus_int32 a,int bits)101 static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)
102 {
103     silk_assert(bits > 0 && bits < 31);
104     a += 1 << (bits-1);
105     return a >> bits;
106 }
107 
silk_sar_round_smulww(opus_int32 a,opus_int32 b,int bits)108 static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)
109 {
110     silk_assert(bits > 0 && bits < 63);
111 #ifdef OPUS_CHECK_ASM
112     return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
113 #else
114     /* This code is more correct, but it won't overflow like the C code in some rare cases. */
115     silk_assert(bits > 0 && bits < 63);
116     opus_int64 t = ((opus_int64)a) * ((opus_int64)b);
117     bits += 16;
118     t += 1ull << (bits-1);
119     return t >> bits;
120 #endif
121 }
122 
silk_add_sat32(opus_int32 a,opus_int32 b)123 static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)
124 {
125     opus_int32 sum;
126     if (__builtin_sadd_overflow(a, b, &sum))
127     {
128         return a >= 0 ? silk_int32_MAX : silk_int32_MIN;
129     }
130     return sum;
131 }
132 
silk_mm_srai_round_epi32(__m128i a,int bits)133 static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits)
134 {
135     silk_assert(bits > 0 && bits < 31);
136     return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits);
137 }
138 
139 /* add/subtract with output saturated */
silk_mm_add_sat_epi32(__m128i a,__m128i b)140 static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b)
141 {
142     __m128i r = _mm_add_epi32(a, b);
143     __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r));           /* OF = (sum ^ a) & (sum ^ b)   */
144     __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
145     return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
146 }
silk_mm_sub_sat_epi32(__m128i a,__m128i b)147 static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b)
148 {
149     __m128i r = _mm_sub_epi32(a, b);
150     __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
151     __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
152     return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
153 }
silk_mm256_sub_sat_epi32(__m256i a,__m256i b)154 static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b)
155 {
156     __m256i r = _mm256_sub_epi32(a, b);
157     __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
158     __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
159     return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31));
160 }
161 
silk_mm_limit_epi32(__m128i num,opus_int32 limit1,opus_int32 limit2)162 static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2)
163 {
164     opus_int32 lo = limit1 < limit2 ? limit1 : limit2;
165     opus_int32 hi = limit1 > limit2 ? limit1 : limit2;
166 
167     num = _mm_min_epi32(num, _mm_set1_epi32(hi));
168     num = _mm_max_epi32(num, _mm_set1_epi32(lo));
169     return num;
170 }
171 
172 /* cond < 0 ? -num : num */
silk_mm_sign_epi32(__m128i num,__m128i cond)173 static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond)
174 {
175     return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1)));
176 }
silk_mm256_sign_epi32(__m256i num,__m256i cond)177 static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond)
178 {
179     return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1)));
180 }
181 
182 /* (a32 * b32) >> 16 */
silk_mm_smulww_epi32(__m128i a,opus_int32 b)183 static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)
184 {
185     return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));
186 }
187 
188 /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
silk_mm_smulwb_epi32(__m128i a,opus_int32 b)189 static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)
190 {
191     return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(silk_LSHIFT(b, 16))));
192 }
193 
194 /* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
silk_mm256_smulbb_epi32(__m256i a,__m256i b)195 static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)
196 {
197     const char FF = (char)0xFF;
198     __m256i msk = _mm256_set_epi8(
199         FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,
200         FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);
201     __m256i lo = _mm256_mullo_epi16(a, b);
202     __m256i hi = _mm256_mulhi_epi16(a, b);
203     lo = _mm256_shuffle_epi8(lo, msk);
204     hi = _mm256_shuffle_epi8(hi, msk);
205     return _mm256_unpacklo_epi16(lo, hi);
206 }
207 
silk_mm256_reverse_epi32(__m256i v)208 static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)
209 {
210     v = _mm256_shuffle_epi32(v, 0x1B);
211     v = _mm256_permute4x64_epi64(v, 0x4E);
212     return v;
213 }
214 
silk_mm256_hsum_epi32(__m256i v)215 static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)
216 {
217     __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));
218     sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));
219     sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
220     return _mm_cvtsi128_si32(sum);
221 }
222 
silk_mm_hmin_epi32(__m128i num)223 static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num)
224 {
225     num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */
226     num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
227     return num;
228 }
229 
silk_mm_hmax_epi32(__m128i num)230 static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num)
231 {
232     num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */
233     num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
234     return num;
235 }
236 
silk_mm_mask_hmin_epi32(__m128i num,__m128i mask)237 static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask)
238 {
239     num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask);
240     return silk_mm_hmin_epi32(num);
241 }
242 
silk_mm_mask_hmax_epi32(__m128i num,__m128i mask)243 static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask)
244 {
245     num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask);
246     return silk_mm_hmax_epi32(num);
247 }
248 
silk_mm256_rand_epi32(__m128i seed)249 static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed)
250 {
251     seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER));
252     seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT));
253     return seed;
254 }
255 
silk_index_of_first_equal_epi32(__m128i a,__m128i b)256 static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b)
257 {
258     unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111;
259     silk_assert(mask != 0);
260     return __builtin_ctz(mask) >> 2;
261 }
262 
silk_index_to_selector(opus_int32 index)263 static __m128i silk_index_to_selector(opus_int32 index)
264 {
265     silk_assert(index < 4);
266     index <<= 2;
267     return _mm_set_epi8(
268         index + 3, index + 2, index + 1, index + 0,
269         index + 3, index + 2, index + 1, index + 0,
270         index + 3, index + 2, index + 1, index + 0,
271         index + 3, index + 2, index + 1, index + 0);
272 }
273 
silk_select_winner(__m128i num,__m128i selector)274 static opus_int32 silk_select_winner(__m128i num, __m128i selector)
275 {
276     return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector));
277 }
278 
279 typedef struct
280 {
281     __m128i RandState;
282     __m128i Q_Q10;
283     __m128i Xq_Q14;
284     __m128i Pred_Q15;
285     __m128i Shape_Q14;
286 } NSQ_del_dec_sample_struct;
287 
288 typedef struct
289 {
290     __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH];
291     __m128i LF_AR_Q14;
292     __m128i Seed;
293     __m128i SeedInit;
294     __m128i RD_Q10;
295     __m128i Diff_Q14;
296     __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER];
297     NSQ_del_dec_sample_struct Samples[DECISION_DELAY];
298 } NSQ_del_dec_struct;
299 
300 static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
301     const silk_encoder_state *psEncC,          /* I    Encoder State                   */
302     silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
303     NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
304     const opus_int16 x16[],                    /* I    Input                           */
305     opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
306     const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
307     opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
308     opus_int subfr,                            /* I    Subframe number                 */
309     const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
310     const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
311     const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
312     const opus_int signal_type,                /* I    Signal type                     */
313     const opus_int decisionDelay               /* I    Decision delay                  */
314 );
315 
316 /*******************************************/
317 /* LPC analysis filter                     */
318 /* NB! State is kept internally and the    */
319 /* filter always starts with zero state    */
320 /* first d output samples are set to zero  */
321 /*******************************************/
322 static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
323     opus_int16                  *out,               /* O    Output signal                           */
324     const opus_int16            *in,                /* I    Input signal                            */
325     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
326     const opus_int32            len,                /* I    Signal length                           */
327     const opus_int32            order               /* I    Filter order                            */
328 );
329 
330 /******************************************/
331 /* Noise shape quantizer for one subframe */
332 /******************************************/
333 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
334     silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
335     NSQ_del_dec_struct psDelDec[],              /* I/O  Delayed decision states            */
336     opus_int signalType,                        /* I    Signal type                        */
337     const opus_int32 x_Q10[],                   /* I                                       */
338     opus_int8 pulses[],                         /* O                                       */
339     opus_int16 xq[],                            /* O                                       */
340     opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
341     opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
342     const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
343     const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
344     const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
345     opus_int lag,                               /* I    Pitch lag                          */
346     opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
347     opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
348     opus_int32 LF_shp_Q14,                      /* I                                       */
349     opus_int32 Gain_Q16,                        /* I                                       */
350     opus_int Lambda_Q10,                        /* I                                       */
351     opus_int offset_Q10,                        /* I                                       */
352     opus_int length,                            /* I    Input length                       */
353     opus_int subfr,                             /* I    Subframe number                    */
354     opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
355     opus_int predictLPCOrder,                   /* I    Prediction filter order            */
356     opus_int warping_Q16,                       /* I                                       */
357     __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
358     opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
359     opus_int decisionDelay                      /* I                                       */
360 );
361 
silk_NSQ_del_dec_avx2(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int16 x16[],opus_int8 pulses[],const opus_int16 * PredCoef_Q12,const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int32 pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)362 void silk_NSQ_del_dec_avx2(
363     const silk_encoder_state *psEncC,                            /* I    Encoder State               */
364     silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
365     SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
366     const opus_int16 x16[],                                      /* I    Input                       */
367     opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
368     const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
369     const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
370     const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
371     const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
372     const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
373     const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
374     const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
375     const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
376     const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
377     const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
378 )
379 {
380 #ifdef OPUS_CHECK_ASM
381     silk_nsq_state NSQ_c;
382     SideInfoIndices psIndices_c;
383     opus_int8 pulses_c[MAX_FRAME_LENGTH];
384     const opus_int8 *const pulses_a = pulses;
385 
386     silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
387     silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
388     silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
389     silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
390                        pitchL, Lambda_Q10, LTP_scale_Q14);
391 #endif
392 
393     if (!verify_assumptions(psEncC))
394     {
395         silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
396         return;
397     }
398 
399     opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
400     opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
401     const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
402     opus_int16 *pxq;
403     VARDECL(opus_int32, sLTP_Q15);
404     VARDECL(opus_int16, sLTP);
405     opus_int32 HarmShapeFIRPacked_Q14;
406     opus_int offset_Q10;
407     opus_int32 Gain_Q10;
408     opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
409     opus_int32 delayedGain_Q10[DECISION_DELAY];
410     NSQ_del_dec_struct psDelDec = {0};
411     NSQ_del_dec_sample_struct *psSample;
412     __m128i RDmin_Q10, MaskDelDec, Winner_selector;
413     SAVE_STACK;
414 
415     MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
416 
417     /* Set unvoiced lag to the previous one, overwrite later for voiced */
418     lag = NSQ->lagPrev;
419 
420     silk_assert(NSQ->prev_gain_Q16 != 0);
421     psDelDec.Seed = _mm_and_si128(
422         _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
423         _mm_set1_epi32(3));
424     psDelDec.SeedInit = psDelDec.Seed;
425     psDelDec.RD_Q10 = _mm_setzero_si128();
426     psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
427     psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
428     psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
429     for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
430     {
431         psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
432     }
433     for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
434     {
435         psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
436     }
437 
438     offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
439     smpl_buf_idx = 0; /* index of oldest samples */
440 
441     decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
442 
443     /* For voiced frames limit the decision delay to lower than the pitch lag */
444     if (psIndices->signalType == TYPE_VOICED)
445     {
446         for (k = 0; k < psEncC->nb_subfr; k++)
447         {
448             decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
449         }
450     }
451     else
452     {
453         if (lag > 0)
454         {
455             decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
456         }
457     }
458 
459     if (psIndices->NLSFInterpCoef_Q2 == 4)
460     {
461         LSF_interpolation_flag = 0;
462     }
463     else
464     {
465         LSF_interpolation_flag = 1;
466     }
467 
468     ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
469     ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
470     /* Set up pointers to start of sub frame */
471     pxq = &NSQ->xq[psEncC->ltp_mem_length];
472     NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
473     NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
474     subfr = 0;
475     for (k = 0; k < psEncC->nb_subfr; k++)
476     {
477         A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
478         B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
479         AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
480 
481         /* Noise shape parameters */
482         silk_assert(HarmShapeGain_Q14[k] >= 0);
483         HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
484         HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
485 
486         NSQ->rewhite_flag = 0;
487         if (psIndices->signalType == TYPE_VOICED)
488         {
489             /* Voiced */
490             lag = pitchL[k];
491 
492             /* Re-whitening */
493             if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
494             {
495                 if (k == 2)
496                 {
497                     /* RESET DELAYED DECISIONS */
498                     /* Find winner */
499                     RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
500                     Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
501                     Winner_selector = silk_index_to_selector(Winner_ind);
502                     psDelDec.RD_Q10 = _mm_add_epi32(
503                         psDelDec.RD_Q10,
504                         _mm_blendv_epi8(
505                             _mm_set1_epi32(silk_int32_MAX >> 4),
506                             _mm_setzero_si128(),
507                             _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
508 
509                     /* Copy final part of signals from winner state to output and long-term filter states */
510                     last_smple_idx = smpl_buf_idx + decisionDelay;
511                     for (i = 0; i < decisionDelay; i++)
512                     {
513                         last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
514                         psSample = &psDelDec.Samples[last_smple_idx];
515                         pulses[i - decisionDelay] =
516                             (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
517                         pxq[i - decisionDelay] =
518                             silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
519                         NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
520                             silk_select_winner(psSample->Shape_Q14, Winner_selector);
521                     }
522 
523                     subfr = 0;
524                 }
525 
526                 /* Rewhiten with new A coefs */
527                 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
528                 silk_assert(start_idx > 0);
529 
530                 silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
531                                               A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
532 
533                 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
534                 NSQ->rewhite_flag = 1;
535             }
536         }
537 
538         silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
539                                            LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
540 
541         silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
542                                                 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
543                                                 Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
544                                                 psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
545 
546         x16 += psEncC->subfr_length;
547         pulses += psEncC->subfr_length;
548         pxq += psEncC->subfr_length;
549     }
550 
551     /* Find winner */
552     RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
553     Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
554 
555     /* Copy final part of signals from winner state to output and long-term filter states */
556     psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
557     last_smple_idx = smpl_buf_idx + decisionDelay;
558     Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
559     for (i = 0; i < decisionDelay; i++)
560     {
561         last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
562         psSample = &psDelDec.Samples[last_smple_idx];
563 
564         pulses[i - decisionDelay] =
565             (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
566         pxq[i - decisionDelay] =
567             silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
568         NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
569             silk_select_winner(psSample->Shape_Q14, Winner_selector);
570     }
571     for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
572     {
573         NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
574     }
575     for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
576     {
577         NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
578     }
579 
580     /* Update states */
581     NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
582     NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
583     NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
584 
585     /* Save quantized speech signal */
586     silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
587     silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
588 
589 #ifdef OPUS_CHECK_ASM
590     silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
591     silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
592     silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
593 #endif
594 
595     RESTORE_STACK;
596 }
597 
silk_noise_shape_quantizer_short_prediction_x4(const __m128i * buf32,const opus_int16 * coef16,opus_int order)598 static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order)
599 {
600     __m256i out;
601     silk_assert(order == 10 || order == 16);
602 
603     /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
604     out = _mm256_set1_epi32(order >> 1);
605     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */
606     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */
607     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */
608     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */
609     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */
610     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */
611     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */
612     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */
613     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */
614     out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */
615 
616     if (order == 16)
617     {
618         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */
619         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */
620         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */
621         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */
622         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */
623         out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */
624     }
625     return silk_cvtepi64_epi32_high(out);
626 }
627 
628 /******************************************/
629 /* Noise shape quantizer for one subframe */
630 /******************************************/
silk_noise_shape_quantizer_del_dec_avx2(silk_nsq_state * NSQ,NSQ_del_dec_struct * psDelDec,opus_int signalType,const opus_int32 x_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],opus_int32 delayedGain_Q10[DECISION_DELAY],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int subfr,opus_int shapingLPCOrder,opus_int predictLPCOrder,opus_int warping_Q16,__m128i MaskDelDec,opus_int * smpl_buf_idx,opus_int decisionDelay)631 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
632     silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
633     NSQ_del_dec_struct *psDelDec,               /* I/O  Delayed decision states            */
634     opus_int signalType,                        /* I    Signal type                        */
635     const opus_int32 x_Q10[],                   /* I                                       */
636     opus_int8 pulses[],                         /* O                                       */
637     opus_int16 xq[],                            /* O                                       */
638     opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
639     opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
640     const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
641     const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
642     const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
643     opus_int lag,                               /* I    Pitch lag                          */
644     opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
645     opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
646     opus_int32 LF_shp_Q14,                      /* I                                       */
647     opus_int32 Gain_Q16,                        /* I                                       */
648     opus_int Lambda_Q10,                        /* I                                       */
649     opus_int offset_Q10,                        /* I                                       */
650     opus_int length,                            /* I    Input length                       */
651     opus_int subfr,                             /* I    Subframe number                    */
652     opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
653     opus_int predictLPCOrder,                   /* I    Prediction filter order            */
654     opus_int warping_Q16,                       /* I                                       */
655     __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
656     opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
657     opus_int decisionDelay                      /* I                                       */
658 )
659 {
660     int i;
661     opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];
662     opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];
663     opus_int32 Gain_Q10 = Gain_Q16 >> 6;
664 
665     for (i = 0; i < length; i++)
666     {
667         /* Perform common calculations used in all states */
668         /* NSQ_sample_struct */
669         /* Low  128 bits => 1st set */
670         /* High 128 bits => 2nd set */
671         int j;
672         __m256i SS_Q_Q10;
673         __m256i SS_RD_Q10;
674         __m256i SS_xq_Q14;
675         __m256i SS_LF_AR_Q14;
676         __m256i SS_Diff_Q14;
677         __m256i SS_sLTP_shp_Q14;
678         __m256i SS_LPC_exc_Q14;
679         __m256i exc_Q14;
680         __m256i q_Q10, rr_Q10, rd_Q10;
681         __m256i mask;
682         __m128i LPC_pred_Q14, n_AR_Q14;
683         __m128i RDmin_Q10, RDmax_Q10;
684         __m128i n_LF_Q14;
685         __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;
686         __m128i Winner_rand_state, Winner_selector;
687         __m128i tmp0, tmp1;
688         NSQ_del_dec_sample_struct *psLastSample, *psSample;
689         opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;
690         opus_int32 LTP_pred_Q14, n_LTP_Q14;
691 
692         /* Long-term prediction */
693         if (signalType == TYPE_VOICED)
694         {
695             /* Unrolled loop */
696             /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
697             LTP_pred_Q14 = 2;
698             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]);
699             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]);
700             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]);
701             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]);
702             LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]);
703             LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */
704             pred_lag_ptr++;
705         }
706         else
707         {
708             LTP_pred_Q14 = 0;
709         }
710 
711         /* Long-term shaping */
712         if (lag > 0)
713         {
714             /* Symmetric, packed FIR coefficients */
715             n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]);
716             n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14);
717             n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14);
718             n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */
719             shp_lag_ptr++;
720         }
721         else
722         {
723             n_LTP_Q14 = 0;
724         }
725 
726         /* BEGIN Updating Delayed Decision States */
727 
728         /* Generate dither */
729         psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed);
730 
731         /* Short-term prediction */
732         LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder);
733         LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */
734 
735         /* Noise shape feedback */
736         silk_assert(shapingLPCOrder > 0);
737         silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */
738         /* Output of lowpass section */
739         tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16));
740         n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1);
741         for (j = 0; j < shapingLPCOrder - 1; j++)
742         {
743             /* Output of allpass section */
744             tmp1 = psDelDec->sAR2_Q14[j];
745             psDelDec->sAR2_Q14[j] = tmp0;
746             n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j]));
747             tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16));
748         }
749         psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0;
750         n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1]));
751 
752         n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1);                                                  /* Q11 -> Q12 */
753         n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */
754         n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2);                                                  /* Q12 -> Q14 */
755 
756         tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */
757         tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16);                  /* Q12 */
758         n_LF_Q14 = _mm_add_epi32(tmp0, tmp1);                                                /* Q12 */
759         n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2);                                              /* Q12 -> Q14 */
760 
761         /* Input minus prediction plus noise feedback                       */
762         /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
763         tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14);              /* Q14 */
764         tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */
765         tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0);                      /* Q13 */
766         tmp0 = silk_mm_srai_round_epi32(tmp0, 4);                      /* Q10 */
767 
768         r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */
769 
770         /* Flip sign depending on dither */
771         r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed);
772         r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10);
773 
774         /* Find two quantization level candidates and measure their rate-distortion */
775         q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10));
776         q1_Q0 = _mm_srai_epi32(q1_Q10, 10);
777         if (Lambda_Q10 > 2048)
778         {
779             /* For aggressive RDO, the bias becomes more than one pulse. */
780             tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */
781             q1_Q0 = _mm_srai_epi32(q1_Q10, 31);
782             tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128());
783             tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10);
784             q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1);
785         }
786 
787         tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0);
788         q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0);
789         q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10));
790 
791         /* check if q1_Q0 is 0 or -1 */
792         tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0);
793         tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128());
794         tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1);
795         q2_Q10 = _mm_add_epi32(q1_Q10, tmp0);
796         q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10);
797 
798         rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10);
799         rd_Q10 = _mm256_abs_epi32(q_Q10);
800         rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10);
801         rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10));
802         rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10);
803         rd_Q10 = _mm256_srai_epi32(rd_Q10, 10);
804 
805         mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1)));
806         SS_RD_Q10 = _mm256_add_epi32(
807             _mm256_broadcastsi128_si256(psDelDec->RD_Q10),
808             _mm256_blendv_epi8(
809                 _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1),
810                 rd_Q10,
811                 mask));
812         SS_Q_Q10 = _mm256_blendv_epi8(
813             _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1),
814             q_Q10,
815             mask);
816 
817         /* Update states for best and second best quantization */
818 
819         /* Quantized excitation */
820         exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed));
821 
822         /* Add predictions */
823         exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14));
824         SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1);
825         SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14));
826 
827         /* Update states */
828         SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4)));
829         SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14));
830         SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14));
831 
832         /* END Updating Delayed Decision States */
833 
834         *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY;
835         last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY;
836         psLastSample = &psDelDec->Samples[last_smple_idx];
837 
838         /* Find winner */
839         RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec);
840         Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10)));
841 
842         /* Increase RD values of expired states */
843         Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector);
844 
845         SS_RD_Q10 = _mm256_blendv_epi8(
846             _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)),
847             SS_RD_Q10,
848             _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state)));
849 
850         /* find worst in first set */
851         RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec);
852         /* find best in second set */
853         RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);
854 
855         /* Replace a state if best from second set outperforms worst in first set */
856         tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10);
857         if (!_mm_test_all_zeros(tmp0, tmp0))
858         {
859             int t;
860             RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));
861             RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));
862             tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));
863             tmp0 = _mm_blendv_epi8(
864                 _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),
865                 silk_index_to_selector(RDmin_ind),
866                 tmp1);
867             for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)
868             {
869                 psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);
870             }
871             psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);
872             psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0);
873             for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)
874             {
875                 psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);
876             }
877             for (t = 0; t < DECISION_DELAY; t++)
878             {
879                 psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);
880                 psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);
881                 psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);
882                 psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);
883                 psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);
884             }
885             mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));
886             SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);
887             SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);
888             SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);
889             SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);
890             SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);
891             SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);
892             SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);
893         }
894 
895         /* Write samples from winner to output and long-term filter states */
896         if (subfr > 0 || i >= decisionDelay)
897         {
898             pulses[i - decisionDelay] =
899                 (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);
900             xq[i - decisionDelay] =
901                 silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));
902             NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =
903                 silk_select_winner(psLastSample->Shape_Q14, Winner_selector);
904             sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =
905                 silk_select_winner(psLastSample->Pred_Q15, Winner_selector);
906         }
907         NSQ->sLTP_shp_buf_idx++;
908         NSQ->sLTP_buf_idx++;
909 
910         /* Update states */
911         psSample = &psDelDec->Samples[*smpl_buf_idx];
912         psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10));
913         psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14);
914         psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14);
915         psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14);
916         psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10);
917         psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14);
918         psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10);
919         psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14);
920         psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14);
921         psSample->RandState = psDelDec->Seed;
922         delayedGain_Q10[*smpl_buf_idx] = Gain_Q10;
923     }
924     /* Update LPC states */
925     for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
926     {
927         psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i];
928     }
929 }
930 
silk_nsq_del_dec_scale_states_avx2(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,NSQ_del_dec_struct * psDelDec,const opus_int16 x16[],opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type,const opus_int decisionDelay)931 static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
932     const silk_encoder_state *psEncC,          /* I    Encoder State                   */
933     silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
934     NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
935     const opus_int16 x16[],                    /* I    Input                           */
936     opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
937     const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
938     opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
939     opus_int subfr,                            /* I    Subframe number                 */
940     const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
941     const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
942     const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
943     const opus_int signal_type,                /* I    Signal type                     */
944     const opus_int decisionDelay               /* I    Decision delay                  */
945 )
946 {
947     int i;
948     opus_int lag;
949     opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
950     NSQ_del_dec_sample_struct *psSample;
951 
952     lag = pitchL[subfr];
953     inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47);
954     silk_assert(inv_gain_Q31 != 0);
955 
956     /* Scale input */
957     inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5);
958     for (i = 0; i < psEncC->subfr_length; i+=4)
959     {
960         __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
961         x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
962         _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
963     }
964 
965     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
966     if (NSQ->rewhite_flag)
967     {
968         if (subfr == 0)
969         {
970             /* Do LTP downscaling */
971             inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);
972         }
973         for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)
974         {
975             silk_assert(i < MAX_FRAME_LENGTH);
976             sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);
977         }
978     }
979 
980     /* Adjust for changing gain */
981     if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)
982     {
983         gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);
984 
985         /* Scale long-term shaping state */
986         for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
987         {
988             __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i];
989             *p = silk_mm_smulww_epi32(*p, gain_adj_Q16);
990         }
991 
992         /* Scale long-term prediction state */
993         if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)
994         {
995             for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)
996             {
997                 sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;
998             }
999         }
1000 
1001         /* Scale scalar states */
1002         psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16);
1003         psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16);
1004 
1005         /* Scale short-term prediction and shaping states */
1006         for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
1007         {
1008             psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);
1009         }
1010         for (i = 0; i < DECISION_DELAY; i++)
1011         {
1012             psSample = &psDelDec->Samples[i];
1013             psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);
1014             psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);
1015         }
1016         for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
1017         {
1018             psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);
1019         }
1020 
1021         /* Save inverse gain */
1022         NSQ->prev_gain_Q16 = Gains_Q16[subfr];
1023     }
1024 }
1025 
silk_LPC_analysis_filter_avx2(opus_int16 * out,const opus_int16 * in,const opus_int16 * B,const opus_int32 len,const opus_int32 order)1026 static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
1027     opus_int16                  *out,               /* O    Output signal                           */
1028     const opus_int16            *in,                /* I    Input signal                            */
1029     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
1030     const opus_int32            len,                /* I    Signal length                           */
1031     const opus_int32            order               /* I    Filter order                            */
1032 )
1033 {
1034     int i;
1035     opus_int32       out32_Q12, out32;
1036     silk_assert(order == 10 || order == 16);
1037 
1038     for(i = order; i < len; i++ )
1039     {
1040         const opus_int16 *in_ptr = &in[ i ];
1041         /* Allowing wrap around so that two wraps can cancel each other. The rare
1042            cases where the result wraps around can only be triggered by invalid streams*/
1043 
1044         __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8]));
1045         __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&      B[0]));
1046         __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
1047         if (order > 10)
1048         {
1049             in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16]));
1050             B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B       [8]));
1051             B_v  = silk_mm256_reverse_epi32(B_v);
1052         }
1053         else
1054         {
1055             in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10]));
1056             B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B       [8]));
1057             B_v  = _mm256_shuffle_epi32(B_v, 0x01);
1058         }
1059         sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v));
1060 
1061         out32_Q12 = silk_mm256_hsum_epi32(sum);
1062 
1063         /* Subtract prediction */
1064         out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 );
1065 
1066         /* Scale to Q0 */
1067         out32 = silk_sar_round_32(out32_Q12, 12);
1068 
1069         /* Saturate output */
1070         out[ i ] = silk_sat16(out32);
1071     }
1072 
1073     /* Set first d output samples to zero */
1074     silk_memset( out, 0, order * sizeof( opus_int16 ) );
1075 }
1076