xref: /aosp_15_r20/external/libopus/silk/arm/NSQ_del_dec_neon_intr.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1 /***********************************************************************
2 Copyright (c) 2017 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <arm_neon.h>
33 #ifdef OPUS_CHECK_ASM
34 # include <string.h>
35 #endif
36 #include "main.h"
37 #include "stack_alloc.h"
38 #include "os_support.h"
39 
40 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
41 /* If there are more states, C function is called, and this optimization must be expanded. */
42 #define NEON_MAX_DEL_DEC_STATES 4
43 
44 typedef struct {
45     opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_DEL_DEC_STATES ];
46     opus_int32 RandState[ DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
47     opus_int32 Q_Q10[     DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
48     opus_int32 Xq_Q14[    DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
49     opus_int32 Pred_Q15[  DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
50     opus_int32 Shape_Q14[ DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
51     opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ];
52     opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
53     opus_int32 Diff_Q14[  NEON_MAX_DEL_DEC_STATES ];
54     opus_int32 Seed[      NEON_MAX_DEL_DEC_STATES ];
55     opus_int32 SeedInit[  NEON_MAX_DEL_DEC_STATES ];
56     opus_int32 RD_Q10[    NEON_MAX_DEL_DEC_STATES ];
57 } NSQ_del_decs_struct;
58 
59 typedef struct {
60     opus_int32 Q_Q10[        NEON_MAX_DEL_DEC_STATES ];
61     opus_int32 RD_Q10[       NEON_MAX_DEL_DEC_STATES ];
62     opus_int32 xq_Q14[       NEON_MAX_DEL_DEC_STATES ];
63     opus_int32 LF_AR_Q14[    NEON_MAX_DEL_DEC_STATES ];
64     opus_int32 Diff_Q14[     NEON_MAX_DEL_DEC_STATES ];
65     opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ];
66     opus_int32 LPC_exc_Q14[  NEON_MAX_DEL_DEC_STATES ];
67 } NSQ_samples_struct;
68 
69 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
70     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
71     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
72     NSQ_del_decs_struct psDelDec[],                 /* I/O  Delayed decision states             */
73     const opus_int16    x16[],                      /* I    Input                               */
74     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
75     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
76     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
77     opus_int            subfr,                      /* I    Subframe number                     */
78     const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
79     const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
80     const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
81     const opus_int      signal_type,                /* I    Signal type                         */
82     const opus_int      decisionDelay               /* I    Decision delay                      */
83 );
84 
85 /******************************************/
86 /* Noise shape quantizer for one subframe */
87 /******************************************/
88 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
89     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
90     NSQ_del_decs_struct psDelDec[],             /* I/O  Delayed decision states             */
91     opus_int            signalType,             /* I    Signal type                         */
92     const opus_int32    x_Q10[],                /* I                                        */
93     opus_int8           pulses[],               /* O                                        */
94     opus_int16          xq[],                   /* O                                        */
95     opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
96     opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
97     const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
98     const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
99     const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
100     opus_int            lag,                    /* I    Pitch lag                           */
101     opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
102     opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
103     opus_int32          LF_shp_Q14,             /* I                                        */
104     opus_int32          Gain_Q16,               /* I                                        */
105     opus_int            Lambda_Q10,             /* I                                        */
106     opus_int            offset_Q10,             /* I                                        */
107     opus_int            length,                 /* I    Input length                        */
108     opus_int            subfr,                  /* I    Subframe number                     */
109     opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
110     opus_int            predictLPCOrder,        /* I    Prediction filter order             */
111     opus_int            warping_Q16,            /* I                                        */
112     opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
113     opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
114     opus_int            decisionDelay           /* I                                        */
115 );
116 
copy_winner_state_kernel(const NSQ_del_decs_struct * psDelDec,const opus_int offset,const opus_int last_smple_idx,const opus_int Winner_ind,const int32x2_t gain_lo_s32x2,const int32x2_t gain_hi_s32x2,const int32x4_t shift_s32x4,int32x4_t t0_s32x4,int32x4_t t1_s32x4,opus_int8 * const pulses,opus_int16 * pxq,silk_nsq_state * NSQ)117 static OPUS_INLINE void copy_winner_state_kernel(
118     const NSQ_del_decs_struct *psDelDec,
119     const opus_int            offset,
120     const opus_int            last_smple_idx,
121     const opus_int            Winner_ind,
122     const int32x2_t           gain_lo_s32x2,
123     const int32x2_t           gain_hi_s32x2,
124     const int32x4_t           shift_s32x4,
125     int32x4_t                 t0_s32x4,
126     int32x4_t                 t1_s32x4,
127     opus_int8 *const          pulses,
128     opus_int16                *pxq,
129     silk_nsq_state            *NSQ
130 )
131 {
132     int16x8_t t_s16x8;
133     int32x4_t o0_s32x4, o1_s32x4;
134 
135     t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
136     t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
137     t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
138     t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
139     t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
140     t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
141     t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
142     t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
143     t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) );
144     vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) );
145 
146     t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
147     t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
148     t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
149     t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
150     t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
151     t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
152     t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
153     t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
154     o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 );
155     o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 );
156     o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 );
157     o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 );
158     o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 );
159     o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 );
160     vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) );
161     vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) );
162 
163     t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
164     t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
165     t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
166     t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
167     t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
168     t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
169     t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
170     t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
171     vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 );
172     vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 );
173 }
174 
copy_winner_state(const NSQ_del_decs_struct * psDelDec,const opus_int decisionDelay,const opus_int smpl_buf_idx,const opus_int Winner_ind,const opus_int32 gain,const opus_int32 shift,opus_int8 * const pulses,opus_int16 * pxq,silk_nsq_state * NSQ)175 static OPUS_INLINE void copy_winner_state(
176     const NSQ_del_decs_struct *psDelDec,
177     const opus_int            decisionDelay,
178     const opus_int            smpl_buf_idx,
179     const opus_int            Winner_ind,
180     const opus_int32          gain,
181     const opus_int32          shift,
182     opus_int8 *const          pulses,
183     opus_int16                *pxq,
184     silk_nsq_state            *NSQ
185 )
186 {
187     opus_int        i, last_smple_idx;
188     const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF, 15 ) );
189     const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 );
190     const int32x4_t shift_s32x4 = vdupq_n_s32( -shift );
191     int32x4_t       t0_s32x4, t1_s32x4;
192 
193     t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */
194     last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY;
195     if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
196     if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
197 
198     for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) {
199         copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
200     }
201     for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) {
202         pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
203         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
204         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
205     }
206 
207     last_smple_idx += DECISION_DELAY;
208     for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) {
209         copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
210     }
211     for( ; i < decisionDelay; i++, last_smple_idx-- ) {
212         pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
213         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
214         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
215     }
216 }
217 
silk_NSQ_del_dec_neon(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int16 x16[],opus_int8 pulses[],const opus_int16 * PredCoef_Q12,const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)218 void silk_NSQ_del_dec_neon(
219     const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
220     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
221     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
222     const opus_int16            x16[],                                      /* I    Input                           */
223     opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
224     const opus_int16            *PredCoef_Q12,                              /* I    Short term prediction coefs     */
225     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
226     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
227     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
228     const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
229     const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
230     const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
231     const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
232     const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
233     const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
234 )
235 {
236 #ifdef OPUS_CHECK_ASM
237     silk_nsq_state NSQ_c;
238     SideInfoIndices psIndices_c;
239     opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
240     const opus_int8 *const pulses_a = pulses;
241 
242     ( void )pulses_a;
243     silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
244     silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
245     silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) );
246     silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
247                        pitchL, Lambda_Q10, LTP_scale_Q14 );
248 #endif
249 
250     /* The optimization parallelizes the different delay decision states. */
251     if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) || ( psEncC->nStatesDelayedDecision <= 2 )) {
252         /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
253         /* If there are more states, C function is called, and this optimization must be expanded. */
254         /* When the number of delay decision states is less than 3, there are penalties using this */
255         /* optimization, and C function is called.                                                 */
256         /* When the number of delay decision states is 2, it's better to specialize another        */
257         /* structure NSQ_del_dec2_struct and optimize with shorter NEON registers. (Low priority)  */
258         silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14,
259             Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 );
260     } else {
261         opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
262         opus_int            smpl_buf_idx, decisionDelay;
263         const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
264         opus_int16          *pxq;
265         VARDECL( opus_int32, sLTP_Q15 );
266         VARDECL( opus_int16, sLTP );
267         opus_int32          HarmShapeFIRPacked_Q14;
268         opus_int            offset_Q10;
269         opus_int32          RDmin_Q10, Gain_Q10;
270         VARDECL( opus_int32, x_sc_Q10 );
271         VARDECL( opus_int32, delayedGain_Q10 );
272         VARDECL( NSQ_del_decs_struct, psDelDec );
273         int32x4_t           t_s32x4;
274         SAVE_STACK;
275 
276         /* Set unvoiced lag to the previous one, overwrite later for voiced */
277         lag = NSQ->lagPrev;
278 
279         silk_assert( NSQ->prev_gain_Q16 != 0 );
280 
281         /* Initialize delayed decision states */
282         ALLOC( psDelDec, 1, NSQ_del_decs_struct );
283         OPUS_CLEAR(psDelDec, 1);
284         /* Only RandState and RD_Q10 need to be initialized to 0. */
285         silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
286         vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
287 
288         for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
289             psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3;
290         }
291         vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) );
292         vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) );
293         vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) );
294         for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
295             vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) );
296         }
297         for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
298             vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) );
299         }
300 
301         offset_Q10   = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
302         smpl_buf_idx = 0; /* index of oldest samples */
303 
304         decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
305 
306         /* For voiced frames limit the decision delay to lower than the pitch lag */
307         if( psIndices->signalType == TYPE_VOICED ) {
308             opus_int pitch_min = pitchL[ 0 ];
309             for( k = 1; k < psEncC->nb_subfr; k++ ) {
310                 pitch_min = silk_min_int( pitch_min, pitchL[ k ] );
311             }
312             decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 );
313         } else {
314             if( lag > 0 ) {
315                 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
316             }
317         }
318 
319         if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
320             LSF_interpolation_flag = 0;
321         } else {
322             LSF_interpolation_flag = 1;
323         }
324 
325         ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
326         ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
327         ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
328         ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
329         /* Set up pointers to start of sub frame */
330         pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
331         NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
332         NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
333         subfr = 0;
334         for( k = 0; k < psEncC->nb_subfr; k++ ) {
335             A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
336             B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
337             AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];
338 
339             /* Noise shape parameters */
340             silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
341             HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
342             HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
343 
344             NSQ->rewhite_flag = 0;
345             if( psIndices->signalType == TYPE_VOICED ) {
346                 /* Voiced */
347                 lag = pitchL[ k ];
348 
349                 /* Re-whitening */
350                 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
351                     if( k == 2 ) {
352                         /* RESET DELAYED DECISIONS */
353                         /* Find winner */
354                         int32x4_t RD_Q10_s32x4;
355                         RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
356                         Winner_ind = 0;
357                         for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
358                             if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) {
359                                 RDmin_Q10 = psDelDec->RD_Q10[ i ];
360                                 Winner_ind = i;
361                             }
362                         }
363                         psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 );
364                         RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
365                         RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) );
366                         vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 );
367 
368                         /* Copy final part of signals from winner state to output and long-term filter states */
369                         copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ );
370 
371                         subfr = 0;
372                     }
373 
374                     /* Rewhiten with new A coefs */
375                     start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
376                     silk_assert( start_idx > 0 );
377 
378                     silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
379                         A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
380 
381                     NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
382                     NSQ->rewhite_flag = 1;
383                 }
384             }
385 
386             silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
387                 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
388 
389             silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
390                 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
391                 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
392                 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
393 
394             x16    += psEncC->subfr_length;
395             pulses += psEncC->subfr_length;
396             pxq    += psEncC->subfr_length;
397         }
398 
399         /* Find winner */
400         RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
401         Winner_ind = 0;
402         for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
403             if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) {
404                 RDmin_Q10 = psDelDec->RD_Q10[ k ];
405                 Winner_ind = k;
406             }
407         }
408 
409         /* Copy final part of signals from winner state to output and long-term filter states */
410         psIndices->Seed = psDelDec->SeedInit[ Winner_ind ];
411         Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
412         copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ );
413 
414         t_s32x4 = vdupq_n_s32( 0 ); /* initialization */
415         for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) {
416             t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
417             t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
418             t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
419             t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
420             vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 );
421         }
422 
423         for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) {
424           NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ];
425         }
426 
427         for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) - 3 ); i += 4 ) {
428             t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
429             t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
430             t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
431             t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
432             vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 );
433         }
434 
435         for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
436           NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ];
437         }
438 
439         /* Update states */
440         NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ];
441         NSQ->sDiff_shp_Q14  = psDelDec->Diff_Q14[ Winner_ind ];
442         NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
443 
444         /* Save quantized speech signal */
445         silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
446         silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
447         RESTORE_STACK;
448     }
449 
450 #ifdef OPUS_CHECK_ASM
451     silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
452     silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
453     silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) );
454 #endif
455 }
456 
457 /******************************************/
458 /* Noise shape quantizer for one subframe */
459 /******************************************/
460 /* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_neon.h is actually a hacking C function. */
461 /*       Therefore here we append "_local" to the NEON function name to avoid confusion.                                */
silk_short_prediction_create_arch_coef_neon_local(opus_int32 * out,const opus_int16 * in,opus_int order)462 static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_int32 *out, const opus_int16 *in, opus_int order)
463 {
464   int16x8_t t_s16x8;
465   int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4;
466   silk_assert( order == 10 || order == 16 );
467 
468   t_s16x8 = vld1q_s16( in + 0 );                                                   /* 7 6 5 4  3 2 1 0    */
469   t_s16x8 = vrev64q_s16( t_s16x8 );                                                /* 4 5 6 7  0 1 2 3    */
470   t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 );                          /* 4 5 6 7             */
471   t3_s32x4 = vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 );                          /* 0 1 2 3             */
472 
473   if( order == 16 ) {
474       t_s16x8 = vld1q_s16( in + 8 );                                               /* F E D C  B A 9 8    */
475       t_s16x8 = vrev64q_s16( t_s16x8 );                                            /* C D E F  8 9 A B    */
476       t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 );                      /* C D E F             */
477       t1_s32x4 = vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 );                      /* 8 9 A B             */
478   } else {
479       int16x4_t t_s16x4;
480 
481       t0_s32x4 = vdupq_n_s32( 0 );                                                 /* zero zero zero zero */
482       t_s16x4 = vld1_s16( in + 6 );                                                /* 9    8    7    6    */
483       t_s16x4 = vrev64_s16( t_s16x4 );                                             /* 6    7    8    9    */
484       t1_s32x4 = vshll_n_s16( t_s16x4, 15 );
485       t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8    9    zero zero */
486   }
487   vst1q_s32( out +  0, t0_s32x4 );
488   vst1q_s32( out +  4, t1_s32x4 );
489   vst1q_s32( out +  8, t2_s32x4 );
490   vst1q_s32( out + 12, t3_s32x4 );
491 }
492 
silk_SMLAWB_lane0_neon(const int32x4_t out_s32x4,const int32x4_t in_s32x4,const int32x2_t coef_s32x2)493 static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon(
494     const int32x4_t out_s32x4,
495     const int32x4_t in_s32x4,
496     const int32x2_t coef_s32x2
497 )
498 {
499     return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) );
500 }
501 
silk_SMLAWB_lane1_neon(const int32x4_t out_s32x4,const int32x4_t in_s32x4,const int32x2_t coef_s32x2)502 static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon(
503     const int32x4_t out_s32x4,
504     const int32x4_t in_s32x4,
505     const int32x2_t coef_s32x2
506 )
507 {
508     return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) );
509 }
510 
511 /* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */
512 /*       Therefore here we append "_local" to the function name to avoid confusion.                        */
silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 * buf32,const opus_int32 * a_Q12_arch,opus_int order)513 static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order)
514 {
515     const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 );
516     const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 );
517     const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 );
518     const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 );
519     int32x4_t LPC_pred_Q14_s32x4;
520 
521     silk_assert( order == 10 || order == 16 );
522     /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
523     LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) );
524     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch0_s32x4 ) );
525     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch0_s32x4 ) );
526     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
527     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
528     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch1_s32x4 ) );
529     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch1_s32x4 ) );
530     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
531     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
532     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch2_s32x4 ) );
533     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch2_s32x4 ) );
534     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
535     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
536     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch3_s32x4 ) );
537     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch3_s32x4 ) );
538     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
539     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
540 
541     return LPC_pred_Q14_s32x4;
542 }
543 
silk_noise_shape_quantizer_del_dec_neon(silk_nsq_state * NSQ,NSQ_del_decs_struct psDelDec[],opus_int signalType,const opus_int32 x_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],opus_int32 delayedGain_Q10[],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int subfr,opus_int shapingLPCOrder,opus_int predictLPCOrder,opus_int warping_Q16,opus_int nStatesDelayedDecision,opus_int * smpl_buf_idx,opus_int decisionDelay)544 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
545     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
546     NSQ_del_decs_struct psDelDec[],             /* I/O  Delayed decision states             */
547     opus_int            signalType,             /* I    Signal type                         */
548     const opus_int32    x_Q10[],                /* I                                        */
549     opus_int8           pulses[],               /* O                                        */
550     opus_int16          xq[],                   /* O                                        */
551     opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
552     opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
553     const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
554     const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
555     const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
556     opus_int            lag,                    /* I    Pitch lag                           */
557     opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
558     opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
559     opus_int32          LF_shp_Q14,             /* I                                        */
560     opus_int32          Gain_Q16,               /* I                                        */
561     opus_int            Lambda_Q10,             /* I                                        */
562     opus_int            offset_Q10,             /* I                                        */
563     opus_int            length,                 /* I    Input length                        */
564     opus_int            subfr,                  /* I    Subframe number                     */
565     opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
566     opus_int            predictLPCOrder,        /* I    Prediction filter order             */
567     opus_int            warping_Q16,            /* I                                        */
568     opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
569     opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
570     opus_int            decisionDelay           /* I                                        */
571 )
572 {
573     opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
574     opus_int32   Winner_rand_state;
575     opus_int32   LTP_pred_Q14, n_LTP_Q14;
576     opus_int32   RDmin_Q10, RDmax_Q10;
577     opus_int32   Gain_Q10;
578     opus_int32   *pred_lag_ptr, *shp_lag_ptr;
579     opus_int32   a_Q12_arch[MAX_LPC_ORDER];
580     const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 );
581     const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1;
582     opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ];
583     const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER );
584     const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT );
585 
586     VARDECL( NSQ_samples_struct, psSampleState );
587     SAVE_STACK;
588 
589     silk_assert( nStatesDelayedDecision > 0 );
590     silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
591     ALLOC( psSampleState, 2, NSQ_samples_struct );
592     OPUS_CLEAR(psSampleState, 2);
593 
594     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
595     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
596     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
597 
598     for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) {
599       const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 +  i );
600       vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 ) );
601       vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) );
602     }
603 
604     for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) {
605       AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 );
606     }
607 
608     silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder );
609 
610     for( i = 0; i < length; i++ ) {
611         int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4;
612         int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4;
613         int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4;
614         int32x2_t AR_shp_Q28_s32x2;
615         int16x4_t r_Q10_s16x4, rr_Q10_s16x4;
616 
617         /* Perform common calculations used in all states */
618 
619         /* Long-term prediction */
620         if( signalType == TYPE_VOICED ) {
621             /* Unrolled loop */
622             /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
623             LTP_pred_Q14 = 2;
624             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[  0 ], b_Q14[ 0 ] );
625             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );
626             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );
627             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );
628             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
629             LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
630             pred_lag_ptr++;
631         } else {
632             LTP_pred_Q14 = 0;
633         }
634 
635         /* Long-term shaping */
636         if( lag > 0 ) {
637             /* Symmetric, packed FIR coefficients */
638             n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
639             n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
640             n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
641             shp_lag_ptr++;
642         } else {
643             n_LTP_Q14 = 0;
644         }
645 
646         /* Generate dither */
647         Seed_s32x4 = vld1q_s32( psDelDec->Seed );
648         Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) );
649         vst1q_s32( psDelDec->Seed, Seed_s32x4 );
650 
651         /* Short-term prediction */
652         LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_local(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrder);
653         LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */
654 
655         /* Noise shape feedback */
656         /* Output of lowpass section */
657         tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 );
658         /* Output of allpass section */
659         tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 );
660         tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
661         vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 );
662         AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 );
663         n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
664 
665         /* Loop over allpass sections */
666         for( j = 2; j < shapingLPCOrder; j += 2 ) {
667             /* Output of allpass section */
668             tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 );
669             tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 );
670             vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 );
671             n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
672             /* Output of allpass section */
673             tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 );
674             tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
675             vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 );
676             AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] );
677             n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
678         }
679         vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 );
680         n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
681         n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 );                                                                                        /* Q11 -> Q12 */
682         n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) );     /* Q12 */
683         n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 );                                                                                        /* Q12 -> Q14 */
684         n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 );                                         /* Q12 */
685         n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */
686         n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 );                                                                                        /* Q12 -> Q14 */
687 
688         /* Input minus prediction plus noise feedback                       */
689         /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
690         tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 );               /* Q14 */
691         tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */
692         tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 );                       /* Q13 */
693         tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 );                             /* Q10 */
694         tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 );        /* residual error Q10 */
695 
696         /* Flip sign depending on dither */
697         sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) );
698         tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 );
699         tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 );
700         tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) );
701         tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );
702         r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );
703 
704         /* Find two quantization level candidates and measure their rate-distortion */
705         {
706             int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) );
707             int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );
708             int16x4_t q2_Q10_s16x4;
709             int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;
710             uint32x4_t t_u32x4;
711 
712             if( Lambda_Q10 > 2048 ) {
713                 /* For aggressive RDO, the bias becomes more than one pulse. */
714                 const int rdo_offset = Lambda_Q10/2 - 512;
715                 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
716                 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
717                 int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) );
718                 signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset );
719                 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
720                 silk_assert( Lambda_Q10 <= 32767 );
721 
722                 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
723                 q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4);
724                 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
725             }
726             {
727                 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
728                 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
729                 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
730                 int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4;
731 
732                 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
733                 tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcge_s16(q1_Q0_s16x4, vdup_n_s16(0))), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
734                 tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 );
735                 tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) );
736                 q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4,  tmp_summand_s16x4);
737                 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
738                 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
739                 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
740                 q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) );
741                 q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 );
742                 q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 );
743                 tmp1_s16x4 = q1_Q10_s16x4;
744                 tmp2_s16x4 = q2_Q10_s16x4;
745                 tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 );
746                 tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 );
747                 rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) );
748                 rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) );
749             }
750 
751             rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 );
752             rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
753             rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 );
754 
755             rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 );
756             rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
757             rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 );
758 
759             tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
760             tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
761             tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
762             vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
763             vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
764             t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 );
765             tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) );
766             tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) );
767             vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 );
768             vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 );
769         }
770 
771         {
772             /* Update states for best quantization */
773             int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4;
774 
775             /* Quantized excitation */
776             exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 );
777             exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
778             exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
779 
780             /* Add predictions */
781             LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
782             xq_Q14_s32x4      = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
783 
784             /* Update states */
785             tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
786             vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 );
787             sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
788             vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
789             vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
790             vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
791             vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 );
792 
793             /* Quantized excitation */
794             exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 );
795             exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
796             exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
797 
798             /* Add predictions */
799             LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
800             xq_Q14_s32x4      = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
801 
802             /* Update states */
803             tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
804             vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 );
805             sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
806             vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
807             vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
808             vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
809             vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 );
810         }
811 
812         *smpl_buf_idx = *smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1);
813         last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY;
814         if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
815         if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
816 
817         /* Find winner */
818         RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
819         Winner_ind = 0;
820         for( k = 1; k < nStatesDelayedDecision; k++ ) {
821             if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) {
822                 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
823                 Winner_ind = k;
824             }
825         }
826 
827         /* clear unused part of RD_Q10 to avoid overflows */
828         if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES )
829         {
830             OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
831             OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
832         }
833 
834         /* Increase RD values of expired states */
835         {
836             uint32x4_t t_u32x4;
837             Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ];
838             t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) );
839             t_u32x4 = vmvnq_u32( t_u32x4 );
840             t_u32x4 = vshrq_n_u32( t_u32x4, 5 );
841             tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 );
842             tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 );
843             tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
844             tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
845             vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
846             vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
847 
848             /* Find worst in first set and best in second set */
849             RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
850             RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ];
851             RDmax_ind = 0;
852             RDmin_ind = 0;
853             for( k = 1; k < nStatesDelayedDecision; k++ ) {
854                 /* find worst in first set */
855                 if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) {
856                     RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
857                     RDmax_ind = k;
858                 }
859                 /* find best in second set */
860                 if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) {
861                     RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ];
862                     RDmin_ind = k;
863                 }
864             }
865         }
866 
867         /* Replace a state if best from second set outperforms worst in first set */
868         if( RDmin_Q10 < RDmax_Q10 ) {
869             opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState;
870             const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) )
871                 / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) );
872             /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several     */
873             /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */
874             /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity.                             */
875             for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) {
876                 psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ];
877             }
878             for( j = 0; j < numOthers; j++ ) {
879                 ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ];
880             }
881 
882             psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ];
883             psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ];
884             psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ];
885             psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ];
886             psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ];
887             psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ];
888             psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ];
889         }
890 
891         /* Write samples from winner to output and long-term filter states */
892         if( subfr > 0 || i >= decisionDelay ) {
893             pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
894             xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
895                 silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
896             NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
897             sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDelDec->Pred_Q15[  last_smple_idx ][ Winner_ind ];
898         }
899         NSQ->sLTP_shp_buf_idx++;
900         NSQ->sLTP_buf_idx++;
901 
902         /* Update states */
903         vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) );
904         vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) );
905         vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
906         vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
907         tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 );
908         vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 );
909         vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
910         vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
911         tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
912         tmp1_s32x4 = vreinterpretq_s32_u32( vaddq_u32( vreinterpretq_u32_s32(
913             vld1q_s32( psDelDec->Seed ) ), vreinterpretq_u32_s32( tmp1_s32x4 ) ) );
914         vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
915         vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
916         vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
917         delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
918     }
919     /* Update LPC states */
920     silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
921 
922     RESTORE_STACK;
923 }
924 
silk_SMULWB_8_neon(const opus_int16 * a,const int32x2_t b,opus_int32 * o)925 static OPUS_INLINE void silk_SMULWB_8_neon(
926     const opus_int16 *a,
927     const int32x2_t  b,
928     opus_int32       *o
929 )
930 {
931     const int16x8_t a_s16x8 = vld1q_s16( a );
932     int32x4_t o0_s32x4, o1_s32x4;
933 
934     o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 );
935     o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 );
936     o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 );
937     o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 );
938     vst1q_s32( o, o0_s32x4 );
939     vst1q_s32( o + 4, o1_s32x4 );
940 }
941 
942 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
silk_SMULWW_small_b_4_neon(opus_int32 * a,const int32x2_t b_s32x2)943 static OPUS_INLINE void silk_SMULWW_small_b_4_neon(
944     opus_int32       *a,
945     const int32x2_t  b_s32x2)
946 {
947     int32x4_t o_s32x4;
948 
949     o_s32x4 = vld1q_s32( a );
950     o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 );
951     vst1q_s32( a, o_s32x4 );
952 }
953 
954 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
silk_SMULWW_small_b_8_neon(opus_int32 * a,const int32x2_t b_s32x2)955 static OPUS_INLINE void silk_SMULWW_small_b_8_neon(
956     opus_int32       *a,
957     const int32x2_t  b_s32x2
958 )
959 {
960     int32x4_t o0_s32x4, o1_s32x4;
961 
962     o0_s32x4 = vld1q_s32( a );
963     o1_s32x4 = vld1q_s32( a + 4 );
964     o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 );
965     o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 );
966     vst1q_s32( a, o0_s32x4 );
967     vst1q_s32( a + 4, o1_s32x4 );
968 }
969 
silk_SMULWW_4_neon(opus_int32 * a,const int32x2_t b_s32x2)970 static OPUS_INLINE void silk_SMULWW_4_neon(
971     opus_int32       *a,
972     const int32x2_t  b_s32x2)
973 {
974     int32x4_t a_s32x4, o_s32x4;
975 
976     a_s32x4 = vld1q_s32( a );
977     o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 );
978     o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 );
979     vst1q_s32( a, o_s32x4 );
980 }
981 
silk_SMULWW_8_neon(opus_int32 * a,const int32x2_t b_s32x2)982 static OPUS_INLINE void silk_SMULWW_8_neon(
983     opus_int32       *a,
984     const int32x2_t  b_s32x2
985 )
986 {
987     int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4;
988 
989     a0_s32x4 = vld1q_s32( a );
990     a1_s32x4 = vld1q_s32( a + 4 );
991     o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 );
992     o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 );
993     o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 );
994     o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 );
995     vst1q_s32( a, o0_s32x4 );
996     vst1q_s32( a + 4, o1_s32x4 );
997 }
998 
silk_SMULWW_loop_neon(const opus_int16 * a,const opus_int32 b,opus_int32 * o,const opus_int loop_num)999 static OPUS_INLINE void silk_SMULWW_loop_neon(
1000     const opus_int16 *a,
1001     const opus_int32 b,
1002     opus_int32       *o,
1003     const opus_int   loop_num
1004 )
1005 {
1006     opus_int i;
1007     int32x2_t b_s32x2;
1008 
1009     b_s32x2 = vdup_n_s32( b );
1010     for( i = 0; i < loop_num - 7; i += 8 ) {
1011         silk_SMULWB_8_neon( a + i, b_s32x2, o + i );
1012     }
1013     for( ; i < loop_num; i++ ) {
1014         o[ i ] = silk_SMULWW( a[ i ], b );
1015     }
1016 }
1017 
silk_nsq_del_dec_scale_states_neon(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,NSQ_del_decs_struct psDelDec[],const opus_int16 x16[],opus_int32 x_sc_Q10[],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type,const opus_int decisionDelay)1018 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
1019     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
1020     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
1021     NSQ_del_decs_struct psDelDec[],                 /* I/O  Delayed decision states             */
1022     const opus_int16    x16[],                      /* I    Input                               */
1023     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
1024     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
1025     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
1026     opus_int            subfr,                      /* I    Subframe number                     */
1027     const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
1028     const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
1029     const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
1030     const opus_int      signal_type,                /* I    Signal type                         */
1031     const opus_int      decisionDelay               /* I    Decision delay                      */
1032 )
1033 {
1034     opus_int            i, lag;
1035     opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
1036 
1037     lag          = pitchL[ subfr ];
1038     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
1039     silk_assert( inv_gain_Q31 != 0 );
1040 
1041     /* Scale input */
1042     inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
1043     silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length );
1044 
1045     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
1046     if( NSQ->rewhite_flag ) {
1047         if( subfr == 0 ) {
1048             /* Do LTP downscaling */
1049             inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
1050         }
1051         silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 );
1052     }
1053 
1054     /* Adjust for changing gain */
1055     if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
1056         int32x2_t gain_adj_Q16_s32x2;
1057         gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
1058 
1059         /* Scale long-term shaping state */
1060         if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) {
1061             gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) );
1062             for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1063                 silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1064             }
1065             for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1066                 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1067             }
1068 
1069             /* Scale long-term prediction state */
1070             if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1071                 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1072                     silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1073                 }
1074                 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1075                     sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1076                 }
1077             }
1078 
1079             /* Scale scalar states */
1080             silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1081             silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14,  gain_adj_Q16_s32x2 );
1082 
1083             /* Scale short-term prediction and shaping states */
1084             for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1085                 silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1086             }
1087 
1088             for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1089                 silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1090             }
1091 
1092             for( i = 0; i < DECISION_DELAY; i++ ) {
1093                 silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[  i ], gain_adj_Q16_s32x2 );
1094                 silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1095             }
1096         } else {
1097             gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x0000FFFF, 15 ) );
1098             gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 );
1099             for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1100                 silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1101             }
1102             for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1103                 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1104             }
1105 
1106             /* Scale long-term prediction state */
1107             if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1108                 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1109                     silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1110                 }
1111                 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1112                     sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1113                 }
1114             }
1115 
1116             /* Scale scalar states */
1117             silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1118             silk_SMULWW_4_neon( psDelDec->Diff_Q14,  gain_adj_Q16_s32x2 );
1119 
1120             /* Scale short-term prediction and shaping states */
1121             for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1122                 silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1123             }
1124 
1125             for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1126                 silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1127             }
1128 
1129             for( i = 0; i < DECISION_DELAY; i++ ) {
1130                 silk_SMULWW_4_neon( psDelDec->Pred_Q15[  i ], gain_adj_Q16_s32x2 );
1131                 silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1132             }
1133         }
1134 
1135         /* Save inverse gain */
1136         NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
1137     }
1138 }
1139