1 /***********************************************************************
2 Copyright (c) 2017 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <arm_neon.h>
33 #ifdef OPUS_CHECK_ASM
34 # include <string.h>
35 #endif
36 #include "main.h"
37 #include "stack_alloc.h"
38 #include "os_support.h"
39
40 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
41 /* If there are more states, C function is called, and this optimization must be expanded. */
42 #define NEON_MAX_DEL_DEC_STATES 4
43
44 typedef struct {
45 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_DEL_DEC_STATES ];
46 opus_int32 RandState[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
47 opus_int32 Q_Q10[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
48 opus_int32 Xq_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
49 opus_int32 Pred_Q15[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
50 opus_int32 Shape_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
51 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ];
52 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
53 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];
54 opus_int32 Seed[ NEON_MAX_DEL_DEC_STATES ];
55 opus_int32 SeedInit[ NEON_MAX_DEL_DEC_STATES ];
56 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];
57 } NSQ_del_decs_struct;
58
59 typedef struct {
60 opus_int32 Q_Q10[ NEON_MAX_DEL_DEC_STATES ];
61 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];
62 opus_int32 xq_Q14[ NEON_MAX_DEL_DEC_STATES ];
63 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
64 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];
65 opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ];
66 opus_int32 LPC_exc_Q14[ NEON_MAX_DEL_DEC_STATES ];
67 } NSQ_samples_struct;
68
69 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
70 const silk_encoder_state *psEncC, /* I Encoder State */
71 silk_nsq_state *NSQ, /* I/O NSQ state */
72 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
73 const opus_int16 x16[], /* I Input */
74 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
75 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
76 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
77 opus_int subfr, /* I Subframe number */
78 const opus_int LTP_scale_Q14, /* I LTP state scaling */
79 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
80 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
81 const opus_int signal_type, /* I Signal type */
82 const opus_int decisionDelay /* I Decision delay */
83 );
84
85 /******************************************/
86 /* Noise shape quantizer for one subframe */
87 /******************************************/
88 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
89 silk_nsq_state *NSQ, /* I/O NSQ state */
90 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
91 opus_int signalType, /* I Signal type */
92 const opus_int32 x_Q10[], /* I */
93 opus_int8 pulses[], /* O */
94 opus_int16 xq[], /* O */
95 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
96 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
97 const opus_int16 a_Q12[], /* I Short term prediction coefs */
98 const opus_int16 b_Q14[], /* I Long term prediction coefs */
99 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
100 opus_int lag, /* I Pitch lag */
101 opus_int32 HarmShapeFIRPacked_Q14, /* I */
102 opus_int Tilt_Q14, /* I Spectral tilt */
103 opus_int32 LF_shp_Q14, /* I */
104 opus_int32 Gain_Q16, /* I */
105 opus_int Lambda_Q10, /* I */
106 opus_int offset_Q10, /* I */
107 opus_int length, /* I Input length */
108 opus_int subfr, /* I Subframe number */
109 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
110 opus_int predictLPCOrder, /* I Prediction filter order */
111 opus_int warping_Q16, /* I */
112 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
113 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
114 opus_int decisionDelay /* I */
115 );
116
copy_winner_state_kernel(const NSQ_del_decs_struct * psDelDec,const opus_int offset,const opus_int last_smple_idx,const opus_int Winner_ind,const int32x2_t gain_lo_s32x2,const int32x2_t gain_hi_s32x2,const int32x4_t shift_s32x4,int32x4_t t0_s32x4,int32x4_t t1_s32x4,opus_int8 * const pulses,opus_int16 * pxq,silk_nsq_state * NSQ)117 static OPUS_INLINE void copy_winner_state_kernel(
118 const NSQ_del_decs_struct *psDelDec,
119 const opus_int offset,
120 const opus_int last_smple_idx,
121 const opus_int Winner_ind,
122 const int32x2_t gain_lo_s32x2,
123 const int32x2_t gain_hi_s32x2,
124 const int32x4_t shift_s32x4,
125 int32x4_t t0_s32x4,
126 int32x4_t t1_s32x4,
127 opus_int8 *const pulses,
128 opus_int16 *pxq,
129 silk_nsq_state *NSQ
130 )
131 {
132 int16x8_t t_s16x8;
133 int32x4_t o0_s32x4, o1_s32x4;
134
135 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
136 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
137 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
138 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
139 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
140 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
141 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
142 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
143 t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) );
144 vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) );
145
146 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
147 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
148 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
149 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
150 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
151 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
152 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
153 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
154 o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 );
155 o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 );
156 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 );
157 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 );
158 o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 );
159 o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 );
160 vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) );
161 vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) );
162
163 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
164 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
165 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
166 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
167 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
168 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
169 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
170 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
171 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 );
172 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 );
173 }
174
copy_winner_state(const NSQ_del_decs_struct * psDelDec,const opus_int decisionDelay,const opus_int smpl_buf_idx,const opus_int Winner_ind,const opus_int32 gain,const opus_int32 shift,opus_int8 * const pulses,opus_int16 * pxq,silk_nsq_state * NSQ)175 static OPUS_INLINE void copy_winner_state(
176 const NSQ_del_decs_struct *psDelDec,
177 const opus_int decisionDelay,
178 const opus_int smpl_buf_idx,
179 const opus_int Winner_ind,
180 const opus_int32 gain,
181 const opus_int32 shift,
182 opus_int8 *const pulses,
183 opus_int16 *pxq,
184 silk_nsq_state *NSQ
185 )
186 {
187 opus_int i, last_smple_idx;
188 const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF, 15 ) );
189 const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 );
190 const int32x4_t shift_s32x4 = vdupq_n_s32( -shift );
191 int32x4_t t0_s32x4, t1_s32x4;
192
193 t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */
194 last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY;
195 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
196 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
197
198 for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) {
199 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
200 }
201 for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) {
202 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
203 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
204 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
205 }
206
207 last_smple_idx += DECISION_DELAY;
208 for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) {
209 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
210 }
211 for( ; i < decisionDelay; i++, last_smple_idx-- ) {
212 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
213 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
214 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
215 }
216 }
217
silk_NSQ_del_dec_neon(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int16 x16[],opus_int8 pulses[],const opus_int16 * PredCoef_Q12,const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)218 void silk_NSQ_del_dec_neon(
219 const silk_encoder_state *psEncC, /* I Encoder State */
220 silk_nsq_state *NSQ, /* I/O NSQ state */
221 SideInfoIndices *psIndices, /* I/O Quantization Indices */
222 const opus_int16 x16[], /* I Input */
223 opus_int8 pulses[], /* O Quantized pulse signal */
224 const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */
225 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
226 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
227 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
228 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
229 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
230 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
231 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
232 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
233 const opus_int LTP_scale_Q14 /* I LTP state scaling */
234 )
235 {
236 #ifdef OPUS_CHECK_ASM
237 silk_nsq_state NSQ_c;
238 SideInfoIndices psIndices_c;
239 opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
240 const opus_int8 *const pulses_a = pulses;
241
242 ( void )pulses_a;
243 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
244 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
245 silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) );
246 silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
247 pitchL, Lambda_Q10, LTP_scale_Q14 );
248 #endif
249
250 /* The optimization parallelizes the different delay decision states. */
251 if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) || ( psEncC->nStatesDelayedDecision <= 2 )) {
252 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
253 /* If there are more states, C function is called, and this optimization must be expanded. */
254 /* When the number of delay decision states is less than 3, there are penalties using this */
255 /* optimization, and C function is called. */
256 /* When the number of delay decision states is 2, it's better to specialize another */
257 /* structure NSQ_del_dec2_struct and optimize with shorter NEON registers. (Low priority) */
258 silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14,
259 Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 );
260 } else {
261 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
262 opus_int smpl_buf_idx, decisionDelay;
263 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
264 opus_int16 *pxq;
265 VARDECL( opus_int32, sLTP_Q15 );
266 VARDECL( opus_int16, sLTP );
267 opus_int32 HarmShapeFIRPacked_Q14;
268 opus_int offset_Q10;
269 opus_int32 RDmin_Q10, Gain_Q10;
270 VARDECL( opus_int32, x_sc_Q10 );
271 VARDECL( opus_int32, delayedGain_Q10 );
272 VARDECL( NSQ_del_decs_struct, psDelDec );
273 int32x4_t t_s32x4;
274 SAVE_STACK;
275
276 /* Set unvoiced lag to the previous one, overwrite later for voiced */
277 lag = NSQ->lagPrev;
278
279 silk_assert( NSQ->prev_gain_Q16 != 0 );
280
281 /* Initialize delayed decision states */
282 ALLOC( psDelDec, 1, NSQ_del_decs_struct );
283 OPUS_CLEAR(psDelDec, 1);
284 /* Only RandState and RD_Q10 need to be initialized to 0. */
285 silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
286 vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
287
288 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
289 psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3;
290 }
291 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) );
292 vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) );
293 vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) );
294 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
295 vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) );
296 }
297 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
298 vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) );
299 }
300
301 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
302 smpl_buf_idx = 0; /* index of oldest samples */
303
304 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
305
306 /* For voiced frames limit the decision delay to lower than the pitch lag */
307 if( psIndices->signalType == TYPE_VOICED ) {
308 opus_int pitch_min = pitchL[ 0 ];
309 for( k = 1; k < psEncC->nb_subfr; k++ ) {
310 pitch_min = silk_min_int( pitch_min, pitchL[ k ] );
311 }
312 decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 );
313 } else {
314 if( lag > 0 ) {
315 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
316 }
317 }
318
319 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
320 LSF_interpolation_flag = 0;
321 } else {
322 LSF_interpolation_flag = 1;
323 }
324
325 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
326 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
327 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
328 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
329 /* Set up pointers to start of sub frame */
330 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
331 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
332 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
333 subfr = 0;
334 for( k = 0; k < psEncC->nb_subfr; k++ ) {
335 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
336 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
337 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
338
339 /* Noise shape parameters */
340 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
341 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
342 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
343
344 NSQ->rewhite_flag = 0;
345 if( psIndices->signalType == TYPE_VOICED ) {
346 /* Voiced */
347 lag = pitchL[ k ];
348
349 /* Re-whitening */
350 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
351 if( k == 2 ) {
352 /* RESET DELAYED DECISIONS */
353 /* Find winner */
354 int32x4_t RD_Q10_s32x4;
355 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
356 Winner_ind = 0;
357 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
358 if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) {
359 RDmin_Q10 = psDelDec->RD_Q10[ i ];
360 Winner_ind = i;
361 }
362 }
363 psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 );
364 RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
365 RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) );
366 vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 );
367
368 /* Copy final part of signals from winner state to output and long-term filter states */
369 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ );
370
371 subfr = 0;
372 }
373
374 /* Rewhiten with new A coefs */
375 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
376 silk_assert( start_idx > 0 );
377
378 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
379 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
380
381 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
382 NSQ->rewhite_flag = 1;
383 }
384 }
385
386 silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
387 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
388
389 silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
390 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
391 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
392 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
393
394 x16 += psEncC->subfr_length;
395 pulses += psEncC->subfr_length;
396 pxq += psEncC->subfr_length;
397 }
398
399 /* Find winner */
400 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
401 Winner_ind = 0;
402 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
403 if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) {
404 RDmin_Q10 = psDelDec->RD_Q10[ k ];
405 Winner_ind = k;
406 }
407 }
408
409 /* Copy final part of signals from winner state to output and long-term filter states */
410 psIndices->Seed = psDelDec->SeedInit[ Winner_ind ];
411 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
412 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ );
413
414 t_s32x4 = vdupq_n_s32( 0 ); /* initialization */
415 for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) {
416 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
417 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
418 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
419 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
420 vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 );
421 }
422
423 for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) {
424 NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ];
425 }
426
427 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) - 3 ); i += 4 ) {
428 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
429 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
430 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
431 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
432 vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 );
433 }
434
435 for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
436 NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ];
437 }
438
439 /* Update states */
440 NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ];
441 NSQ->sDiff_shp_Q14 = psDelDec->Diff_Q14[ Winner_ind ];
442 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
443
444 /* Save quantized speech signal */
445 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
446 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
447 RESTORE_STACK;
448 }
449
450 #ifdef OPUS_CHECK_ASM
451 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
452 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
453 silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) );
454 #endif
455 }
456
457 /******************************************/
458 /* Noise shape quantizer for one subframe */
459 /******************************************/
460 /* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_neon.h is actually a hacking C function. */
461 /* Therefore here we append "_local" to the NEON function name to avoid confusion. */
silk_short_prediction_create_arch_coef_neon_local(opus_int32 * out,const opus_int16 * in,opus_int order)462 static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_int32 *out, const opus_int16 *in, opus_int order)
463 {
464 int16x8_t t_s16x8;
465 int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4;
466 silk_assert( order == 10 || order == 16 );
467
468 t_s16x8 = vld1q_s16( in + 0 ); /* 7 6 5 4 3 2 1 0 */
469 t_s16x8 = vrev64q_s16( t_s16x8 ); /* 4 5 6 7 0 1 2 3 */
470 t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* 4 5 6 7 */
471 t3_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 0 1 2 3 */
472
473 if( order == 16 ) {
474 t_s16x8 = vld1q_s16( in + 8 ); /* F E D C B A 9 8 */
475 t_s16x8 = vrev64q_s16( t_s16x8 ); /* C D E F 8 9 A B */
476 t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* C D E F */
477 t1_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 8 9 A B */
478 } else {
479 int16x4_t t_s16x4;
480
481 t0_s32x4 = vdupq_n_s32( 0 ); /* zero zero zero zero */
482 t_s16x4 = vld1_s16( in + 6 ); /* 9 8 7 6 */
483 t_s16x4 = vrev64_s16( t_s16x4 ); /* 6 7 8 9 */
484 t1_s32x4 = vshll_n_s16( t_s16x4, 15 );
485 t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8 9 zero zero */
486 }
487 vst1q_s32( out + 0, t0_s32x4 );
488 vst1q_s32( out + 4, t1_s32x4 );
489 vst1q_s32( out + 8, t2_s32x4 );
490 vst1q_s32( out + 12, t3_s32x4 );
491 }
492
silk_SMLAWB_lane0_neon(const int32x4_t out_s32x4,const int32x4_t in_s32x4,const int32x2_t coef_s32x2)493 static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon(
494 const int32x4_t out_s32x4,
495 const int32x4_t in_s32x4,
496 const int32x2_t coef_s32x2
497 )
498 {
499 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) );
500 }
501
silk_SMLAWB_lane1_neon(const int32x4_t out_s32x4,const int32x4_t in_s32x4,const int32x2_t coef_s32x2)502 static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon(
503 const int32x4_t out_s32x4,
504 const int32x4_t in_s32x4,
505 const int32x2_t coef_s32x2
506 )
507 {
508 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) );
509 }
510
511 /* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */
512 /* Therefore here we append "_local" to the function name to avoid confusion. */
silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 * buf32,const opus_int32 * a_Q12_arch,opus_int order)513 static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order)
514 {
515 const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 );
516 const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 );
517 const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 );
518 const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 );
519 int32x4_t LPC_pred_Q14_s32x4;
520
521 silk_assert( order == 10 || order == 16 );
522 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
523 LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) );
524 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );
525 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );
526 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
527 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
528 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );
529 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );
530 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
531 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
532 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );
533 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );
534 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
535 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
536 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );
537 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );
538 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
539 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
540
541 return LPC_pred_Q14_s32x4;
542 }
543
silk_noise_shape_quantizer_del_dec_neon(silk_nsq_state * NSQ,NSQ_del_decs_struct psDelDec[],opus_int signalType,const opus_int32 x_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],opus_int32 delayedGain_Q10[],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int subfr,opus_int shapingLPCOrder,opus_int predictLPCOrder,opus_int warping_Q16,opus_int nStatesDelayedDecision,opus_int * smpl_buf_idx,opus_int decisionDelay)544 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
545 silk_nsq_state *NSQ, /* I/O NSQ state */
546 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
547 opus_int signalType, /* I Signal type */
548 const opus_int32 x_Q10[], /* I */
549 opus_int8 pulses[], /* O */
550 opus_int16 xq[], /* O */
551 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
552 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
553 const opus_int16 a_Q12[], /* I Short term prediction coefs */
554 const opus_int16 b_Q14[], /* I Long term prediction coefs */
555 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
556 opus_int lag, /* I Pitch lag */
557 opus_int32 HarmShapeFIRPacked_Q14, /* I */
558 opus_int Tilt_Q14, /* I Spectral tilt */
559 opus_int32 LF_shp_Q14, /* I */
560 opus_int32 Gain_Q16, /* I */
561 opus_int Lambda_Q10, /* I */
562 opus_int offset_Q10, /* I */
563 opus_int length, /* I Input length */
564 opus_int subfr, /* I Subframe number */
565 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
566 opus_int predictLPCOrder, /* I Prediction filter order */
567 opus_int warping_Q16, /* I */
568 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
569 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
570 opus_int decisionDelay /* I */
571 )
572 {
573 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
574 opus_int32 Winner_rand_state;
575 opus_int32 LTP_pred_Q14, n_LTP_Q14;
576 opus_int32 RDmin_Q10, RDmax_Q10;
577 opus_int32 Gain_Q10;
578 opus_int32 *pred_lag_ptr, *shp_lag_ptr;
579 opus_int32 a_Q12_arch[MAX_LPC_ORDER];
580 const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 );
581 const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1;
582 opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ];
583 const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER );
584 const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT );
585
586 VARDECL( NSQ_samples_struct, psSampleState );
587 SAVE_STACK;
588
589 silk_assert( nStatesDelayedDecision > 0 );
590 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
591 ALLOC( psSampleState, 2, NSQ_samples_struct );
592 OPUS_CLEAR(psSampleState, 2);
593
594 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
595 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
596 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
597
598 for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) {
599 const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 + i );
600 vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ) );
601 vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) );
602 }
603
604 for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) {
605 AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 );
606 }
607
608 silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder );
609
610 for( i = 0; i < length; i++ ) {
611 int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4;
612 int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4;
613 int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4;
614 int32x2_t AR_shp_Q28_s32x2;
615 int16x4_t r_Q10_s16x4, rr_Q10_s16x4;
616
617 /* Perform common calculations used in all states */
618
619 /* Long-term prediction */
620 if( signalType == TYPE_VOICED ) {
621 /* Unrolled loop */
622 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
623 LTP_pred_Q14 = 2;
624 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] );
625 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );
626 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );
627 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );
628 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
629 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
630 pred_lag_ptr++;
631 } else {
632 LTP_pred_Q14 = 0;
633 }
634
635 /* Long-term shaping */
636 if( lag > 0 ) {
637 /* Symmetric, packed FIR coefficients */
638 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
639 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
640 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
641 shp_lag_ptr++;
642 } else {
643 n_LTP_Q14 = 0;
644 }
645
646 /* Generate dither */
647 Seed_s32x4 = vld1q_s32( psDelDec->Seed );
648 Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) );
649 vst1q_s32( psDelDec->Seed, Seed_s32x4 );
650
651 /* Short-term prediction */
652 LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_local(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrder);
653 LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */
654
655 /* Noise shape feedback */
656 /* Output of lowpass section */
657 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 );
658 /* Output of allpass section */
659 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 );
660 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
661 vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 );
662 AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 );
663 n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
664
665 /* Loop over allpass sections */
666 for( j = 2; j < shapingLPCOrder; j += 2 ) {
667 /* Output of allpass section */
668 tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 );
669 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 );
670 vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 );
671 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
672 /* Output of allpass section */
673 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 );
674 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
675 vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 );
676 AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] );
677 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
678 }
679 vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 );
680 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
681 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 ); /* Q11 -> Q12 */
682 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) ); /* Q12 */
683 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 ); /* Q12 -> Q14 */
684 n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 ); /* Q12 */
685 n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */
686 n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 ); /* Q12 -> Q14 */
687
688 /* Input minus prediction plus noise feedback */
689 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
690 tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 ); /* Q14 */
691 tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */
692 tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 ); /* Q13 */
693 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 ); /* Q10 */
694 tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 ); /* residual error Q10 */
695
696 /* Flip sign depending on dither */
697 sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) );
698 tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 );
699 tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 );
700 tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) );
701 tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );
702 r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );
703
704 /* Find two quantization level candidates and measure their rate-distortion */
705 {
706 int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) );
707 int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );
708 int16x4_t q2_Q10_s16x4;
709 int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;
710 uint32x4_t t_u32x4;
711
712 if( Lambda_Q10 > 2048 ) {
713 /* For aggressive RDO, the bias becomes more than one pulse. */
714 const int rdo_offset = Lambda_Q10/2 - 512;
715 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
716 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
717 int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) );
718 signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset );
719 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
720 silk_assert( Lambda_Q10 <= 32767 );
721
722 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
723 q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4);
724 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
725 }
726 {
727 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
728 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
729 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
730 int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4;
731
732 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
733 tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcge_s16(q1_Q0_s16x4, vdup_n_s16(0))), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
734 tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 );
735 tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) );
736 q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4);
737 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
738 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
739 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
740 q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) );
741 q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 );
742 q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 );
743 tmp1_s16x4 = q1_Q10_s16x4;
744 tmp2_s16x4 = q2_Q10_s16x4;
745 tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 );
746 tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 );
747 rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) );
748 rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) );
749 }
750
751 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 );
752 rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
753 rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 );
754
755 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 );
756 rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
757 rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 );
758
759 tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
760 tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
761 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
762 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
763 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
764 t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 );
765 tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) );
766 tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) );
767 vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 );
768 vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 );
769 }
770
771 {
772 /* Update states for best quantization */
773 int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4;
774
775 /* Quantized excitation */
776 exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 );
777 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
778 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
779
780 /* Add predictions */
781 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
782 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
783
784 /* Update states */
785 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
786 vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 );
787 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
788 vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
789 vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
790 vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
791 vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 );
792
793 /* Quantized excitation */
794 exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 );
795 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
796 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
797
798 /* Add predictions */
799 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
800 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
801
802 /* Update states */
803 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
804 vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 );
805 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
806 vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
807 vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
808 vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
809 vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 );
810 }
811
812 *smpl_buf_idx = *smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1);
813 last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY;
814 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
815 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
816
817 /* Find winner */
818 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
819 Winner_ind = 0;
820 for( k = 1; k < nStatesDelayedDecision; k++ ) {
821 if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) {
822 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
823 Winner_ind = k;
824 }
825 }
826
827 /* clear unused part of RD_Q10 to avoid overflows */
828 if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES )
829 {
830 OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
831 OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
832 }
833
834 /* Increase RD values of expired states */
835 {
836 uint32x4_t t_u32x4;
837 Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ];
838 t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) );
839 t_u32x4 = vmvnq_u32( t_u32x4 );
840 t_u32x4 = vshrq_n_u32( t_u32x4, 5 );
841 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 );
842 tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 );
843 tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
844 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
845 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
846 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
847
848 /* Find worst in first set and best in second set */
849 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
850 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ];
851 RDmax_ind = 0;
852 RDmin_ind = 0;
853 for( k = 1; k < nStatesDelayedDecision; k++ ) {
854 /* find worst in first set */
855 if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) {
856 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
857 RDmax_ind = k;
858 }
859 /* find best in second set */
860 if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) {
861 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ];
862 RDmin_ind = k;
863 }
864 }
865 }
866
867 /* Replace a state if best from second set outperforms worst in first set */
868 if( RDmin_Q10 < RDmax_Q10 ) {
869 opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState;
870 const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) )
871 / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) );
872 /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several */
873 /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */
874 /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity. */
875 for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) {
876 psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ];
877 }
878 for( j = 0; j < numOthers; j++ ) {
879 ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ];
880 }
881
882 psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ];
883 psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ];
884 psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ];
885 psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ];
886 psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ];
887 psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ];
888 psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ];
889 }
890
891 /* Write samples from winner to output and long-term filter states */
892 if( subfr > 0 || i >= decisionDelay ) {
893 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
894 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
895 silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
896 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
897 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDelDec->Pred_Q15[ last_smple_idx ][ Winner_ind ];
898 }
899 NSQ->sLTP_shp_buf_idx++;
900 NSQ->sLTP_buf_idx++;
901
902 /* Update states */
903 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) );
904 vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) );
905 vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
906 vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
907 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 );
908 vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 );
909 vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
910 vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
911 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
912 tmp1_s32x4 = vreinterpretq_s32_u32( vaddq_u32( vreinterpretq_u32_s32(
913 vld1q_s32( psDelDec->Seed ) ), vreinterpretq_u32_s32( tmp1_s32x4 ) ) );
914 vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
915 vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
916 vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
917 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
918 }
919 /* Update LPC states */
920 silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
921
922 RESTORE_STACK;
923 }
924
silk_SMULWB_8_neon(const opus_int16 * a,const int32x2_t b,opus_int32 * o)925 static OPUS_INLINE void silk_SMULWB_8_neon(
926 const opus_int16 *a,
927 const int32x2_t b,
928 opus_int32 *o
929 )
930 {
931 const int16x8_t a_s16x8 = vld1q_s16( a );
932 int32x4_t o0_s32x4, o1_s32x4;
933
934 o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 );
935 o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 );
936 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 );
937 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 );
938 vst1q_s32( o, o0_s32x4 );
939 vst1q_s32( o + 4, o1_s32x4 );
940 }
941
942 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
silk_SMULWW_small_b_4_neon(opus_int32 * a,const int32x2_t b_s32x2)943 static OPUS_INLINE void silk_SMULWW_small_b_4_neon(
944 opus_int32 *a,
945 const int32x2_t b_s32x2)
946 {
947 int32x4_t o_s32x4;
948
949 o_s32x4 = vld1q_s32( a );
950 o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 );
951 vst1q_s32( a, o_s32x4 );
952 }
953
954 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
silk_SMULWW_small_b_8_neon(opus_int32 * a,const int32x2_t b_s32x2)955 static OPUS_INLINE void silk_SMULWW_small_b_8_neon(
956 opus_int32 *a,
957 const int32x2_t b_s32x2
958 )
959 {
960 int32x4_t o0_s32x4, o1_s32x4;
961
962 o0_s32x4 = vld1q_s32( a );
963 o1_s32x4 = vld1q_s32( a + 4 );
964 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 );
965 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 );
966 vst1q_s32( a, o0_s32x4 );
967 vst1q_s32( a + 4, o1_s32x4 );
968 }
969
silk_SMULWW_4_neon(opus_int32 * a,const int32x2_t b_s32x2)970 static OPUS_INLINE void silk_SMULWW_4_neon(
971 opus_int32 *a,
972 const int32x2_t b_s32x2)
973 {
974 int32x4_t a_s32x4, o_s32x4;
975
976 a_s32x4 = vld1q_s32( a );
977 o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 );
978 o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 );
979 vst1q_s32( a, o_s32x4 );
980 }
981
silk_SMULWW_8_neon(opus_int32 * a,const int32x2_t b_s32x2)982 static OPUS_INLINE void silk_SMULWW_8_neon(
983 opus_int32 *a,
984 const int32x2_t b_s32x2
985 )
986 {
987 int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4;
988
989 a0_s32x4 = vld1q_s32( a );
990 a1_s32x4 = vld1q_s32( a + 4 );
991 o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 );
992 o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 );
993 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 );
994 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 );
995 vst1q_s32( a, o0_s32x4 );
996 vst1q_s32( a + 4, o1_s32x4 );
997 }
998
silk_SMULWW_loop_neon(const opus_int16 * a,const opus_int32 b,opus_int32 * o,const opus_int loop_num)999 static OPUS_INLINE void silk_SMULWW_loop_neon(
1000 const opus_int16 *a,
1001 const opus_int32 b,
1002 opus_int32 *o,
1003 const opus_int loop_num
1004 )
1005 {
1006 opus_int i;
1007 int32x2_t b_s32x2;
1008
1009 b_s32x2 = vdup_n_s32( b );
1010 for( i = 0; i < loop_num - 7; i += 8 ) {
1011 silk_SMULWB_8_neon( a + i, b_s32x2, o + i );
1012 }
1013 for( ; i < loop_num; i++ ) {
1014 o[ i ] = silk_SMULWW( a[ i ], b );
1015 }
1016 }
1017
silk_nsq_del_dec_scale_states_neon(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,NSQ_del_decs_struct psDelDec[],const opus_int16 x16[],opus_int32 x_sc_Q10[],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type,const opus_int decisionDelay)1018 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
1019 const silk_encoder_state *psEncC, /* I Encoder State */
1020 silk_nsq_state *NSQ, /* I/O NSQ state */
1021 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
1022 const opus_int16 x16[], /* I Input */
1023 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
1024 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
1025 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
1026 opus_int subfr, /* I Subframe number */
1027 const opus_int LTP_scale_Q14, /* I LTP state scaling */
1028 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
1029 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
1030 const opus_int signal_type, /* I Signal type */
1031 const opus_int decisionDelay /* I Decision delay */
1032 )
1033 {
1034 opus_int i, lag;
1035 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
1036
1037 lag = pitchL[ subfr ];
1038 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
1039 silk_assert( inv_gain_Q31 != 0 );
1040
1041 /* Scale input */
1042 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
1043 silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length );
1044
1045 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
1046 if( NSQ->rewhite_flag ) {
1047 if( subfr == 0 ) {
1048 /* Do LTP downscaling */
1049 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
1050 }
1051 silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 );
1052 }
1053
1054 /* Adjust for changing gain */
1055 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
1056 int32x2_t gain_adj_Q16_s32x2;
1057 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
1058
1059 /* Scale long-term shaping state */
1060 if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) {
1061 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) );
1062 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1063 silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1064 }
1065 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1066 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1067 }
1068
1069 /* Scale long-term prediction state */
1070 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1071 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1072 silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1073 }
1074 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1075 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1076 }
1077 }
1078
1079 /* Scale scalar states */
1080 silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1081 silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );
1082
1083 /* Scale short-term prediction and shaping states */
1084 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1085 silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1086 }
1087
1088 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1089 silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1090 }
1091
1092 for( i = 0; i < DECISION_DELAY; i++ ) {
1093 silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );
1094 silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1095 }
1096 } else {
1097 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x0000FFFF, 15 ) );
1098 gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 );
1099 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1100 silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1101 }
1102 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1103 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1104 }
1105
1106 /* Scale long-term prediction state */
1107 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1108 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1109 silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1110 }
1111 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1112 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1113 }
1114 }
1115
1116 /* Scale scalar states */
1117 silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1118 silk_SMULWW_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );
1119
1120 /* Scale short-term prediction and shaping states */
1121 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1122 silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1123 }
1124
1125 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1126 silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1127 }
1128
1129 for( i = 0; i < DECISION_DELAY; i++ ) {
1130 silk_SMULWW_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );
1131 silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1132 }
1133 }
1134
1135 /* Save inverse gain */
1136 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
1137 }
1138 }
1139