xref: /aosp_15_r20/external/libopus/silk/x86/main_sse.h (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1 /* Copyright (c) 2014, Cisco Systems, INC
2    Written by XiangMingZhu WeiZhou MinPeng YanWang
3 
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifndef MAIN_SSE_H
29 # define MAIN_SSE_H
30 
31 # ifdef HAVE_CONFIG_H
32 #  include "config.h"
33 # endif
34 
35 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)
36 
37 void silk_VQ_WMat_EC_sse4_1(
38     opus_int8                   *ind,                           /* O    index of best codebook vector               */
39     opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
40     opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
41     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
42     const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
43     const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
44     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
45     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
46     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
47     const opus_int              subfr_len,                      /* I    number of samples per subframe              */
48     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
49     const opus_int              L                               /* I    number of vectors in codebook               */
50 );
51 
52 #  if defined OPUS_X86_PRESUME_SSE4_1
53 
54 #   define OVERRIDE_silk_VQ_WMat_EC
55 #   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
56                            subfr_len, max_gain_Q7, L, arch) \
57     ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
58                           subfr_len, max_gain_Q7, L))
59 
60 #  elif defined(OPUS_HAVE_RTCD)
61 
62 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
63     opus_int8                   *ind,                           /* O    index of best codebook vector               */
64     opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
65     opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
66     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
67     const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
68     const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
69     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
70     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
71     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
72     const opus_int              subfr_len,                      /* I    number of samples per subframe              */
73     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
74     const opus_int              L                               /* I    number of vectors in codebook               */
75 );
76 
77 #   define OVERRIDE_silk_VQ_WMat_EC
78 #   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
79                            subfr_len, max_gain_Q7, L, arch) \
80     ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
81                           subfr_len, max_gain_Q7, L))
82 
83 #  endif
84 
85 void silk_NSQ_sse4_1(
86     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
87     silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
88     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
89     const opus_int16            x16[],                                        /* I    Input                           */
90     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
91     const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
92     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
93     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
94     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
95     const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
96     const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
97     const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
98     const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
99     const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
100     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
101 );
102 
103 #  if defined OPUS_X86_PRESUME_SSE4_1
104 
105 #   define OVERRIDE_silk_NSQ
106 #   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
107                     HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
108     ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
109                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
110 
111 #  elif defined(OPUS_HAVE_RTCD)
112 
113 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
114     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
115     silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
116     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
117     const opus_int16            x16[],                                        /* I    Input                           */
118     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
119     const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
120     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
121     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
122     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
123     const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
124     const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
125     const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
126     const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
127     const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
128     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
129 );
130 
131 #   define OVERRIDE_silk_NSQ
132 #   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
133                     HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
134     ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
135                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
136 
137 #  endif
138 
139 void silk_NSQ_del_dec_sse4_1(
140     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
141     silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
142     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
143     const opus_int16            x16[],                                        /* I    Input                           */
144     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
145     const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
146     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
147     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
148     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
149     const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
150     const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
151     const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
152     const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
153     const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
154     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
155 );
156 
157 void silk_NSQ_del_dec_avx2(
158     const silk_encoder_state *psEncC,                            /* I    Encoder State               */
159     silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
160     SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
161     const opus_int16 x16[],                                      /* I    Input                       */
162     opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
163     const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
164     const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
165     const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
166     const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
167     const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
168     const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
169     const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
170     const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
171     const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
172     const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
173 );
174 
175 #  if defined (OPUS_X86_PRESUME_AVX2)
176 
177 #   define OVERRIDE_silk_NSQ_del_dec
178 #   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
179                             HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
180     ((void)(arch),silk_NSQ_del_dec_avx2(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
181                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
182 
183 #  elif defined (OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
184 
185 #   define OVERRIDE_silk_NSQ_del_dec
186 #   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
187                             HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
188     ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
189                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
190 
191 #  elif defined(OPUS_HAVE_RTCD)
192 
193 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
194     const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
195     silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
196     SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
197     const opus_int16            x16[],                                        /* I    Input                           */
198     opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
199     const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
200     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
201     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
202     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
203     const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
204     const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
205     const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
206     const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
207     const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
208     const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
209 );
210 
211 #   define OVERRIDE_silk_NSQ_del_dec
212 #   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
213                             HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
214     ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
215                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
216 
217 #  endif
218 
219 void silk_noise_shape_quantizer(
220     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
221     opus_int            signalType,             /* I    Signal type                     */
222     const opus_int32    x_sc_Q10[],             /* I                                    */
223     opus_int8           pulses[],               /* O                                    */
224     opus_int16          xq[],                   /* O                                    */
225     opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
226     const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
227     const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
228     const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
229     opus_int            lag,                    /* I    Pitch lag                       */
230     opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
231     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
232     opus_int32          LF_shp_Q14,             /* I                                    */
233     opus_int32          Gain_Q16,               /* I                                    */
234     opus_int            Lambda_Q10,             /* I                                    */
235     opus_int            offset_Q10,             /* I                                    */
236     opus_int            length,                 /* I    Input length                    */
237     opus_int            shapingLPCOrder,        /* I    Noise shaping AR filter order   */
238     opus_int            predictLPCOrder,        /* I    Prediction filter order         */
239     int                 arch                    /* I    Architecture                    */
240 );
241 
242 /**************************/
243 /* Noise level estimation */
244 /**************************/
245 void silk_VAD_GetNoiseLevels(
246     const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
247     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
248 );
249 
250 opus_int silk_VAD_GetSA_Q8_sse4_1(
251     silk_encoder_state *psEnC,
252     const opus_int16   pIn[]
253 );
254 
255 #  if defined(OPUS_X86_PRESUME_SSE4_1)
256 
257 #   define OVERRIDE_silk_VAD_GetSA_Q8
258 #   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
259 
260 #  elif defined(OPUS_HAVE_RTCD)
261 
262 extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
263      silk_encoder_state *psEnC,
264      const opus_int16   pIn[]);
265 
266 #   define OVERRIDE_silk_VAD_GetSA_Q8
267 #   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
268       ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
269 
270 #  endif
271 
272 #ifndef FIXED_POINT
273 double silk_inner_product_FLP_avx2(
274     const silk_float    *data1,
275     const silk_float    *data2,
276     opus_int            dataSize
277 );
278 
279 #if defined (OPUS_X86_PRESUME_AVX2)
280 
281 #define OVERRIDE_inner_product_FLP
282 #define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_avx2(data1, data2, dataSize))
283 
284 #elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
285 
286 #define OVERRIDE_inner_product_FLP
287 extern double (*const SILK_INNER_PRODUCT_FLP_IMPL[OPUS_ARCHMASK + 1])(
288     const silk_float    *data1,
289     const silk_float    *data2,
290     opus_int            dataSize
291 );
292 
293 #define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,(*SILK_INNER_PRODUCT_FLP_IMPL[(arch) & OPUS_ARCHMASK])(data1, data2, dataSize))
294 
295 #endif
296 #endif
297 
298 # endif
299 #endif
300