xref: /aosp_15_r20/external/libopus/celt/x86/vq_sse2.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li /* Copyright (c) 2007-2008 CSIRO
2*a58d3d2aSXin Li    Copyright (c) 2007-2009 Xiph.Org Foundation
3*a58d3d2aSXin Li    Copyright (c) 2007-2016 Jean-Marc Valin */
4*a58d3d2aSXin Li /*
5*a58d3d2aSXin Li    Redistribution and use in source and binary forms, with or without
6*a58d3d2aSXin Li    modification, are permitted provided that the following conditions
7*a58d3d2aSXin Li    are met:
8*a58d3d2aSXin Li 
9*a58d3d2aSXin Li    - Redistributions of source code must retain the above copyright
10*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer.
11*a58d3d2aSXin Li 
12*a58d3d2aSXin Li    - Redistributions in binary form must reproduce the above copyright
13*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer in the
14*a58d3d2aSXin Li    documentation and/or other materials provided with the distribution.
15*a58d3d2aSXin Li 
16*a58d3d2aSXin Li    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17*a58d3d2aSXin Li    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18*a58d3d2aSXin Li    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19*a58d3d2aSXin Li    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20*a58d3d2aSXin Li    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21*a58d3d2aSXin Li    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22*a58d3d2aSXin Li    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23*a58d3d2aSXin Li    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24*a58d3d2aSXin Li    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25*a58d3d2aSXin Li    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26*a58d3d2aSXin Li    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*a58d3d2aSXin Li */
28*a58d3d2aSXin Li 
29*a58d3d2aSXin Li #ifdef HAVE_CONFIG_H
30*a58d3d2aSXin Li #include "config.h"
31*a58d3d2aSXin Li #endif
32*a58d3d2aSXin Li 
33*a58d3d2aSXin Li #include <xmmintrin.h>
34*a58d3d2aSXin Li #include <emmintrin.h>
35*a58d3d2aSXin Li #include "celt_lpc.h"
36*a58d3d2aSXin Li #include "stack_alloc.h"
37*a58d3d2aSXin Li #include "mathops.h"
38*a58d3d2aSXin Li #include "vq.h"
39*a58d3d2aSXin Li #include "x86cpu.h"
40*a58d3d2aSXin Li 
41*a58d3d2aSXin Li 
42*a58d3d2aSXin Li #ifndef FIXED_POINT
43*a58d3d2aSXin Li 
op_pvq_search_sse2(celt_norm * _X,int * iy,int K,int N,int arch)44*a58d3d2aSXin Li opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
45*a58d3d2aSXin Li {
46*a58d3d2aSXin Li    int i, j;
47*a58d3d2aSXin Li    int pulsesLeft;
48*a58d3d2aSXin Li    float xy, yy;
49*a58d3d2aSXin Li    VARDECL(celt_norm, y);
50*a58d3d2aSXin Li    VARDECL(celt_norm, X);
51*a58d3d2aSXin Li    VARDECL(float, signy);
52*a58d3d2aSXin Li    __m128 signmask;
53*a58d3d2aSXin Li    __m128 sums;
54*a58d3d2aSXin Li    __m128i fours;
55*a58d3d2aSXin Li    SAVE_STACK;
56*a58d3d2aSXin Li 
57*a58d3d2aSXin Li    (void)arch;
58*a58d3d2aSXin Li    /* All bits set to zero, except for the sign bit. */
59*a58d3d2aSXin Li    signmask = _mm_set_ps1(-0.f);
60*a58d3d2aSXin Li    fours = _mm_set_epi32(4, 4, 4, 4);
61*a58d3d2aSXin Li    ALLOC(y, N+3, celt_norm);
62*a58d3d2aSXin Li    ALLOC(X, N+3, celt_norm);
63*a58d3d2aSXin Li    ALLOC(signy, N+3, float);
64*a58d3d2aSXin Li 
65*a58d3d2aSXin Li    OPUS_COPY(X, _X, N);
66*a58d3d2aSXin Li    X[N] = X[N+1] = X[N+2] = 0;
67*a58d3d2aSXin Li    sums = _mm_setzero_ps();
68*a58d3d2aSXin Li    for (j=0;j<N;j+=4)
69*a58d3d2aSXin Li    {
70*a58d3d2aSXin Li       __m128 x4, s4;
71*a58d3d2aSXin Li       x4 = _mm_loadu_ps(&X[j]);
72*a58d3d2aSXin Li       s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
73*a58d3d2aSXin Li       /* Get rid of the sign */
74*a58d3d2aSXin Li       x4 = _mm_andnot_ps(signmask, x4);
75*a58d3d2aSXin Li       sums = _mm_add_ps(sums, x4);
76*a58d3d2aSXin Li       /* Clear y and iy in case we don't do the projection. */
77*a58d3d2aSXin Li       _mm_storeu_ps(&y[j], _mm_setzero_ps());
78*a58d3d2aSXin Li       _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128());
79*a58d3d2aSXin Li       _mm_storeu_ps(&X[j], x4);
80*a58d3d2aSXin Li       _mm_storeu_ps(&signy[j], s4);
81*a58d3d2aSXin Li    }
82*a58d3d2aSXin Li    sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
83*a58d3d2aSXin Li    sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
84*a58d3d2aSXin Li 
85*a58d3d2aSXin Li    xy = yy = 0;
86*a58d3d2aSXin Li 
87*a58d3d2aSXin Li    pulsesLeft = K;
88*a58d3d2aSXin Li 
89*a58d3d2aSXin Li    /* Do a pre-search by projecting on the pyramid */
90*a58d3d2aSXin Li    if (K > (N>>1))
91*a58d3d2aSXin Li    {
92*a58d3d2aSXin Li       __m128i pulses_sum;
93*a58d3d2aSXin Li       __m128 yy4, xy4;
94*a58d3d2aSXin Li       __m128 rcp4;
95*a58d3d2aSXin Li       opus_val32 sum = _mm_cvtss_f32(sums);
96*a58d3d2aSXin Li       /* If X is too small, just replace it with a pulse at 0 */
97*a58d3d2aSXin Li       /* Prevents infinities and NaNs from causing too many pulses
98*a58d3d2aSXin Li          to be allocated. 64 is an approximation of infinity here. */
99*a58d3d2aSXin Li       if (!(sum > EPSILON && sum < 64))
100*a58d3d2aSXin Li       {
101*a58d3d2aSXin Li          X[0] = QCONST16(1.f,14);
102*a58d3d2aSXin Li          j=1; do
103*a58d3d2aSXin Li             X[j]=0;
104*a58d3d2aSXin Li          while (++j<N);
105*a58d3d2aSXin Li          sums = _mm_set_ps1(1.f);
106*a58d3d2aSXin Li       }
107*a58d3d2aSXin Li       /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
108*a58d3d2aSXin Li       rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
109*a58d3d2aSXin Li       xy4 = yy4 = _mm_setzero_ps();
110*a58d3d2aSXin Li       pulses_sum = _mm_setzero_si128();
111*a58d3d2aSXin Li       for (j=0;j<N;j+=4)
112*a58d3d2aSXin Li       {
113*a58d3d2aSXin Li          __m128 rx4, x4, y4;
114*a58d3d2aSXin Li          __m128i iy4;
115*a58d3d2aSXin Li          x4 = _mm_loadu_ps(&X[j]);
116*a58d3d2aSXin Li          rx4 = _mm_mul_ps(x4, rcp4);
117*a58d3d2aSXin Li          iy4 = _mm_cvttps_epi32(rx4);
118*a58d3d2aSXin Li          pulses_sum = _mm_add_epi32(pulses_sum, iy4);
119*a58d3d2aSXin Li          _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4);
120*a58d3d2aSXin Li          y4 = _mm_cvtepi32_ps(iy4);
121*a58d3d2aSXin Li          xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
122*a58d3d2aSXin Li          yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
123*a58d3d2aSXin Li          /* double the y[] vector so we don't have to do it in the search loop. */
124*a58d3d2aSXin Li          _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
125*a58d3d2aSXin Li       }
126*a58d3d2aSXin Li       pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
127*a58d3d2aSXin Li       pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
128*a58d3d2aSXin Li       pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
129*a58d3d2aSXin Li       xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
130*a58d3d2aSXin Li       xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
131*a58d3d2aSXin Li       xy = _mm_cvtss_f32(xy4);
132*a58d3d2aSXin Li       yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
133*a58d3d2aSXin Li       yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
134*a58d3d2aSXin Li       yy = _mm_cvtss_f32(yy4);
135*a58d3d2aSXin Li    }
136*a58d3d2aSXin Li    X[N] = X[N+1] = X[N+2] = -100;
137*a58d3d2aSXin Li    y[N] = y[N+1] = y[N+2] = 100;
138*a58d3d2aSXin Li    celt_sig_assert(pulsesLeft>=0);
139*a58d3d2aSXin Li 
140*a58d3d2aSXin Li    /* This should never happen, but just in case it does (e.g. on silence)
141*a58d3d2aSXin Li       we fill the first bin with pulses. */
142*a58d3d2aSXin Li    if (pulsesLeft > N+3)
143*a58d3d2aSXin Li    {
144*a58d3d2aSXin Li       opus_val16 tmp = (opus_val16)pulsesLeft;
145*a58d3d2aSXin Li       yy = MAC16_16(yy, tmp, tmp);
146*a58d3d2aSXin Li       yy = MAC16_16(yy, tmp, y[0]);
147*a58d3d2aSXin Li       iy[0] += pulsesLeft;
148*a58d3d2aSXin Li       pulsesLeft=0;
149*a58d3d2aSXin Li    }
150*a58d3d2aSXin Li 
151*a58d3d2aSXin Li    for (i=0;i<pulsesLeft;i++)
152*a58d3d2aSXin Li    {
153*a58d3d2aSXin Li       int best_id;
154*a58d3d2aSXin Li       __m128 xy4, yy4;
155*a58d3d2aSXin Li       __m128 max, max2;
156*a58d3d2aSXin Li       __m128i count;
157*a58d3d2aSXin Li       __m128i pos;
158*a58d3d2aSXin Li       /* The squared magnitude term gets added anyway, so we might as well
159*a58d3d2aSXin Li          add it outside the loop */
160*a58d3d2aSXin Li       yy = ADD16(yy, 1);
161*a58d3d2aSXin Li       xy4 = _mm_load1_ps(&xy);
162*a58d3d2aSXin Li       yy4 = _mm_load1_ps(&yy);
163*a58d3d2aSXin Li       max = _mm_setzero_ps();
164*a58d3d2aSXin Li       pos = _mm_setzero_si128();
165*a58d3d2aSXin Li       count = _mm_set_epi32(3, 2, 1, 0);
166*a58d3d2aSXin Li       for (j=0;j<N;j+=4)
167*a58d3d2aSXin Li       {
168*a58d3d2aSXin Li          __m128 x4, y4, r4;
169*a58d3d2aSXin Li          x4 = _mm_loadu_ps(&X[j]);
170*a58d3d2aSXin Li          y4 = _mm_loadu_ps(&y[j]);
171*a58d3d2aSXin Li          x4 = _mm_add_ps(x4, xy4);
172*a58d3d2aSXin Li          y4 = _mm_add_ps(y4, yy4);
173*a58d3d2aSXin Li          y4 = _mm_rsqrt_ps(y4);
174*a58d3d2aSXin Li          r4 = _mm_mul_ps(x4, y4);
175*a58d3d2aSXin Li          /* Update the index of the max. */
176*a58d3d2aSXin Li          pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
177*a58d3d2aSXin Li          /* Update the max. */
178*a58d3d2aSXin Li          max = _mm_max_ps(max, r4);
179*a58d3d2aSXin Li          /* Update the indices (+4) */
180*a58d3d2aSXin Li          count = _mm_add_epi32(count, fours);
181*a58d3d2aSXin Li       }
182*a58d3d2aSXin Li       /* Horizontal max */
183*a58d3d2aSXin Li       max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
184*a58d3d2aSXin Li       max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
185*a58d3d2aSXin Li       /* Now that max2 contains the max at all positions, look at which value(s) of the
186*a58d3d2aSXin Li          partial max is equal to the global max. */
187*a58d3d2aSXin Li       pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
188*a58d3d2aSXin Li       pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
189*a58d3d2aSXin Li       pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
190*a58d3d2aSXin Li       best_id = _mm_cvtsi128_si32(pos);
191*a58d3d2aSXin Li 
192*a58d3d2aSXin Li       /* Updating the sums of the new pulse(s) */
193*a58d3d2aSXin Li       xy = ADD32(xy, EXTEND32(X[best_id]));
194*a58d3d2aSXin Li       /* We're multiplying y[j] by two so we don't have to do it here */
195*a58d3d2aSXin Li       yy = ADD16(yy, y[best_id]);
196*a58d3d2aSXin Li 
197*a58d3d2aSXin Li       /* Only now that we've made the final choice, update y/iy */
198*a58d3d2aSXin Li       /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
199*a58d3d2aSXin Li       y[best_id] += 2;
200*a58d3d2aSXin Li       iy[best_id]++;
201*a58d3d2aSXin Li    }
202*a58d3d2aSXin Li 
203*a58d3d2aSXin Li    /* Put the original sign back */
204*a58d3d2aSXin Li    for (j=0;j<N;j+=4)
205*a58d3d2aSXin Li    {
206*a58d3d2aSXin Li       __m128i y4;
207*a58d3d2aSXin Li       __m128i s4;
208*a58d3d2aSXin Li       y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]);
209*a58d3d2aSXin Li       s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
210*a58d3d2aSXin Li       y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
211*a58d3d2aSXin Li       _mm_storeu_si128((__m128i*)(void*)&iy[j], y4);
212*a58d3d2aSXin Li    }
213*a58d3d2aSXin Li    RESTORE_STACK;
214*a58d3d2aSXin Li    return yy;
215*a58d3d2aSXin Li }
216*a58d3d2aSXin Li 
217*a58d3d2aSXin Li #endif
218