xref: /aosp_15_r20/external/libopus/celt/x86/pitch_sse.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li /* Copyright (c) 2014, Cisco Systems, INC
2*a58d3d2aSXin Li    Written by XiangMingZhu WeiZhou MinPeng YanWang
3*a58d3d2aSXin Li 
4*a58d3d2aSXin Li    Redistribution and use in source and binary forms, with or without
5*a58d3d2aSXin Li    modification, are permitted provided that the following conditions
6*a58d3d2aSXin Li    are met:
7*a58d3d2aSXin Li 
8*a58d3d2aSXin Li    - Redistributions of source code must retain the above copyright
9*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer.
10*a58d3d2aSXin Li 
11*a58d3d2aSXin Li    - Redistributions in binary form must reproduce the above copyright
12*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer in the
13*a58d3d2aSXin Li    documentation and/or other materials provided with the distribution.
14*a58d3d2aSXin Li 
15*a58d3d2aSXin Li    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16*a58d3d2aSXin Li    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17*a58d3d2aSXin Li    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18*a58d3d2aSXin Li    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19*a58d3d2aSXin Li    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20*a58d3d2aSXin Li    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21*a58d3d2aSXin Li    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22*a58d3d2aSXin Li    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23*a58d3d2aSXin Li    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24*a58d3d2aSXin Li    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*a58d3d2aSXin Li    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*a58d3d2aSXin Li */
27*a58d3d2aSXin Li 
28*a58d3d2aSXin Li #ifdef HAVE_CONFIG_H
29*a58d3d2aSXin Li #include "config.h"
30*a58d3d2aSXin Li #endif
31*a58d3d2aSXin Li 
32*a58d3d2aSXin Li #include "macros.h"
33*a58d3d2aSXin Li #include "celt_lpc.h"
34*a58d3d2aSXin Li #include "stack_alloc.h"
35*a58d3d2aSXin Li #include "mathops.h"
36*a58d3d2aSXin Li #include "pitch.h"
37*a58d3d2aSXin Li 
38*a58d3d2aSXin Li #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
39*a58d3d2aSXin Li 
40*a58d3d2aSXin Li #include <xmmintrin.h>
41*a58d3d2aSXin Li #include "arch.h"
42*a58d3d2aSXin Li 
xcorr_kernel_sse(const opus_val16 * x,const opus_val16 * y,opus_val32 sum[4],int len)43*a58d3d2aSXin Li void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
44*a58d3d2aSXin Li {
45*a58d3d2aSXin Li    int j;
46*a58d3d2aSXin Li    __m128 xsum1, xsum2;
47*a58d3d2aSXin Li    xsum1 = _mm_loadu_ps(sum);
48*a58d3d2aSXin Li    xsum2 = _mm_setzero_ps();
49*a58d3d2aSXin Li 
50*a58d3d2aSXin Li    for (j = 0; j < len-3; j += 4)
51*a58d3d2aSXin Li    {
52*a58d3d2aSXin Li       __m128 x0 = _mm_loadu_ps(x+j);
53*a58d3d2aSXin Li       __m128 yj = _mm_loadu_ps(y+j);
54*a58d3d2aSXin Li       __m128 y3 = _mm_loadu_ps(y+j+3);
55*a58d3d2aSXin Li 
56*a58d3d2aSXin Li       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
57*a58d3d2aSXin Li       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
58*a58d3d2aSXin Li                                           _mm_shuffle_ps(yj,y3,0x49)));
59*a58d3d2aSXin Li       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
60*a58d3d2aSXin Li                                           _mm_shuffle_ps(yj,y3,0x9e)));
61*a58d3d2aSXin Li       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
62*a58d3d2aSXin Li    }
63*a58d3d2aSXin Li    if (j < len)
64*a58d3d2aSXin Li    {
65*a58d3d2aSXin Li       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
66*a58d3d2aSXin Li       if (++j < len)
67*a58d3d2aSXin Li       {
68*a58d3d2aSXin Li          xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
69*a58d3d2aSXin Li          if (++j < len)
70*a58d3d2aSXin Li          {
71*a58d3d2aSXin Li             xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
72*a58d3d2aSXin Li          }
73*a58d3d2aSXin Li       }
74*a58d3d2aSXin Li    }
75*a58d3d2aSXin Li    _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
76*a58d3d2aSXin Li }
77*a58d3d2aSXin Li 
78*a58d3d2aSXin Li 
dual_inner_prod_sse(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)79*a58d3d2aSXin Li void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
80*a58d3d2aSXin Li       int N, opus_val32 *xy1, opus_val32 *xy2)
81*a58d3d2aSXin Li {
82*a58d3d2aSXin Li    int i;
83*a58d3d2aSXin Li    __m128 xsum1, xsum2;
84*a58d3d2aSXin Li    xsum1 = _mm_setzero_ps();
85*a58d3d2aSXin Li    xsum2 = _mm_setzero_ps();
86*a58d3d2aSXin Li    for (i=0;i<N-3;i+=4)
87*a58d3d2aSXin Li    {
88*a58d3d2aSXin Li       __m128 xi = _mm_loadu_ps(x+i);
89*a58d3d2aSXin Li       __m128 y1i = _mm_loadu_ps(y01+i);
90*a58d3d2aSXin Li       __m128 y2i = _mm_loadu_ps(y02+i);
91*a58d3d2aSXin Li       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
92*a58d3d2aSXin Li       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
93*a58d3d2aSXin Li    }
94*a58d3d2aSXin Li    /* Horizontal sum */
95*a58d3d2aSXin Li    xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
96*a58d3d2aSXin Li    xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
97*a58d3d2aSXin Li    _mm_store_ss(xy1, xsum1);
98*a58d3d2aSXin Li    xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
99*a58d3d2aSXin Li    xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
100*a58d3d2aSXin Li    _mm_store_ss(xy2, xsum2);
101*a58d3d2aSXin Li    for (;i<N;i++)
102*a58d3d2aSXin Li    {
103*a58d3d2aSXin Li       *xy1 = MAC16_16(*xy1, x[i], y01[i]);
104*a58d3d2aSXin Li       *xy2 = MAC16_16(*xy2, x[i], y02[i]);
105*a58d3d2aSXin Li    }
106*a58d3d2aSXin Li }
107*a58d3d2aSXin Li 
celt_inner_prod_sse(const opus_val16 * x,const opus_val16 * y,int N)108*a58d3d2aSXin Li opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
109*a58d3d2aSXin Li       int N)
110*a58d3d2aSXin Li {
111*a58d3d2aSXin Li    int i;
112*a58d3d2aSXin Li    float xy;
113*a58d3d2aSXin Li    __m128 sum;
114*a58d3d2aSXin Li    sum = _mm_setzero_ps();
115*a58d3d2aSXin Li    /* FIXME: We should probably go 8-way and use 2 sums. */
116*a58d3d2aSXin Li    for (i=0;i<N-3;i+=4)
117*a58d3d2aSXin Li    {
118*a58d3d2aSXin Li       __m128 xi = _mm_loadu_ps(x+i);
119*a58d3d2aSXin Li       __m128 yi = _mm_loadu_ps(y+i);
120*a58d3d2aSXin Li       sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
121*a58d3d2aSXin Li    }
122*a58d3d2aSXin Li    /* Horizontal sum */
123*a58d3d2aSXin Li    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
124*a58d3d2aSXin Li    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
125*a58d3d2aSXin Li    _mm_store_ss(&xy, sum);
126*a58d3d2aSXin Li    for (;i<N;i++)
127*a58d3d2aSXin Li    {
128*a58d3d2aSXin Li       xy = MAC16_16(xy, x[i], y[i]);
129*a58d3d2aSXin Li    }
130*a58d3d2aSXin Li    return xy;
131*a58d3d2aSXin Li }
132*a58d3d2aSXin Li 
comb_filter_const_sse(opus_val32 * y,opus_val32 * x,int T,int N,opus_val16 g10,opus_val16 g11,opus_val16 g12)133*a58d3d2aSXin Li void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
134*a58d3d2aSXin Li       opus_val16 g10, opus_val16 g11, opus_val16 g12)
135*a58d3d2aSXin Li {
136*a58d3d2aSXin Li    int i;
137*a58d3d2aSXin Li    __m128 x0v;
138*a58d3d2aSXin Li    __m128 g10v, g11v, g12v;
139*a58d3d2aSXin Li    g10v = _mm_load1_ps(&g10);
140*a58d3d2aSXin Li    g11v = _mm_load1_ps(&g11);
141*a58d3d2aSXin Li    g12v = _mm_load1_ps(&g12);
142*a58d3d2aSXin Li    x0v = _mm_loadu_ps(&x[-T-2]);
143*a58d3d2aSXin Li    for (i=0;i<N-3;i+=4)
144*a58d3d2aSXin Li    {
145*a58d3d2aSXin Li       __m128 yi, yi2, x1v, x2v, x3v, x4v;
146*a58d3d2aSXin Li       const opus_val32 *xp = &x[i-T-2];
147*a58d3d2aSXin Li       yi = _mm_loadu_ps(x+i);
148*a58d3d2aSXin Li       x4v = _mm_loadu_ps(xp+4);
149*a58d3d2aSXin Li #if 0
150*a58d3d2aSXin Li       /* Slower version with all loads */
151*a58d3d2aSXin Li       x1v = _mm_loadu_ps(xp+1);
152*a58d3d2aSXin Li       x2v = _mm_loadu_ps(xp+2);
153*a58d3d2aSXin Li       x3v = _mm_loadu_ps(xp+3);
154*a58d3d2aSXin Li #else
155*a58d3d2aSXin Li       x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
156*a58d3d2aSXin Li       x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
157*a58d3d2aSXin Li       x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
158*a58d3d2aSXin Li #endif
159*a58d3d2aSXin Li 
160*a58d3d2aSXin Li       yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
161*a58d3d2aSXin Li #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
162*a58d3d2aSXin Li       yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
163*a58d3d2aSXin Li       yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
164*a58d3d2aSXin Li #else
165*a58d3d2aSXin Li       /* Use partial sums */
166*a58d3d2aSXin Li       yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
167*a58d3d2aSXin Li                        _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
168*a58d3d2aSXin Li       yi = _mm_add_ps(yi, yi2);
169*a58d3d2aSXin Li #endif
170*a58d3d2aSXin Li       x0v=x4v;
171*a58d3d2aSXin Li       _mm_storeu_ps(y+i, yi);
172*a58d3d2aSXin Li    }
173*a58d3d2aSXin Li #ifdef CUSTOM_MODES
174*a58d3d2aSXin Li    for (;i<N;i++)
175*a58d3d2aSXin Li    {
176*a58d3d2aSXin Li       y[i] = x[i]
177*a58d3d2aSXin Li                + MULT16_32_Q15(g10,x[i-T])
178*a58d3d2aSXin Li                + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
179*a58d3d2aSXin Li                + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
180*a58d3d2aSXin Li    }
181*a58d3d2aSXin Li #endif
182*a58d3d2aSXin Li }
183*a58d3d2aSXin Li 
184*a58d3d2aSXin Li 
185*a58d3d2aSXin Li #endif
186