xref: /aosp_15_r20/external/libopus/celt/x86/pitch_sse4_1.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li /* Copyright (c) 2014, Cisco Systems, INC
2*a58d3d2aSXin Li    Written by XiangMingZhu WeiZhou MinPeng YanWang
3*a58d3d2aSXin Li 
4*a58d3d2aSXin Li    Redistribution and use in source and binary forms, with or without
5*a58d3d2aSXin Li    modification, are permitted provided that the following conditions
6*a58d3d2aSXin Li    are met:
7*a58d3d2aSXin Li 
8*a58d3d2aSXin Li    - Redistributions of source code must retain the above copyright
9*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer.
10*a58d3d2aSXin Li 
11*a58d3d2aSXin Li    - Redistributions in binary form must reproduce the above copyright
12*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer in the
13*a58d3d2aSXin Li    documentation and/or other materials provided with the distribution.
14*a58d3d2aSXin Li 
15*a58d3d2aSXin Li    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16*a58d3d2aSXin Li    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17*a58d3d2aSXin Li    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18*a58d3d2aSXin Li    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19*a58d3d2aSXin Li    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20*a58d3d2aSXin Li    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21*a58d3d2aSXin Li    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22*a58d3d2aSXin Li    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23*a58d3d2aSXin Li    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24*a58d3d2aSXin Li    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*a58d3d2aSXin Li    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*a58d3d2aSXin Li */
27*a58d3d2aSXin Li 
28*a58d3d2aSXin Li #ifdef HAVE_CONFIG_H
29*a58d3d2aSXin Li #include "config.h"
30*a58d3d2aSXin Li #endif
31*a58d3d2aSXin Li 
32*a58d3d2aSXin Li #include <xmmintrin.h>
33*a58d3d2aSXin Li #include <emmintrin.h>
34*a58d3d2aSXin Li 
35*a58d3d2aSXin Li #include "macros.h"
36*a58d3d2aSXin Li #include "celt_lpc.h"
37*a58d3d2aSXin Li #include "stack_alloc.h"
38*a58d3d2aSXin Li #include "mathops.h"
39*a58d3d2aSXin Li #include "pitch.h"
40*a58d3d2aSXin Li 
41*a58d3d2aSXin Li #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42*a58d3d2aSXin Li #include <smmintrin.h>
43*a58d3d2aSXin Li #include "x86cpu.h"
44*a58d3d2aSXin Li 
celt_inner_prod_sse4_1(const opus_val16 * x,const opus_val16 * y,int N)45*a58d3d2aSXin Li opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46*a58d3d2aSXin Li       int N)
47*a58d3d2aSXin Li {
48*a58d3d2aSXin Li     opus_int  i, dataSize16;
49*a58d3d2aSXin Li     opus_int32 sum;
50*a58d3d2aSXin Li     __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51*a58d3d2aSXin Li     __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52*a58d3d2aSXin Li     __m128i inVec1_3210, inVec2_3210;
53*a58d3d2aSXin Li 
54*a58d3d2aSXin Li     sum = 0;
55*a58d3d2aSXin Li     dataSize16 = N & ~15;
56*a58d3d2aSXin Li 
57*a58d3d2aSXin Li     acc1 = _mm_setzero_si128();
58*a58d3d2aSXin Li     acc2 = _mm_setzero_si128();
59*a58d3d2aSXin Li 
60*a58d3d2aSXin Li     for (i=0;i<dataSize16;i+=16) {
61*a58d3d2aSXin Li         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62*a58d3d2aSXin Li         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63*a58d3d2aSXin Li 
64*a58d3d2aSXin Li         inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65*a58d3d2aSXin Li         inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66*a58d3d2aSXin Li 
67*a58d3d2aSXin Li         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68*a58d3d2aSXin Li         inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69*a58d3d2aSXin Li 
70*a58d3d2aSXin Li         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71*a58d3d2aSXin Li         acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72*a58d3d2aSXin Li     }
73*a58d3d2aSXin Li 
74*a58d3d2aSXin Li     acc1 = _mm_add_epi32(acc1, acc2);
75*a58d3d2aSXin Li 
76*a58d3d2aSXin Li     if (N - i >= 8)
77*a58d3d2aSXin Li     {
78*a58d3d2aSXin Li         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79*a58d3d2aSXin Li         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80*a58d3d2aSXin Li 
81*a58d3d2aSXin Li         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82*a58d3d2aSXin Li 
83*a58d3d2aSXin Li         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84*a58d3d2aSXin Li         i += 8;
85*a58d3d2aSXin Li     }
86*a58d3d2aSXin Li 
87*a58d3d2aSXin Li     if (N - i >= 4)
88*a58d3d2aSXin Li     {
89*a58d3d2aSXin Li         inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90*a58d3d2aSXin Li         inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91*a58d3d2aSXin Li 
92*a58d3d2aSXin Li         inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93*a58d3d2aSXin Li 
94*a58d3d2aSXin Li         acc1 = _mm_add_epi32(acc1, inVec1_3210);
95*a58d3d2aSXin Li         i += 4;
96*a58d3d2aSXin Li     }
97*a58d3d2aSXin Li 
98*a58d3d2aSXin Li     acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99*a58d3d2aSXin Li     acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100*a58d3d2aSXin Li 
101*a58d3d2aSXin Li     sum += _mm_cvtsi128_si32(acc1);
102*a58d3d2aSXin Li 
103*a58d3d2aSXin Li     for (;i<N;i++)
104*a58d3d2aSXin Li     {
105*a58d3d2aSXin Li         sum = silk_SMLABB(sum, x[i], y[i]);
106*a58d3d2aSXin Li     }
107*a58d3d2aSXin Li 
108*a58d3d2aSXin Li     return sum;
109*a58d3d2aSXin Li }
110*a58d3d2aSXin Li 
xcorr_kernel_sse4_1(const opus_val16 * x,const opus_val16 * y,opus_val32 sum[4],int len)111*a58d3d2aSXin Li void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112*a58d3d2aSXin Li {
113*a58d3d2aSXin Li     int j;
114*a58d3d2aSXin Li 
115*a58d3d2aSXin Li     __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116*a58d3d2aSXin Li     __m128i vecY0, vecY1, vecY2, vecY3;
117*a58d3d2aSXin Li     __m128i sum0, sum1, sum2, sum3, vecSum;
118*a58d3d2aSXin Li     __m128i initSum;
119*a58d3d2aSXin Li 
120*a58d3d2aSXin Li #ifdef OPUS_CHECK_ASM
121*a58d3d2aSXin Li     opus_val32 sum_c[4];
122*a58d3d2aSXin Li     for (j=0;j<4;j++) {
123*a58d3d2aSXin Li       sum_c[j] = sum[j];
124*a58d3d2aSXin Li     }
125*a58d3d2aSXin Li     xcorr_kernel_c(x, y, sum_c, len);
126*a58d3d2aSXin Li #endif
127*a58d3d2aSXin Li 
128*a58d3d2aSXin Li     celt_assert(len >= 3);
129*a58d3d2aSXin Li 
130*a58d3d2aSXin Li     sum0 = _mm_setzero_si128();
131*a58d3d2aSXin Li     sum1 = _mm_setzero_si128();
132*a58d3d2aSXin Li     sum2 = _mm_setzero_si128();
133*a58d3d2aSXin Li     sum3 = _mm_setzero_si128();
134*a58d3d2aSXin Li 
135*a58d3d2aSXin Li     for (j=0;j<(len-7);j+=8)
136*a58d3d2aSXin Li     {
137*a58d3d2aSXin Li         vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
138*a58d3d2aSXin Li         vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
139*a58d3d2aSXin Li         vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
140*a58d3d2aSXin Li         vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
141*a58d3d2aSXin Li         vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
142*a58d3d2aSXin Li 
143*a58d3d2aSXin Li         sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
144*a58d3d2aSXin Li         sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
145*a58d3d2aSXin Li         sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
146*a58d3d2aSXin Li         sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
147*a58d3d2aSXin Li     }
148*a58d3d2aSXin Li 
149*a58d3d2aSXin Li     sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
150*a58d3d2aSXin Li     sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
151*a58d3d2aSXin Li 
152*a58d3d2aSXin Li     sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
153*a58d3d2aSXin Li     sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
154*a58d3d2aSXin Li 
155*a58d3d2aSXin Li     sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
156*a58d3d2aSXin Li     sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
157*a58d3d2aSXin Li 
158*a58d3d2aSXin Li     sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
159*a58d3d2aSXin Li     sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
160*a58d3d2aSXin Li 
161*a58d3d2aSXin Li     vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
162*a58d3d2aSXin Li           _mm_unpacklo_epi32(sum2, sum3));
163*a58d3d2aSXin Li 
164*a58d3d2aSXin Li     for (;j<(len-3);j+=4)
165*a58d3d2aSXin Li     {
166*a58d3d2aSXin Li         vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
167*a58d3d2aSXin Li         vecX0 = _mm_shuffle_epi32(vecX, 0x00);
168*a58d3d2aSXin Li         vecX1 = _mm_shuffle_epi32(vecX, 0x55);
169*a58d3d2aSXin Li         vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
170*a58d3d2aSXin Li         vecX3 = _mm_shuffle_epi32(vecX, 0xff);
171*a58d3d2aSXin Li 
172*a58d3d2aSXin Li         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
173*a58d3d2aSXin Li         vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
174*a58d3d2aSXin Li         vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
175*a58d3d2aSXin Li         vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
176*a58d3d2aSXin Li 
177*a58d3d2aSXin Li         sum0 = _mm_mullo_epi32(vecX0, vecY0);
178*a58d3d2aSXin Li         sum1 = _mm_mullo_epi32(vecX1, vecY1);
179*a58d3d2aSXin Li         sum2 = _mm_mullo_epi32(vecX2, vecY2);
180*a58d3d2aSXin Li         sum3 = _mm_mullo_epi32(vecX3, vecY3);
181*a58d3d2aSXin Li 
182*a58d3d2aSXin Li         sum0 = _mm_add_epi32(sum0, sum1);
183*a58d3d2aSXin Li         sum2 = _mm_add_epi32(sum2, sum3);
184*a58d3d2aSXin Li         vecSum = _mm_add_epi32(vecSum, sum0);
185*a58d3d2aSXin Li         vecSum = _mm_add_epi32(vecSum, sum2);
186*a58d3d2aSXin Li     }
187*a58d3d2aSXin Li 
188*a58d3d2aSXin Li     vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
189*a58d3d2aSXin Li     if (len - j == 3)
190*a58d3d2aSXin Li     {
191*a58d3d2aSXin Li         vecX0 = _mm_shuffle_epi32(vecX, 0x55);
192*a58d3d2aSXin Li         vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
193*a58d3d2aSXin Li         vecX2 = _mm_shuffle_epi32(vecX, 0xff);
194*a58d3d2aSXin Li 
195*a58d3d2aSXin Li         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
196*a58d3d2aSXin Li         vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
197*a58d3d2aSXin Li         vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
198*a58d3d2aSXin Li 
199*a58d3d2aSXin Li         sum0 = _mm_mullo_epi32(vecX0, vecY0);
200*a58d3d2aSXin Li         sum1 = _mm_mullo_epi32(vecX1, vecY1);
201*a58d3d2aSXin Li         sum2 = _mm_mullo_epi32(vecX2, vecY2);
202*a58d3d2aSXin Li 
203*a58d3d2aSXin Li         vecSum = _mm_add_epi32(vecSum, sum0);
204*a58d3d2aSXin Li         vecSum = _mm_add_epi32(vecSum, sum1);
205*a58d3d2aSXin Li         vecSum = _mm_add_epi32(vecSum, sum2);
206*a58d3d2aSXin Li     }
207*a58d3d2aSXin Li     else if (len - j == 2)
208*a58d3d2aSXin Li     {
209*a58d3d2aSXin Li         vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
210*a58d3d2aSXin Li         vecX1 = _mm_shuffle_epi32(vecX, 0xff);
211*a58d3d2aSXin Li 
212*a58d3d2aSXin Li         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
213*a58d3d2aSXin Li         vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
214*a58d3d2aSXin Li 
215*a58d3d2aSXin Li         sum0 = _mm_mullo_epi32(vecX0, vecY0);
216*a58d3d2aSXin Li         sum1 = _mm_mullo_epi32(vecX1, vecY1);
217*a58d3d2aSXin Li 
218*a58d3d2aSXin Li         vecSum = _mm_add_epi32(vecSum, sum0);
219*a58d3d2aSXin Li         vecSum = _mm_add_epi32(vecSum, sum1);
220*a58d3d2aSXin Li     }
221*a58d3d2aSXin Li     else if (len - j == 1)
222*a58d3d2aSXin Li     {
223*a58d3d2aSXin Li         vecX0 = _mm_shuffle_epi32(vecX, 0xff);
224*a58d3d2aSXin Li 
225*a58d3d2aSXin Li         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
226*a58d3d2aSXin Li 
227*a58d3d2aSXin Li         sum0 = _mm_mullo_epi32(vecX0, vecY0);
228*a58d3d2aSXin Li 
229*a58d3d2aSXin Li         vecSum = _mm_add_epi32(vecSum, sum0);
230*a58d3d2aSXin Li     }
231*a58d3d2aSXin Li 
232*a58d3d2aSXin Li     initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
233*a58d3d2aSXin Li     initSum = _mm_add_epi32(initSum, vecSum);
234*a58d3d2aSXin Li     _mm_storeu_si128((__m128i *)sum, initSum);
235*a58d3d2aSXin Li 
236*a58d3d2aSXin Li #ifdef OPUS_CHECK_ASM
237*a58d3d2aSXin Li     celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
238*a58d3d2aSXin Li #endif
239*a58d3d2aSXin Li }
240*a58d3d2aSXin Li #endif
241