1*a58d3d2aSXin Li /* Copyright (c) 2014, Cisco Systems, INC
2*a58d3d2aSXin Li Written by XiangMingZhu WeiZhou MinPeng YanWang
3*a58d3d2aSXin Li
4*a58d3d2aSXin Li Redistribution and use in source and binary forms, with or without
5*a58d3d2aSXin Li modification, are permitted provided that the following conditions
6*a58d3d2aSXin Li are met:
7*a58d3d2aSXin Li
8*a58d3d2aSXin Li - Redistributions of source code must retain the above copyright
9*a58d3d2aSXin Li notice, this list of conditions and the following disclaimer.
10*a58d3d2aSXin Li
11*a58d3d2aSXin Li - Redistributions in binary form must reproduce the above copyright
12*a58d3d2aSXin Li notice, this list of conditions and the following disclaimer in the
13*a58d3d2aSXin Li documentation and/or other materials provided with the distribution.
14*a58d3d2aSXin Li
15*a58d3d2aSXin Li THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16*a58d3d2aSXin Li ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17*a58d3d2aSXin Li LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18*a58d3d2aSXin Li A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19*a58d3d2aSXin Li OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20*a58d3d2aSXin Li EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21*a58d3d2aSXin Li PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22*a58d3d2aSXin Li PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23*a58d3d2aSXin Li LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24*a58d3d2aSXin Li NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*a58d3d2aSXin Li SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*a58d3d2aSXin Li */
27*a58d3d2aSXin Li
28*a58d3d2aSXin Li #ifdef HAVE_CONFIG_H
29*a58d3d2aSXin Li #include "config.h"
30*a58d3d2aSXin Li #endif
31*a58d3d2aSXin Li
32*a58d3d2aSXin Li #include <xmmintrin.h>
33*a58d3d2aSXin Li #include <emmintrin.h>
34*a58d3d2aSXin Li
35*a58d3d2aSXin Li #include "macros.h"
36*a58d3d2aSXin Li #include "celt_lpc.h"
37*a58d3d2aSXin Li #include "stack_alloc.h"
38*a58d3d2aSXin Li #include "mathops.h"
39*a58d3d2aSXin Li #include "pitch.h"
40*a58d3d2aSXin Li
41*a58d3d2aSXin Li #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42*a58d3d2aSXin Li #include <smmintrin.h>
43*a58d3d2aSXin Li #include "x86cpu.h"
44*a58d3d2aSXin Li
celt_inner_prod_sse4_1(const opus_val16 * x,const opus_val16 * y,int N)45*a58d3d2aSXin Li opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46*a58d3d2aSXin Li int N)
47*a58d3d2aSXin Li {
48*a58d3d2aSXin Li opus_int i, dataSize16;
49*a58d3d2aSXin Li opus_int32 sum;
50*a58d3d2aSXin Li __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51*a58d3d2aSXin Li __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52*a58d3d2aSXin Li __m128i inVec1_3210, inVec2_3210;
53*a58d3d2aSXin Li
54*a58d3d2aSXin Li sum = 0;
55*a58d3d2aSXin Li dataSize16 = N & ~15;
56*a58d3d2aSXin Li
57*a58d3d2aSXin Li acc1 = _mm_setzero_si128();
58*a58d3d2aSXin Li acc2 = _mm_setzero_si128();
59*a58d3d2aSXin Li
60*a58d3d2aSXin Li for (i=0;i<dataSize16;i+=16) {
61*a58d3d2aSXin Li inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62*a58d3d2aSXin Li inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63*a58d3d2aSXin Li
64*a58d3d2aSXin Li inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65*a58d3d2aSXin Li inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66*a58d3d2aSXin Li
67*a58d3d2aSXin Li inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68*a58d3d2aSXin Li inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69*a58d3d2aSXin Li
70*a58d3d2aSXin Li acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71*a58d3d2aSXin Li acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72*a58d3d2aSXin Li }
73*a58d3d2aSXin Li
74*a58d3d2aSXin Li acc1 = _mm_add_epi32(acc1, acc2);
75*a58d3d2aSXin Li
76*a58d3d2aSXin Li if (N - i >= 8)
77*a58d3d2aSXin Li {
78*a58d3d2aSXin Li inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79*a58d3d2aSXin Li inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80*a58d3d2aSXin Li
81*a58d3d2aSXin Li inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82*a58d3d2aSXin Li
83*a58d3d2aSXin Li acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84*a58d3d2aSXin Li i += 8;
85*a58d3d2aSXin Li }
86*a58d3d2aSXin Li
87*a58d3d2aSXin Li if (N - i >= 4)
88*a58d3d2aSXin Li {
89*a58d3d2aSXin Li inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90*a58d3d2aSXin Li inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91*a58d3d2aSXin Li
92*a58d3d2aSXin Li inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93*a58d3d2aSXin Li
94*a58d3d2aSXin Li acc1 = _mm_add_epi32(acc1, inVec1_3210);
95*a58d3d2aSXin Li i += 4;
96*a58d3d2aSXin Li }
97*a58d3d2aSXin Li
98*a58d3d2aSXin Li acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99*a58d3d2aSXin Li acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100*a58d3d2aSXin Li
101*a58d3d2aSXin Li sum += _mm_cvtsi128_si32(acc1);
102*a58d3d2aSXin Li
103*a58d3d2aSXin Li for (;i<N;i++)
104*a58d3d2aSXin Li {
105*a58d3d2aSXin Li sum = silk_SMLABB(sum, x[i], y[i]);
106*a58d3d2aSXin Li }
107*a58d3d2aSXin Li
108*a58d3d2aSXin Li return sum;
109*a58d3d2aSXin Li }
110*a58d3d2aSXin Li
xcorr_kernel_sse4_1(const opus_val16 * x,const opus_val16 * y,opus_val32 sum[4],int len)111*a58d3d2aSXin Li void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112*a58d3d2aSXin Li {
113*a58d3d2aSXin Li int j;
114*a58d3d2aSXin Li
115*a58d3d2aSXin Li __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116*a58d3d2aSXin Li __m128i vecY0, vecY1, vecY2, vecY3;
117*a58d3d2aSXin Li __m128i sum0, sum1, sum2, sum3, vecSum;
118*a58d3d2aSXin Li __m128i initSum;
119*a58d3d2aSXin Li
120*a58d3d2aSXin Li #ifdef OPUS_CHECK_ASM
121*a58d3d2aSXin Li opus_val32 sum_c[4];
122*a58d3d2aSXin Li for (j=0;j<4;j++) {
123*a58d3d2aSXin Li sum_c[j] = sum[j];
124*a58d3d2aSXin Li }
125*a58d3d2aSXin Li xcorr_kernel_c(x, y, sum_c, len);
126*a58d3d2aSXin Li #endif
127*a58d3d2aSXin Li
128*a58d3d2aSXin Li celt_assert(len >= 3);
129*a58d3d2aSXin Li
130*a58d3d2aSXin Li sum0 = _mm_setzero_si128();
131*a58d3d2aSXin Li sum1 = _mm_setzero_si128();
132*a58d3d2aSXin Li sum2 = _mm_setzero_si128();
133*a58d3d2aSXin Li sum3 = _mm_setzero_si128();
134*a58d3d2aSXin Li
135*a58d3d2aSXin Li for (j=0;j<(len-7);j+=8)
136*a58d3d2aSXin Li {
137*a58d3d2aSXin Li vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
138*a58d3d2aSXin Li vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
139*a58d3d2aSXin Li vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
140*a58d3d2aSXin Li vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
141*a58d3d2aSXin Li vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
142*a58d3d2aSXin Li
143*a58d3d2aSXin Li sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
144*a58d3d2aSXin Li sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
145*a58d3d2aSXin Li sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
146*a58d3d2aSXin Li sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
147*a58d3d2aSXin Li }
148*a58d3d2aSXin Li
149*a58d3d2aSXin Li sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
150*a58d3d2aSXin Li sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
151*a58d3d2aSXin Li
152*a58d3d2aSXin Li sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
153*a58d3d2aSXin Li sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
154*a58d3d2aSXin Li
155*a58d3d2aSXin Li sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
156*a58d3d2aSXin Li sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
157*a58d3d2aSXin Li
158*a58d3d2aSXin Li sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
159*a58d3d2aSXin Li sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
160*a58d3d2aSXin Li
161*a58d3d2aSXin Li vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
162*a58d3d2aSXin Li _mm_unpacklo_epi32(sum2, sum3));
163*a58d3d2aSXin Li
164*a58d3d2aSXin Li for (;j<(len-3);j+=4)
165*a58d3d2aSXin Li {
166*a58d3d2aSXin Li vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
167*a58d3d2aSXin Li vecX0 = _mm_shuffle_epi32(vecX, 0x00);
168*a58d3d2aSXin Li vecX1 = _mm_shuffle_epi32(vecX, 0x55);
169*a58d3d2aSXin Li vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
170*a58d3d2aSXin Li vecX3 = _mm_shuffle_epi32(vecX, 0xff);
171*a58d3d2aSXin Li
172*a58d3d2aSXin Li vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
173*a58d3d2aSXin Li vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
174*a58d3d2aSXin Li vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
175*a58d3d2aSXin Li vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
176*a58d3d2aSXin Li
177*a58d3d2aSXin Li sum0 = _mm_mullo_epi32(vecX0, vecY0);
178*a58d3d2aSXin Li sum1 = _mm_mullo_epi32(vecX1, vecY1);
179*a58d3d2aSXin Li sum2 = _mm_mullo_epi32(vecX2, vecY2);
180*a58d3d2aSXin Li sum3 = _mm_mullo_epi32(vecX3, vecY3);
181*a58d3d2aSXin Li
182*a58d3d2aSXin Li sum0 = _mm_add_epi32(sum0, sum1);
183*a58d3d2aSXin Li sum2 = _mm_add_epi32(sum2, sum3);
184*a58d3d2aSXin Li vecSum = _mm_add_epi32(vecSum, sum0);
185*a58d3d2aSXin Li vecSum = _mm_add_epi32(vecSum, sum2);
186*a58d3d2aSXin Li }
187*a58d3d2aSXin Li
188*a58d3d2aSXin Li vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
189*a58d3d2aSXin Li if (len - j == 3)
190*a58d3d2aSXin Li {
191*a58d3d2aSXin Li vecX0 = _mm_shuffle_epi32(vecX, 0x55);
192*a58d3d2aSXin Li vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
193*a58d3d2aSXin Li vecX2 = _mm_shuffle_epi32(vecX, 0xff);
194*a58d3d2aSXin Li
195*a58d3d2aSXin Li vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
196*a58d3d2aSXin Li vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
197*a58d3d2aSXin Li vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
198*a58d3d2aSXin Li
199*a58d3d2aSXin Li sum0 = _mm_mullo_epi32(vecX0, vecY0);
200*a58d3d2aSXin Li sum1 = _mm_mullo_epi32(vecX1, vecY1);
201*a58d3d2aSXin Li sum2 = _mm_mullo_epi32(vecX2, vecY2);
202*a58d3d2aSXin Li
203*a58d3d2aSXin Li vecSum = _mm_add_epi32(vecSum, sum0);
204*a58d3d2aSXin Li vecSum = _mm_add_epi32(vecSum, sum1);
205*a58d3d2aSXin Li vecSum = _mm_add_epi32(vecSum, sum2);
206*a58d3d2aSXin Li }
207*a58d3d2aSXin Li else if (len - j == 2)
208*a58d3d2aSXin Li {
209*a58d3d2aSXin Li vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
210*a58d3d2aSXin Li vecX1 = _mm_shuffle_epi32(vecX, 0xff);
211*a58d3d2aSXin Li
212*a58d3d2aSXin Li vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
213*a58d3d2aSXin Li vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
214*a58d3d2aSXin Li
215*a58d3d2aSXin Li sum0 = _mm_mullo_epi32(vecX0, vecY0);
216*a58d3d2aSXin Li sum1 = _mm_mullo_epi32(vecX1, vecY1);
217*a58d3d2aSXin Li
218*a58d3d2aSXin Li vecSum = _mm_add_epi32(vecSum, sum0);
219*a58d3d2aSXin Li vecSum = _mm_add_epi32(vecSum, sum1);
220*a58d3d2aSXin Li }
221*a58d3d2aSXin Li else if (len - j == 1)
222*a58d3d2aSXin Li {
223*a58d3d2aSXin Li vecX0 = _mm_shuffle_epi32(vecX, 0xff);
224*a58d3d2aSXin Li
225*a58d3d2aSXin Li vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
226*a58d3d2aSXin Li
227*a58d3d2aSXin Li sum0 = _mm_mullo_epi32(vecX0, vecY0);
228*a58d3d2aSXin Li
229*a58d3d2aSXin Li vecSum = _mm_add_epi32(vecSum, sum0);
230*a58d3d2aSXin Li }
231*a58d3d2aSXin Li
232*a58d3d2aSXin Li initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
233*a58d3d2aSXin Li initSum = _mm_add_epi32(initSum, vecSum);
234*a58d3d2aSXin Li _mm_storeu_si128((__m128i *)sum, initSum);
235*a58d3d2aSXin Li
236*a58d3d2aSXin Li #ifdef OPUS_CHECK_ASM
237*a58d3d2aSXin Li celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
238*a58d3d2aSXin Li #endif
239*a58d3d2aSXin Li }
240*a58d3d2aSXin Li #endif
241