xref: /aosp_15_r20/external/libopus/celt/x86/pitch_sse2.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li /* Copyright (c) 2014, Cisco Systems, INC
2*a58d3d2aSXin Li    Written by XiangMingZhu WeiZhou MinPeng YanWang
3*a58d3d2aSXin Li 
4*a58d3d2aSXin Li    Redistribution and use in source and binary forms, with or without
5*a58d3d2aSXin Li    modification, are permitted provided that the following conditions
6*a58d3d2aSXin Li    are met:
7*a58d3d2aSXin Li 
8*a58d3d2aSXin Li    - Redistributions of source code must retain the above copyright
9*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer.
10*a58d3d2aSXin Li 
11*a58d3d2aSXin Li    - Redistributions in binary form must reproduce the above copyright
12*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer in the
13*a58d3d2aSXin Li    documentation and/or other materials provided with the distribution.
14*a58d3d2aSXin Li 
15*a58d3d2aSXin Li    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16*a58d3d2aSXin Li    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17*a58d3d2aSXin Li    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18*a58d3d2aSXin Li    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19*a58d3d2aSXin Li    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20*a58d3d2aSXin Li    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21*a58d3d2aSXin Li    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22*a58d3d2aSXin Li    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23*a58d3d2aSXin Li    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24*a58d3d2aSXin Li    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*a58d3d2aSXin Li    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*a58d3d2aSXin Li */
27*a58d3d2aSXin Li 
28*a58d3d2aSXin Li #ifdef HAVE_CONFIG_H
29*a58d3d2aSXin Li #include "config.h"
30*a58d3d2aSXin Li #endif
31*a58d3d2aSXin Li 
32*a58d3d2aSXin Li #include <xmmintrin.h>
33*a58d3d2aSXin Li #include <emmintrin.h>
34*a58d3d2aSXin Li 
35*a58d3d2aSXin Li #include "macros.h"
36*a58d3d2aSXin Li #include "celt_lpc.h"
37*a58d3d2aSXin Li #include "stack_alloc.h"
38*a58d3d2aSXin Li #include "mathops.h"
39*a58d3d2aSXin Li #include "pitch.h"
40*a58d3d2aSXin Li 
41*a58d3d2aSXin Li #if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
celt_inner_prod_sse2(const opus_val16 * x,const opus_val16 * y,int N)42*a58d3d2aSXin Li opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
43*a58d3d2aSXin Li       int N)
44*a58d3d2aSXin Li {
45*a58d3d2aSXin Li     opus_int  i, dataSize16;
46*a58d3d2aSXin Li     opus_int32 sum;
47*a58d3d2aSXin Li 
48*a58d3d2aSXin Li     __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
49*a58d3d2aSXin Li     __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
50*a58d3d2aSXin Li 
51*a58d3d2aSXin Li     sum = 0;
52*a58d3d2aSXin Li     dataSize16 = N & ~15;
53*a58d3d2aSXin Li 
54*a58d3d2aSXin Li     acc1 = _mm_setzero_si128();
55*a58d3d2aSXin Li     acc2 = _mm_setzero_si128();
56*a58d3d2aSXin Li 
57*a58d3d2aSXin Li     for (i=0;i<dataSize16;i+=16)
58*a58d3d2aSXin Li     {
59*a58d3d2aSXin Li         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
60*a58d3d2aSXin Li         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
61*a58d3d2aSXin Li 
62*a58d3d2aSXin Li         inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
63*a58d3d2aSXin Li         inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
64*a58d3d2aSXin Li 
65*a58d3d2aSXin Li         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
66*a58d3d2aSXin Li         inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
67*a58d3d2aSXin Li 
68*a58d3d2aSXin Li         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
69*a58d3d2aSXin Li         acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
70*a58d3d2aSXin Li     }
71*a58d3d2aSXin Li 
72*a58d3d2aSXin Li     acc1 = _mm_add_epi32( acc1, acc2 );
73*a58d3d2aSXin Li 
74*a58d3d2aSXin Li     if (N - i >= 8)
75*a58d3d2aSXin Li     {
76*a58d3d2aSXin Li         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
77*a58d3d2aSXin Li         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
78*a58d3d2aSXin Li 
79*a58d3d2aSXin Li         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
80*a58d3d2aSXin Li 
81*a58d3d2aSXin Li         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
82*a58d3d2aSXin Li         i += 8;
83*a58d3d2aSXin Li     }
84*a58d3d2aSXin Li 
85*a58d3d2aSXin Li     acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
86*a58d3d2aSXin Li     acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
87*a58d3d2aSXin Li     sum += _mm_cvtsi128_si32(acc1);
88*a58d3d2aSXin Li 
89*a58d3d2aSXin Li     for (;i<N;i++) {
90*a58d3d2aSXin Li         sum = silk_SMLABB(sum, x[i], y[i]);
91*a58d3d2aSXin Li     }
92*a58d3d2aSXin Li 
93*a58d3d2aSXin Li     return sum;
94*a58d3d2aSXin Li }
95*a58d3d2aSXin Li #endif
96