xref: /aosp_15_r20/external/libopus/celt/x86/pitch_avx.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li /* Copyright (c) 2023 Amazon */
2*a58d3d2aSXin Li /*
3*a58d3d2aSXin Li    Redistribution and use in source and binary forms, with or without
4*a58d3d2aSXin Li    modification, are permitted provided that the following conditions
5*a58d3d2aSXin Li    are met:
6*a58d3d2aSXin Li 
7*a58d3d2aSXin Li    - Redistributions of source code must retain the above copyright
8*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer.
9*a58d3d2aSXin Li 
10*a58d3d2aSXin Li    - Redistributions in binary form must reproduce the above copyright
11*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer in the
12*a58d3d2aSXin Li    documentation and/or other materials provided with the distribution.
13*a58d3d2aSXin Li 
14*a58d3d2aSXin Li    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15*a58d3d2aSXin Li    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16*a58d3d2aSXin Li    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17*a58d3d2aSXin Li    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
18*a58d3d2aSXin Li    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19*a58d3d2aSXin Li    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20*a58d3d2aSXin Li    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21*a58d3d2aSXin Li    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22*a58d3d2aSXin Li    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23*a58d3d2aSXin Li    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*a58d3d2aSXin Li    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*a58d3d2aSXin Li */
26*a58d3d2aSXin Li 
27*a58d3d2aSXin Li #ifdef HAVE_CONFIG_H
28*a58d3d2aSXin Li #include "config.h"
29*a58d3d2aSXin Li #endif
30*a58d3d2aSXin Li 
31*a58d3d2aSXin Li 
32*a58d3d2aSXin Li #include <immintrin.h>
33*a58d3d2aSXin Li #include "x86cpu.h"
34*a58d3d2aSXin Li #include "pitch.h"
35*a58d3d2aSXin Li 
36*a58d3d2aSXin Li #if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT)
37*a58d3d2aSXin Li 
38*a58d3d2aSXin Li /* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */
xcorr_kernel_avx(const float * x,const float * y,float sum[8],int len)39*a58d3d2aSXin Li static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len)
40*a58d3d2aSXin Li {
41*a58d3d2aSXin Li     __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7;
42*a58d3d2aSXin Li     xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps();
43*a58d3d2aSXin Li     int i;
44*a58d3d2aSXin Li     __m256 x0;
45*a58d3d2aSXin Li     /* Compute 8 inner products using partial sums. */
46*a58d3d2aSXin Li     for (i=0;i<len-7;i+=8)
47*a58d3d2aSXin Li     {
48*a58d3d2aSXin Li         x0 = _mm256_loadu_ps(x+i);
49*a58d3d2aSXin Li         xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i  ), xsum0);
50*a58d3d2aSXin Li         xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1);
51*a58d3d2aSXin Li         xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2);
52*a58d3d2aSXin Li         xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3);
53*a58d3d2aSXin Li         xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4);
54*a58d3d2aSXin Li         xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5);
55*a58d3d2aSXin Li         xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6);
56*a58d3d2aSXin Li         xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7);
57*a58d3d2aSXin Li     }
58*a58d3d2aSXin Li     if (i != len) {
59*a58d3d2aSXin Li         static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
60*a58d3d2aSXin Li         __m256i m;
61*a58d3d2aSXin Li         m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len));
62*a58d3d2aSXin Li         x0 = _mm256_maskload_ps(x+i, m);
63*a58d3d2aSXin Li         xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i  , m), xsum0);
64*a58d3d2aSXin Li         xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1);
65*a58d3d2aSXin Li         xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2);
66*a58d3d2aSXin Li         xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3);
67*a58d3d2aSXin Li         xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4);
68*a58d3d2aSXin Li         xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5);
69*a58d3d2aSXin Li         xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6);
70*a58d3d2aSXin Li         xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7);
71*a58d3d2aSXin Li     }
72*a58d3d2aSXin Li     /* 8 horizontal adds. */
73*a58d3d2aSXin Li     /* Compute [0 4] [1 5] [2 6] [3 7] */
74*a58d3d2aSXin Li     xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4)));
75*a58d3d2aSXin Li     xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4)));
76*a58d3d2aSXin Li     xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4)));
77*a58d3d2aSXin Li     xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4)));
78*a58d3d2aSXin Li     /* Compute [0 1 4 5] [2 3 6 7] */
79*a58d3d2aSXin Li     xsum0 = _mm256_hadd_ps(xsum0, xsum1);
80*a58d3d2aSXin Li     xsum1 = _mm256_hadd_ps(xsum2, xsum3);
81*a58d3d2aSXin Li     /* Compute [0 1 2 3 4 5 6 7] */
82*a58d3d2aSXin Li     xsum0 = _mm256_hadd_ps(xsum0, xsum1);
83*a58d3d2aSXin Li     _mm256_storeu_ps(sum, xsum0);
84*a58d3d2aSXin Li }
85*a58d3d2aSXin Li 
celt_pitch_xcorr_avx2(const float * _x,const float * _y,float * xcorr,int len,int max_pitch,int arch)86*a58d3d2aSXin Li void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch)
87*a58d3d2aSXin Li {
88*a58d3d2aSXin Li    int i;
89*a58d3d2aSXin Li    celt_assert(max_pitch>0);
90*a58d3d2aSXin Li    (void)arch;
91*a58d3d2aSXin Li    for (i=0;i<max_pitch-7;i+=8)
92*a58d3d2aSXin Li    {
93*a58d3d2aSXin Li       xcorr_kernel_avx(_x, _y+i, &xcorr[i], len);
94*a58d3d2aSXin Li    }
95*a58d3d2aSXin Li    for (;i<max_pitch;i++)
96*a58d3d2aSXin Li    {
97*a58d3d2aSXin Li       xcorr[i] = celt_inner_prod(_x, _y+i, len, arch);
98*a58d3d2aSXin Li    }
99*a58d3d2aSXin Li }
100*a58d3d2aSXin Li 
101*a58d3d2aSXin Li #endif
102