xref: /aosp_15_r20/external/libopus/celt/arm/celt_neon_intr.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li /* Copyright (c) 2014-2015 Xiph.Org Foundation
2*a58d3d2aSXin Li    Written by Viswanath Puttagunta */
3*a58d3d2aSXin Li /**
4*a58d3d2aSXin Li    @file celt_neon_intr.c
5*a58d3d2aSXin Li    @brief ARM Neon Intrinsic optimizations for celt
6*a58d3d2aSXin Li  */
7*a58d3d2aSXin Li 
8*a58d3d2aSXin Li /*
9*a58d3d2aSXin Li    Redistribution and use in source and binary forms, with or without
10*a58d3d2aSXin Li    modification, are permitted provided that the following conditions
11*a58d3d2aSXin Li    are met:
12*a58d3d2aSXin Li 
13*a58d3d2aSXin Li    - Redistributions of source code must retain the above copyright
14*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer.
15*a58d3d2aSXin Li 
16*a58d3d2aSXin Li    - Redistributions in binary form must reproduce the above copyright
17*a58d3d2aSXin Li    notice, this list of conditions and the following disclaimer in the
18*a58d3d2aSXin Li    documentation and/or other materials provided with the distribution.
19*a58d3d2aSXin Li 
20*a58d3d2aSXin Li    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21*a58d3d2aSXin Li    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22*a58d3d2aSXin Li    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23*a58d3d2aSXin Li    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24*a58d3d2aSXin Li    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25*a58d3d2aSXin Li    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26*a58d3d2aSXin Li    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27*a58d3d2aSXin Li    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28*a58d3d2aSXin Li    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29*a58d3d2aSXin Li    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30*a58d3d2aSXin Li    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*a58d3d2aSXin Li */
32*a58d3d2aSXin Li 
33*a58d3d2aSXin Li #ifdef HAVE_CONFIG_H
34*a58d3d2aSXin Li #include "config.h"
35*a58d3d2aSXin Li #endif
36*a58d3d2aSXin Li 
37*a58d3d2aSXin Li #include <arm_neon.h>
38*a58d3d2aSXin Li #include "../pitch.h"
39*a58d3d2aSXin Li 
40*a58d3d2aSXin Li #if defined(FIXED_POINT)
41*a58d3d2aSXin Li #include <string.h>
42*a58d3d2aSXin Li 
xcorr_kernel_neon_fixed(const opus_val16 * x,const opus_val16 * y,opus_val32 sum[4],int len)43*a58d3d2aSXin Li void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
44*a58d3d2aSXin Li {
45*a58d3d2aSXin Li    int j;
46*a58d3d2aSXin Li    int32x4_t a = vld1q_s32(sum);
47*a58d3d2aSXin Li    /* Load y[0...3] */
48*a58d3d2aSXin Li    /* This requires len>0 to always be valid (which we assert in the C code). */
49*a58d3d2aSXin Li    int16x4_t y0 = vld1_s16(y);
50*a58d3d2aSXin Li    y += 4;
51*a58d3d2aSXin Li 
52*a58d3d2aSXin Li    /* This loop loads one y value more than we actually need.
53*a58d3d2aSXin Li       Therefore we have to stop as soon as there are 8 or fewer samples left
54*a58d3d2aSXin Li        (instead of 7), to avoid reading past the end of the array. */
55*a58d3d2aSXin Li    for (j = 0; j + 8 < len; j += 8)
56*a58d3d2aSXin Li    {
57*a58d3d2aSXin Li       /* Load x[0...7] */
58*a58d3d2aSXin Li       int16x8_t xx = vld1q_s16(x);
59*a58d3d2aSXin Li       int16x4_t x0 = vget_low_s16(xx);
60*a58d3d2aSXin Li       int16x4_t x4 = vget_high_s16(xx);
61*a58d3d2aSXin Li       /* Load y[4...11] */
62*a58d3d2aSXin Li       int16x8_t yy = vld1q_s16(y);
63*a58d3d2aSXin Li       int16x4_t y4 = vget_low_s16(yy);
64*a58d3d2aSXin Li       int16x4_t y8 = vget_high_s16(yy);
65*a58d3d2aSXin Li       int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
66*a58d3d2aSXin Li       int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
67*a58d3d2aSXin Li 
68*a58d3d2aSXin Li       int16x4_t y1 = vext_s16(y0, y4, 1);
69*a58d3d2aSXin Li       int16x4_t y5 = vext_s16(y4, y8, 1);
70*a58d3d2aSXin Li       int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
71*a58d3d2aSXin Li       int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
72*a58d3d2aSXin Li 
73*a58d3d2aSXin Li       int16x4_t y2 = vext_s16(y0, y4, 2);
74*a58d3d2aSXin Li       int16x4_t y6 = vext_s16(y4, y8, 2);
75*a58d3d2aSXin Li       int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
76*a58d3d2aSXin Li       int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
77*a58d3d2aSXin Li 
78*a58d3d2aSXin Li       int16x4_t y3 = vext_s16(y0, y4, 3);
79*a58d3d2aSXin Li       int16x4_t y7 = vext_s16(y4, y8, 3);
80*a58d3d2aSXin Li       int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
81*a58d3d2aSXin Li       int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
82*a58d3d2aSXin Li 
83*a58d3d2aSXin Li       y0 = y8;
84*a58d3d2aSXin Li       a = a7;
85*a58d3d2aSXin Li       x += 8;
86*a58d3d2aSXin Li       y += 8;
87*a58d3d2aSXin Li    }
88*a58d3d2aSXin Li    if (j + 4 < len) {
89*a58d3d2aSXin Li       /* Load x[0...3] */
90*a58d3d2aSXin Li       int16x4_t x0 = vld1_s16(x);
91*a58d3d2aSXin Li       /* Load y[4...7] */
92*a58d3d2aSXin Li       int16x4_t y4 = vld1_s16(y);
93*a58d3d2aSXin Li       int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
94*a58d3d2aSXin Li       int16x4_t y1 = vext_s16(y0, y4, 1);
95*a58d3d2aSXin Li       int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
96*a58d3d2aSXin Li       int16x4_t y2 = vext_s16(y0, y4, 2);
97*a58d3d2aSXin Li       int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
98*a58d3d2aSXin Li       int16x4_t y3 = vext_s16(y0, y4, 3);
99*a58d3d2aSXin Li       int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
100*a58d3d2aSXin Li       y0 = y4;
101*a58d3d2aSXin Li       a = a3;
102*a58d3d2aSXin Li       x += 4;
103*a58d3d2aSXin Li       y += 4;
104*a58d3d2aSXin Li       j += 4;
105*a58d3d2aSXin Li    }
106*a58d3d2aSXin Li    if (j + 2 < len) {
107*a58d3d2aSXin Li       /* Load x[0...1] */
108*a58d3d2aSXin Li       int16x4x2_t xx = vld2_dup_s16(x);
109*a58d3d2aSXin Li       int16x4_t x0 = xx.val[0];
110*a58d3d2aSXin Li       int16x4_t x1 = xx.val[1];
111*a58d3d2aSXin Li       /* Load y[4...5].
112*a58d3d2aSXin Li          We would like to use vld1_dup_s32(), but casting the pointer would
113*a58d3d2aSXin Li           break strict aliasing rules and potentially have alignment issues.
114*a58d3d2aSXin Li          Fortunately the compiler seems capable of translating this memcpy()
115*a58d3d2aSXin Li           and vdup_n_s32() into the equivalent vld1_dup_s32().*/
116*a58d3d2aSXin Li       int32_t yy;
117*a58d3d2aSXin Li       memcpy(&yy, y, sizeof(yy));
118*a58d3d2aSXin Li       int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
119*a58d3d2aSXin Li       int32x4_t a0 = vmlal_s16(a, y0, x0);
120*a58d3d2aSXin Li       int16x4_t y1 = vext_s16(y0, y4, 1);
121*a58d3d2aSXin Li       /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
122*a58d3d2aSXin Li           using VSRI instead of VEXT, since it's a data-processing
123*a58d3d2aSXin Li           instruction. */
124*a58d3d2aSXin Li       y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
125*a58d3d2aSXin Li        vreinterpret_s64_s16(y0), 32));
126*a58d3d2aSXin Li       int32x4_t a1 = vmlal_s16(a0, y1, x1);
127*a58d3d2aSXin Li       a = a1;
128*a58d3d2aSXin Li       x += 2;
129*a58d3d2aSXin Li       y += 2;
130*a58d3d2aSXin Li       j += 2;
131*a58d3d2aSXin Li    }
132*a58d3d2aSXin Li    if (j + 1 < len) {
133*a58d3d2aSXin Li       /* Load next x. */
134*a58d3d2aSXin Li       int16x4_t x0 = vld1_dup_s16(x);
135*a58d3d2aSXin Li       int32x4_t a0 = vmlal_s16(a, y0, x0);
136*a58d3d2aSXin Li       /* Load last y. */
137*a58d3d2aSXin Li       int16x4_t y4 = vld1_dup_s16(y);
138*a58d3d2aSXin Li       y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
139*a58d3d2aSXin Li        vreinterpret_s64_s16(y0), 16));
140*a58d3d2aSXin Li       a = a0;
141*a58d3d2aSXin Li       x++;
142*a58d3d2aSXin Li    }
143*a58d3d2aSXin Li    /* Load last x. */
144*a58d3d2aSXin Li    int16x4_t x0 = vld1_dup_s16(x);
145*a58d3d2aSXin Li    int32x4_t a0 = vmlal_s16(a, y0, x0);
146*a58d3d2aSXin Li    vst1q_s32(sum, a0);
147*a58d3d2aSXin Li }
148*a58d3d2aSXin Li 
149*a58d3d2aSXin Li #else
150*a58d3d2aSXin Li 
151*a58d3d2aSXin Li #if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
152*a58d3d2aSXin Li /* If we can, force the compiler to use an FMA instruction rather than break
153*a58d3d2aSXin Li  *    vmlaq_f32() into fmul/fadd. */
154*a58d3d2aSXin Li #ifdef vmlaq_lane_f32
155*a58d3d2aSXin Li #undef vmlaq_lane_f32
156*a58d3d2aSXin Li #endif
157*a58d3d2aSXin Li #define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
158*a58d3d2aSXin Li #endif
159*a58d3d2aSXin Li 
160*a58d3d2aSXin Li 
161*a58d3d2aSXin Li /*
162*a58d3d2aSXin Li  * Function: xcorr_kernel_neon_float
163*a58d3d2aSXin Li  * ---------------------------------
164*a58d3d2aSXin Li  * Computes 4 correlation values and stores them in sum[4]
165*a58d3d2aSXin Li  */
xcorr_kernel_neon_float(const float32_t * x,const float32_t * y,float32_t sum[4],int len)166*a58d3d2aSXin Li static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
167*a58d3d2aSXin Li       float32_t sum[4], int len) {
168*a58d3d2aSXin Li    float32x4_t YY[3];
169*a58d3d2aSXin Li    float32x4_t YEXT[3];
170*a58d3d2aSXin Li    float32x4_t XX[2];
171*a58d3d2aSXin Li    float32x2_t XX_2;
172*a58d3d2aSXin Li    float32x4_t SUMM;
173*a58d3d2aSXin Li    const float32_t *xi = x;
174*a58d3d2aSXin Li    const float32_t *yi = y;
175*a58d3d2aSXin Li 
176*a58d3d2aSXin Li    celt_assert(len>0);
177*a58d3d2aSXin Li 
178*a58d3d2aSXin Li    YY[0] = vld1q_f32(yi);
179*a58d3d2aSXin Li    SUMM = vdupq_n_f32(0);
180*a58d3d2aSXin Li 
181*a58d3d2aSXin Li    /* Consume 8 elements in x vector and 12 elements in y
182*a58d3d2aSXin Li     * vector. However, the 12'th element never really gets
183*a58d3d2aSXin Li     * touched in this loop. So, if len == 8, then we only
184*a58d3d2aSXin Li     * must access y[0] to y[10]. y[11] must not be accessed
185*a58d3d2aSXin Li     * hence make sure len > 8 and not len >= 8
186*a58d3d2aSXin Li     */
187*a58d3d2aSXin Li    while (len > 8) {
188*a58d3d2aSXin Li       yi += 4;
189*a58d3d2aSXin Li       YY[1] = vld1q_f32(yi);
190*a58d3d2aSXin Li       yi += 4;
191*a58d3d2aSXin Li       YY[2] = vld1q_f32(yi);
192*a58d3d2aSXin Li 
193*a58d3d2aSXin Li       XX[0] = vld1q_f32(xi);
194*a58d3d2aSXin Li       xi += 4;
195*a58d3d2aSXin Li       XX[1] = vld1q_f32(xi);
196*a58d3d2aSXin Li       xi += 4;
197*a58d3d2aSXin Li 
198*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
199*a58d3d2aSXin Li       YEXT[0] = vextq_f32(YY[0], YY[1], 1);
200*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
201*a58d3d2aSXin Li       YEXT[1] = vextq_f32(YY[0], YY[1], 2);
202*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
203*a58d3d2aSXin Li       YEXT[2] = vextq_f32(YY[0], YY[1], 3);
204*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
205*a58d3d2aSXin Li 
206*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
207*a58d3d2aSXin Li       YEXT[0] = vextq_f32(YY[1], YY[2], 1);
208*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
209*a58d3d2aSXin Li       YEXT[1] = vextq_f32(YY[1], YY[2], 2);
210*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
211*a58d3d2aSXin Li       YEXT[2] = vextq_f32(YY[1], YY[2], 3);
212*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
213*a58d3d2aSXin Li 
214*a58d3d2aSXin Li       YY[0] = YY[2];
215*a58d3d2aSXin Li       len -= 8;
216*a58d3d2aSXin Li    }
217*a58d3d2aSXin Li 
218*a58d3d2aSXin Li    /* Consume 4 elements in x vector and 8 elements in y
219*a58d3d2aSXin Li     * vector. However, the 8'th element in y never really gets
220*a58d3d2aSXin Li     * touched in this loop. So, if len == 4, then we only
221*a58d3d2aSXin Li     * must access y[0] to y[6]. y[7] must not be accessed
222*a58d3d2aSXin Li     * hence make sure len>4 and not len>=4
223*a58d3d2aSXin Li     */
224*a58d3d2aSXin Li    if (len > 4) {
225*a58d3d2aSXin Li       yi += 4;
226*a58d3d2aSXin Li       YY[1] = vld1q_f32(yi);
227*a58d3d2aSXin Li 
228*a58d3d2aSXin Li       XX[0] = vld1q_f32(xi);
229*a58d3d2aSXin Li       xi += 4;
230*a58d3d2aSXin Li 
231*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
232*a58d3d2aSXin Li       YEXT[0] = vextq_f32(YY[0], YY[1], 1);
233*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
234*a58d3d2aSXin Li       YEXT[1] = vextq_f32(YY[0], YY[1], 2);
235*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
236*a58d3d2aSXin Li       YEXT[2] = vextq_f32(YY[0], YY[1], 3);
237*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
238*a58d3d2aSXin Li 
239*a58d3d2aSXin Li       YY[0] = YY[1];
240*a58d3d2aSXin Li       len -= 4;
241*a58d3d2aSXin Li    }
242*a58d3d2aSXin Li 
243*a58d3d2aSXin Li    while (--len > 0) {
244*a58d3d2aSXin Li       XX_2 = vld1_dup_f32(xi++);
245*a58d3d2aSXin Li       SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
246*a58d3d2aSXin Li       YY[0]= vld1q_f32(++yi);
247*a58d3d2aSXin Li    }
248*a58d3d2aSXin Li 
249*a58d3d2aSXin Li    XX_2 = vld1_dup_f32(xi);
250*a58d3d2aSXin Li    SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
251*a58d3d2aSXin Li 
252*a58d3d2aSXin Li    vst1q_f32(sum, SUMM);
253*a58d3d2aSXin Li }
254*a58d3d2aSXin Li 
celt_pitch_xcorr_float_neon(const opus_val16 * _x,const opus_val16 * _y,opus_val32 * xcorr,int len,int max_pitch,int arch)255*a58d3d2aSXin Li void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
256*a58d3d2aSXin Li                         opus_val32 *xcorr, int len, int max_pitch, int arch) {
257*a58d3d2aSXin Li    int i;
258*a58d3d2aSXin Li    (void)arch;
259*a58d3d2aSXin Li    celt_assert(max_pitch > 0);
260*a58d3d2aSXin Li    celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
261*a58d3d2aSXin Li 
262*a58d3d2aSXin Li    for (i = 0; i < (max_pitch-3); i += 4) {
263*a58d3d2aSXin Li       xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
264*a58d3d2aSXin Li             (float32_t *)xcorr+i, len);
265*a58d3d2aSXin Li    }
266*a58d3d2aSXin Li 
267*a58d3d2aSXin Li    /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
268*a58d3d2aSXin Li    for (; i < max_pitch; i++) {
269*a58d3d2aSXin Li       xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
270*a58d3d2aSXin Li    }
271*a58d3d2aSXin Li }
272*a58d3d2aSXin Li #endif
273