xref: /aosp_15_r20/external/libopus/celt/arm/pitch_neon_intr.c (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1 /***********************************************************************
2 Copyright (c) 2017 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <arm_neon.h>
33 #include "pitch.h"
34 
35 #ifdef FIXED_POINT
36 
celt_inner_prod_neon(const opus_val16 * x,const opus_val16 * y,int N)37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
38 {
39     int i;
40     opus_val32 xy;
41     int16x8_t x_s16x8, y_s16x8;
42     int32x4_t xy_s32x4 = vdupq_n_s32(0);
43     int64x2_t xy_s64x2;
44     int64x1_t xy_s64x1;
45 
46     for (i = 0; i < N - 7; i += 8) {
47         x_s16x8  = vld1q_s16(&x[i]);
48         y_s16x8  = vld1q_s16(&y[i]);
49         xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
50         xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
51     }
52 
53     if (N - i >= 4) {
54         const int16x4_t x_s16x4 = vld1_s16(&x[i]);
55         const int16x4_t y_s16x4 = vld1_s16(&y[i]);
56         xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
57         i += 4;
58     }
59 
60     xy_s64x2 = vpaddlq_s32(xy_s32x4);
61     xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
62     xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
63 
64     for (; i < N; i++) {
65         xy = MAC16_16(xy, x[i], y[i]);
66     }
67 
68 #ifdef OPUS_CHECK_ASM
69     celt_assert(celt_inner_prod_c(x, y, N) == xy);
70 #endif
71 
72     return xy;
73 }
74 
dual_inner_prod_neon(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
76         int N, opus_val32 *xy1, opus_val32 *xy2)
77 {
78     int i;
79     opus_val32 xy01, xy02;
80     int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
81     int32x4_t xy01_s32x4 = vdupq_n_s32(0);
82     int32x4_t xy02_s32x4 = vdupq_n_s32(0);
83     int64x2_t xy01_s64x2, xy02_s64x2;
84     int64x1_t xy01_s64x1, xy02_s64x1;
85 
86     for (i = 0; i < N - 7; i += 8) {
87         x_s16x8    = vld1q_s16(&x[i]);
88         y01_s16x8  = vld1q_s16(&y01[i]);
89         y02_s16x8  = vld1q_s16(&y02[i]);
90         xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
91         xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
92         xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
93         xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
94     }
95 
96     if (N - i >= 4) {
97         const int16x4_t x_s16x4   = vld1_s16(&x[i]);
98         const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
99         const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
100         xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
101         xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
102         i += 4;
103     }
104 
105     xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
106     xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
107     xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
108     xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
109     xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
110     xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
111 
112     for (; i < N; i++) {
113         xy01 = MAC16_16(xy01, x[i], y01[i]);
114         xy02 = MAC16_16(xy02, x[i], y02[i]);
115     }
116     *xy1 = xy01;
117     *xy2 = xy02;
118 
119 #ifdef OPUS_CHECK_ASM
120     {
121         opus_val32 xy1_c, xy2_c;
122         dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
123         celt_assert(xy1_c == *xy1);
124         celt_assert(xy2_c == *xy2);
125     }
126 #endif
127 }
128 
129 #else /* !FIXED_POINT */
130 
131 /* ========================================================================== */
132 
133 #ifdef __ARM_FEATURE_FMA
134 /* If we can, force the compiler to use an FMA instruction rather than break
135    vmlaq_f32() into fmul/fadd. */
136 #define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
137 #endif
138 
139 
140 #ifdef OPUS_CHECK_ASM
141 
142 /* This part of code simulates floating-point NEON operations. */
143 
144 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
145 /* operations of celt_inner_prod_neon(), and both functions should have bit */
146 /* exact output.                                                            */
celt_inner_prod_neon_float_c_simulation(const opus_val16 * x,const opus_val16 * y,float * err,int N)147 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
148 {
149    int i;
150    *err = 0;
151    opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
152    for (i = 0; i < N - 3; i += 4) {
153       xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
154       xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
155       xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
156       xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
157       *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
158    }
159    xy0 += xy2;
160    xy1 += xy3;
161    xy = xy0 + xy1;
162    *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
163    for (; i < N; i++) {
164       xy = MAC16_16(xy, x[i], y[i]);
165       *err += ABS32(xy);
166    }
167    *err = *err*2e-7 + N*1e-37;
168    return xy;
169 }
170 
171 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
172 /* operations of dual_inner_prod_neon(), and both functions should have bit */
173 /* exact output.                                                            */
dual_inner_prod_neon_float_c_simulation(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2,float * err)174 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
175       int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
176 {
177    *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
178    *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
179 }
180 
181 #endif /* OPUS_CHECK_ASM */
182 
183 /* ========================================================================== */
184 
celt_inner_prod_neon(const opus_val16 * x,const opus_val16 * y,int N)185 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
186 {
187     int i;
188     opus_val32 xy;
189     float32x4_t xy_f32x4 = vdupq_n_f32(0);
190     float32x2_t xy_f32x2;
191 
192     for (i = 0; i < N - 7; i += 8) {
193         float32x4_t x_f32x4, y_f32x4;
194         x_f32x4  = vld1q_f32(&x[i]);
195         y_f32x4  = vld1q_f32(&y[i]);
196         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
197         x_f32x4  = vld1q_f32(&x[i + 4]);
198         y_f32x4  = vld1q_f32(&y[i + 4]);
199         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
200     }
201 
202     if (N - i >= 4) {
203         const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
204         const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
205         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
206         i += 4;
207     }
208 
209     xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
210     xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
211     xy       = vget_lane_f32(xy_f32x2, 0);
212 
213     for (; i < N; i++) {
214         xy = MAC16_16(xy, x[i], y[i]);
215     }
216 
217 #ifdef OPUS_CHECK_ASM
218     {
219         float err, res;
220         res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
221         /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
222         celt_assert(ABS32(res - xy) <= err);
223     }
224 #endif
225 
226     return xy;
227 }
228 
dual_inner_prod_neon(const opus_val16 * x,const opus_val16 * y01,const opus_val16 * y02,int N,opus_val32 * xy1,opus_val32 * xy2)229 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
230         int N, opus_val32 *xy1, opus_val32 *xy2)
231 {
232     int i;
233     opus_val32 xy01, xy02;
234     float32x4_t xy01_f32x4 = vdupq_n_f32(0);
235     float32x4_t xy02_f32x4 = vdupq_n_f32(0);
236     float32x2_t xy01_f32x2, xy02_f32x2;
237 
238     for (i = 0; i < N - 7; i += 8) {
239         float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
240         x_f32x4    = vld1q_f32(&x[i]);
241         y01_f32x4  = vld1q_f32(&y01[i]);
242         y02_f32x4  = vld1q_f32(&y02[i]);
243         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
244         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
245         x_f32x4    = vld1q_f32(&x[i + 4]);
246         y01_f32x4  = vld1q_f32(&y01[i + 4]);
247         y02_f32x4  = vld1q_f32(&y02[i + 4]);
248         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
249         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
250     }
251 
252     if (N - i >= 4) {
253         const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
254         const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
255         const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
256         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
257         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
258         i += 4;
259     }
260 
261     xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
262     xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
263     xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
264     xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
265     xy01       = vget_lane_f32(xy01_f32x2, 0);
266     xy02       = vget_lane_f32(xy02_f32x2, 0);
267 
268     for (; i < N; i++) {
269         xy01 = MAC16_16(xy01, x[i], y01[i]);
270         xy02 = MAC16_16(xy02, x[i], y02[i]);
271     }
272     *xy1 = xy01;
273     *xy2 = xy02;
274 
275 #ifdef OPUS_CHECK_ASM
276     {
277         opus_val32 xy1_c, xy2_c;
278         float err[2];
279         dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
280         /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
281         if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
282         celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
283         celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
284     }
285 #endif
286 }
287 
288 #endif /* FIXED_POINT */
289