xref: /aosp_15_r20/external/speex/libspeexdsp/resample_neon.h (revision 28e138c64d234588b5cd2a8a403b584bd3036e4e)
1*28e138c6SAndroid Build Coastguard Worker /* Copyright (C) 2007-2008 Jean-Marc Valin
2*28e138c6SAndroid Build Coastguard Worker  * Copyright (C) 2008 Thorvald Natvig
3*28e138c6SAndroid Build Coastguard Worker  * Copyright (C) 2011 Texas Instruments
4*28e138c6SAndroid Build Coastguard Worker  *               author Jyri Sarha
5*28e138c6SAndroid Build Coastguard Worker  */
6*28e138c6SAndroid Build Coastguard Worker /**
7*28e138c6SAndroid Build Coastguard Worker    @file resample_neon.h
8*28e138c6SAndroid Build Coastguard Worker    @brief Resampler functions (NEON version)
9*28e138c6SAndroid Build Coastguard Worker */
10*28e138c6SAndroid Build Coastguard Worker /*
11*28e138c6SAndroid Build Coastguard Worker    Redistribution and use in source and binary forms, with or without
12*28e138c6SAndroid Build Coastguard Worker    modification, are permitted provided that the following conditions
13*28e138c6SAndroid Build Coastguard Worker    are met:
14*28e138c6SAndroid Build Coastguard Worker 
15*28e138c6SAndroid Build Coastguard Worker    - Redistributions of source code must retain the above copyright
16*28e138c6SAndroid Build Coastguard Worker    notice, this list of conditions and the following disclaimer.
17*28e138c6SAndroid Build Coastguard Worker 
18*28e138c6SAndroid Build Coastguard Worker    - Redistributions in binary form must reproduce the above copyright
19*28e138c6SAndroid Build Coastguard Worker    notice, this list of conditions and the following disclaimer in the
20*28e138c6SAndroid Build Coastguard Worker    documentation and/or other materials provided with the distribution.
21*28e138c6SAndroid Build Coastguard Worker 
22*28e138c6SAndroid Build Coastguard Worker    - Neither the name of the Xiph.org Foundation nor the names of its
23*28e138c6SAndroid Build Coastguard Worker    contributors may be used to endorse or promote products derived from
24*28e138c6SAndroid Build Coastguard Worker    this software without specific prior written permission.
25*28e138c6SAndroid Build Coastguard Worker 
26*28e138c6SAndroid Build Coastguard Worker    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27*28e138c6SAndroid Build Coastguard Worker    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28*28e138c6SAndroid Build Coastguard Worker    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29*28e138c6SAndroid Build Coastguard Worker    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
30*28e138c6SAndroid Build Coastguard Worker    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31*28e138c6SAndroid Build Coastguard Worker    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32*28e138c6SAndroid Build Coastguard Worker    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33*28e138c6SAndroid Build Coastguard Worker    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34*28e138c6SAndroid Build Coastguard Worker    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35*28e138c6SAndroid Build Coastguard Worker    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36*28e138c6SAndroid Build Coastguard Worker    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37*28e138c6SAndroid Build Coastguard Worker */
38*28e138c6SAndroid Build Coastguard Worker 
39*28e138c6SAndroid Build Coastguard Worker #ifdef FIXED_POINT
40*28e138c6SAndroid Build Coastguard Worker #if defined(__aarch64__)
saturate_32bit_to_16bit(int32_t a)41*28e138c6SAndroid Build Coastguard Worker static inline int32_t saturate_32bit_to_16bit(int32_t a) {
42*28e138c6SAndroid Build Coastguard Worker     int32_t ret;
43*28e138c6SAndroid Build Coastguard Worker     asm ("fmov s0, %w[a]\n"
44*28e138c6SAndroid Build Coastguard Worker          "sqxtn h0, s0\n"
45*28e138c6SAndroid Build Coastguard Worker          "sxtl v0.4s, v0.4h\n"
46*28e138c6SAndroid Build Coastguard Worker          "fmov %w[ret], s0\n"
47*28e138c6SAndroid Build Coastguard Worker          : [ret] "=r" (ret)
48*28e138c6SAndroid Build Coastguard Worker          : [a] "r" (a)
49*28e138c6SAndroid Build Coastguard Worker          : "v0" );
50*28e138c6SAndroid Build Coastguard Worker     return ret;
51*28e138c6SAndroid Build Coastguard Worker }
52*28e138c6SAndroid Build Coastguard Worker #elif defined(__thumb2__)
saturate_32bit_to_16bit(int32_t a)53*28e138c6SAndroid Build Coastguard Worker static inline int32_t saturate_32bit_to_16bit(int32_t a) {
54*28e138c6SAndroid Build Coastguard Worker     int32_t ret;
55*28e138c6SAndroid Build Coastguard Worker     asm ("ssat %[ret], #16, %[a]"
56*28e138c6SAndroid Build Coastguard Worker          : [ret] "=r" (ret)
57*28e138c6SAndroid Build Coastguard Worker          : [a] "r" (a)
58*28e138c6SAndroid Build Coastguard Worker          : );
59*28e138c6SAndroid Build Coastguard Worker     return ret;
60*28e138c6SAndroid Build Coastguard Worker }
61*28e138c6SAndroid Build Coastguard Worker #else
saturate_32bit_to_16bit(int32_t a)62*28e138c6SAndroid Build Coastguard Worker static inline int32_t saturate_32bit_to_16bit(int32_t a) {
63*28e138c6SAndroid Build Coastguard Worker     int32_t ret;
64*28e138c6SAndroid Build Coastguard Worker     asm ("vmov.s32 d0[0], %[a]\n"
65*28e138c6SAndroid Build Coastguard Worker          "vqmovn.s32 d0, q0\n"
66*28e138c6SAndroid Build Coastguard Worker          "vmov.s16 %[ret], d0[0]\n"
67*28e138c6SAndroid Build Coastguard Worker          : [ret] "=r" (ret)
68*28e138c6SAndroid Build Coastguard Worker          : [a] "r" (a)
69*28e138c6SAndroid Build Coastguard Worker          : "q0");
70*28e138c6SAndroid Build Coastguard Worker     return ret;
71*28e138c6SAndroid Build Coastguard Worker }
72*28e138c6SAndroid Build Coastguard Worker #endif
73*28e138c6SAndroid Build Coastguard Worker #undef WORD2INT
74*28e138c6SAndroid Build Coastguard Worker #define WORD2INT(x) (saturate_32bit_to_16bit(x))
75*28e138c6SAndroid Build Coastguard Worker 
76*28e138c6SAndroid Build Coastguard Worker #define OVERRIDE_INNER_PRODUCT_SINGLE
77*28e138c6SAndroid Build Coastguard Worker /* Only works when len % 4 == 0 and len >= 4 */
78*28e138c6SAndroid Build Coastguard Worker #if defined(__aarch64__)
inner_product_single(const int16_t * a,const int16_t * b,unsigned int len)79*28e138c6SAndroid Build Coastguard Worker static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
80*28e138c6SAndroid Build Coastguard Worker {
81*28e138c6SAndroid Build Coastguard Worker     int32_t ret;
82*28e138c6SAndroid Build Coastguard Worker     uint32_t remainder = len % 16;
83*28e138c6SAndroid Build Coastguard Worker     len = len - remainder;
84*28e138c6SAndroid Build Coastguard Worker 
85*28e138c6SAndroid Build Coastguard Worker     asm volatile ("	 cmp %w[len], #0\n"
86*28e138c6SAndroid Build Coastguard Worker 		  "	 b.ne 1f\n"
87*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v16.4h}, [%[b]], #8\n"
88*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v20.4h}, [%[a]], #8\n"
89*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %w[remainder], %w[remainder], #4\n"
90*28e138c6SAndroid Build Coastguard Worker 		  "	 smull v0.4s, v16.4h, v20.4h\n"
91*28e138c6SAndroid Build Coastguard Worker 		  "      b.ne 4f\n"
92*28e138c6SAndroid Build Coastguard Worker 		  "	 b 5f\n"
93*28e138c6SAndroid Build Coastguard Worker 		  "1:"
94*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [%[b]], #32\n"
95*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [%[a]], #32\n"
96*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %w[len], %w[len], #16\n"
97*28e138c6SAndroid Build Coastguard Worker 		  "	 smull v0.4s, v16.4h, v20.4h\n"
98*28e138c6SAndroid Build Coastguard Worker 		  "	 smlal v0.4s, v17.4h, v21.4h\n"
99*28e138c6SAndroid Build Coastguard Worker 		  "	 smlal v0.4s, v18.4h, v22.4h\n"
100*28e138c6SAndroid Build Coastguard Worker 		  "	 smlal v0.4s, v19.4h, v23.4h\n"
101*28e138c6SAndroid Build Coastguard Worker 		  "	 b.eq 3f\n"
102*28e138c6SAndroid Build Coastguard Worker 		  "2:"
103*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [%[b]], #32\n"
104*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [%[a]], #32\n"
105*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %w[len], %w[len], #16\n"
106*28e138c6SAndroid Build Coastguard Worker 		  "	 smlal v0.4s, v16.4h, v20.4h\n"
107*28e138c6SAndroid Build Coastguard Worker 		  "	 smlal v0.4s, v17.4h, v21.4h\n"
108*28e138c6SAndroid Build Coastguard Worker 		  "	 smlal v0.4s, v18.4h, v22.4h\n"
109*28e138c6SAndroid Build Coastguard Worker 		  "	 smlal v0.4s, v19.4h, v23.4h\n"
110*28e138c6SAndroid Build Coastguard Worker 		  "	 b.ne 2b\n"
111*28e138c6SAndroid Build Coastguard Worker 		  "3:"
112*28e138c6SAndroid Build Coastguard Worker 		  "	 cmp %w[remainder], #0\n"
113*28e138c6SAndroid Build Coastguard Worker 		  "	 b.eq 5f\n"
114*28e138c6SAndroid Build Coastguard Worker 		  "4:"
115*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v18.4h}, [%[b]], #8\n"
116*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v22.4h}, [%[a]], #8\n"
117*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %w[remainder], %w[remainder], #4\n"
118*28e138c6SAndroid Build Coastguard Worker 		  "	 smlal v0.4s, v18.4h, v22.4h\n"
119*28e138c6SAndroid Build Coastguard Worker 		  "	 b.ne 4b\n"
120*28e138c6SAndroid Build Coastguard Worker 		  "5:"
121*28e138c6SAndroid Build Coastguard Worker 		  "	 saddlv d0, v0.4s\n"
122*28e138c6SAndroid Build Coastguard Worker 		  "	 sqxtn s0, d0\n"
123*28e138c6SAndroid Build Coastguard Worker 		  "	 sqrshrn h0, s0, #15\n"
124*28e138c6SAndroid Build Coastguard Worker 		  "	 sxtl v0.4s, v0.4h\n"
125*28e138c6SAndroid Build Coastguard Worker 		  "	 fmov %w[ret], s0\n"
126*28e138c6SAndroid Build Coastguard Worker 		  : [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
127*28e138c6SAndroid Build Coastguard Worker 		    [len] "+r" (len), [remainder] "+r" (remainder)
128*28e138c6SAndroid Build Coastguard Worker 		  :
129*28e138c6SAndroid Build Coastguard Worker 		  : "cc", "v0",
130*28e138c6SAndroid Build Coastguard Worker 		    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
131*28e138c6SAndroid Build Coastguard Worker     return ret;
132*28e138c6SAndroid Build Coastguard Worker }
133*28e138c6SAndroid Build Coastguard Worker #else
inner_product_single(const int16_t * a,const int16_t * b,unsigned int len)134*28e138c6SAndroid Build Coastguard Worker static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
135*28e138c6SAndroid Build Coastguard Worker {
136*28e138c6SAndroid Build Coastguard Worker     int32_t ret;
137*28e138c6SAndroid Build Coastguard Worker     uint32_t remainder = len % 16;
138*28e138c6SAndroid Build Coastguard Worker     len = len - remainder;
139*28e138c6SAndroid Build Coastguard Worker 
140*28e138c6SAndroid Build Coastguard Worker     asm volatile ("	 cmp %[len], #0\n"
141*28e138c6SAndroid Build Coastguard Worker 		  "	 bne 1f\n"
142*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.16 {d16}, [%[b]]!\n"
143*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.16 {d20}, [%[a]]!\n"
144*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %[remainder], %[remainder], #4\n"
145*28e138c6SAndroid Build Coastguard Worker 		  "	 vmull.s16 q0, d16, d20\n"
146*28e138c6SAndroid Build Coastguard Worker 		  "      beq 5f\n"
147*28e138c6SAndroid Build Coastguard Worker 		  "	 b 4f\n"
148*28e138c6SAndroid Build Coastguard Worker 		  "1:"
149*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
150*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
151*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %[len], %[len], #16\n"
152*28e138c6SAndroid Build Coastguard Worker 		  "	 vmull.s16 q0, d16, d20\n"
153*28e138c6SAndroid Build Coastguard Worker 		  "	 vmlal.s16 q0, d17, d21\n"
154*28e138c6SAndroid Build Coastguard Worker 		  "	 vmlal.s16 q0, d18, d22\n"
155*28e138c6SAndroid Build Coastguard Worker 		  "	 vmlal.s16 q0, d19, d23\n"
156*28e138c6SAndroid Build Coastguard Worker 		  "	 beq 3f\n"
157*28e138c6SAndroid Build Coastguard Worker 		  "2:"
158*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
159*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
160*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %[len], %[len], #16\n"
161*28e138c6SAndroid Build Coastguard Worker 		  "	 vmlal.s16 q0, d16, d20\n"
162*28e138c6SAndroid Build Coastguard Worker 		  "	 vmlal.s16 q0, d17, d21\n"
163*28e138c6SAndroid Build Coastguard Worker 		  "	 vmlal.s16 q0, d18, d22\n"
164*28e138c6SAndroid Build Coastguard Worker 		  "	 vmlal.s16 q0, d19, d23\n"
165*28e138c6SAndroid Build Coastguard Worker 		  "	 bne 2b\n"
166*28e138c6SAndroid Build Coastguard Worker 		  "3:"
167*28e138c6SAndroid Build Coastguard Worker 		  "	 cmp %[remainder], #0\n"
168*28e138c6SAndroid Build Coastguard Worker 		  "	 beq 5f\n"
169*28e138c6SAndroid Build Coastguard Worker 		  "4:"
170*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.16 {d16}, [%[b]]!\n"
171*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.16 {d20}, [%[a]]!\n"
172*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %[remainder], %[remainder], #4\n"
173*28e138c6SAndroid Build Coastguard Worker 		  "	 vmlal.s16 q0, d16, d20\n"
174*28e138c6SAndroid Build Coastguard Worker 		  "	 bne 4b\n"
175*28e138c6SAndroid Build Coastguard Worker 		  "5:"
176*28e138c6SAndroid Build Coastguard Worker 		  "	 vaddl.s32 q0, d0, d1\n"
177*28e138c6SAndroid Build Coastguard Worker 		  "	 vadd.s64 d0, d0, d1\n"
178*28e138c6SAndroid Build Coastguard Worker 		  "	 vqmovn.s64 d0, q0\n"
179*28e138c6SAndroid Build Coastguard Worker 		  "	 vqrshrn.s32 d0, q0, #15\n"
180*28e138c6SAndroid Build Coastguard Worker 		  "	 vmov.s16 %[ret], d0[0]\n"
181*28e138c6SAndroid Build Coastguard Worker 		  : [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
182*28e138c6SAndroid Build Coastguard Worker 		    [len] "+r" (len), [remainder] "+r" (remainder)
183*28e138c6SAndroid Build Coastguard Worker 		  :
184*28e138c6SAndroid Build Coastguard Worker 		  : "cc", "q0",
185*28e138c6SAndroid Build Coastguard Worker 		    "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23");
186*28e138c6SAndroid Build Coastguard Worker 
187*28e138c6SAndroid Build Coastguard Worker     return ret;
188*28e138c6SAndroid Build Coastguard Worker }
189*28e138c6SAndroid Build Coastguard Worker #endif  // !defined(__aarch64__)
190*28e138c6SAndroid Build Coastguard Worker 
191*28e138c6SAndroid Build Coastguard Worker #elif defined(FLOATING_POINT)
192*28e138c6SAndroid Build Coastguard Worker #if defined(__aarch64__)
saturate_float_to_16bit(float a)193*28e138c6SAndroid Build Coastguard Worker static inline int32_t saturate_float_to_16bit(float a) {
194*28e138c6SAndroid Build Coastguard Worker     int32_t ret;
195*28e138c6SAndroid Build Coastguard Worker     asm ("fcvtas s1, %s[a]\n"
196*28e138c6SAndroid Build Coastguard Worker          "sqxtn h1, s1\n"
197*28e138c6SAndroid Build Coastguard Worker          "sxtl v1.4s, v1.4h\n"
198*28e138c6SAndroid Build Coastguard Worker          "fmov %w[ret], s1\n"
199*28e138c6SAndroid Build Coastguard Worker          : [ret] "=r" (ret)
200*28e138c6SAndroid Build Coastguard Worker          : [a] "w" (a)
201*28e138c6SAndroid Build Coastguard Worker          : "v1");
202*28e138c6SAndroid Build Coastguard Worker     return ret;
203*28e138c6SAndroid Build Coastguard Worker }
204*28e138c6SAndroid Build Coastguard Worker #else
saturate_float_to_16bit(float a)205*28e138c6SAndroid Build Coastguard Worker static inline int32_t saturate_float_to_16bit(float a) {
206*28e138c6SAndroid Build Coastguard Worker     int32_t ret;
207*28e138c6SAndroid Build Coastguard Worker     asm ("vmov.f32 d0[0], %[a]\n"
208*28e138c6SAndroid Build Coastguard Worker          "vcvt.s32.f32 d0, d0, #15\n"
209*28e138c6SAndroid Build Coastguard Worker          "vqrshrn.s32 d0, q0, #15\n"
210*28e138c6SAndroid Build Coastguard Worker          "vmov.s16 %[ret], d0[0]\n"
211*28e138c6SAndroid Build Coastguard Worker          : [ret] "=r" (ret)
212*28e138c6SAndroid Build Coastguard Worker          : [a] "r" (a)
213*28e138c6SAndroid Build Coastguard Worker          : "q0");
214*28e138c6SAndroid Build Coastguard Worker     return ret;
215*28e138c6SAndroid Build Coastguard Worker }
216*28e138c6SAndroid Build Coastguard Worker #endif
217*28e138c6SAndroid Build Coastguard Worker 
218*28e138c6SAndroid Build Coastguard Worker #undef WORD2INT
219*28e138c6SAndroid Build Coastguard Worker #define WORD2INT(x) (saturate_float_to_16bit(x))
220*28e138c6SAndroid Build Coastguard Worker 
221*28e138c6SAndroid Build Coastguard Worker #define OVERRIDE_INNER_PRODUCT_SINGLE
222*28e138c6SAndroid Build Coastguard Worker /* Only works when len % 4 == 0 and len >= 4 */
223*28e138c6SAndroid Build Coastguard Worker #if defined(__aarch64__)
inner_product_single(const float * a,const float * b,unsigned int len)224*28e138c6SAndroid Build Coastguard Worker static inline float inner_product_single(const float *a, const float *b, unsigned int len)
225*28e138c6SAndroid Build Coastguard Worker {
226*28e138c6SAndroid Build Coastguard Worker     float ret;
227*28e138c6SAndroid Build Coastguard Worker     uint32_t remainder = len % 16;
228*28e138c6SAndroid Build Coastguard Worker     len = len - remainder;
229*28e138c6SAndroid Build Coastguard Worker 
230*28e138c6SAndroid Build Coastguard Worker     asm volatile ("	 cmp %w[len], #0\n"
231*28e138c6SAndroid Build Coastguard Worker 		  "	 b.ne 1f\n"
232*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v16.4s}, [%[b]], #16\n"
233*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v20.4s}, [%[a]], #16\n"
234*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %w[remainder], %w[remainder], #4\n"
235*28e138c6SAndroid Build Coastguard Worker 		  "	 fmul v1.4s, v16.4s, v20.4s\n"
236*28e138c6SAndroid Build Coastguard Worker 		  "      b.ne 4f\n"
237*28e138c6SAndroid Build Coastguard Worker 		  "	 b 5f\n"
238*28e138c6SAndroid Build Coastguard Worker 		  "1:"
239*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[b]], #64\n"
240*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%[a]], #64\n"
241*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %w[len], %w[len], #16\n"
242*28e138c6SAndroid Build Coastguard Worker 		  "	 fmul v1.4s, v16.4s, v20.4s\n"
243*28e138c6SAndroid Build Coastguard Worker 		  "	 fmul v2.4s, v17.4s, v21.4s\n"
244*28e138c6SAndroid Build Coastguard Worker 		  "	 fmul v3.4s, v18.4s, v22.4s\n"
245*28e138c6SAndroid Build Coastguard Worker 		  "	 fmul v4.4s, v19.4s, v23.4s\n"
246*28e138c6SAndroid Build Coastguard Worker 		  "	 b.eq 3f\n"
247*28e138c6SAndroid Build Coastguard Worker 		  "2:"
248*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[b]], #64\n"
249*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%[a]], #64\n"
250*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %w[len], %w[len], #16\n"
251*28e138c6SAndroid Build Coastguard Worker 		  "	 fmla v1.4s, v16.4s, v20.4s\n"
252*28e138c6SAndroid Build Coastguard Worker 		  "	 fmla v2.4s, v17.4s, v21.4s\n"
253*28e138c6SAndroid Build Coastguard Worker 		  "	 fmla v3.4s, v18.4s, v22.4s\n"
254*28e138c6SAndroid Build Coastguard Worker 		  "	 fmla v4.4s, v19.4s, v23.4s\n"
255*28e138c6SAndroid Build Coastguard Worker 		  "	 b.ne 2b\n"
256*28e138c6SAndroid Build Coastguard Worker 		  "3:"
257*28e138c6SAndroid Build Coastguard Worker 		  "	 fadd v16.4s, v1.4s, v2.4s\n"
258*28e138c6SAndroid Build Coastguard Worker 		  "	 fadd v17.4s, v3.4s, v4.4s\n"
259*28e138c6SAndroid Build Coastguard Worker 		  "	 cmp %w[remainder], #0\n"
260*28e138c6SAndroid Build Coastguard Worker 		  "	 fadd v1.4s, v16.4s, v17.4s\n"
261*28e138c6SAndroid Build Coastguard Worker 		  "	 b.eq 5f\n"
262*28e138c6SAndroid Build Coastguard Worker 		  "4:"
263*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v18.4s}, [%[b]], #16\n"
264*28e138c6SAndroid Build Coastguard Worker 		  "	 ld1 {v22.4s}, [%[a]], #16\n"
265*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %w[remainder], %w[remainder], #4\n"
266*28e138c6SAndroid Build Coastguard Worker 		  "	 fmla v1.4s, v18.4s, v22.4s\n"
267*28e138c6SAndroid Build Coastguard Worker 		  "	 b.ne 4b\n"
268*28e138c6SAndroid Build Coastguard Worker 		  "5:"
269*28e138c6SAndroid Build Coastguard Worker 		  "	 faddp v1.4s, v1.4s, v1.4s\n"
270*28e138c6SAndroid Build Coastguard Worker 		  "	 faddp %[ret].4s, v1.4s, v1.4s\n"
271*28e138c6SAndroid Build Coastguard Worker 		  : [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b),
272*28e138c6SAndroid Build Coastguard Worker 		    [len] "+r" (len), [remainder] "+r" (remainder)
273*28e138c6SAndroid Build Coastguard Worker 		  :
274*28e138c6SAndroid Build Coastguard Worker 		  : "cc", "v1", "v2", "v3", "v4",
275*28e138c6SAndroid Build Coastguard Worker 		    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
276*28e138c6SAndroid Build Coastguard Worker     return ret;
277*28e138c6SAndroid Build Coastguard Worker }
278*28e138c6SAndroid Build Coastguard Worker #else
inner_product_single(const float * a,const float * b,unsigned int len)279*28e138c6SAndroid Build Coastguard Worker static inline float inner_product_single(const float *a, const float *b, unsigned int len)
280*28e138c6SAndroid Build Coastguard Worker {
281*28e138c6SAndroid Build Coastguard Worker     float ret;
282*28e138c6SAndroid Build Coastguard Worker     uint32_t remainder = len % 16;
283*28e138c6SAndroid Build Coastguard Worker     len = len - remainder;
284*28e138c6SAndroid Build Coastguard Worker 
285*28e138c6SAndroid Build Coastguard Worker     asm volatile ("	 cmp %[len], #0\n"
286*28e138c6SAndroid Build Coastguard Worker 		  "	 bne 1f\n"
287*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q4}, [%[b]]!\n"
288*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q8}, [%[a]]!\n"
289*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %[remainder], %[remainder], #4\n"
290*28e138c6SAndroid Build Coastguard Worker 		  "	 vmul.f32 q0, q4, q8\n"
291*28e138c6SAndroid Build Coastguard Worker 		  "      bne 4f\n"
292*28e138c6SAndroid Build Coastguard Worker 		  "	 b 5f\n"
293*28e138c6SAndroid Build Coastguard Worker 		  "1:"
294*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q4, q5}, [%[b]]!\n"
295*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q8, q9}, [%[a]]!\n"
296*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q6, q7}, [%[b]]!\n"
297*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q10, q11}, [%[a]]!\n"
298*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %[len], %[len], #16\n"
299*28e138c6SAndroid Build Coastguard Worker 		  "	 vmul.f32 q0, q4, q8\n"
300*28e138c6SAndroid Build Coastguard Worker 		  "	 vmul.f32 q1, q5, q9\n"
301*28e138c6SAndroid Build Coastguard Worker 		  "	 vmul.f32 q2, q6, q10\n"
302*28e138c6SAndroid Build Coastguard Worker 		  "	 vmul.f32 q3, q7, q11\n"
303*28e138c6SAndroid Build Coastguard Worker 		  "	 beq 3f\n"
304*28e138c6SAndroid Build Coastguard Worker 		  "2:"
305*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q4, q5}, [%[b]]!\n"
306*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q8, q9}, [%[a]]!\n"
307*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q6, q7}, [%[b]]!\n"
308*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q10, q11}, [%[a]]!\n"
309*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %[len], %[len], #16\n"
310*28e138c6SAndroid Build Coastguard Worker 		  "	 vmla.f32 q0, q4, q8\n"
311*28e138c6SAndroid Build Coastguard Worker 		  "	 vmla.f32 q1, q5, q9\n"
312*28e138c6SAndroid Build Coastguard Worker 		  "	 vmla.f32 q2, q6, q10\n"
313*28e138c6SAndroid Build Coastguard Worker 		  "	 vmla.f32 q3, q7, q11\n"
314*28e138c6SAndroid Build Coastguard Worker 		  "	 bne 2b\n"
315*28e138c6SAndroid Build Coastguard Worker 		  "3:"
316*28e138c6SAndroid Build Coastguard Worker 		  "	 vadd.f32 q4, q0, q1\n"
317*28e138c6SAndroid Build Coastguard Worker 		  "	 vadd.f32 q5, q2, q3\n"
318*28e138c6SAndroid Build Coastguard Worker 		  "	 cmp %[remainder], #0\n"
319*28e138c6SAndroid Build Coastguard Worker 		  "	 vadd.f32 q0, q4, q5\n"
320*28e138c6SAndroid Build Coastguard Worker 		  "	 beq 5f\n"
321*28e138c6SAndroid Build Coastguard Worker 		  "4:"
322*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q6}, [%[b]]!\n"
323*28e138c6SAndroid Build Coastguard Worker 		  "	 vld1.32 {q10}, [%[a]]!\n"
324*28e138c6SAndroid Build Coastguard Worker 		  "	 subs %[remainder], %[remainder], #4\n"
325*28e138c6SAndroid Build Coastguard Worker 		  "	 vmla.f32 q0, q6, q10\n"
326*28e138c6SAndroid Build Coastguard Worker 		  "	 bne 4b\n"
327*28e138c6SAndroid Build Coastguard Worker 		  "5:"
328*28e138c6SAndroid Build Coastguard Worker 		  "	 vadd.f32 d0, d0, d1\n"
329*28e138c6SAndroid Build Coastguard Worker 		  "	 vpadd.f32 d0, d0, d0\n"
330*28e138c6SAndroid Build Coastguard Worker 		  "	 vmov.f32 %[ret], d0[0]\n"
331*28e138c6SAndroid Build Coastguard Worker 		  : [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
332*28e138c6SAndroid Build Coastguard Worker 		    [len] "+l" (len), [remainder] "+l" (remainder)
333*28e138c6SAndroid Build Coastguard Worker 		  :
334*28e138c6SAndroid Build Coastguard Worker 		  : "cc", "q0", "q1", "q2", "q3",
335*28e138c6SAndroid Build Coastguard Worker 		    "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
336*28e138c6SAndroid Build Coastguard Worker     return ret;
337*28e138c6SAndroid Build Coastguard Worker }
338*28e138c6SAndroid Build Coastguard Worker #endif  // defined(__aarch64__)
339*28e138c6SAndroid Build Coastguard Worker #endif
340