xref: /aosp_15_r20/frameworks/av/media/libaudioprocessing/AudioResamplerFirProcessNeon.h (revision ec779b8e0859a360c3d303172224686826e6e0e1)
1*ec779b8eSAndroid Build Coastguard Worker /*
2*ec779b8eSAndroid Build Coastguard Worker  * Copyright (C) 2013 The Android Open Source Project
3*ec779b8eSAndroid Build Coastguard Worker  *
4*ec779b8eSAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*ec779b8eSAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*ec779b8eSAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*ec779b8eSAndroid Build Coastguard Worker  *
8*ec779b8eSAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*ec779b8eSAndroid Build Coastguard Worker  *
10*ec779b8eSAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*ec779b8eSAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*ec779b8eSAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*ec779b8eSAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*ec779b8eSAndroid Build Coastguard Worker  * limitations under the License.
15*ec779b8eSAndroid Build Coastguard Worker  */
16*ec779b8eSAndroid Build Coastguard Worker 
17*ec779b8eSAndroid Build Coastguard Worker #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18*ec779b8eSAndroid Build Coastguard Worker #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19*ec779b8eSAndroid Build Coastguard Worker 
20*ec779b8eSAndroid Build Coastguard Worker namespace android {
21*ec779b8eSAndroid Build Coastguard Worker 
22*ec779b8eSAndroid Build Coastguard Worker // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23*ec779b8eSAndroid Build Coastguard Worker 
24*ec779b8eSAndroid Build Coastguard Worker #if USE_NEON
25*ec779b8eSAndroid Build Coastguard Worker 
26*ec779b8eSAndroid Build Coastguard Worker // use intrinsics if inline arm32 assembly is not possible
27*ec779b8eSAndroid Build Coastguard Worker #if !USE_INLINE_ASSEMBLY
28*ec779b8eSAndroid Build Coastguard Worker #define USE_INTRINSIC
29*ec779b8eSAndroid Build Coastguard Worker #endif
30*ec779b8eSAndroid Build Coastguard Worker 
31*ec779b8eSAndroid Build Coastguard Worker // following intrinsics available only on ARM 64 bit ACLE
32*ec779b8eSAndroid Build Coastguard Worker #ifndef __aarch64__
33*ec779b8eSAndroid Build Coastguard Worker #undef vld1q_f32_x2
34*ec779b8eSAndroid Build Coastguard Worker #undef vld1q_s32_x2
35*ec779b8eSAndroid Build Coastguard Worker #endif
36*ec779b8eSAndroid Build Coastguard Worker 
37*ec779b8eSAndroid Build Coastguard Worker #define TO_STRING2(x) #x
38*ec779b8eSAndroid Build Coastguard Worker #define TO_STRING(x) TO_STRING2(x)
39*ec779b8eSAndroid Build Coastguard Worker // uncomment to print GCC version, may be relevant for intrinsic optimizations
40*ec779b8eSAndroid Build Coastguard Worker /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
41*ec779b8eSAndroid Build Coastguard Worker         "." TO_STRING(__GNUC_MINOR__) \
42*ec779b8eSAndroid Build Coastguard Worker         "." TO_STRING(__GNUC_PATCHLEVEL__)) */
43*ec779b8eSAndroid Build Coastguard Worker 
44*ec779b8eSAndroid Build Coastguard Worker //
45*ec779b8eSAndroid Build Coastguard Worker // NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
46*ec779b8eSAndroid Build Coastguard Worker //
47*ec779b8eSAndroid Build Coastguard Worker // Two variants are presented here:
48*ec779b8eSAndroid Build Coastguard Worker // ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
49*ec779b8eSAndroid Build Coastguard Worker // ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
50*ec779b8eSAndroid Build Coastguard Worker //
51*ec779b8eSAndroid Build Coastguard Worker 
52*ec779b8eSAndroid Build Coastguard Worker // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
53*ec779b8eSAndroid Build Coastguard Worker // These are only used for inline assembly.
54*ec779b8eSAndroid Build Coastguard Worker #define ASSEMBLY_ACCUMULATE_MONO \
55*ec779b8eSAndroid Build Coastguard Worker         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
56*ec779b8eSAndroid Build Coastguard Worker         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
57*ec779b8eSAndroid Build Coastguard Worker         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
58*ec779b8eSAndroid Build Coastguard Worker         "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
59*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
60*ec779b8eSAndroid Build Coastguard Worker         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
61*ec779b8eSAndroid Build Coastguard Worker         "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
62*ec779b8eSAndroid Build Coastguard Worker 
63*ec779b8eSAndroid Build Coastguard Worker #define ASSEMBLY_ACCUMULATE_STEREO \
64*ec779b8eSAndroid Build Coastguard Worker         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
65*ec779b8eSAndroid Build Coastguard Worker         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
66*ec779b8eSAndroid Build Coastguard Worker         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
67*ec779b8eSAndroid Build Coastguard Worker         "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
68*ec779b8eSAndroid Build Coastguard Worker         "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
69*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
70*ec779b8eSAndroid Build Coastguard Worker         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
71*ec779b8eSAndroid Build Coastguard Worker         "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
72*ec779b8eSAndroid Build Coastguard Worker 
73*ec779b8eSAndroid Build Coastguard Worker template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(int32_t * out,int count,const int16_t * coefsP,const int16_t * coefsN,const int16_t * sP,const int16_t * sN,const int32_t * volumeLR,uint32_t lerpP,const int16_t * coefsP1,const int16_t * coefsN1)74*ec779b8eSAndroid Build Coastguard Worker static inline void ProcessNeonIntrinsic(int32_t* out,
75*ec779b8eSAndroid Build Coastguard Worker         int count,
76*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsP,
77*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsN,
78*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
79*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
80*ec779b8eSAndroid Build Coastguard Worker         const int32_t* volumeLR,
81*ec779b8eSAndroid Build Coastguard Worker         uint32_t lerpP,
82*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsP1,
83*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsN1)
84*ec779b8eSAndroid Build Coastguard Worker {
85*ec779b8eSAndroid Build Coastguard Worker     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
86*ec779b8eSAndroid Build Coastguard Worker     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
87*ec779b8eSAndroid Build Coastguard Worker 
88*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
89*ec779b8eSAndroid Build Coastguard Worker     coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
90*ec779b8eSAndroid Build Coastguard Worker     coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
91*ec779b8eSAndroid Build Coastguard Worker 
92*ec779b8eSAndroid Build Coastguard Worker     int16x4_t interp;
93*ec779b8eSAndroid Build Coastguard Worker     if (!FIXED) {
94*ec779b8eSAndroid Build Coastguard Worker         interp = vdup_n_s16(lerpP);
95*ec779b8eSAndroid Build Coastguard Worker         //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
96*ec779b8eSAndroid Build Coastguard Worker         coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
97*ec779b8eSAndroid Build Coastguard Worker         coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
98*ec779b8eSAndroid Build Coastguard Worker     }
99*ec779b8eSAndroid Build Coastguard Worker     int32x4_t accum, accum2;
100*ec779b8eSAndroid Build Coastguard Worker     // warning uninitialized if we use veorq_s32
101*ec779b8eSAndroid Build Coastguard Worker     // (alternative to below) accum = veorq_s32(accum, accum);
102*ec779b8eSAndroid Build Coastguard Worker     accum = vdupq_n_s32(0);
103*ec779b8eSAndroid Build Coastguard Worker     if (CHANNELS == 2) {
104*ec779b8eSAndroid Build Coastguard Worker         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
105*ec779b8eSAndroid Build Coastguard Worker         accum2 = vdupq_n_s32(0);
106*ec779b8eSAndroid Build Coastguard Worker     }
107*ec779b8eSAndroid Build Coastguard Worker     do {
108*ec779b8eSAndroid Build Coastguard Worker         int16x8_t posCoef = vld1q_s16(coefsP);
109*ec779b8eSAndroid Build Coastguard Worker         coefsP += 8;
110*ec779b8eSAndroid Build Coastguard Worker         int16x8_t negCoef = vld1q_s16(coefsN);
111*ec779b8eSAndroid Build Coastguard Worker         coefsN += 8;
112*ec779b8eSAndroid Build Coastguard Worker         if (!FIXED) { // interpolate
113*ec779b8eSAndroid Build Coastguard Worker             int16x8_t posCoef1 = vld1q_s16(coefsP1);
114*ec779b8eSAndroid Build Coastguard Worker             coefsP1 += 8;
115*ec779b8eSAndroid Build Coastguard Worker             int16x8_t negCoef1 = vld1q_s16(coefsN1);
116*ec779b8eSAndroid Build Coastguard Worker             coefsN1 += 8;
117*ec779b8eSAndroid Build Coastguard Worker 
118*ec779b8eSAndroid Build Coastguard Worker             posCoef1 = vsubq_s16(posCoef1, posCoef);
119*ec779b8eSAndroid Build Coastguard Worker             negCoef = vsubq_s16(negCoef, negCoef1);
120*ec779b8eSAndroid Build Coastguard Worker 
121*ec779b8eSAndroid Build Coastguard Worker             posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
122*ec779b8eSAndroid Build Coastguard Worker             negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
123*ec779b8eSAndroid Build Coastguard Worker 
124*ec779b8eSAndroid Build Coastguard Worker             posCoef = vaddq_s16(posCoef, posCoef1);
125*ec779b8eSAndroid Build Coastguard Worker             negCoef = vaddq_s16(negCoef, negCoef1);
126*ec779b8eSAndroid Build Coastguard Worker         }
127*ec779b8eSAndroid Build Coastguard Worker         switch (CHANNELS) {
128*ec779b8eSAndroid Build Coastguard Worker         case 1: {
129*ec779b8eSAndroid Build Coastguard Worker             int16x8_t posSamp = vld1q_s16(sP);
130*ec779b8eSAndroid Build Coastguard Worker             int16x8_t negSamp = vld1q_s16(sN);
131*ec779b8eSAndroid Build Coastguard Worker             sN += 8;
132*ec779b8eSAndroid Build Coastguard Worker             posSamp = vrev64q_s16(posSamp);
133*ec779b8eSAndroid Build Coastguard Worker 
134*ec779b8eSAndroid Build Coastguard Worker             // dot product
135*ec779b8eSAndroid Build Coastguard Worker             accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
136*ec779b8eSAndroid Build Coastguard Worker             accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
137*ec779b8eSAndroid Build Coastguard Worker             accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
138*ec779b8eSAndroid Build Coastguard Worker             accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
139*ec779b8eSAndroid Build Coastguard Worker             sP -= 8;
140*ec779b8eSAndroid Build Coastguard Worker         } break;
141*ec779b8eSAndroid Build Coastguard Worker         case 2: {
142*ec779b8eSAndroid Build Coastguard Worker             int16x8x2_t posSamp = vld2q_s16(sP);
143*ec779b8eSAndroid Build Coastguard Worker             int16x8x2_t negSamp = vld2q_s16(sN);
144*ec779b8eSAndroid Build Coastguard Worker             sN += 16;
145*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
146*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
147*ec779b8eSAndroid Build Coastguard Worker 
148*ec779b8eSAndroid Build Coastguard Worker             // dot product
149*ec779b8eSAndroid Build Coastguard Worker             accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
150*ec779b8eSAndroid Build Coastguard Worker             accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
151*ec779b8eSAndroid Build Coastguard Worker             accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
152*ec779b8eSAndroid Build Coastguard Worker             accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
153*ec779b8eSAndroid Build Coastguard Worker             accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
154*ec779b8eSAndroid Build Coastguard Worker             accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
155*ec779b8eSAndroid Build Coastguard Worker             accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
156*ec779b8eSAndroid Build Coastguard Worker             accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
157*ec779b8eSAndroid Build Coastguard Worker             sP -= 16;
158*ec779b8eSAndroid Build Coastguard Worker         } break;
159*ec779b8eSAndroid Build Coastguard Worker         }
160*ec779b8eSAndroid Build Coastguard Worker     } while (count -= 8);
161*ec779b8eSAndroid Build Coastguard Worker 
162*ec779b8eSAndroid Build Coastguard Worker     // multiply by volume and save
163*ec779b8eSAndroid Build Coastguard Worker     volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
164*ec779b8eSAndroid Build Coastguard Worker     int32x2_t vLR = vld1_s32(volumeLR);
165*ec779b8eSAndroid Build Coastguard Worker     int32x2_t outSamp = vld1_s32(out);
166*ec779b8eSAndroid Build Coastguard Worker     // combine and funnel down accumulator
167*ec779b8eSAndroid Build Coastguard Worker     int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
168*ec779b8eSAndroid Build Coastguard Worker     if (CHANNELS == 1) {
169*ec779b8eSAndroid Build Coastguard Worker         // duplicate accum to both L and R
170*ec779b8eSAndroid Build Coastguard Worker         outAccum = vpadd_s32(outAccum, outAccum);
171*ec779b8eSAndroid Build Coastguard Worker     } else if (CHANNELS == 2) {
172*ec779b8eSAndroid Build Coastguard Worker         // accum2 contains R, fold in
173*ec779b8eSAndroid Build Coastguard Worker         int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
174*ec779b8eSAndroid Build Coastguard Worker         outAccum = vpadd_s32(outAccum, outAccum2);
175*ec779b8eSAndroid Build Coastguard Worker     }
176*ec779b8eSAndroid Build Coastguard Worker     outAccum = vqrdmulh_s32(outAccum, vLR);
177*ec779b8eSAndroid Build Coastguard Worker     outSamp = vqadd_s32(outSamp, outAccum);
178*ec779b8eSAndroid Build Coastguard Worker     vst1_s32(out, outSamp);
179*ec779b8eSAndroid Build Coastguard Worker }
180*ec779b8eSAndroid Build Coastguard Worker 
181*ec779b8eSAndroid Build Coastguard Worker template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(int32_t * out,int count,const int32_t * coefsP,const int32_t * coefsN,const int16_t * sP,const int16_t * sN,const int32_t * volumeLR,uint32_t lerpP,const int32_t * coefsP1,const int32_t * coefsN1)182*ec779b8eSAndroid Build Coastguard Worker static inline void ProcessNeonIntrinsic(int32_t* out,
183*ec779b8eSAndroid Build Coastguard Worker         int count,
184*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsP,
185*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsN,
186*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
187*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
188*ec779b8eSAndroid Build Coastguard Worker         const int32_t* volumeLR,
189*ec779b8eSAndroid Build Coastguard Worker         uint32_t lerpP,
190*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsP1,
191*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsN1)
192*ec779b8eSAndroid Build Coastguard Worker {
193*ec779b8eSAndroid Build Coastguard Worker     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
194*ec779b8eSAndroid Build Coastguard Worker     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
195*ec779b8eSAndroid Build Coastguard Worker 
196*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
197*ec779b8eSAndroid Build Coastguard Worker     coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
198*ec779b8eSAndroid Build Coastguard Worker     coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
199*ec779b8eSAndroid Build Coastguard Worker 
200*ec779b8eSAndroid Build Coastguard Worker     int32x2_t interp;
201*ec779b8eSAndroid Build Coastguard Worker     if (!FIXED) {
202*ec779b8eSAndroid Build Coastguard Worker         interp = vdup_n_s32(lerpP);
203*ec779b8eSAndroid Build Coastguard Worker         coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
204*ec779b8eSAndroid Build Coastguard Worker         coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
205*ec779b8eSAndroid Build Coastguard Worker     }
206*ec779b8eSAndroid Build Coastguard Worker     int32x4_t accum, accum2;
207*ec779b8eSAndroid Build Coastguard Worker     // warning uninitialized if we use veorq_s32
208*ec779b8eSAndroid Build Coastguard Worker     // (alternative to below) accum = veorq_s32(accum, accum);
209*ec779b8eSAndroid Build Coastguard Worker     accum = vdupq_n_s32(0);
210*ec779b8eSAndroid Build Coastguard Worker     if (CHANNELS == 2) {
211*ec779b8eSAndroid Build Coastguard Worker         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
212*ec779b8eSAndroid Build Coastguard Worker         accum2 = vdupq_n_s32(0);
213*ec779b8eSAndroid Build Coastguard Worker     }
214*ec779b8eSAndroid Build Coastguard Worker     do {
215*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_s32_x2
216*ec779b8eSAndroid Build Coastguard Worker         int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
217*ec779b8eSAndroid Build Coastguard Worker         coefsP += 8;
218*ec779b8eSAndroid Build Coastguard Worker         int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
219*ec779b8eSAndroid Build Coastguard Worker         coefsN += 8;
220*ec779b8eSAndroid Build Coastguard Worker #else
221*ec779b8eSAndroid Build Coastguard Worker         int32x4x2_t posCoef;
222*ec779b8eSAndroid Build Coastguard Worker         posCoef.val[0] = vld1q_s32(coefsP);
223*ec779b8eSAndroid Build Coastguard Worker         coefsP += 4;
224*ec779b8eSAndroid Build Coastguard Worker         posCoef.val[1] = vld1q_s32(coefsP);
225*ec779b8eSAndroid Build Coastguard Worker         coefsP += 4;
226*ec779b8eSAndroid Build Coastguard Worker         int32x4x2_t negCoef;
227*ec779b8eSAndroid Build Coastguard Worker         negCoef.val[0] = vld1q_s32(coefsN);
228*ec779b8eSAndroid Build Coastguard Worker         coefsN += 4;
229*ec779b8eSAndroid Build Coastguard Worker         negCoef.val[1] = vld1q_s32(coefsN);
230*ec779b8eSAndroid Build Coastguard Worker         coefsN += 4;
231*ec779b8eSAndroid Build Coastguard Worker #endif
232*ec779b8eSAndroid Build Coastguard Worker         if (!FIXED) { // interpolate
233*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_s32_x2
234*ec779b8eSAndroid Build Coastguard Worker             int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
235*ec779b8eSAndroid Build Coastguard Worker             coefsP1 += 8;
236*ec779b8eSAndroid Build Coastguard Worker             int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
237*ec779b8eSAndroid Build Coastguard Worker             coefsN1 += 8;
238*ec779b8eSAndroid Build Coastguard Worker #else
239*ec779b8eSAndroid Build Coastguard Worker             int32x4x2_t posCoef1;
240*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[0] = vld1q_s32(coefsP1);
241*ec779b8eSAndroid Build Coastguard Worker             coefsP1 += 4;
242*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[1] = vld1q_s32(coefsP1);
243*ec779b8eSAndroid Build Coastguard Worker             coefsP1 += 4;
244*ec779b8eSAndroid Build Coastguard Worker             int32x4x2_t negCoef1;
245*ec779b8eSAndroid Build Coastguard Worker             negCoef1.val[0] = vld1q_s32(coefsN1);
246*ec779b8eSAndroid Build Coastguard Worker             coefsN1 += 4;
247*ec779b8eSAndroid Build Coastguard Worker             negCoef1.val[1] = vld1q_s32(coefsN1);
248*ec779b8eSAndroid Build Coastguard Worker             coefsN1 += 4;
249*ec779b8eSAndroid Build Coastguard Worker #endif
250*ec779b8eSAndroid Build Coastguard Worker 
251*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
252*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
253*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
254*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
255*ec779b8eSAndroid Build Coastguard Worker 
256*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
257*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
258*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
259*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
260*ec779b8eSAndroid Build Coastguard Worker 
261*ec779b8eSAndroid Build Coastguard Worker             posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
262*ec779b8eSAndroid Build Coastguard Worker             posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
263*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
264*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
265*ec779b8eSAndroid Build Coastguard Worker         }
266*ec779b8eSAndroid Build Coastguard Worker         switch (CHANNELS) {
267*ec779b8eSAndroid Build Coastguard Worker         case 1: {
268*ec779b8eSAndroid Build Coastguard Worker             int16x8_t posSamp = vld1q_s16(sP);
269*ec779b8eSAndroid Build Coastguard Worker             int16x8_t negSamp = vld1q_s16(sN);
270*ec779b8eSAndroid Build Coastguard Worker             sN += 8;
271*ec779b8eSAndroid Build Coastguard Worker             posSamp = vrev64q_s16(posSamp);
272*ec779b8eSAndroid Build Coastguard Worker 
273*ec779b8eSAndroid Build Coastguard Worker             int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
274*ec779b8eSAndroid Build Coastguard Worker             int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
275*ec779b8eSAndroid Build Coastguard Worker             int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
276*ec779b8eSAndroid Build Coastguard Worker             int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
277*ec779b8eSAndroid Build Coastguard Worker 
278*ec779b8eSAndroid Build Coastguard Worker             // dot product
279*ec779b8eSAndroid Build Coastguard Worker             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
280*ec779b8eSAndroid Build Coastguard Worker             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
281*ec779b8eSAndroid Build Coastguard Worker             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
282*ec779b8eSAndroid Build Coastguard Worker             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
283*ec779b8eSAndroid Build Coastguard Worker 
284*ec779b8eSAndroid Build Coastguard Worker             accum = vaddq_s32(accum, posSamp0);
285*ec779b8eSAndroid Build Coastguard Worker             negSamp0 = vaddq_s32(negSamp0, negSamp1);
286*ec779b8eSAndroid Build Coastguard Worker             accum = vaddq_s32(accum, posSamp1);
287*ec779b8eSAndroid Build Coastguard Worker             accum = vaddq_s32(accum, negSamp0);
288*ec779b8eSAndroid Build Coastguard Worker 
289*ec779b8eSAndroid Build Coastguard Worker             sP -= 8;
290*ec779b8eSAndroid Build Coastguard Worker         } break;
291*ec779b8eSAndroid Build Coastguard Worker         case 2: {
292*ec779b8eSAndroid Build Coastguard Worker             int16x8x2_t posSamp = vld2q_s16(sP);
293*ec779b8eSAndroid Build Coastguard Worker             int16x8x2_t negSamp = vld2q_s16(sN);
294*ec779b8eSAndroid Build Coastguard Worker             sN += 16;
295*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
296*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
297*ec779b8eSAndroid Build Coastguard Worker 
298*ec779b8eSAndroid Build Coastguard Worker             // left
299*ec779b8eSAndroid Build Coastguard Worker             int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
300*ec779b8eSAndroid Build Coastguard Worker             int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
301*ec779b8eSAndroid Build Coastguard Worker             int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
302*ec779b8eSAndroid Build Coastguard Worker             int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
303*ec779b8eSAndroid Build Coastguard Worker 
304*ec779b8eSAndroid Build Coastguard Worker             // dot product
305*ec779b8eSAndroid Build Coastguard Worker             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
306*ec779b8eSAndroid Build Coastguard Worker             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
307*ec779b8eSAndroid Build Coastguard Worker             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
308*ec779b8eSAndroid Build Coastguard Worker             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
309*ec779b8eSAndroid Build Coastguard Worker 
310*ec779b8eSAndroid Build Coastguard Worker             accum = vaddq_s32(accum, posSamp0);
311*ec779b8eSAndroid Build Coastguard Worker             negSamp0 = vaddq_s32(negSamp0, negSamp1);
312*ec779b8eSAndroid Build Coastguard Worker             accum = vaddq_s32(accum, posSamp1);
313*ec779b8eSAndroid Build Coastguard Worker             accum = vaddq_s32(accum, negSamp0);
314*ec779b8eSAndroid Build Coastguard Worker 
315*ec779b8eSAndroid Build Coastguard Worker             // right
316*ec779b8eSAndroid Build Coastguard Worker             posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
317*ec779b8eSAndroid Build Coastguard Worker             posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
318*ec779b8eSAndroid Build Coastguard Worker             negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
319*ec779b8eSAndroid Build Coastguard Worker             negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
320*ec779b8eSAndroid Build Coastguard Worker 
321*ec779b8eSAndroid Build Coastguard Worker             // dot product
322*ec779b8eSAndroid Build Coastguard Worker             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
323*ec779b8eSAndroid Build Coastguard Worker             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
324*ec779b8eSAndroid Build Coastguard Worker             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
325*ec779b8eSAndroid Build Coastguard Worker             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
326*ec779b8eSAndroid Build Coastguard Worker 
327*ec779b8eSAndroid Build Coastguard Worker             accum2 = vaddq_s32(accum2, posSamp0);
328*ec779b8eSAndroid Build Coastguard Worker             negSamp0 = vaddq_s32(negSamp0, negSamp1);
329*ec779b8eSAndroid Build Coastguard Worker             accum2 = vaddq_s32(accum2, posSamp1);
330*ec779b8eSAndroid Build Coastguard Worker             accum2 = vaddq_s32(accum2, negSamp0);
331*ec779b8eSAndroid Build Coastguard Worker 
332*ec779b8eSAndroid Build Coastguard Worker             sP -= 16;
333*ec779b8eSAndroid Build Coastguard Worker         } break;
334*ec779b8eSAndroid Build Coastguard Worker         }
335*ec779b8eSAndroid Build Coastguard Worker     } while (count -= 8);
336*ec779b8eSAndroid Build Coastguard Worker 
337*ec779b8eSAndroid Build Coastguard Worker     // multiply by volume and save
338*ec779b8eSAndroid Build Coastguard Worker     volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
339*ec779b8eSAndroid Build Coastguard Worker     int32x2_t vLR = vld1_s32(volumeLR);
340*ec779b8eSAndroid Build Coastguard Worker     int32x2_t outSamp = vld1_s32(out);
341*ec779b8eSAndroid Build Coastguard Worker     // combine and funnel down accumulator
342*ec779b8eSAndroid Build Coastguard Worker     int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
343*ec779b8eSAndroid Build Coastguard Worker     if (CHANNELS == 1) {
344*ec779b8eSAndroid Build Coastguard Worker         // duplicate accum to both L and R
345*ec779b8eSAndroid Build Coastguard Worker         outAccum = vpadd_s32(outAccum, outAccum);
346*ec779b8eSAndroid Build Coastguard Worker     } else if (CHANNELS == 2) {
347*ec779b8eSAndroid Build Coastguard Worker         // accum2 contains R, fold in
348*ec779b8eSAndroid Build Coastguard Worker         int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
349*ec779b8eSAndroid Build Coastguard Worker         outAccum = vpadd_s32(outAccum, outAccum2);
350*ec779b8eSAndroid Build Coastguard Worker     }
351*ec779b8eSAndroid Build Coastguard Worker     outAccum = vqrdmulh_s32(outAccum, vLR);
352*ec779b8eSAndroid Build Coastguard Worker     outSamp = vqadd_s32(outSamp, outAccum);
353*ec779b8eSAndroid Build Coastguard Worker     vst1_s32(out, outSamp);
354*ec779b8eSAndroid Build Coastguard Worker }
355*ec779b8eSAndroid Build Coastguard Worker 
356*ec779b8eSAndroid Build Coastguard Worker template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(float * out,int count,const float * coefsP,const float * coefsN,const float * sP,const float * sN,const float * volumeLR,float lerpP,const float * coefsP1,const float * coefsN1)357*ec779b8eSAndroid Build Coastguard Worker static inline void ProcessNeonIntrinsic(float* out,
358*ec779b8eSAndroid Build Coastguard Worker         int count,
359*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
360*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
361*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
362*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
363*ec779b8eSAndroid Build Coastguard Worker         const float* volumeLR,
364*ec779b8eSAndroid Build Coastguard Worker         float lerpP,
365*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP1,
366*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN1)
367*ec779b8eSAndroid Build Coastguard Worker {
368*ec779b8eSAndroid Build Coastguard Worker     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
369*ec779b8eSAndroid Build Coastguard Worker     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
370*ec779b8eSAndroid Build Coastguard Worker 
371*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
372*ec779b8eSAndroid Build Coastguard Worker     coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
373*ec779b8eSAndroid Build Coastguard Worker     coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
374*ec779b8eSAndroid Build Coastguard Worker 
375*ec779b8eSAndroid Build Coastguard Worker     float32x2_t interp;
376*ec779b8eSAndroid Build Coastguard Worker     if (!FIXED) {
377*ec779b8eSAndroid Build Coastguard Worker         interp = vdup_n_f32(lerpP);
378*ec779b8eSAndroid Build Coastguard Worker         coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
379*ec779b8eSAndroid Build Coastguard Worker         coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
380*ec779b8eSAndroid Build Coastguard Worker     }
381*ec779b8eSAndroid Build Coastguard Worker     float32x4_t accum, accum2;
382*ec779b8eSAndroid Build Coastguard Worker     // warning uninitialized if we use veorq_s32
383*ec779b8eSAndroid Build Coastguard Worker     // (alternative to below) accum = veorq_s32(accum, accum);
384*ec779b8eSAndroid Build Coastguard Worker     accum = vdupq_n_f32(0);
385*ec779b8eSAndroid Build Coastguard Worker     if (CHANNELS == 2) {
386*ec779b8eSAndroid Build Coastguard Worker         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
387*ec779b8eSAndroid Build Coastguard Worker         accum2 = vdupq_n_f32(0);
388*ec779b8eSAndroid Build Coastguard Worker     }
389*ec779b8eSAndroid Build Coastguard Worker     do {
390*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_f32_x2
391*ec779b8eSAndroid Build Coastguard Worker         float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
392*ec779b8eSAndroid Build Coastguard Worker         coefsP += 8;
393*ec779b8eSAndroid Build Coastguard Worker         float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
394*ec779b8eSAndroid Build Coastguard Worker         coefsN += 8;
395*ec779b8eSAndroid Build Coastguard Worker #else
396*ec779b8eSAndroid Build Coastguard Worker         float32x4x2_t posCoef;
397*ec779b8eSAndroid Build Coastguard Worker         posCoef.val[0] = vld1q_f32(coefsP);
398*ec779b8eSAndroid Build Coastguard Worker         coefsP += 4;
399*ec779b8eSAndroid Build Coastguard Worker         posCoef.val[1] = vld1q_f32(coefsP);
400*ec779b8eSAndroid Build Coastguard Worker         coefsP += 4;
401*ec779b8eSAndroid Build Coastguard Worker         float32x4x2_t negCoef;
402*ec779b8eSAndroid Build Coastguard Worker         negCoef.val[0] = vld1q_f32(coefsN);
403*ec779b8eSAndroid Build Coastguard Worker         coefsN += 4;
404*ec779b8eSAndroid Build Coastguard Worker         negCoef.val[1] = vld1q_f32(coefsN);
405*ec779b8eSAndroid Build Coastguard Worker         coefsN += 4;
406*ec779b8eSAndroid Build Coastguard Worker #endif
407*ec779b8eSAndroid Build Coastguard Worker         if (!FIXED) { // interpolate
408*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_f32_x2
409*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
410*ec779b8eSAndroid Build Coastguard Worker             coefsP1 += 8;
411*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
412*ec779b8eSAndroid Build Coastguard Worker             coefsN1 += 8;
413*ec779b8eSAndroid Build Coastguard Worker #else
414*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t posCoef1;
415*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[0] = vld1q_f32(coefsP1);
416*ec779b8eSAndroid Build Coastguard Worker             coefsP1 += 4;
417*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[1] = vld1q_f32(coefsP1);
418*ec779b8eSAndroid Build Coastguard Worker             coefsP1 += 4;
419*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t negCoef1;
420*ec779b8eSAndroid Build Coastguard Worker             negCoef1.val[0] = vld1q_f32(coefsN1);
421*ec779b8eSAndroid Build Coastguard Worker             coefsN1 += 4;
422*ec779b8eSAndroid Build Coastguard Worker             negCoef1.val[1] = vld1q_f32(coefsN1);
423*ec779b8eSAndroid Build Coastguard Worker             coefsN1 += 4;
424*ec779b8eSAndroid Build Coastguard Worker #endif
425*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
426*ec779b8eSAndroid Build Coastguard Worker             posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
427*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
428*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
429*ec779b8eSAndroid Build Coastguard Worker 
430*ec779b8eSAndroid Build Coastguard Worker             posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
431*ec779b8eSAndroid Build Coastguard Worker             posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
432*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
433*ec779b8eSAndroid Build Coastguard Worker             negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
434*ec779b8eSAndroid Build Coastguard Worker         }
435*ec779b8eSAndroid Build Coastguard Worker         switch (CHANNELS) {
436*ec779b8eSAndroid Build Coastguard Worker         case 1: {
437*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_f32_x2
438*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t posSamp = vld1q_f32_x2(sP);
439*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t negSamp = vld1q_f32_x2(sN);
440*ec779b8eSAndroid Build Coastguard Worker             sN += 8;
441*ec779b8eSAndroid Build Coastguard Worker             sP -= 8;
442*ec779b8eSAndroid Build Coastguard Worker #else
443*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t posSamp;
444*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[0] = vld1q_f32(sP);
445*ec779b8eSAndroid Build Coastguard Worker             sP += 4;
446*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[1] = vld1q_f32(sP);
447*ec779b8eSAndroid Build Coastguard Worker             sP -= 12;
448*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t negSamp;
449*ec779b8eSAndroid Build Coastguard Worker             negSamp.val[0] = vld1q_f32(sN);
450*ec779b8eSAndroid Build Coastguard Worker             sN += 4;
451*ec779b8eSAndroid Build Coastguard Worker             negSamp.val[1] = vld1q_f32(sN);
452*ec779b8eSAndroid Build Coastguard Worker             sN += 4;
453*ec779b8eSAndroid Build Coastguard Worker #endif
454*ec779b8eSAndroid Build Coastguard Worker             // effectively we want a vrev128q_f32()
455*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
456*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
457*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[0] = vcombine_f32(
458*ec779b8eSAndroid Build Coastguard Worker                     vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
459*ec779b8eSAndroid Build Coastguard Worker             posSamp.val[1] = vcombine_f32(
460*ec779b8eSAndroid Build Coastguard Worker                     vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
461*ec779b8eSAndroid Build Coastguard Worker 
462*ec779b8eSAndroid Build Coastguard Worker             accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
463*ec779b8eSAndroid Build Coastguard Worker             accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
464*ec779b8eSAndroid Build Coastguard Worker             accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
465*ec779b8eSAndroid Build Coastguard Worker             accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
466*ec779b8eSAndroid Build Coastguard Worker         } break;
467*ec779b8eSAndroid Build Coastguard Worker         case 2: {
468*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t posSamp0 = vld2q_f32(sP);
469*ec779b8eSAndroid Build Coastguard Worker             sP += 8;
470*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t negSamp0 = vld2q_f32(sN);
471*ec779b8eSAndroid Build Coastguard Worker             sN += 8;
472*ec779b8eSAndroid Build Coastguard Worker             posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
473*ec779b8eSAndroid Build Coastguard Worker             posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
474*ec779b8eSAndroid Build Coastguard Worker             posSamp0.val[0] = vcombine_f32(
475*ec779b8eSAndroid Build Coastguard Worker                     vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
476*ec779b8eSAndroid Build Coastguard Worker             posSamp0.val[1] = vcombine_f32(
477*ec779b8eSAndroid Build Coastguard Worker                     vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
478*ec779b8eSAndroid Build Coastguard Worker 
479*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t posSamp1 = vld2q_f32(sP);
480*ec779b8eSAndroid Build Coastguard Worker             sP -= 24;
481*ec779b8eSAndroid Build Coastguard Worker             float32x4x2_t negSamp1 = vld2q_f32(sN);
482*ec779b8eSAndroid Build Coastguard Worker             sN += 8;
483*ec779b8eSAndroid Build Coastguard Worker             posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
484*ec779b8eSAndroid Build Coastguard Worker             posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
485*ec779b8eSAndroid Build Coastguard Worker             posSamp1.val[0] = vcombine_f32(
486*ec779b8eSAndroid Build Coastguard Worker                     vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
487*ec779b8eSAndroid Build Coastguard Worker             posSamp1.val[1] = vcombine_f32(
488*ec779b8eSAndroid Build Coastguard Worker                     vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
489*ec779b8eSAndroid Build Coastguard Worker 
490*ec779b8eSAndroid Build Coastguard Worker             // Note: speed is affected by accumulation order.
491*ec779b8eSAndroid Build Coastguard Worker             // Also, speed appears slower using vmul/vadd instead of vmla for
492*ec779b8eSAndroid Build Coastguard Worker             // stereo case, comparable for mono.
493*ec779b8eSAndroid Build Coastguard Worker 
494*ec779b8eSAndroid Build Coastguard Worker             accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
495*ec779b8eSAndroid Build Coastguard Worker             accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
496*ec779b8eSAndroid Build Coastguard Worker             accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
497*ec779b8eSAndroid Build Coastguard Worker             accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
498*ec779b8eSAndroid Build Coastguard Worker 
499*ec779b8eSAndroid Build Coastguard Worker             accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
500*ec779b8eSAndroid Build Coastguard Worker             accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
501*ec779b8eSAndroid Build Coastguard Worker             accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
502*ec779b8eSAndroid Build Coastguard Worker             accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
503*ec779b8eSAndroid Build Coastguard Worker         } break;
504*ec779b8eSAndroid Build Coastguard Worker         }
505*ec779b8eSAndroid Build Coastguard Worker     } while (count -= 8);
506*ec779b8eSAndroid Build Coastguard Worker 
507*ec779b8eSAndroid Build Coastguard Worker     // multiply by volume and save
508*ec779b8eSAndroid Build Coastguard Worker     volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
509*ec779b8eSAndroid Build Coastguard Worker     float32x2_t vLR = vld1_f32(volumeLR);
510*ec779b8eSAndroid Build Coastguard Worker     float32x2_t outSamp = vld1_f32(out);
511*ec779b8eSAndroid Build Coastguard Worker     // combine and funnel down accumulator
512*ec779b8eSAndroid Build Coastguard Worker     float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
513*ec779b8eSAndroid Build Coastguard Worker     if (CHANNELS == 1) {
514*ec779b8eSAndroid Build Coastguard Worker         // duplicate accum to both L and R
515*ec779b8eSAndroid Build Coastguard Worker         outAccum = vpadd_f32(outAccum, outAccum);
516*ec779b8eSAndroid Build Coastguard Worker     } else if (CHANNELS == 2) {
517*ec779b8eSAndroid Build Coastguard Worker         // accum2 contains R, fold in
518*ec779b8eSAndroid Build Coastguard Worker         float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
519*ec779b8eSAndroid Build Coastguard Worker         outAccum = vpadd_f32(outAccum, outAccum2);
520*ec779b8eSAndroid Build Coastguard Worker     }
521*ec779b8eSAndroid Build Coastguard Worker     outSamp = vmla_f32(outSamp, outAccum, vLR);
522*ec779b8eSAndroid Build Coastguard Worker     vst1_f32(out, outSamp);
523*ec779b8eSAndroid Build Coastguard Worker }
524*ec779b8eSAndroid Build Coastguard Worker 
525*ec779b8eSAndroid Build Coastguard Worker template <>
526*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<1, 16>(int32_t* const out,
527*ec779b8eSAndroid Build Coastguard Worker         int count,
528*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsP,
529*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsN,
530*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
531*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
532*ec779b8eSAndroid Build Coastguard Worker         const int32_t* const volumeLR)
533*ec779b8eSAndroid Build Coastguard Worker {
534*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
535*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
536*ec779b8eSAndroid Build Coastguard Worker             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
537*ec779b8eSAndroid Build Coastguard Worker #else
538*ec779b8eSAndroid Build Coastguard Worker     const int CHANNELS = 1; // template specialization does not preserve params
539*ec779b8eSAndroid Build Coastguard Worker     const int STRIDE = 16;
540*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
541*ec779b8eSAndroid Build Coastguard Worker     asm (
542*ec779b8eSAndroid Build Coastguard Worker         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
543*ec779b8eSAndroid Build Coastguard Worker 
544*ec779b8eSAndroid Build Coastguard Worker         "1:                                      \n"
545*ec779b8eSAndroid Build Coastguard Worker 
546*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
547*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
548*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
549*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
550*ec779b8eSAndroid Build Coastguard Worker 
551*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
552*ec779b8eSAndroid Build Coastguard Worker 
553*ec779b8eSAndroid Build Coastguard Worker         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
554*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
555*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
556*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
557*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
558*ec779b8eSAndroid Build Coastguard Worker 
559*ec779b8eSAndroid Build Coastguard Worker         // moving these ARM instructions before neon above seems to be slower
560*ec779b8eSAndroid Build Coastguard Worker         "subs           %[count], %[count], #8   \n"// (1) update loop counter
561*ec779b8eSAndroid Build Coastguard Worker         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
562*ec779b8eSAndroid Build Coastguard Worker 
563*ec779b8eSAndroid Build Coastguard Worker         // sP used after branch (warning)
564*ec779b8eSAndroid Build Coastguard Worker         "bne            1b                       \n"// loop
565*ec779b8eSAndroid Build Coastguard Worker 
566*ec779b8eSAndroid Build Coastguard Worker          ASSEMBLY_ACCUMULATE_MONO
567*ec779b8eSAndroid Build Coastguard Worker 
568*ec779b8eSAndroid Build Coastguard Worker         : [out]     "=Uv" (out[0]),
569*ec779b8eSAndroid Build Coastguard Worker           [count]   "+r" (count),
570*ec779b8eSAndroid Build Coastguard Worker           [coefsP0] "+r" (coefsP),
571*ec779b8eSAndroid Build Coastguard Worker           [coefsN0] "+r" (coefsN),
572*ec779b8eSAndroid Build Coastguard Worker           [sP]      "+r" (sP),
573*ec779b8eSAndroid Build Coastguard Worker           [sN]      "+r" (sN)
574*ec779b8eSAndroid Build Coastguard Worker         : [vLR]     "r" (volumeLR)
575*ec779b8eSAndroid Build Coastguard Worker         : "cc", "memory",
576*ec779b8eSAndroid Build Coastguard Worker           "q0", "q1", "q2", "q3",
577*ec779b8eSAndroid Build Coastguard Worker           "q8", "q10"
578*ec779b8eSAndroid Build Coastguard Worker     );
579*ec779b8eSAndroid Build Coastguard Worker #endif
580*ec779b8eSAndroid Build Coastguard Worker }
581*ec779b8eSAndroid Build Coastguard Worker 
582*ec779b8eSAndroid Build Coastguard Worker template <>
583*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<2, 16>(int32_t* const out,
584*ec779b8eSAndroid Build Coastguard Worker         int count,
585*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsP,
586*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsN,
587*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
588*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
589*ec779b8eSAndroid Build Coastguard Worker         const int32_t* const volumeLR)
590*ec779b8eSAndroid Build Coastguard Worker {
591*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
592*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
593*ec779b8eSAndroid Build Coastguard Worker             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
594*ec779b8eSAndroid Build Coastguard Worker #else
595*ec779b8eSAndroid Build Coastguard Worker     const int CHANNELS = 2; // template specialization does not preserve params
596*ec779b8eSAndroid Build Coastguard Worker     const int STRIDE = 16;
597*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
598*ec779b8eSAndroid Build Coastguard Worker     asm (
599*ec779b8eSAndroid Build Coastguard Worker         "veor           q0, q0, q0               \n"// (1) acc_L = 0
600*ec779b8eSAndroid Build Coastguard Worker         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
601*ec779b8eSAndroid Build Coastguard Worker 
602*ec779b8eSAndroid Build Coastguard Worker         "1:                                      \n"
603*ec779b8eSAndroid Build Coastguard Worker 
604*ec779b8eSAndroid Build Coastguard Worker         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
605*ec779b8eSAndroid Build Coastguard Worker         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
606*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
607*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
608*ec779b8eSAndroid Build Coastguard Worker 
609*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
610*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q3, q3                   \n"// (0 combines+) reverse positive right
611*ec779b8eSAndroid Build Coastguard Worker 
612*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
613*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
614*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
615*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
616*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
617*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
618*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
619*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
620*ec779b8eSAndroid Build Coastguard Worker 
621*ec779b8eSAndroid Build Coastguard Worker         // moving these ARM before neon seems to be slower
622*ec779b8eSAndroid Build Coastguard Worker         "subs           %[count], %[count], #8   \n"// (1) update loop counter
623*ec779b8eSAndroid Build Coastguard Worker         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
624*ec779b8eSAndroid Build Coastguard Worker 
625*ec779b8eSAndroid Build Coastguard Worker         // sP used after branch (warning)
626*ec779b8eSAndroid Build Coastguard Worker         "bne            1b                       \n"// loop
627*ec779b8eSAndroid Build Coastguard Worker 
628*ec779b8eSAndroid Build Coastguard Worker         ASSEMBLY_ACCUMULATE_STEREO
629*ec779b8eSAndroid Build Coastguard Worker 
630*ec779b8eSAndroid Build Coastguard Worker         : [out] "=Uv" (out[0]),
631*ec779b8eSAndroid Build Coastguard Worker           [count] "+r" (count),
632*ec779b8eSAndroid Build Coastguard Worker           [coefsP0] "+r" (coefsP),
633*ec779b8eSAndroid Build Coastguard Worker           [coefsN0] "+r" (coefsN),
634*ec779b8eSAndroid Build Coastguard Worker           [sP] "+r" (sP),
635*ec779b8eSAndroid Build Coastguard Worker           [sN] "+r" (sN)
636*ec779b8eSAndroid Build Coastguard Worker         : [vLR] "r" (volumeLR)
637*ec779b8eSAndroid Build Coastguard Worker         : "cc", "memory",
638*ec779b8eSAndroid Build Coastguard Worker           "q0", "q1", "q2", "q3",
639*ec779b8eSAndroid Build Coastguard Worker           "q4", "q5", "q6",
640*ec779b8eSAndroid Build Coastguard Worker           "q8", "q10"
641*ec779b8eSAndroid Build Coastguard Worker      );
642*ec779b8eSAndroid Build Coastguard Worker #endif
643*ec779b8eSAndroid Build Coastguard Worker }
644*ec779b8eSAndroid Build Coastguard Worker 
645*ec779b8eSAndroid Build Coastguard Worker template <>
646*ec779b8eSAndroid Build Coastguard Worker inline void Process<1, 16>(int32_t* const out,
647*ec779b8eSAndroid Build Coastguard Worker         int count,
648*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsP,
649*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsN,
650*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsP1,
651*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsN1,
652*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
653*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
654*ec779b8eSAndroid Build Coastguard Worker         uint32_t lerpP,
655*ec779b8eSAndroid Build Coastguard Worker         const int32_t* const volumeLR)
656*ec779b8eSAndroid Build Coastguard Worker {
657*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
658*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
659*ec779b8eSAndroid Build Coastguard Worker             lerpP, coefsP1, coefsN1);
660*ec779b8eSAndroid Build Coastguard Worker #else
661*ec779b8eSAndroid Build Coastguard Worker 
662*ec779b8eSAndroid Build Coastguard Worker     const int CHANNELS = 1; // template specialization does not preserve params
663*ec779b8eSAndroid Build Coastguard Worker     const int STRIDE = 16;
664*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
665*ec779b8eSAndroid Build Coastguard Worker     asm (
666*ec779b8eSAndroid Build Coastguard Worker         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
667*ec779b8eSAndroid Build Coastguard Worker         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
668*ec779b8eSAndroid Build Coastguard Worker 
669*ec779b8eSAndroid Build Coastguard Worker         "1:                                      \n"
670*ec779b8eSAndroid Build Coastguard Worker 
671*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
672*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
673*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
674*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
675*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
676*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
677*ec779b8eSAndroid Build Coastguard Worker 
678*ec779b8eSAndroid Build Coastguard Worker         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
679*ec779b8eSAndroid Build Coastguard Worker         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
680*ec779b8eSAndroid Build Coastguard Worker 
681*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
682*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
683*ec779b8eSAndroid Build Coastguard Worker 
684*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
685*ec779b8eSAndroid Build Coastguard Worker 
686*ec779b8eSAndroid Build Coastguard Worker         "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
687*ec779b8eSAndroid Build Coastguard Worker         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
688*ec779b8eSAndroid Build Coastguard Worker 
689*ec779b8eSAndroid Build Coastguard Worker         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
690*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
691*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
692*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
693*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
694*ec779b8eSAndroid Build Coastguard Worker 
695*ec779b8eSAndroid Build Coastguard Worker         // moving these ARM instructions before neon above seems to be slower
696*ec779b8eSAndroid Build Coastguard Worker         "subs           %[count], %[count], #8   \n"// (1) update loop counter
697*ec779b8eSAndroid Build Coastguard Worker         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
698*ec779b8eSAndroid Build Coastguard Worker 
699*ec779b8eSAndroid Build Coastguard Worker         // sP used after branch (warning)
700*ec779b8eSAndroid Build Coastguard Worker         "bne            1b                       \n"// loop
701*ec779b8eSAndroid Build Coastguard Worker 
702*ec779b8eSAndroid Build Coastguard Worker         ASSEMBLY_ACCUMULATE_MONO
703*ec779b8eSAndroid Build Coastguard Worker 
704*ec779b8eSAndroid Build Coastguard Worker         : [out]     "=Uv" (out[0]),
705*ec779b8eSAndroid Build Coastguard Worker           [count]   "+r" (count),
706*ec779b8eSAndroid Build Coastguard Worker           [coefsP0] "+r" (coefsP),
707*ec779b8eSAndroid Build Coastguard Worker           [coefsN0] "+r" (coefsN),
708*ec779b8eSAndroid Build Coastguard Worker           [coefsP1] "+r" (coefsP1),
709*ec779b8eSAndroid Build Coastguard Worker           [coefsN1] "+r" (coefsN1),
710*ec779b8eSAndroid Build Coastguard Worker           [sP]      "+r" (sP),
711*ec779b8eSAndroid Build Coastguard Worker           [sN]      "+r" (sN)
712*ec779b8eSAndroid Build Coastguard Worker         : [lerpP]   "r" (lerpP),
713*ec779b8eSAndroid Build Coastguard Worker           [vLR]     "r" (volumeLR)
714*ec779b8eSAndroid Build Coastguard Worker         : "cc", "memory",
715*ec779b8eSAndroid Build Coastguard Worker           "q0", "q1", "q2", "q3",
716*ec779b8eSAndroid Build Coastguard Worker           "q8", "q9", "q10", "q11"
717*ec779b8eSAndroid Build Coastguard Worker     );
718*ec779b8eSAndroid Build Coastguard Worker #endif
719*ec779b8eSAndroid Build Coastguard Worker }
720*ec779b8eSAndroid Build Coastguard Worker 
721*ec779b8eSAndroid Build Coastguard Worker template <>
722*ec779b8eSAndroid Build Coastguard Worker inline void Process<2, 16>(int32_t* const out,
723*ec779b8eSAndroid Build Coastguard Worker         int count,
724*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsP,
725*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsN,
726*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsP1,
727*ec779b8eSAndroid Build Coastguard Worker         const int16_t* coefsN1,
728*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
729*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
730*ec779b8eSAndroid Build Coastguard Worker         uint32_t lerpP,
731*ec779b8eSAndroid Build Coastguard Worker         const int32_t* const volumeLR)
732*ec779b8eSAndroid Build Coastguard Worker {
733*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
734*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
735*ec779b8eSAndroid Build Coastguard Worker             lerpP, coefsP1, coefsN1);
736*ec779b8eSAndroid Build Coastguard Worker #else
737*ec779b8eSAndroid Build Coastguard Worker     const int CHANNELS = 2; // template specialization does not preserve params
738*ec779b8eSAndroid Build Coastguard Worker     const int STRIDE = 16;
739*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
740*ec779b8eSAndroid Build Coastguard Worker     asm (
741*ec779b8eSAndroid Build Coastguard Worker         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
742*ec779b8eSAndroid Build Coastguard Worker         "veor           q0, q0, q0               \n"// (1) acc_L = 0
743*ec779b8eSAndroid Build Coastguard Worker         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
744*ec779b8eSAndroid Build Coastguard Worker 
745*ec779b8eSAndroid Build Coastguard Worker         "1:                                      \n"
746*ec779b8eSAndroid Build Coastguard Worker 
747*ec779b8eSAndroid Build Coastguard Worker         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
748*ec779b8eSAndroid Build Coastguard Worker         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
749*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
750*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
751*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
752*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
753*ec779b8eSAndroid Build Coastguard Worker 
754*ec779b8eSAndroid Build Coastguard Worker         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
755*ec779b8eSAndroid Build Coastguard Worker         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
756*ec779b8eSAndroid Build Coastguard Worker 
757*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
758*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
759*ec779b8eSAndroid Build Coastguard Worker 
760*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
761*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q3, q3                   \n"// (1) reverse 8 samples of positive right
762*ec779b8eSAndroid Build Coastguard Worker 
763*ec779b8eSAndroid Build Coastguard Worker         "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
764*ec779b8eSAndroid Build Coastguard Worker         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
765*ec779b8eSAndroid Build Coastguard Worker 
766*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
767*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
768*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
769*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
770*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
771*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
772*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
773*ec779b8eSAndroid Build Coastguard Worker         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
774*ec779b8eSAndroid Build Coastguard Worker 
775*ec779b8eSAndroid Build Coastguard Worker         // moving these ARM before neon seems to be slower
776*ec779b8eSAndroid Build Coastguard Worker         "subs           %[count], %[count], #8   \n"// (1) update loop counter
777*ec779b8eSAndroid Build Coastguard Worker         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
778*ec779b8eSAndroid Build Coastguard Worker 
779*ec779b8eSAndroid Build Coastguard Worker         // sP used after branch (warning)
780*ec779b8eSAndroid Build Coastguard Worker         "bne            1b                       \n"// loop
781*ec779b8eSAndroid Build Coastguard Worker 
782*ec779b8eSAndroid Build Coastguard Worker         ASSEMBLY_ACCUMULATE_STEREO
783*ec779b8eSAndroid Build Coastguard Worker 
784*ec779b8eSAndroid Build Coastguard Worker         : [out] "=Uv" (out[0]),
785*ec779b8eSAndroid Build Coastguard Worker           [count] "+r" (count),
786*ec779b8eSAndroid Build Coastguard Worker           [coefsP0] "+r" (coefsP),
787*ec779b8eSAndroid Build Coastguard Worker           [coefsN0] "+r" (coefsN),
788*ec779b8eSAndroid Build Coastguard Worker           [coefsP1] "+r" (coefsP1),
789*ec779b8eSAndroid Build Coastguard Worker           [coefsN1] "+r" (coefsN1),
790*ec779b8eSAndroid Build Coastguard Worker           [sP] "+r" (sP),
791*ec779b8eSAndroid Build Coastguard Worker           [sN] "+r" (sN)
792*ec779b8eSAndroid Build Coastguard Worker         : [lerpP]   "r" (lerpP),
793*ec779b8eSAndroid Build Coastguard Worker           [vLR] "r" (volumeLR)
794*ec779b8eSAndroid Build Coastguard Worker         : "cc", "memory",
795*ec779b8eSAndroid Build Coastguard Worker           "q0", "q1", "q2", "q3",
796*ec779b8eSAndroid Build Coastguard Worker           "q4", "q5", "q6",
797*ec779b8eSAndroid Build Coastguard Worker           "q8", "q9", "q10", "q11"
798*ec779b8eSAndroid Build Coastguard Worker     );
799*ec779b8eSAndroid Build Coastguard Worker #endif
800*ec779b8eSAndroid Build Coastguard Worker }
801*ec779b8eSAndroid Build Coastguard Worker 
802*ec779b8eSAndroid Build Coastguard Worker template <>
803*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<1, 16>(int32_t* const out,
804*ec779b8eSAndroid Build Coastguard Worker         int count,
805*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsP,
806*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsN,
807*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
808*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
809*ec779b8eSAndroid Build Coastguard Worker         const int32_t* const volumeLR)
810*ec779b8eSAndroid Build Coastguard Worker {
811*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
812*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
813*ec779b8eSAndroid Build Coastguard Worker             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
814*ec779b8eSAndroid Build Coastguard Worker #else
815*ec779b8eSAndroid Build Coastguard Worker     const int CHANNELS = 1; // template specialization does not preserve params
816*ec779b8eSAndroid Build Coastguard Worker     const int STRIDE = 16;
817*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
818*ec779b8eSAndroid Build Coastguard Worker     asm (
819*ec779b8eSAndroid Build Coastguard Worker         "veor           q0, q0, q0                    \n"// result, initialize to 0
820*ec779b8eSAndroid Build Coastguard Worker 
821*ec779b8eSAndroid Build Coastguard Worker         "1:                                           \n"
822*ec779b8eSAndroid Build Coastguard Worker 
823*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
824*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
825*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
826*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
827*ec779b8eSAndroid Build Coastguard Worker 
828*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
829*ec779b8eSAndroid Build Coastguard Worker 
830*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
831*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
832*ec779b8eSAndroid Build Coastguard Worker 
833*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
834*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
835*ec779b8eSAndroid Build Coastguard Worker 
836*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples
837*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples
838*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples
839*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples
840*ec779b8eSAndroid Build Coastguard Worker 
841*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q12                   \n"// accumulate result
842*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q13, q13, q14                 \n"// accumulate result
843*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q15                   \n"// accumulate result
844*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q13                   \n"// accumulate result
845*ec779b8eSAndroid Build Coastguard Worker 
846*ec779b8eSAndroid Build Coastguard Worker         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
847*ec779b8eSAndroid Build Coastguard Worker         "subs           %[count], %[count], #8        \n"// update loop counter
848*ec779b8eSAndroid Build Coastguard Worker 
849*ec779b8eSAndroid Build Coastguard Worker         "bne            1b                            \n"// loop
850*ec779b8eSAndroid Build Coastguard Worker 
851*ec779b8eSAndroid Build Coastguard Worker         ASSEMBLY_ACCUMULATE_MONO
852*ec779b8eSAndroid Build Coastguard Worker 
853*ec779b8eSAndroid Build Coastguard Worker         : [out]     "=Uv" (out[0]),
854*ec779b8eSAndroid Build Coastguard Worker           [count]   "+r" (count),
855*ec779b8eSAndroid Build Coastguard Worker           [coefsP0] "+r" (coefsP),
856*ec779b8eSAndroid Build Coastguard Worker           [coefsN0] "+r" (coefsN),
857*ec779b8eSAndroid Build Coastguard Worker           [sP]      "+r" (sP),
858*ec779b8eSAndroid Build Coastguard Worker           [sN]      "+r" (sN)
859*ec779b8eSAndroid Build Coastguard Worker         : [vLR]     "r" (volumeLR)
860*ec779b8eSAndroid Build Coastguard Worker         : "cc", "memory",
861*ec779b8eSAndroid Build Coastguard Worker           "q0", "q1", "q2", "q3",
862*ec779b8eSAndroid Build Coastguard Worker           "q8", "q9", "q10", "q11",
863*ec779b8eSAndroid Build Coastguard Worker           "q12", "q13", "q14", "q15"
864*ec779b8eSAndroid Build Coastguard Worker     );
865*ec779b8eSAndroid Build Coastguard Worker #endif
866*ec779b8eSAndroid Build Coastguard Worker }
867*ec779b8eSAndroid Build Coastguard Worker 
868*ec779b8eSAndroid Build Coastguard Worker template <>
869*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<2, 16>(int32_t* const out,
870*ec779b8eSAndroid Build Coastguard Worker         int count,
871*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsP,
872*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsN,
873*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
874*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
875*ec779b8eSAndroid Build Coastguard Worker         const int32_t* const volumeLR)
876*ec779b8eSAndroid Build Coastguard Worker {
877*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
878*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
879*ec779b8eSAndroid Build Coastguard Worker             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
880*ec779b8eSAndroid Build Coastguard Worker #else
881*ec779b8eSAndroid Build Coastguard Worker     const int CHANNELS = 2; // template specialization does not preserve params
882*ec779b8eSAndroid Build Coastguard Worker     const int STRIDE = 16;
883*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
884*ec779b8eSAndroid Build Coastguard Worker     asm (
885*ec779b8eSAndroid Build Coastguard Worker         "veor           q0, q0, q0                    \n"// result, initialize to 0
886*ec779b8eSAndroid Build Coastguard Worker         "veor           q4, q4, q4                    \n"// result, initialize to 0
887*ec779b8eSAndroid Build Coastguard Worker 
888*ec779b8eSAndroid Build Coastguard Worker         "1:                                           \n"
889*ec779b8eSAndroid Build Coastguard Worker 
890*ec779b8eSAndroid Build Coastguard Worker         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
891*ec779b8eSAndroid Build Coastguard Worker         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
892*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
893*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
894*ec779b8eSAndroid Build Coastguard Worker 
895*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
896*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
897*ec779b8eSAndroid Build Coastguard Worker 
898*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
899*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
900*ec779b8eSAndroid Build Coastguard Worker 
901*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
902*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
903*ec779b8eSAndroid Build Coastguard Worker 
904*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
905*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
906*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
907*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
908*ec779b8eSAndroid Build Coastguard Worker 
909*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q12                   \n"// accumulate result
910*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q13, q13, q14                 \n"// accumulate result
911*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q15                   \n"// accumulate result
912*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q13                   \n"// accumulate result
913*ec779b8eSAndroid Build Coastguard Worker 
914*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
915*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
916*ec779b8eSAndroid Build Coastguard Worker 
917*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
918*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
919*ec779b8eSAndroid Build Coastguard Worker 
920*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
921*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
922*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
923*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
924*ec779b8eSAndroid Build Coastguard Worker 
925*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q4, q4, q12                   \n"// accumulate result
926*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q13, q13, q14                 \n"// accumulate result
927*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q4, q4, q15                   \n"// accumulate result
928*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q4, q4, q13                   \n"// accumulate result
929*ec779b8eSAndroid Build Coastguard Worker 
930*ec779b8eSAndroid Build Coastguard Worker         "subs           %[count], %[count], #8        \n"// update loop counter
931*ec779b8eSAndroid Build Coastguard Worker         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
932*ec779b8eSAndroid Build Coastguard Worker 
933*ec779b8eSAndroid Build Coastguard Worker         "bne            1b                            \n"// loop
934*ec779b8eSAndroid Build Coastguard Worker 
935*ec779b8eSAndroid Build Coastguard Worker         ASSEMBLY_ACCUMULATE_STEREO
936*ec779b8eSAndroid Build Coastguard Worker 
937*ec779b8eSAndroid Build Coastguard Worker         : [out]     "=Uv" (out[0]),
938*ec779b8eSAndroid Build Coastguard Worker           [count]   "+r" (count),
939*ec779b8eSAndroid Build Coastguard Worker           [coefsP0] "+r" (coefsP),
940*ec779b8eSAndroid Build Coastguard Worker           [coefsN0] "+r" (coefsN),
941*ec779b8eSAndroid Build Coastguard Worker           [sP]      "+r" (sP),
942*ec779b8eSAndroid Build Coastguard Worker           [sN]      "+r" (sN)
943*ec779b8eSAndroid Build Coastguard Worker         : [vLR]     "r" (volumeLR)
944*ec779b8eSAndroid Build Coastguard Worker         : "cc", "memory",
945*ec779b8eSAndroid Build Coastguard Worker           "q0", "q1", "q2", "q3",
946*ec779b8eSAndroid Build Coastguard Worker           "q4", "q5", "q6",
947*ec779b8eSAndroid Build Coastguard Worker           "q8", "q9", "q10", "q11",
948*ec779b8eSAndroid Build Coastguard Worker           "q12", "q13", "q14", "q15"
949*ec779b8eSAndroid Build Coastguard Worker     );
950*ec779b8eSAndroid Build Coastguard Worker #endif
951*ec779b8eSAndroid Build Coastguard Worker }
952*ec779b8eSAndroid Build Coastguard Worker 
953*ec779b8eSAndroid Build Coastguard Worker template <>
954*ec779b8eSAndroid Build Coastguard Worker inline void Process<1, 16>(int32_t* const out,
955*ec779b8eSAndroid Build Coastguard Worker         int count,
956*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsP,
957*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsN,
958*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsP1,
959*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsN1,
960*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
961*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
962*ec779b8eSAndroid Build Coastguard Worker         uint32_t lerpP,
963*ec779b8eSAndroid Build Coastguard Worker         const int32_t* const volumeLR)
964*ec779b8eSAndroid Build Coastguard Worker {
965*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
966*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
967*ec779b8eSAndroid Build Coastguard Worker             lerpP, coefsP1, coefsN1);
968*ec779b8eSAndroid Build Coastguard Worker #else
969*ec779b8eSAndroid Build Coastguard Worker     const int CHANNELS = 1; // template specialization does not preserve params
970*ec779b8eSAndroid Build Coastguard Worker     const int STRIDE = 16;
971*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
972*ec779b8eSAndroid Build Coastguard Worker     asm (
973*ec779b8eSAndroid Build Coastguard Worker         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
974*ec779b8eSAndroid Build Coastguard Worker         "veor           q0, q0, q0                    \n"// result, initialize to 0
975*ec779b8eSAndroid Build Coastguard Worker 
976*ec779b8eSAndroid Build Coastguard Worker         "1:                                           \n"
977*ec779b8eSAndroid Build Coastguard Worker 
978*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
979*ec779b8eSAndroid Build Coastguard Worker         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
980*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
981*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
982*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
983*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
984*ec779b8eSAndroid Build Coastguard Worker 
985*ec779b8eSAndroid Build Coastguard Worker         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
986*ec779b8eSAndroid Build Coastguard Worker         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
987*ec779b8eSAndroid Build Coastguard Worker         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
988*ec779b8eSAndroid Build Coastguard Worker         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
989*ec779b8eSAndroid Build Coastguard Worker 
990*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
991*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
992*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
993*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
994*ec779b8eSAndroid Build Coastguard Worker 
995*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
996*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
997*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
998*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
999*ec779b8eSAndroid Build Coastguard Worker 
1000*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
1001*ec779b8eSAndroid Build Coastguard Worker 
1002*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
1003*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
1004*ec779b8eSAndroid Build Coastguard Worker 
1005*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
1006*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
1007*ec779b8eSAndroid Build Coastguard Worker 
1008*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1009*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1010*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1011*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1012*ec779b8eSAndroid Build Coastguard Worker 
1013*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q12                   \n"// accumulate result
1014*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q13, q13, q14                 \n"// accumulate result
1015*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q15                   \n"// accumulate result
1016*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q13                   \n"// accumulate result
1017*ec779b8eSAndroid Build Coastguard Worker 
1018*ec779b8eSAndroid Build Coastguard Worker         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
1019*ec779b8eSAndroid Build Coastguard Worker         "subs           %[count], %[count], #8        \n"// update loop counter
1020*ec779b8eSAndroid Build Coastguard Worker 
1021*ec779b8eSAndroid Build Coastguard Worker         "bne            1b                            \n"// loop
1022*ec779b8eSAndroid Build Coastguard Worker 
1023*ec779b8eSAndroid Build Coastguard Worker         ASSEMBLY_ACCUMULATE_MONO
1024*ec779b8eSAndroid Build Coastguard Worker 
1025*ec779b8eSAndroid Build Coastguard Worker         : [out]     "=Uv" (out[0]),
1026*ec779b8eSAndroid Build Coastguard Worker           [count]   "+r" (count),
1027*ec779b8eSAndroid Build Coastguard Worker           [coefsP0] "+r" (coefsP),
1028*ec779b8eSAndroid Build Coastguard Worker           [coefsN0] "+r" (coefsN),
1029*ec779b8eSAndroid Build Coastguard Worker           [coefsP1] "+r" (coefsP1),
1030*ec779b8eSAndroid Build Coastguard Worker           [coefsN1] "+r" (coefsN1),
1031*ec779b8eSAndroid Build Coastguard Worker           [sP]      "+r" (sP),
1032*ec779b8eSAndroid Build Coastguard Worker           [sN]      "+r" (sN)
1033*ec779b8eSAndroid Build Coastguard Worker         : [lerpP]   "r" (lerpP),
1034*ec779b8eSAndroid Build Coastguard Worker           [vLR]     "r" (volumeLR)
1035*ec779b8eSAndroid Build Coastguard Worker         : "cc", "memory",
1036*ec779b8eSAndroid Build Coastguard Worker           "q0", "q1", "q2", "q3",
1037*ec779b8eSAndroid Build Coastguard Worker           "q8", "q9", "q10", "q11",
1038*ec779b8eSAndroid Build Coastguard Worker           "q12", "q13", "q14", "q15"
1039*ec779b8eSAndroid Build Coastguard Worker     );
1040*ec779b8eSAndroid Build Coastguard Worker #endif
1041*ec779b8eSAndroid Build Coastguard Worker }
1042*ec779b8eSAndroid Build Coastguard Worker 
1043*ec779b8eSAndroid Build Coastguard Worker template <>
1044*ec779b8eSAndroid Build Coastguard Worker inline void Process<2, 16>(int32_t* const out,
1045*ec779b8eSAndroid Build Coastguard Worker         int count,
1046*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsP,
1047*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsN,
1048*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsP1,
1049*ec779b8eSAndroid Build Coastguard Worker         const int32_t* coefsN1,
1050*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sP,
1051*ec779b8eSAndroid Build Coastguard Worker         const int16_t* sN,
1052*ec779b8eSAndroid Build Coastguard Worker         uint32_t lerpP,
1053*ec779b8eSAndroid Build Coastguard Worker         const int32_t* const volumeLR)
1054*ec779b8eSAndroid Build Coastguard Worker {
1055*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
1056*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1057*ec779b8eSAndroid Build Coastguard Worker             lerpP, coefsP1, coefsN1);
1058*ec779b8eSAndroid Build Coastguard Worker #else
1059*ec779b8eSAndroid Build Coastguard Worker     const int CHANNELS = 2; // template specialization does not preserve params
1060*ec779b8eSAndroid Build Coastguard Worker     const int STRIDE = 16;
1061*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*((STRIDE>>1)-1);
1062*ec779b8eSAndroid Build Coastguard Worker     asm (
1063*ec779b8eSAndroid Build Coastguard Worker         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
1064*ec779b8eSAndroid Build Coastguard Worker         "veor           q0, q0, q0                    \n"// result, initialize to 0
1065*ec779b8eSAndroid Build Coastguard Worker         "veor           q4, q4, q4                    \n"// result, initialize to 0
1066*ec779b8eSAndroid Build Coastguard Worker 
1067*ec779b8eSAndroid Build Coastguard Worker         "1:                                           \n"
1068*ec779b8eSAndroid Build Coastguard Worker 
1069*ec779b8eSAndroid Build Coastguard Worker         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
1070*ec779b8eSAndroid Build Coastguard Worker         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
1071*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
1072*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
1073*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
1074*ec779b8eSAndroid Build Coastguard Worker         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
1075*ec779b8eSAndroid Build Coastguard Worker 
1076*ec779b8eSAndroid Build Coastguard Worker         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
1077*ec779b8eSAndroid Build Coastguard Worker         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
1078*ec779b8eSAndroid Build Coastguard Worker         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
1079*ec779b8eSAndroid Build Coastguard Worker         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
1080*ec779b8eSAndroid Build Coastguard Worker 
1081*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
1082*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
1083*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
1084*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
1085*ec779b8eSAndroid Build Coastguard Worker 
1086*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
1087*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
1088*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
1089*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
1090*ec779b8eSAndroid Build Coastguard Worker 
1091*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
1092*ec779b8eSAndroid Build Coastguard Worker         "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
1093*ec779b8eSAndroid Build Coastguard Worker 
1094*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
1095*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
1096*ec779b8eSAndroid Build Coastguard Worker 
1097*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
1098*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
1099*ec779b8eSAndroid Build Coastguard Worker 
1100*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1101*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1102*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1103*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1104*ec779b8eSAndroid Build Coastguard Worker 
1105*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q12                   \n"// accumulate result
1106*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q13, q13, q14                 \n"// accumulate result
1107*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q15                   \n"// accumulate result
1108*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q0, q0, q13                   \n"// accumulate result
1109*ec779b8eSAndroid Build Coastguard Worker 
1110*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
1111*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
1112*ec779b8eSAndroid Build Coastguard Worker 
1113*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
1114*ec779b8eSAndroid Build Coastguard Worker         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
1115*ec779b8eSAndroid Build Coastguard Worker 
1116*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1117*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1118*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1119*ec779b8eSAndroid Build Coastguard Worker         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1120*ec779b8eSAndroid Build Coastguard Worker 
1121*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q4, q4, q12                   \n"// accumulate result
1122*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q13, q13, q14                 \n"// accumulate result
1123*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q4, q4, q15                   \n"// accumulate result
1124*ec779b8eSAndroid Build Coastguard Worker         "vadd.s32       q4, q4, q13                   \n"// accumulate result
1125*ec779b8eSAndroid Build Coastguard Worker 
1126*ec779b8eSAndroid Build Coastguard Worker         "subs           %[count], %[count], #8        \n"// update loop counter
1127*ec779b8eSAndroid Build Coastguard Worker         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
1128*ec779b8eSAndroid Build Coastguard Worker 
1129*ec779b8eSAndroid Build Coastguard Worker         "bne            1b                            \n"// loop
1130*ec779b8eSAndroid Build Coastguard Worker 
1131*ec779b8eSAndroid Build Coastguard Worker         ASSEMBLY_ACCUMULATE_STEREO
1132*ec779b8eSAndroid Build Coastguard Worker 
1133*ec779b8eSAndroid Build Coastguard Worker         : [out]     "=Uv" (out[0]),
1134*ec779b8eSAndroid Build Coastguard Worker           [count]   "+r" (count),
1135*ec779b8eSAndroid Build Coastguard Worker           [coefsP0] "+r" (coefsP),
1136*ec779b8eSAndroid Build Coastguard Worker           [coefsN0] "+r" (coefsN),
1137*ec779b8eSAndroid Build Coastguard Worker           [coefsP1] "+r" (coefsP1),
1138*ec779b8eSAndroid Build Coastguard Worker           [coefsN1] "+r" (coefsN1),
1139*ec779b8eSAndroid Build Coastguard Worker           [sP]      "+r" (sP),
1140*ec779b8eSAndroid Build Coastguard Worker           [sN]      "+r" (sN)
1141*ec779b8eSAndroid Build Coastguard Worker         : [lerpP]   "r" (lerpP),
1142*ec779b8eSAndroid Build Coastguard Worker           [vLR]     "r" (volumeLR)
1143*ec779b8eSAndroid Build Coastguard Worker         : "cc", "memory",
1144*ec779b8eSAndroid Build Coastguard Worker           "q0", "q1", "q2", "q3",
1145*ec779b8eSAndroid Build Coastguard Worker           "q4", "q5", "q6",
1146*ec779b8eSAndroid Build Coastguard Worker           "q8", "q9", "q10", "q11",
1147*ec779b8eSAndroid Build Coastguard Worker           "q12", "q13", "q14", "q15"
1148*ec779b8eSAndroid Build Coastguard Worker     );
1149*ec779b8eSAndroid Build Coastguard Worker #endif
1150*ec779b8eSAndroid Build Coastguard Worker }
1151*ec779b8eSAndroid Build Coastguard Worker 
1152*ec779b8eSAndroid Build Coastguard Worker template<>
1153*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<1, 16>(float* const out,
1154*ec779b8eSAndroid Build Coastguard Worker         int count,
1155*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
1156*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
1157*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
1158*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
1159*ec779b8eSAndroid Build Coastguard Worker         const float* const volumeLR)
1160*ec779b8eSAndroid Build Coastguard Worker {
1161*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1162*ec779b8eSAndroid Build Coastguard Worker             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1163*ec779b8eSAndroid Build Coastguard Worker }
1164*ec779b8eSAndroid Build Coastguard Worker 
1165*ec779b8eSAndroid Build Coastguard Worker template<>
1166*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<2, 16>(float* const out,
1167*ec779b8eSAndroid Build Coastguard Worker         int count,
1168*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
1169*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
1170*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
1171*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
1172*ec779b8eSAndroid Build Coastguard Worker         const float* const volumeLR)
1173*ec779b8eSAndroid Build Coastguard Worker {
1174*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1175*ec779b8eSAndroid Build Coastguard Worker             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1176*ec779b8eSAndroid Build Coastguard Worker }
1177*ec779b8eSAndroid Build Coastguard Worker 
1178*ec779b8eSAndroid Build Coastguard Worker template<>
1179*ec779b8eSAndroid Build Coastguard Worker inline void Process<1, 16>(float* const out,
1180*ec779b8eSAndroid Build Coastguard Worker         int count,
1181*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
1182*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
1183*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP1,
1184*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN1,
1185*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
1186*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
1187*ec779b8eSAndroid Build Coastguard Worker         float lerpP,
1188*ec779b8eSAndroid Build Coastguard Worker         const float* const volumeLR)
1189*ec779b8eSAndroid Build Coastguard Worker {
1190*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1191*ec779b8eSAndroid Build Coastguard Worker             lerpP, coefsP1, coefsN1);
1192*ec779b8eSAndroid Build Coastguard Worker }
1193*ec779b8eSAndroid Build Coastguard Worker 
1194*ec779b8eSAndroid Build Coastguard Worker template<>
1195*ec779b8eSAndroid Build Coastguard Worker inline void Process<2, 16>(float* const out,
1196*ec779b8eSAndroid Build Coastguard Worker         int count,
1197*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
1198*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
1199*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP1,
1200*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN1,
1201*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
1202*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
1203*ec779b8eSAndroid Build Coastguard Worker         float lerpP,
1204*ec779b8eSAndroid Build Coastguard Worker         const float* const volumeLR)
1205*ec779b8eSAndroid Build Coastguard Worker {
1206*ec779b8eSAndroid Build Coastguard Worker     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1207*ec779b8eSAndroid Build Coastguard Worker             lerpP, coefsP1, coefsN1);
1208*ec779b8eSAndroid Build Coastguard Worker }
1209*ec779b8eSAndroid Build Coastguard Worker 
1210*ec779b8eSAndroid Build Coastguard Worker #endif //USE_NEON
1211*ec779b8eSAndroid Build Coastguard Worker 
1212*ec779b8eSAndroid Build Coastguard Worker } // namespace android
1213*ec779b8eSAndroid Build Coastguard Worker 
1214*ec779b8eSAndroid Build Coastguard Worker #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1215