1*ec779b8eSAndroid Build Coastguard Worker /*
2*ec779b8eSAndroid Build Coastguard Worker * Copyright (C) 2013 The Android Open Source Project
3*ec779b8eSAndroid Build Coastguard Worker *
4*ec779b8eSAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*ec779b8eSAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*ec779b8eSAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*ec779b8eSAndroid Build Coastguard Worker *
8*ec779b8eSAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*ec779b8eSAndroid Build Coastguard Worker *
10*ec779b8eSAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*ec779b8eSAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*ec779b8eSAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*ec779b8eSAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*ec779b8eSAndroid Build Coastguard Worker * limitations under the License.
15*ec779b8eSAndroid Build Coastguard Worker */
16*ec779b8eSAndroid Build Coastguard Worker
17*ec779b8eSAndroid Build Coastguard Worker #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18*ec779b8eSAndroid Build Coastguard Worker #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19*ec779b8eSAndroid Build Coastguard Worker
20*ec779b8eSAndroid Build Coastguard Worker namespace android {
21*ec779b8eSAndroid Build Coastguard Worker
22*ec779b8eSAndroid Build Coastguard Worker // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23*ec779b8eSAndroid Build Coastguard Worker
24*ec779b8eSAndroid Build Coastguard Worker #if USE_NEON
25*ec779b8eSAndroid Build Coastguard Worker
26*ec779b8eSAndroid Build Coastguard Worker // use intrinsics if inline arm32 assembly is not possible
27*ec779b8eSAndroid Build Coastguard Worker #if !USE_INLINE_ASSEMBLY
28*ec779b8eSAndroid Build Coastguard Worker #define USE_INTRINSIC
29*ec779b8eSAndroid Build Coastguard Worker #endif
30*ec779b8eSAndroid Build Coastguard Worker
31*ec779b8eSAndroid Build Coastguard Worker // following intrinsics available only on ARM 64 bit ACLE
32*ec779b8eSAndroid Build Coastguard Worker #ifndef __aarch64__
33*ec779b8eSAndroid Build Coastguard Worker #undef vld1q_f32_x2
34*ec779b8eSAndroid Build Coastguard Worker #undef vld1q_s32_x2
35*ec779b8eSAndroid Build Coastguard Worker #endif
36*ec779b8eSAndroid Build Coastguard Worker
37*ec779b8eSAndroid Build Coastguard Worker #define TO_STRING2(x) #x
38*ec779b8eSAndroid Build Coastguard Worker #define TO_STRING(x) TO_STRING2(x)
39*ec779b8eSAndroid Build Coastguard Worker // uncomment to print GCC version, may be relevant for intrinsic optimizations
40*ec779b8eSAndroid Build Coastguard Worker /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
41*ec779b8eSAndroid Build Coastguard Worker "." TO_STRING(__GNUC_MINOR__) \
42*ec779b8eSAndroid Build Coastguard Worker "." TO_STRING(__GNUC_PATCHLEVEL__)) */
43*ec779b8eSAndroid Build Coastguard Worker
44*ec779b8eSAndroid Build Coastguard Worker //
45*ec779b8eSAndroid Build Coastguard Worker // NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
46*ec779b8eSAndroid Build Coastguard Worker //
47*ec779b8eSAndroid Build Coastguard Worker // Two variants are presented here:
48*ec779b8eSAndroid Build Coastguard Worker // ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
49*ec779b8eSAndroid Build Coastguard Worker // ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
50*ec779b8eSAndroid Build Coastguard Worker //
51*ec779b8eSAndroid Build Coastguard Worker
52*ec779b8eSAndroid Build Coastguard Worker // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
53*ec779b8eSAndroid Build Coastguard Worker // These are only used for inline assembly.
54*ec779b8eSAndroid Build Coastguard Worker #define ASSEMBLY_ACCUMULATE_MONO \
55*ec779b8eSAndroid Build Coastguard Worker "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
56*ec779b8eSAndroid Build Coastguard Worker "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
57*ec779b8eSAndroid Build Coastguard Worker "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\
58*ec779b8eSAndroid Build Coastguard Worker "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\
59*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\
60*ec779b8eSAndroid Build Coastguard Worker "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\
61*ec779b8eSAndroid Build Coastguard Worker "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */
62*ec779b8eSAndroid Build Coastguard Worker
63*ec779b8eSAndroid Build Coastguard Worker #define ASSEMBLY_ACCUMULATE_STEREO \
64*ec779b8eSAndroid Build Coastguard Worker "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\
65*ec779b8eSAndroid Build Coastguard Worker "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\
66*ec779b8eSAndroid Build Coastguard Worker "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\
67*ec779b8eSAndroid Build Coastguard Worker "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\
68*ec779b8eSAndroid Build Coastguard Worker "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\
69*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\
70*ec779b8eSAndroid Build Coastguard Worker "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
71*ec779b8eSAndroid Build Coastguard Worker "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
72*ec779b8eSAndroid Build Coastguard Worker
73*ec779b8eSAndroid Build Coastguard Worker template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(int32_t * out,int count,const int16_t * coefsP,const int16_t * coefsN,const int16_t * sP,const int16_t * sN,const int32_t * volumeLR,uint32_t lerpP,const int16_t * coefsP1,const int16_t * coefsN1)74*ec779b8eSAndroid Build Coastguard Worker static inline void ProcessNeonIntrinsic(int32_t* out,
75*ec779b8eSAndroid Build Coastguard Worker int count,
76*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsP,
77*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsN,
78*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
79*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
80*ec779b8eSAndroid Build Coastguard Worker const int32_t* volumeLR,
81*ec779b8eSAndroid Build Coastguard Worker uint32_t lerpP,
82*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsP1,
83*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsN1)
84*ec779b8eSAndroid Build Coastguard Worker {
85*ec779b8eSAndroid Build Coastguard Worker ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
86*ec779b8eSAndroid Build Coastguard Worker static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
87*ec779b8eSAndroid Build Coastguard Worker
88*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
89*ec779b8eSAndroid Build Coastguard Worker coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
90*ec779b8eSAndroid Build Coastguard Worker coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
91*ec779b8eSAndroid Build Coastguard Worker
92*ec779b8eSAndroid Build Coastguard Worker int16x4_t interp;
93*ec779b8eSAndroid Build Coastguard Worker if (!FIXED) {
94*ec779b8eSAndroid Build Coastguard Worker interp = vdup_n_s16(lerpP);
95*ec779b8eSAndroid Build Coastguard Worker //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
96*ec779b8eSAndroid Build Coastguard Worker coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
97*ec779b8eSAndroid Build Coastguard Worker coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
98*ec779b8eSAndroid Build Coastguard Worker }
99*ec779b8eSAndroid Build Coastguard Worker int32x4_t accum, accum2;
100*ec779b8eSAndroid Build Coastguard Worker // warning uninitialized if we use veorq_s32
101*ec779b8eSAndroid Build Coastguard Worker // (alternative to below) accum = veorq_s32(accum, accum);
102*ec779b8eSAndroid Build Coastguard Worker accum = vdupq_n_s32(0);
103*ec779b8eSAndroid Build Coastguard Worker if (CHANNELS == 2) {
104*ec779b8eSAndroid Build Coastguard Worker // (alternative to below) accum2 = veorq_s32(accum2, accum2);
105*ec779b8eSAndroid Build Coastguard Worker accum2 = vdupq_n_s32(0);
106*ec779b8eSAndroid Build Coastguard Worker }
107*ec779b8eSAndroid Build Coastguard Worker do {
108*ec779b8eSAndroid Build Coastguard Worker int16x8_t posCoef = vld1q_s16(coefsP);
109*ec779b8eSAndroid Build Coastguard Worker coefsP += 8;
110*ec779b8eSAndroid Build Coastguard Worker int16x8_t negCoef = vld1q_s16(coefsN);
111*ec779b8eSAndroid Build Coastguard Worker coefsN += 8;
112*ec779b8eSAndroid Build Coastguard Worker if (!FIXED) { // interpolate
113*ec779b8eSAndroid Build Coastguard Worker int16x8_t posCoef1 = vld1q_s16(coefsP1);
114*ec779b8eSAndroid Build Coastguard Worker coefsP1 += 8;
115*ec779b8eSAndroid Build Coastguard Worker int16x8_t negCoef1 = vld1q_s16(coefsN1);
116*ec779b8eSAndroid Build Coastguard Worker coefsN1 += 8;
117*ec779b8eSAndroid Build Coastguard Worker
118*ec779b8eSAndroid Build Coastguard Worker posCoef1 = vsubq_s16(posCoef1, posCoef);
119*ec779b8eSAndroid Build Coastguard Worker negCoef = vsubq_s16(negCoef, negCoef1);
120*ec779b8eSAndroid Build Coastguard Worker
121*ec779b8eSAndroid Build Coastguard Worker posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
122*ec779b8eSAndroid Build Coastguard Worker negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
123*ec779b8eSAndroid Build Coastguard Worker
124*ec779b8eSAndroid Build Coastguard Worker posCoef = vaddq_s16(posCoef, posCoef1);
125*ec779b8eSAndroid Build Coastguard Worker negCoef = vaddq_s16(negCoef, negCoef1);
126*ec779b8eSAndroid Build Coastguard Worker }
127*ec779b8eSAndroid Build Coastguard Worker switch (CHANNELS) {
128*ec779b8eSAndroid Build Coastguard Worker case 1: {
129*ec779b8eSAndroid Build Coastguard Worker int16x8_t posSamp = vld1q_s16(sP);
130*ec779b8eSAndroid Build Coastguard Worker int16x8_t negSamp = vld1q_s16(sN);
131*ec779b8eSAndroid Build Coastguard Worker sN += 8;
132*ec779b8eSAndroid Build Coastguard Worker posSamp = vrev64q_s16(posSamp);
133*ec779b8eSAndroid Build Coastguard Worker
134*ec779b8eSAndroid Build Coastguard Worker // dot product
135*ec779b8eSAndroid Build Coastguard Worker accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
136*ec779b8eSAndroid Build Coastguard Worker accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
137*ec779b8eSAndroid Build Coastguard Worker accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
138*ec779b8eSAndroid Build Coastguard Worker accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
139*ec779b8eSAndroid Build Coastguard Worker sP -= 8;
140*ec779b8eSAndroid Build Coastguard Worker } break;
141*ec779b8eSAndroid Build Coastguard Worker case 2: {
142*ec779b8eSAndroid Build Coastguard Worker int16x8x2_t posSamp = vld2q_s16(sP);
143*ec779b8eSAndroid Build Coastguard Worker int16x8x2_t negSamp = vld2q_s16(sN);
144*ec779b8eSAndroid Build Coastguard Worker sN += 16;
145*ec779b8eSAndroid Build Coastguard Worker posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
146*ec779b8eSAndroid Build Coastguard Worker posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
147*ec779b8eSAndroid Build Coastguard Worker
148*ec779b8eSAndroid Build Coastguard Worker // dot product
149*ec779b8eSAndroid Build Coastguard Worker accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
150*ec779b8eSAndroid Build Coastguard Worker accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
151*ec779b8eSAndroid Build Coastguard Worker accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
152*ec779b8eSAndroid Build Coastguard Worker accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
153*ec779b8eSAndroid Build Coastguard Worker accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
154*ec779b8eSAndroid Build Coastguard Worker accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
155*ec779b8eSAndroid Build Coastguard Worker accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
156*ec779b8eSAndroid Build Coastguard Worker accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
157*ec779b8eSAndroid Build Coastguard Worker sP -= 16;
158*ec779b8eSAndroid Build Coastguard Worker } break;
159*ec779b8eSAndroid Build Coastguard Worker }
160*ec779b8eSAndroid Build Coastguard Worker } while (count -= 8);
161*ec779b8eSAndroid Build Coastguard Worker
162*ec779b8eSAndroid Build Coastguard Worker // multiply by volume and save
163*ec779b8eSAndroid Build Coastguard Worker volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
164*ec779b8eSAndroid Build Coastguard Worker int32x2_t vLR = vld1_s32(volumeLR);
165*ec779b8eSAndroid Build Coastguard Worker int32x2_t outSamp = vld1_s32(out);
166*ec779b8eSAndroid Build Coastguard Worker // combine and funnel down accumulator
167*ec779b8eSAndroid Build Coastguard Worker int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
168*ec779b8eSAndroid Build Coastguard Worker if (CHANNELS == 1) {
169*ec779b8eSAndroid Build Coastguard Worker // duplicate accum to both L and R
170*ec779b8eSAndroid Build Coastguard Worker outAccum = vpadd_s32(outAccum, outAccum);
171*ec779b8eSAndroid Build Coastguard Worker } else if (CHANNELS == 2) {
172*ec779b8eSAndroid Build Coastguard Worker // accum2 contains R, fold in
173*ec779b8eSAndroid Build Coastguard Worker int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
174*ec779b8eSAndroid Build Coastguard Worker outAccum = vpadd_s32(outAccum, outAccum2);
175*ec779b8eSAndroid Build Coastguard Worker }
176*ec779b8eSAndroid Build Coastguard Worker outAccum = vqrdmulh_s32(outAccum, vLR);
177*ec779b8eSAndroid Build Coastguard Worker outSamp = vqadd_s32(outSamp, outAccum);
178*ec779b8eSAndroid Build Coastguard Worker vst1_s32(out, outSamp);
179*ec779b8eSAndroid Build Coastguard Worker }
180*ec779b8eSAndroid Build Coastguard Worker
181*ec779b8eSAndroid Build Coastguard Worker template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(int32_t * out,int count,const int32_t * coefsP,const int32_t * coefsN,const int16_t * sP,const int16_t * sN,const int32_t * volumeLR,uint32_t lerpP,const int32_t * coefsP1,const int32_t * coefsN1)182*ec779b8eSAndroid Build Coastguard Worker static inline void ProcessNeonIntrinsic(int32_t* out,
183*ec779b8eSAndroid Build Coastguard Worker int count,
184*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsP,
185*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsN,
186*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
187*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
188*ec779b8eSAndroid Build Coastguard Worker const int32_t* volumeLR,
189*ec779b8eSAndroid Build Coastguard Worker uint32_t lerpP,
190*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsP1,
191*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsN1)
192*ec779b8eSAndroid Build Coastguard Worker {
193*ec779b8eSAndroid Build Coastguard Worker ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
194*ec779b8eSAndroid Build Coastguard Worker static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
195*ec779b8eSAndroid Build Coastguard Worker
196*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
197*ec779b8eSAndroid Build Coastguard Worker coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
198*ec779b8eSAndroid Build Coastguard Worker coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
199*ec779b8eSAndroid Build Coastguard Worker
200*ec779b8eSAndroid Build Coastguard Worker int32x2_t interp;
201*ec779b8eSAndroid Build Coastguard Worker if (!FIXED) {
202*ec779b8eSAndroid Build Coastguard Worker interp = vdup_n_s32(lerpP);
203*ec779b8eSAndroid Build Coastguard Worker coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
204*ec779b8eSAndroid Build Coastguard Worker coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
205*ec779b8eSAndroid Build Coastguard Worker }
206*ec779b8eSAndroid Build Coastguard Worker int32x4_t accum, accum2;
207*ec779b8eSAndroid Build Coastguard Worker // warning uninitialized if we use veorq_s32
208*ec779b8eSAndroid Build Coastguard Worker // (alternative to below) accum = veorq_s32(accum, accum);
209*ec779b8eSAndroid Build Coastguard Worker accum = vdupq_n_s32(0);
210*ec779b8eSAndroid Build Coastguard Worker if (CHANNELS == 2) {
211*ec779b8eSAndroid Build Coastguard Worker // (alternative to below) accum2 = veorq_s32(accum2, accum2);
212*ec779b8eSAndroid Build Coastguard Worker accum2 = vdupq_n_s32(0);
213*ec779b8eSAndroid Build Coastguard Worker }
214*ec779b8eSAndroid Build Coastguard Worker do {
215*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_s32_x2
216*ec779b8eSAndroid Build Coastguard Worker int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
217*ec779b8eSAndroid Build Coastguard Worker coefsP += 8;
218*ec779b8eSAndroid Build Coastguard Worker int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
219*ec779b8eSAndroid Build Coastguard Worker coefsN += 8;
220*ec779b8eSAndroid Build Coastguard Worker #else
221*ec779b8eSAndroid Build Coastguard Worker int32x4x2_t posCoef;
222*ec779b8eSAndroid Build Coastguard Worker posCoef.val[0] = vld1q_s32(coefsP);
223*ec779b8eSAndroid Build Coastguard Worker coefsP += 4;
224*ec779b8eSAndroid Build Coastguard Worker posCoef.val[1] = vld1q_s32(coefsP);
225*ec779b8eSAndroid Build Coastguard Worker coefsP += 4;
226*ec779b8eSAndroid Build Coastguard Worker int32x4x2_t negCoef;
227*ec779b8eSAndroid Build Coastguard Worker negCoef.val[0] = vld1q_s32(coefsN);
228*ec779b8eSAndroid Build Coastguard Worker coefsN += 4;
229*ec779b8eSAndroid Build Coastguard Worker negCoef.val[1] = vld1q_s32(coefsN);
230*ec779b8eSAndroid Build Coastguard Worker coefsN += 4;
231*ec779b8eSAndroid Build Coastguard Worker #endif
232*ec779b8eSAndroid Build Coastguard Worker if (!FIXED) { // interpolate
233*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_s32_x2
234*ec779b8eSAndroid Build Coastguard Worker int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
235*ec779b8eSAndroid Build Coastguard Worker coefsP1 += 8;
236*ec779b8eSAndroid Build Coastguard Worker int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
237*ec779b8eSAndroid Build Coastguard Worker coefsN1 += 8;
238*ec779b8eSAndroid Build Coastguard Worker #else
239*ec779b8eSAndroid Build Coastguard Worker int32x4x2_t posCoef1;
240*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[0] = vld1q_s32(coefsP1);
241*ec779b8eSAndroid Build Coastguard Worker coefsP1 += 4;
242*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[1] = vld1q_s32(coefsP1);
243*ec779b8eSAndroid Build Coastguard Worker coefsP1 += 4;
244*ec779b8eSAndroid Build Coastguard Worker int32x4x2_t negCoef1;
245*ec779b8eSAndroid Build Coastguard Worker negCoef1.val[0] = vld1q_s32(coefsN1);
246*ec779b8eSAndroid Build Coastguard Worker coefsN1 += 4;
247*ec779b8eSAndroid Build Coastguard Worker negCoef1.val[1] = vld1q_s32(coefsN1);
248*ec779b8eSAndroid Build Coastguard Worker coefsN1 += 4;
249*ec779b8eSAndroid Build Coastguard Worker #endif
250*ec779b8eSAndroid Build Coastguard Worker
251*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
252*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
253*ec779b8eSAndroid Build Coastguard Worker negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
254*ec779b8eSAndroid Build Coastguard Worker negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
255*ec779b8eSAndroid Build Coastguard Worker
256*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
257*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
258*ec779b8eSAndroid Build Coastguard Worker negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
259*ec779b8eSAndroid Build Coastguard Worker negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
260*ec779b8eSAndroid Build Coastguard Worker
261*ec779b8eSAndroid Build Coastguard Worker posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
262*ec779b8eSAndroid Build Coastguard Worker posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
263*ec779b8eSAndroid Build Coastguard Worker negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
264*ec779b8eSAndroid Build Coastguard Worker negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
265*ec779b8eSAndroid Build Coastguard Worker }
266*ec779b8eSAndroid Build Coastguard Worker switch (CHANNELS) {
267*ec779b8eSAndroid Build Coastguard Worker case 1: {
268*ec779b8eSAndroid Build Coastguard Worker int16x8_t posSamp = vld1q_s16(sP);
269*ec779b8eSAndroid Build Coastguard Worker int16x8_t negSamp = vld1q_s16(sN);
270*ec779b8eSAndroid Build Coastguard Worker sN += 8;
271*ec779b8eSAndroid Build Coastguard Worker posSamp = vrev64q_s16(posSamp);
272*ec779b8eSAndroid Build Coastguard Worker
273*ec779b8eSAndroid Build Coastguard Worker int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
274*ec779b8eSAndroid Build Coastguard Worker int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
275*ec779b8eSAndroid Build Coastguard Worker int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
276*ec779b8eSAndroid Build Coastguard Worker int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
277*ec779b8eSAndroid Build Coastguard Worker
278*ec779b8eSAndroid Build Coastguard Worker // dot product
279*ec779b8eSAndroid Build Coastguard Worker posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
280*ec779b8eSAndroid Build Coastguard Worker posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
281*ec779b8eSAndroid Build Coastguard Worker negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
282*ec779b8eSAndroid Build Coastguard Worker negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
283*ec779b8eSAndroid Build Coastguard Worker
284*ec779b8eSAndroid Build Coastguard Worker accum = vaddq_s32(accum, posSamp0);
285*ec779b8eSAndroid Build Coastguard Worker negSamp0 = vaddq_s32(negSamp0, negSamp1);
286*ec779b8eSAndroid Build Coastguard Worker accum = vaddq_s32(accum, posSamp1);
287*ec779b8eSAndroid Build Coastguard Worker accum = vaddq_s32(accum, negSamp0);
288*ec779b8eSAndroid Build Coastguard Worker
289*ec779b8eSAndroid Build Coastguard Worker sP -= 8;
290*ec779b8eSAndroid Build Coastguard Worker } break;
291*ec779b8eSAndroid Build Coastguard Worker case 2: {
292*ec779b8eSAndroid Build Coastguard Worker int16x8x2_t posSamp = vld2q_s16(sP);
293*ec779b8eSAndroid Build Coastguard Worker int16x8x2_t negSamp = vld2q_s16(sN);
294*ec779b8eSAndroid Build Coastguard Worker sN += 16;
295*ec779b8eSAndroid Build Coastguard Worker posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
296*ec779b8eSAndroid Build Coastguard Worker posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
297*ec779b8eSAndroid Build Coastguard Worker
298*ec779b8eSAndroid Build Coastguard Worker // left
299*ec779b8eSAndroid Build Coastguard Worker int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
300*ec779b8eSAndroid Build Coastguard Worker int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
301*ec779b8eSAndroid Build Coastguard Worker int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
302*ec779b8eSAndroid Build Coastguard Worker int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
303*ec779b8eSAndroid Build Coastguard Worker
304*ec779b8eSAndroid Build Coastguard Worker // dot product
305*ec779b8eSAndroid Build Coastguard Worker posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
306*ec779b8eSAndroid Build Coastguard Worker posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
307*ec779b8eSAndroid Build Coastguard Worker negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
308*ec779b8eSAndroid Build Coastguard Worker negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
309*ec779b8eSAndroid Build Coastguard Worker
310*ec779b8eSAndroid Build Coastguard Worker accum = vaddq_s32(accum, posSamp0);
311*ec779b8eSAndroid Build Coastguard Worker negSamp0 = vaddq_s32(negSamp0, negSamp1);
312*ec779b8eSAndroid Build Coastguard Worker accum = vaddq_s32(accum, posSamp1);
313*ec779b8eSAndroid Build Coastguard Worker accum = vaddq_s32(accum, negSamp0);
314*ec779b8eSAndroid Build Coastguard Worker
315*ec779b8eSAndroid Build Coastguard Worker // right
316*ec779b8eSAndroid Build Coastguard Worker posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
317*ec779b8eSAndroid Build Coastguard Worker posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
318*ec779b8eSAndroid Build Coastguard Worker negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
319*ec779b8eSAndroid Build Coastguard Worker negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
320*ec779b8eSAndroid Build Coastguard Worker
321*ec779b8eSAndroid Build Coastguard Worker // dot product
322*ec779b8eSAndroid Build Coastguard Worker posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
323*ec779b8eSAndroid Build Coastguard Worker posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
324*ec779b8eSAndroid Build Coastguard Worker negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
325*ec779b8eSAndroid Build Coastguard Worker negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
326*ec779b8eSAndroid Build Coastguard Worker
327*ec779b8eSAndroid Build Coastguard Worker accum2 = vaddq_s32(accum2, posSamp0);
328*ec779b8eSAndroid Build Coastguard Worker negSamp0 = vaddq_s32(negSamp0, negSamp1);
329*ec779b8eSAndroid Build Coastguard Worker accum2 = vaddq_s32(accum2, posSamp1);
330*ec779b8eSAndroid Build Coastguard Worker accum2 = vaddq_s32(accum2, negSamp0);
331*ec779b8eSAndroid Build Coastguard Worker
332*ec779b8eSAndroid Build Coastguard Worker sP -= 16;
333*ec779b8eSAndroid Build Coastguard Worker } break;
334*ec779b8eSAndroid Build Coastguard Worker }
335*ec779b8eSAndroid Build Coastguard Worker } while (count -= 8);
336*ec779b8eSAndroid Build Coastguard Worker
337*ec779b8eSAndroid Build Coastguard Worker // multiply by volume and save
338*ec779b8eSAndroid Build Coastguard Worker volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
339*ec779b8eSAndroid Build Coastguard Worker int32x2_t vLR = vld1_s32(volumeLR);
340*ec779b8eSAndroid Build Coastguard Worker int32x2_t outSamp = vld1_s32(out);
341*ec779b8eSAndroid Build Coastguard Worker // combine and funnel down accumulator
342*ec779b8eSAndroid Build Coastguard Worker int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
343*ec779b8eSAndroid Build Coastguard Worker if (CHANNELS == 1) {
344*ec779b8eSAndroid Build Coastguard Worker // duplicate accum to both L and R
345*ec779b8eSAndroid Build Coastguard Worker outAccum = vpadd_s32(outAccum, outAccum);
346*ec779b8eSAndroid Build Coastguard Worker } else if (CHANNELS == 2) {
347*ec779b8eSAndroid Build Coastguard Worker // accum2 contains R, fold in
348*ec779b8eSAndroid Build Coastguard Worker int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
349*ec779b8eSAndroid Build Coastguard Worker outAccum = vpadd_s32(outAccum, outAccum2);
350*ec779b8eSAndroid Build Coastguard Worker }
351*ec779b8eSAndroid Build Coastguard Worker outAccum = vqrdmulh_s32(outAccum, vLR);
352*ec779b8eSAndroid Build Coastguard Worker outSamp = vqadd_s32(outSamp, outAccum);
353*ec779b8eSAndroid Build Coastguard Worker vst1_s32(out, outSamp);
354*ec779b8eSAndroid Build Coastguard Worker }
355*ec779b8eSAndroid Build Coastguard Worker
356*ec779b8eSAndroid Build Coastguard Worker template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(float * out,int count,const float * coefsP,const float * coefsN,const float * sP,const float * sN,const float * volumeLR,float lerpP,const float * coefsP1,const float * coefsN1)357*ec779b8eSAndroid Build Coastguard Worker static inline void ProcessNeonIntrinsic(float* out,
358*ec779b8eSAndroid Build Coastguard Worker int count,
359*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
360*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
361*ec779b8eSAndroid Build Coastguard Worker const float* sP,
362*ec779b8eSAndroid Build Coastguard Worker const float* sN,
363*ec779b8eSAndroid Build Coastguard Worker const float* volumeLR,
364*ec779b8eSAndroid Build Coastguard Worker float lerpP,
365*ec779b8eSAndroid Build Coastguard Worker const float* coefsP1,
366*ec779b8eSAndroid Build Coastguard Worker const float* coefsN1)
367*ec779b8eSAndroid Build Coastguard Worker {
368*ec779b8eSAndroid Build Coastguard Worker ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
369*ec779b8eSAndroid Build Coastguard Worker static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
370*ec779b8eSAndroid Build Coastguard Worker
371*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
372*ec779b8eSAndroid Build Coastguard Worker coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
373*ec779b8eSAndroid Build Coastguard Worker coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
374*ec779b8eSAndroid Build Coastguard Worker
375*ec779b8eSAndroid Build Coastguard Worker float32x2_t interp;
376*ec779b8eSAndroid Build Coastguard Worker if (!FIXED) {
377*ec779b8eSAndroid Build Coastguard Worker interp = vdup_n_f32(lerpP);
378*ec779b8eSAndroid Build Coastguard Worker coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
379*ec779b8eSAndroid Build Coastguard Worker coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
380*ec779b8eSAndroid Build Coastguard Worker }
381*ec779b8eSAndroid Build Coastguard Worker float32x4_t accum, accum2;
382*ec779b8eSAndroid Build Coastguard Worker // warning uninitialized if we use veorq_s32
383*ec779b8eSAndroid Build Coastguard Worker // (alternative to below) accum = veorq_s32(accum, accum);
384*ec779b8eSAndroid Build Coastguard Worker accum = vdupq_n_f32(0);
385*ec779b8eSAndroid Build Coastguard Worker if (CHANNELS == 2) {
386*ec779b8eSAndroid Build Coastguard Worker // (alternative to below) accum2 = veorq_s32(accum2, accum2);
387*ec779b8eSAndroid Build Coastguard Worker accum2 = vdupq_n_f32(0);
388*ec779b8eSAndroid Build Coastguard Worker }
389*ec779b8eSAndroid Build Coastguard Worker do {
390*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_f32_x2
391*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
392*ec779b8eSAndroid Build Coastguard Worker coefsP += 8;
393*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
394*ec779b8eSAndroid Build Coastguard Worker coefsN += 8;
395*ec779b8eSAndroid Build Coastguard Worker #else
396*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t posCoef;
397*ec779b8eSAndroid Build Coastguard Worker posCoef.val[0] = vld1q_f32(coefsP);
398*ec779b8eSAndroid Build Coastguard Worker coefsP += 4;
399*ec779b8eSAndroid Build Coastguard Worker posCoef.val[1] = vld1q_f32(coefsP);
400*ec779b8eSAndroid Build Coastguard Worker coefsP += 4;
401*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t negCoef;
402*ec779b8eSAndroid Build Coastguard Worker negCoef.val[0] = vld1q_f32(coefsN);
403*ec779b8eSAndroid Build Coastguard Worker coefsN += 4;
404*ec779b8eSAndroid Build Coastguard Worker negCoef.val[1] = vld1q_f32(coefsN);
405*ec779b8eSAndroid Build Coastguard Worker coefsN += 4;
406*ec779b8eSAndroid Build Coastguard Worker #endif
407*ec779b8eSAndroid Build Coastguard Worker if (!FIXED) { // interpolate
408*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_f32_x2
409*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
410*ec779b8eSAndroid Build Coastguard Worker coefsP1 += 8;
411*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
412*ec779b8eSAndroid Build Coastguard Worker coefsN1 += 8;
413*ec779b8eSAndroid Build Coastguard Worker #else
414*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t posCoef1;
415*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[0] = vld1q_f32(coefsP1);
416*ec779b8eSAndroid Build Coastguard Worker coefsP1 += 4;
417*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[1] = vld1q_f32(coefsP1);
418*ec779b8eSAndroid Build Coastguard Worker coefsP1 += 4;
419*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t negCoef1;
420*ec779b8eSAndroid Build Coastguard Worker negCoef1.val[0] = vld1q_f32(coefsN1);
421*ec779b8eSAndroid Build Coastguard Worker coefsN1 += 4;
422*ec779b8eSAndroid Build Coastguard Worker negCoef1.val[1] = vld1q_f32(coefsN1);
423*ec779b8eSAndroid Build Coastguard Worker coefsN1 += 4;
424*ec779b8eSAndroid Build Coastguard Worker #endif
425*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
426*ec779b8eSAndroid Build Coastguard Worker posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
427*ec779b8eSAndroid Build Coastguard Worker negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
428*ec779b8eSAndroid Build Coastguard Worker negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
429*ec779b8eSAndroid Build Coastguard Worker
430*ec779b8eSAndroid Build Coastguard Worker posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
431*ec779b8eSAndroid Build Coastguard Worker posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
432*ec779b8eSAndroid Build Coastguard Worker negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
433*ec779b8eSAndroid Build Coastguard Worker negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
434*ec779b8eSAndroid Build Coastguard Worker }
435*ec779b8eSAndroid Build Coastguard Worker switch (CHANNELS) {
436*ec779b8eSAndroid Build Coastguard Worker case 1: {
437*ec779b8eSAndroid Build Coastguard Worker #ifdef vld1q_f32_x2
438*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t posSamp = vld1q_f32_x2(sP);
439*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t negSamp = vld1q_f32_x2(sN);
440*ec779b8eSAndroid Build Coastguard Worker sN += 8;
441*ec779b8eSAndroid Build Coastguard Worker sP -= 8;
442*ec779b8eSAndroid Build Coastguard Worker #else
443*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t posSamp;
444*ec779b8eSAndroid Build Coastguard Worker posSamp.val[0] = vld1q_f32(sP);
445*ec779b8eSAndroid Build Coastguard Worker sP += 4;
446*ec779b8eSAndroid Build Coastguard Worker posSamp.val[1] = vld1q_f32(sP);
447*ec779b8eSAndroid Build Coastguard Worker sP -= 12;
448*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t negSamp;
449*ec779b8eSAndroid Build Coastguard Worker negSamp.val[0] = vld1q_f32(sN);
450*ec779b8eSAndroid Build Coastguard Worker sN += 4;
451*ec779b8eSAndroid Build Coastguard Worker negSamp.val[1] = vld1q_f32(sN);
452*ec779b8eSAndroid Build Coastguard Worker sN += 4;
453*ec779b8eSAndroid Build Coastguard Worker #endif
454*ec779b8eSAndroid Build Coastguard Worker // effectively we want a vrev128q_f32()
455*ec779b8eSAndroid Build Coastguard Worker posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
456*ec779b8eSAndroid Build Coastguard Worker posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
457*ec779b8eSAndroid Build Coastguard Worker posSamp.val[0] = vcombine_f32(
458*ec779b8eSAndroid Build Coastguard Worker vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
459*ec779b8eSAndroid Build Coastguard Worker posSamp.val[1] = vcombine_f32(
460*ec779b8eSAndroid Build Coastguard Worker vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
461*ec779b8eSAndroid Build Coastguard Worker
462*ec779b8eSAndroid Build Coastguard Worker accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
463*ec779b8eSAndroid Build Coastguard Worker accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
464*ec779b8eSAndroid Build Coastguard Worker accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
465*ec779b8eSAndroid Build Coastguard Worker accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
466*ec779b8eSAndroid Build Coastguard Worker } break;
467*ec779b8eSAndroid Build Coastguard Worker case 2: {
468*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t posSamp0 = vld2q_f32(sP);
469*ec779b8eSAndroid Build Coastguard Worker sP += 8;
470*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t negSamp0 = vld2q_f32(sN);
471*ec779b8eSAndroid Build Coastguard Worker sN += 8;
472*ec779b8eSAndroid Build Coastguard Worker posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
473*ec779b8eSAndroid Build Coastguard Worker posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
474*ec779b8eSAndroid Build Coastguard Worker posSamp0.val[0] = vcombine_f32(
475*ec779b8eSAndroid Build Coastguard Worker vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
476*ec779b8eSAndroid Build Coastguard Worker posSamp0.val[1] = vcombine_f32(
477*ec779b8eSAndroid Build Coastguard Worker vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
478*ec779b8eSAndroid Build Coastguard Worker
479*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t posSamp1 = vld2q_f32(sP);
480*ec779b8eSAndroid Build Coastguard Worker sP -= 24;
481*ec779b8eSAndroid Build Coastguard Worker float32x4x2_t negSamp1 = vld2q_f32(sN);
482*ec779b8eSAndroid Build Coastguard Worker sN += 8;
483*ec779b8eSAndroid Build Coastguard Worker posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
484*ec779b8eSAndroid Build Coastguard Worker posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
485*ec779b8eSAndroid Build Coastguard Worker posSamp1.val[0] = vcombine_f32(
486*ec779b8eSAndroid Build Coastguard Worker vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
487*ec779b8eSAndroid Build Coastguard Worker posSamp1.val[1] = vcombine_f32(
488*ec779b8eSAndroid Build Coastguard Worker vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
489*ec779b8eSAndroid Build Coastguard Worker
490*ec779b8eSAndroid Build Coastguard Worker // Note: speed is affected by accumulation order.
491*ec779b8eSAndroid Build Coastguard Worker // Also, speed appears slower using vmul/vadd instead of vmla for
492*ec779b8eSAndroid Build Coastguard Worker // stereo case, comparable for mono.
493*ec779b8eSAndroid Build Coastguard Worker
494*ec779b8eSAndroid Build Coastguard Worker accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
495*ec779b8eSAndroid Build Coastguard Worker accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
496*ec779b8eSAndroid Build Coastguard Worker accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
497*ec779b8eSAndroid Build Coastguard Worker accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
498*ec779b8eSAndroid Build Coastguard Worker
499*ec779b8eSAndroid Build Coastguard Worker accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
500*ec779b8eSAndroid Build Coastguard Worker accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
501*ec779b8eSAndroid Build Coastguard Worker accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
502*ec779b8eSAndroid Build Coastguard Worker accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
503*ec779b8eSAndroid Build Coastguard Worker } break;
504*ec779b8eSAndroid Build Coastguard Worker }
505*ec779b8eSAndroid Build Coastguard Worker } while (count -= 8);
506*ec779b8eSAndroid Build Coastguard Worker
507*ec779b8eSAndroid Build Coastguard Worker // multiply by volume and save
508*ec779b8eSAndroid Build Coastguard Worker volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
509*ec779b8eSAndroid Build Coastguard Worker float32x2_t vLR = vld1_f32(volumeLR);
510*ec779b8eSAndroid Build Coastguard Worker float32x2_t outSamp = vld1_f32(out);
511*ec779b8eSAndroid Build Coastguard Worker // combine and funnel down accumulator
512*ec779b8eSAndroid Build Coastguard Worker float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
513*ec779b8eSAndroid Build Coastguard Worker if (CHANNELS == 1) {
514*ec779b8eSAndroid Build Coastguard Worker // duplicate accum to both L and R
515*ec779b8eSAndroid Build Coastguard Worker outAccum = vpadd_f32(outAccum, outAccum);
516*ec779b8eSAndroid Build Coastguard Worker } else if (CHANNELS == 2) {
517*ec779b8eSAndroid Build Coastguard Worker // accum2 contains R, fold in
518*ec779b8eSAndroid Build Coastguard Worker float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
519*ec779b8eSAndroid Build Coastguard Worker outAccum = vpadd_f32(outAccum, outAccum2);
520*ec779b8eSAndroid Build Coastguard Worker }
521*ec779b8eSAndroid Build Coastguard Worker outSamp = vmla_f32(outSamp, outAccum, vLR);
522*ec779b8eSAndroid Build Coastguard Worker vst1_f32(out, outSamp);
523*ec779b8eSAndroid Build Coastguard Worker }
524*ec779b8eSAndroid Build Coastguard Worker
525*ec779b8eSAndroid Build Coastguard Worker template <>
526*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<1, 16>(int32_t* const out,
527*ec779b8eSAndroid Build Coastguard Worker int count,
528*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsP,
529*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsN,
530*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
531*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
532*ec779b8eSAndroid Build Coastguard Worker const int32_t* const volumeLR)
533*ec779b8eSAndroid Build Coastguard Worker {
534*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
535*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
536*ec779b8eSAndroid Build Coastguard Worker 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
537*ec779b8eSAndroid Build Coastguard Worker #else
538*ec779b8eSAndroid Build Coastguard Worker const int CHANNELS = 1; // template specialization does not preserve params
539*ec779b8eSAndroid Build Coastguard Worker const int STRIDE = 16;
540*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
541*ec779b8eSAndroid Build Coastguard Worker asm (
542*ec779b8eSAndroid Build Coastguard Worker "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
543*ec779b8eSAndroid Build Coastguard Worker
544*ec779b8eSAndroid Build Coastguard Worker "1: \n"
545*ec779b8eSAndroid Build Coastguard Worker
546*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
547*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
548*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
549*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
550*ec779b8eSAndroid Build Coastguard Worker
551*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
552*ec779b8eSAndroid Build Coastguard Worker
553*ec779b8eSAndroid Build Coastguard Worker // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
554*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef
555*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef
556*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
557*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
558*ec779b8eSAndroid Build Coastguard Worker
559*ec779b8eSAndroid Build Coastguard Worker // moving these ARM instructions before neon above seems to be slower
560*ec779b8eSAndroid Build Coastguard Worker "subs %[count], %[count], #8 \n"// (1) update loop counter
561*ec779b8eSAndroid Build Coastguard Worker "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
562*ec779b8eSAndroid Build Coastguard Worker
563*ec779b8eSAndroid Build Coastguard Worker // sP used after branch (warning)
564*ec779b8eSAndroid Build Coastguard Worker "bne 1b \n"// loop
565*ec779b8eSAndroid Build Coastguard Worker
566*ec779b8eSAndroid Build Coastguard Worker ASSEMBLY_ACCUMULATE_MONO
567*ec779b8eSAndroid Build Coastguard Worker
568*ec779b8eSAndroid Build Coastguard Worker : [out] "=Uv" (out[0]),
569*ec779b8eSAndroid Build Coastguard Worker [count] "+r" (count),
570*ec779b8eSAndroid Build Coastguard Worker [coefsP0] "+r" (coefsP),
571*ec779b8eSAndroid Build Coastguard Worker [coefsN0] "+r" (coefsN),
572*ec779b8eSAndroid Build Coastguard Worker [sP] "+r" (sP),
573*ec779b8eSAndroid Build Coastguard Worker [sN] "+r" (sN)
574*ec779b8eSAndroid Build Coastguard Worker : [vLR] "r" (volumeLR)
575*ec779b8eSAndroid Build Coastguard Worker : "cc", "memory",
576*ec779b8eSAndroid Build Coastguard Worker "q0", "q1", "q2", "q3",
577*ec779b8eSAndroid Build Coastguard Worker "q8", "q10"
578*ec779b8eSAndroid Build Coastguard Worker );
579*ec779b8eSAndroid Build Coastguard Worker #endif
580*ec779b8eSAndroid Build Coastguard Worker }
581*ec779b8eSAndroid Build Coastguard Worker
582*ec779b8eSAndroid Build Coastguard Worker template <>
583*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<2, 16>(int32_t* const out,
584*ec779b8eSAndroid Build Coastguard Worker int count,
585*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsP,
586*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsN,
587*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
588*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
589*ec779b8eSAndroid Build Coastguard Worker const int32_t* const volumeLR)
590*ec779b8eSAndroid Build Coastguard Worker {
591*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
592*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
593*ec779b8eSAndroid Build Coastguard Worker 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
594*ec779b8eSAndroid Build Coastguard Worker #else
595*ec779b8eSAndroid Build Coastguard Worker const int CHANNELS = 2; // template specialization does not preserve params
596*ec779b8eSAndroid Build Coastguard Worker const int STRIDE = 16;
597*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
598*ec779b8eSAndroid Build Coastguard Worker asm (
599*ec779b8eSAndroid Build Coastguard Worker "veor q0, q0, q0 \n"// (1) acc_L = 0
600*ec779b8eSAndroid Build Coastguard Worker "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
601*ec779b8eSAndroid Build Coastguard Worker
602*ec779b8eSAndroid Build Coastguard Worker "1: \n"
603*ec779b8eSAndroid Build Coastguard Worker
604*ec779b8eSAndroid Build Coastguard Worker "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
605*ec779b8eSAndroid Build Coastguard Worker "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
606*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
607*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
608*ec779b8eSAndroid Build Coastguard Worker
609*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
610*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right
611*ec779b8eSAndroid Build Coastguard Worker
612*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
613*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
614*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right
615*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right
616*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
617*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
618*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
619*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
620*ec779b8eSAndroid Build Coastguard Worker
621*ec779b8eSAndroid Build Coastguard Worker // moving these ARM before neon seems to be slower
622*ec779b8eSAndroid Build Coastguard Worker "subs %[count], %[count], #8 \n"// (1) update loop counter
623*ec779b8eSAndroid Build Coastguard Worker "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
624*ec779b8eSAndroid Build Coastguard Worker
625*ec779b8eSAndroid Build Coastguard Worker // sP used after branch (warning)
626*ec779b8eSAndroid Build Coastguard Worker "bne 1b \n"// loop
627*ec779b8eSAndroid Build Coastguard Worker
628*ec779b8eSAndroid Build Coastguard Worker ASSEMBLY_ACCUMULATE_STEREO
629*ec779b8eSAndroid Build Coastguard Worker
630*ec779b8eSAndroid Build Coastguard Worker : [out] "=Uv" (out[0]),
631*ec779b8eSAndroid Build Coastguard Worker [count] "+r" (count),
632*ec779b8eSAndroid Build Coastguard Worker [coefsP0] "+r" (coefsP),
633*ec779b8eSAndroid Build Coastguard Worker [coefsN0] "+r" (coefsN),
634*ec779b8eSAndroid Build Coastguard Worker [sP] "+r" (sP),
635*ec779b8eSAndroid Build Coastguard Worker [sN] "+r" (sN)
636*ec779b8eSAndroid Build Coastguard Worker : [vLR] "r" (volumeLR)
637*ec779b8eSAndroid Build Coastguard Worker : "cc", "memory",
638*ec779b8eSAndroid Build Coastguard Worker "q0", "q1", "q2", "q3",
639*ec779b8eSAndroid Build Coastguard Worker "q4", "q5", "q6",
640*ec779b8eSAndroid Build Coastguard Worker "q8", "q10"
641*ec779b8eSAndroid Build Coastguard Worker );
642*ec779b8eSAndroid Build Coastguard Worker #endif
643*ec779b8eSAndroid Build Coastguard Worker }
644*ec779b8eSAndroid Build Coastguard Worker
645*ec779b8eSAndroid Build Coastguard Worker template <>
646*ec779b8eSAndroid Build Coastguard Worker inline void Process<1, 16>(int32_t* const out,
647*ec779b8eSAndroid Build Coastguard Worker int count,
648*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsP,
649*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsN,
650*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsP1,
651*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsN1,
652*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
653*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
654*ec779b8eSAndroid Build Coastguard Worker uint32_t lerpP,
655*ec779b8eSAndroid Build Coastguard Worker const int32_t* const volumeLR)
656*ec779b8eSAndroid Build Coastguard Worker {
657*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
658*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
659*ec779b8eSAndroid Build Coastguard Worker lerpP, coefsP1, coefsN1);
660*ec779b8eSAndroid Build Coastguard Worker #else
661*ec779b8eSAndroid Build Coastguard Worker
662*ec779b8eSAndroid Build Coastguard Worker const int CHANNELS = 1; // template specialization does not preserve params
663*ec779b8eSAndroid Build Coastguard Worker const int STRIDE = 16;
664*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
665*ec779b8eSAndroid Build Coastguard Worker asm (
666*ec779b8eSAndroid Build Coastguard Worker "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
667*ec779b8eSAndroid Build Coastguard Worker "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
668*ec779b8eSAndroid Build Coastguard Worker
669*ec779b8eSAndroid Build Coastguard Worker "1: \n"
670*ec779b8eSAndroid Build Coastguard Worker
671*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
672*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
673*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
674*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
675*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
676*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
677*ec779b8eSAndroid Build Coastguard Worker
678*ec779b8eSAndroid Build Coastguard Worker "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
679*ec779b8eSAndroid Build Coastguard Worker "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
680*ec779b8eSAndroid Build Coastguard Worker
681*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
682*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
683*ec779b8eSAndroid Build Coastguard Worker
684*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
685*ec779b8eSAndroid Build Coastguard Worker
686*ec779b8eSAndroid Build Coastguard Worker "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set
687*ec779b8eSAndroid Build Coastguard Worker "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
688*ec779b8eSAndroid Build Coastguard Worker
689*ec779b8eSAndroid Build Coastguard Worker // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
690*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef
691*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef
692*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
693*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
694*ec779b8eSAndroid Build Coastguard Worker
695*ec779b8eSAndroid Build Coastguard Worker // moving these ARM instructions before neon above seems to be slower
696*ec779b8eSAndroid Build Coastguard Worker "subs %[count], %[count], #8 \n"// (1) update loop counter
697*ec779b8eSAndroid Build Coastguard Worker "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
698*ec779b8eSAndroid Build Coastguard Worker
699*ec779b8eSAndroid Build Coastguard Worker // sP used after branch (warning)
700*ec779b8eSAndroid Build Coastguard Worker "bne 1b \n"// loop
701*ec779b8eSAndroid Build Coastguard Worker
702*ec779b8eSAndroid Build Coastguard Worker ASSEMBLY_ACCUMULATE_MONO
703*ec779b8eSAndroid Build Coastguard Worker
704*ec779b8eSAndroid Build Coastguard Worker : [out] "=Uv" (out[0]),
705*ec779b8eSAndroid Build Coastguard Worker [count] "+r" (count),
706*ec779b8eSAndroid Build Coastguard Worker [coefsP0] "+r" (coefsP),
707*ec779b8eSAndroid Build Coastguard Worker [coefsN0] "+r" (coefsN),
708*ec779b8eSAndroid Build Coastguard Worker [coefsP1] "+r" (coefsP1),
709*ec779b8eSAndroid Build Coastguard Worker [coefsN1] "+r" (coefsN1),
710*ec779b8eSAndroid Build Coastguard Worker [sP] "+r" (sP),
711*ec779b8eSAndroid Build Coastguard Worker [sN] "+r" (sN)
712*ec779b8eSAndroid Build Coastguard Worker : [lerpP] "r" (lerpP),
713*ec779b8eSAndroid Build Coastguard Worker [vLR] "r" (volumeLR)
714*ec779b8eSAndroid Build Coastguard Worker : "cc", "memory",
715*ec779b8eSAndroid Build Coastguard Worker "q0", "q1", "q2", "q3",
716*ec779b8eSAndroid Build Coastguard Worker "q8", "q9", "q10", "q11"
717*ec779b8eSAndroid Build Coastguard Worker );
718*ec779b8eSAndroid Build Coastguard Worker #endif
719*ec779b8eSAndroid Build Coastguard Worker }
720*ec779b8eSAndroid Build Coastguard Worker
721*ec779b8eSAndroid Build Coastguard Worker template <>
722*ec779b8eSAndroid Build Coastguard Worker inline void Process<2, 16>(int32_t* const out,
723*ec779b8eSAndroid Build Coastguard Worker int count,
724*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsP,
725*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsN,
726*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsP1,
727*ec779b8eSAndroid Build Coastguard Worker const int16_t* coefsN1,
728*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
729*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
730*ec779b8eSAndroid Build Coastguard Worker uint32_t lerpP,
731*ec779b8eSAndroid Build Coastguard Worker const int32_t* const volumeLR)
732*ec779b8eSAndroid Build Coastguard Worker {
733*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
734*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
735*ec779b8eSAndroid Build Coastguard Worker lerpP, coefsP1, coefsN1);
736*ec779b8eSAndroid Build Coastguard Worker #else
737*ec779b8eSAndroid Build Coastguard Worker const int CHANNELS = 2; // template specialization does not preserve params
738*ec779b8eSAndroid Build Coastguard Worker const int STRIDE = 16;
739*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
740*ec779b8eSAndroid Build Coastguard Worker asm (
741*ec779b8eSAndroid Build Coastguard Worker "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
742*ec779b8eSAndroid Build Coastguard Worker "veor q0, q0, q0 \n"// (1) acc_L = 0
743*ec779b8eSAndroid Build Coastguard Worker "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
744*ec779b8eSAndroid Build Coastguard Worker
745*ec779b8eSAndroid Build Coastguard Worker "1: \n"
746*ec779b8eSAndroid Build Coastguard Worker
747*ec779b8eSAndroid Build Coastguard Worker "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
748*ec779b8eSAndroid Build Coastguard Worker "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
749*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
750*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
751*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
752*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
753*ec779b8eSAndroid Build Coastguard Worker
754*ec779b8eSAndroid Build Coastguard Worker "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
755*ec779b8eSAndroid Build Coastguard Worker "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
756*ec779b8eSAndroid Build Coastguard Worker
757*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
758*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
759*ec779b8eSAndroid Build Coastguard Worker
760*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
761*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right
762*ec779b8eSAndroid Build Coastguard Worker
763*ec779b8eSAndroid Build Coastguard Worker "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
764*ec779b8eSAndroid Build Coastguard Worker "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
765*ec779b8eSAndroid Build Coastguard Worker
766*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left
767*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left
768*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right
769*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right
770*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
771*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
772*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
773*ec779b8eSAndroid Build Coastguard Worker "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
774*ec779b8eSAndroid Build Coastguard Worker
775*ec779b8eSAndroid Build Coastguard Worker // moving these ARM before neon seems to be slower
776*ec779b8eSAndroid Build Coastguard Worker "subs %[count], %[count], #8 \n"// (1) update loop counter
777*ec779b8eSAndroid Build Coastguard Worker "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
778*ec779b8eSAndroid Build Coastguard Worker
779*ec779b8eSAndroid Build Coastguard Worker // sP used after branch (warning)
780*ec779b8eSAndroid Build Coastguard Worker "bne 1b \n"// loop
781*ec779b8eSAndroid Build Coastguard Worker
782*ec779b8eSAndroid Build Coastguard Worker ASSEMBLY_ACCUMULATE_STEREO
783*ec779b8eSAndroid Build Coastguard Worker
784*ec779b8eSAndroid Build Coastguard Worker : [out] "=Uv" (out[0]),
785*ec779b8eSAndroid Build Coastguard Worker [count] "+r" (count),
786*ec779b8eSAndroid Build Coastguard Worker [coefsP0] "+r" (coefsP),
787*ec779b8eSAndroid Build Coastguard Worker [coefsN0] "+r" (coefsN),
788*ec779b8eSAndroid Build Coastguard Worker [coefsP1] "+r" (coefsP1),
789*ec779b8eSAndroid Build Coastguard Worker [coefsN1] "+r" (coefsN1),
790*ec779b8eSAndroid Build Coastguard Worker [sP] "+r" (sP),
791*ec779b8eSAndroid Build Coastguard Worker [sN] "+r" (sN)
792*ec779b8eSAndroid Build Coastguard Worker : [lerpP] "r" (lerpP),
793*ec779b8eSAndroid Build Coastguard Worker [vLR] "r" (volumeLR)
794*ec779b8eSAndroid Build Coastguard Worker : "cc", "memory",
795*ec779b8eSAndroid Build Coastguard Worker "q0", "q1", "q2", "q3",
796*ec779b8eSAndroid Build Coastguard Worker "q4", "q5", "q6",
797*ec779b8eSAndroid Build Coastguard Worker "q8", "q9", "q10", "q11"
798*ec779b8eSAndroid Build Coastguard Worker );
799*ec779b8eSAndroid Build Coastguard Worker #endif
800*ec779b8eSAndroid Build Coastguard Worker }
801*ec779b8eSAndroid Build Coastguard Worker
802*ec779b8eSAndroid Build Coastguard Worker template <>
803*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<1, 16>(int32_t* const out,
804*ec779b8eSAndroid Build Coastguard Worker int count,
805*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsP,
806*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsN,
807*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
808*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
809*ec779b8eSAndroid Build Coastguard Worker const int32_t* const volumeLR)
810*ec779b8eSAndroid Build Coastguard Worker {
811*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
812*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
813*ec779b8eSAndroid Build Coastguard Worker 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
814*ec779b8eSAndroid Build Coastguard Worker #else
815*ec779b8eSAndroid Build Coastguard Worker const int CHANNELS = 1; // template specialization does not preserve params
816*ec779b8eSAndroid Build Coastguard Worker const int STRIDE = 16;
817*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
818*ec779b8eSAndroid Build Coastguard Worker asm (
819*ec779b8eSAndroid Build Coastguard Worker "veor q0, q0, q0 \n"// result, initialize to 0
820*ec779b8eSAndroid Build Coastguard Worker
821*ec779b8eSAndroid Build Coastguard Worker "1: \n"
822*ec779b8eSAndroid Build Coastguard Worker
823*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
824*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
825*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
826*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
827*ec779b8eSAndroid Build Coastguard Worker
828*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
829*ec779b8eSAndroid Build Coastguard Worker
830*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
831*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
832*ec779b8eSAndroid Build Coastguard Worker
833*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
834*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
835*ec779b8eSAndroid Build Coastguard Worker
836*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples
837*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples
838*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples
839*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples
840*ec779b8eSAndroid Build Coastguard Worker
841*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q12 \n"// accumulate result
842*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q13, q13, q14 \n"// accumulate result
843*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q15 \n"// accumulate result
844*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q13 \n"// accumulate result
845*ec779b8eSAndroid Build Coastguard Worker
846*ec779b8eSAndroid Build Coastguard Worker "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
847*ec779b8eSAndroid Build Coastguard Worker "subs %[count], %[count], #8 \n"// update loop counter
848*ec779b8eSAndroid Build Coastguard Worker
849*ec779b8eSAndroid Build Coastguard Worker "bne 1b \n"// loop
850*ec779b8eSAndroid Build Coastguard Worker
851*ec779b8eSAndroid Build Coastguard Worker ASSEMBLY_ACCUMULATE_MONO
852*ec779b8eSAndroid Build Coastguard Worker
853*ec779b8eSAndroid Build Coastguard Worker : [out] "=Uv" (out[0]),
854*ec779b8eSAndroid Build Coastguard Worker [count] "+r" (count),
855*ec779b8eSAndroid Build Coastguard Worker [coefsP0] "+r" (coefsP),
856*ec779b8eSAndroid Build Coastguard Worker [coefsN0] "+r" (coefsN),
857*ec779b8eSAndroid Build Coastguard Worker [sP] "+r" (sP),
858*ec779b8eSAndroid Build Coastguard Worker [sN] "+r" (sN)
859*ec779b8eSAndroid Build Coastguard Worker : [vLR] "r" (volumeLR)
860*ec779b8eSAndroid Build Coastguard Worker : "cc", "memory",
861*ec779b8eSAndroid Build Coastguard Worker "q0", "q1", "q2", "q3",
862*ec779b8eSAndroid Build Coastguard Worker "q8", "q9", "q10", "q11",
863*ec779b8eSAndroid Build Coastguard Worker "q12", "q13", "q14", "q15"
864*ec779b8eSAndroid Build Coastguard Worker );
865*ec779b8eSAndroid Build Coastguard Worker #endif
866*ec779b8eSAndroid Build Coastguard Worker }
867*ec779b8eSAndroid Build Coastguard Worker
868*ec779b8eSAndroid Build Coastguard Worker template <>
869*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<2, 16>(int32_t* const out,
870*ec779b8eSAndroid Build Coastguard Worker int count,
871*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsP,
872*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsN,
873*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
874*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
875*ec779b8eSAndroid Build Coastguard Worker const int32_t* const volumeLR)
876*ec779b8eSAndroid Build Coastguard Worker {
877*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
878*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
879*ec779b8eSAndroid Build Coastguard Worker 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
880*ec779b8eSAndroid Build Coastguard Worker #else
881*ec779b8eSAndroid Build Coastguard Worker const int CHANNELS = 2; // template specialization does not preserve params
882*ec779b8eSAndroid Build Coastguard Worker const int STRIDE = 16;
883*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
884*ec779b8eSAndroid Build Coastguard Worker asm (
885*ec779b8eSAndroid Build Coastguard Worker "veor q0, q0, q0 \n"// result, initialize to 0
886*ec779b8eSAndroid Build Coastguard Worker "veor q4, q4, q4 \n"// result, initialize to 0
887*ec779b8eSAndroid Build Coastguard Worker
888*ec779b8eSAndroid Build Coastguard Worker "1: \n"
889*ec779b8eSAndroid Build Coastguard Worker
890*ec779b8eSAndroid Build Coastguard Worker "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
891*ec779b8eSAndroid Build Coastguard Worker "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
892*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
893*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
894*ec779b8eSAndroid Build Coastguard Worker
895*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
896*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
897*ec779b8eSAndroid Build Coastguard Worker
898*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
899*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
900*ec779b8eSAndroid Build Coastguard Worker
901*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
902*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
903*ec779b8eSAndroid Build Coastguard Worker
904*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
905*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
906*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
907*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
908*ec779b8eSAndroid Build Coastguard Worker
909*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q12 \n"// accumulate result
910*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q13, q13, q14 \n"// accumulate result
911*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q15 \n"// accumulate result
912*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q13 \n"// accumulate result
913*ec779b8eSAndroid Build Coastguard Worker
914*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
915*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
916*ec779b8eSAndroid Build Coastguard Worker
917*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
918*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
919*ec779b8eSAndroid Build Coastguard Worker
920*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
921*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
922*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
923*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
924*ec779b8eSAndroid Build Coastguard Worker
925*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q4, q4, q12 \n"// accumulate result
926*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q13, q13, q14 \n"// accumulate result
927*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q4, q4, q15 \n"// accumulate result
928*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q4, q4, q13 \n"// accumulate result
929*ec779b8eSAndroid Build Coastguard Worker
930*ec779b8eSAndroid Build Coastguard Worker "subs %[count], %[count], #8 \n"// update loop counter
931*ec779b8eSAndroid Build Coastguard Worker "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
932*ec779b8eSAndroid Build Coastguard Worker
933*ec779b8eSAndroid Build Coastguard Worker "bne 1b \n"// loop
934*ec779b8eSAndroid Build Coastguard Worker
935*ec779b8eSAndroid Build Coastguard Worker ASSEMBLY_ACCUMULATE_STEREO
936*ec779b8eSAndroid Build Coastguard Worker
937*ec779b8eSAndroid Build Coastguard Worker : [out] "=Uv" (out[0]),
938*ec779b8eSAndroid Build Coastguard Worker [count] "+r" (count),
939*ec779b8eSAndroid Build Coastguard Worker [coefsP0] "+r" (coefsP),
940*ec779b8eSAndroid Build Coastguard Worker [coefsN0] "+r" (coefsN),
941*ec779b8eSAndroid Build Coastguard Worker [sP] "+r" (sP),
942*ec779b8eSAndroid Build Coastguard Worker [sN] "+r" (sN)
943*ec779b8eSAndroid Build Coastguard Worker : [vLR] "r" (volumeLR)
944*ec779b8eSAndroid Build Coastguard Worker : "cc", "memory",
945*ec779b8eSAndroid Build Coastguard Worker "q0", "q1", "q2", "q3",
946*ec779b8eSAndroid Build Coastguard Worker "q4", "q5", "q6",
947*ec779b8eSAndroid Build Coastguard Worker "q8", "q9", "q10", "q11",
948*ec779b8eSAndroid Build Coastguard Worker "q12", "q13", "q14", "q15"
949*ec779b8eSAndroid Build Coastguard Worker );
950*ec779b8eSAndroid Build Coastguard Worker #endif
951*ec779b8eSAndroid Build Coastguard Worker }
952*ec779b8eSAndroid Build Coastguard Worker
953*ec779b8eSAndroid Build Coastguard Worker template <>
954*ec779b8eSAndroid Build Coastguard Worker inline void Process<1, 16>(int32_t* const out,
955*ec779b8eSAndroid Build Coastguard Worker int count,
956*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsP,
957*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsN,
958*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsP1,
959*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsN1,
960*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
961*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
962*ec779b8eSAndroid Build Coastguard Worker uint32_t lerpP,
963*ec779b8eSAndroid Build Coastguard Worker const int32_t* const volumeLR)
964*ec779b8eSAndroid Build Coastguard Worker {
965*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
966*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
967*ec779b8eSAndroid Build Coastguard Worker lerpP, coefsP1, coefsN1);
968*ec779b8eSAndroid Build Coastguard Worker #else
969*ec779b8eSAndroid Build Coastguard Worker const int CHANNELS = 1; // template specialization does not preserve params
970*ec779b8eSAndroid Build Coastguard Worker const int STRIDE = 16;
971*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
972*ec779b8eSAndroid Build Coastguard Worker asm (
973*ec779b8eSAndroid Build Coastguard Worker "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
974*ec779b8eSAndroid Build Coastguard Worker "veor q0, q0, q0 \n"// result, initialize to 0
975*ec779b8eSAndroid Build Coastguard Worker
976*ec779b8eSAndroid Build Coastguard Worker "1: \n"
977*ec779b8eSAndroid Build Coastguard Worker
978*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
979*ec779b8eSAndroid Build Coastguard Worker "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
980*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
981*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
982*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
983*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
984*ec779b8eSAndroid Build Coastguard Worker
985*ec779b8eSAndroid Build Coastguard Worker "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
986*ec779b8eSAndroid Build Coastguard Worker "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
987*ec779b8eSAndroid Build Coastguard Worker "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
988*ec779b8eSAndroid Build Coastguard Worker "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
989*ec779b8eSAndroid Build Coastguard Worker
990*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
991*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
992*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
993*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
994*ec779b8eSAndroid Build Coastguard Worker
995*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
996*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
997*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
998*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
999*ec779b8eSAndroid Build Coastguard Worker
1000*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
1001*ec779b8eSAndroid Build Coastguard Worker
1002*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
1003*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
1004*ec779b8eSAndroid Build Coastguard Worker
1005*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
1006*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
1007*ec779b8eSAndroid Build Coastguard Worker
1008*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
1009*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
1010*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
1011*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
1012*ec779b8eSAndroid Build Coastguard Worker
1013*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q12 \n"// accumulate result
1014*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q13, q13, q14 \n"// accumulate result
1015*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q15 \n"// accumulate result
1016*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q13 \n"// accumulate result
1017*ec779b8eSAndroid Build Coastguard Worker
1018*ec779b8eSAndroid Build Coastguard Worker "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
1019*ec779b8eSAndroid Build Coastguard Worker "subs %[count], %[count], #8 \n"// update loop counter
1020*ec779b8eSAndroid Build Coastguard Worker
1021*ec779b8eSAndroid Build Coastguard Worker "bne 1b \n"// loop
1022*ec779b8eSAndroid Build Coastguard Worker
1023*ec779b8eSAndroid Build Coastguard Worker ASSEMBLY_ACCUMULATE_MONO
1024*ec779b8eSAndroid Build Coastguard Worker
1025*ec779b8eSAndroid Build Coastguard Worker : [out] "=Uv" (out[0]),
1026*ec779b8eSAndroid Build Coastguard Worker [count] "+r" (count),
1027*ec779b8eSAndroid Build Coastguard Worker [coefsP0] "+r" (coefsP),
1028*ec779b8eSAndroid Build Coastguard Worker [coefsN0] "+r" (coefsN),
1029*ec779b8eSAndroid Build Coastguard Worker [coefsP1] "+r" (coefsP1),
1030*ec779b8eSAndroid Build Coastguard Worker [coefsN1] "+r" (coefsN1),
1031*ec779b8eSAndroid Build Coastguard Worker [sP] "+r" (sP),
1032*ec779b8eSAndroid Build Coastguard Worker [sN] "+r" (sN)
1033*ec779b8eSAndroid Build Coastguard Worker : [lerpP] "r" (lerpP),
1034*ec779b8eSAndroid Build Coastguard Worker [vLR] "r" (volumeLR)
1035*ec779b8eSAndroid Build Coastguard Worker : "cc", "memory",
1036*ec779b8eSAndroid Build Coastguard Worker "q0", "q1", "q2", "q3",
1037*ec779b8eSAndroid Build Coastguard Worker "q8", "q9", "q10", "q11",
1038*ec779b8eSAndroid Build Coastguard Worker "q12", "q13", "q14", "q15"
1039*ec779b8eSAndroid Build Coastguard Worker );
1040*ec779b8eSAndroid Build Coastguard Worker #endif
1041*ec779b8eSAndroid Build Coastguard Worker }
1042*ec779b8eSAndroid Build Coastguard Worker
1043*ec779b8eSAndroid Build Coastguard Worker template <>
1044*ec779b8eSAndroid Build Coastguard Worker inline void Process<2, 16>(int32_t* const out,
1045*ec779b8eSAndroid Build Coastguard Worker int count,
1046*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsP,
1047*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsN,
1048*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsP1,
1049*ec779b8eSAndroid Build Coastguard Worker const int32_t* coefsN1,
1050*ec779b8eSAndroid Build Coastguard Worker const int16_t* sP,
1051*ec779b8eSAndroid Build Coastguard Worker const int16_t* sN,
1052*ec779b8eSAndroid Build Coastguard Worker uint32_t lerpP,
1053*ec779b8eSAndroid Build Coastguard Worker const int32_t* const volumeLR)
1054*ec779b8eSAndroid Build Coastguard Worker {
1055*ec779b8eSAndroid Build Coastguard Worker #ifdef USE_INTRINSIC
1056*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1057*ec779b8eSAndroid Build Coastguard Worker lerpP, coefsP1, coefsN1);
1058*ec779b8eSAndroid Build Coastguard Worker #else
1059*ec779b8eSAndroid Build Coastguard Worker const int CHANNELS = 2; // template specialization does not preserve params
1060*ec779b8eSAndroid Build Coastguard Worker const int STRIDE = 16;
1061*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*((STRIDE>>1)-1);
1062*ec779b8eSAndroid Build Coastguard Worker asm (
1063*ec779b8eSAndroid Build Coastguard Worker "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
1064*ec779b8eSAndroid Build Coastguard Worker "veor q0, q0, q0 \n"// result, initialize to 0
1065*ec779b8eSAndroid Build Coastguard Worker "veor q4, q4, q4 \n"// result, initialize to 0
1066*ec779b8eSAndroid Build Coastguard Worker
1067*ec779b8eSAndroid Build Coastguard Worker "1: \n"
1068*ec779b8eSAndroid Build Coastguard Worker
1069*ec779b8eSAndroid Build Coastguard Worker "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
1070*ec779b8eSAndroid Build Coastguard Worker "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
1071*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
1072*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
1073*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
1074*ec779b8eSAndroid Build Coastguard Worker "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
1075*ec779b8eSAndroid Build Coastguard Worker
1076*ec779b8eSAndroid Build Coastguard Worker "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
1077*ec779b8eSAndroid Build Coastguard Worker "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
1078*ec779b8eSAndroid Build Coastguard Worker "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
1079*ec779b8eSAndroid Build Coastguard Worker "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
1080*ec779b8eSAndroid Build Coastguard Worker
1081*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
1082*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
1083*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
1084*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
1085*ec779b8eSAndroid Build Coastguard Worker
1086*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
1087*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
1088*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
1089*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
1090*ec779b8eSAndroid Build Coastguard Worker
1091*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
1092*ec779b8eSAndroid Build Coastguard Worker "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
1093*ec779b8eSAndroid Build Coastguard Worker
1094*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
1095*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
1096*ec779b8eSAndroid Build Coastguard Worker
1097*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
1098*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
1099*ec779b8eSAndroid Build Coastguard Worker
1100*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
1101*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
1102*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
1103*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
1104*ec779b8eSAndroid Build Coastguard Worker
1105*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q12 \n"// accumulate result
1106*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q13, q13, q14 \n"// accumulate result
1107*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q15 \n"// accumulate result
1108*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q0, q0, q13 \n"// accumulate result
1109*ec779b8eSAndroid Build Coastguard Worker
1110*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
1111*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
1112*ec779b8eSAndroid Build Coastguard Worker
1113*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
1114*ec779b8eSAndroid Build Coastguard Worker "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
1115*ec779b8eSAndroid Build Coastguard Worker
1116*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
1117*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
1118*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
1119*ec779b8eSAndroid Build Coastguard Worker "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
1120*ec779b8eSAndroid Build Coastguard Worker
1121*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q4, q4, q12 \n"// accumulate result
1122*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q13, q13, q14 \n"// accumulate result
1123*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q4, q4, q15 \n"// accumulate result
1124*ec779b8eSAndroid Build Coastguard Worker "vadd.s32 q4, q4, q13 \n"// accumulate result
1125*ec779b8eSAndroid Build Coastguard Worker
1126*ec779b8eSAndroid Build Coastguard Worker "subs %[count], %[count], #8 \n"// update loop counter
1127*ec779b8eSAndroid Build Coastguard Worker "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
1128*ec779b8eSAndroid Build Coastguard Worker
1129*ec779b8eSAndroid Build Coastguard Worker "bne 1b \n"// loop
1130*ec779b8eSAndroid Build Coastguard Worker
1131*ec779b8eSAndroid Build Coastguard Worker ASSEMBLY_ACCUMULATE_STEREO
1132*ec779b8eSAndroid Build Coastguard Worker
1133*ec779b8eSAndroid Build Coastguard Worker : [out] "=Uv" (out[0]),
1134*ec779b8eSAndroid Build Coastguard Worker [count] "+r" (count),
1135*ec779b8eSAndroid Build Coastguard Worker [coefsP0] "+r" (coefsP),
1136*ec779b8eSAndroid Build Coastguard Worker [coefsN0] "+r" (coefsN),
1137*ec779b8eSAndroid Build Coastguard Worker [coefsP1] "+r" (coefsP1),
1138*ec779b8eSAndroid Build Coastguard Worker [coefsN1] "+r" (coefsN1),
1139*ec779b8eSAndroid Build Coastguard Worker [sP] "+r" (sP),
1140*ec779b8eSAndroid Build Coastguard Worker [sN] "+r" (sN)
1141*ec779b8eSAndroid Build Coastguard Worker : [lerpP] "r" (lerpP),
1142*ec779b8eSAndroid Build Coastguard Worker [vLR] "r" (volumeLR)
1143*ec779b8eSAndroid Build Coastguard Worker : "cc", "memory",
1144*ec779b8eSAndroid Build Coastguard Worker "q0", "q1", "q2", "q3",
1145*ec779b8eSAndroid Build Coastguard Worker "q4", "q5", "q6",
1146*ec779b8eSAndroid Build Coastguard Worker "q8", "q9", "q10", "q11",
1147*ec779b8eSAndroid Build Coastguard Worker "q12", "q13", "q14", "q15"
1148*ec779b8eSAndroid Build Coastguard Worker );
1149*ec779b8eSAndroid Build Coastguard Worker #endif
1150*ec779b8eSAndroid Build Coastguard Worker }
1151*ec779b8eSAndroid Build Coastguard Worker
1152*ec779b8eSAndroid Build Coastguard Worker template<>
1153*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<1, 16>(float* const out,
1154*ec779b8eSAndroid Build Coastguard Worker int count,
1155*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
1156*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
1157*ec779b8eSAndroid Build Coastguard Worker const float* sP,
1158*ec779b8eSAndroid Build Coastguard Worker const float* sN,
1159*ec779b8eSAndroid Build Coastguard Worker const float* const volumeLR)
1160*ec779b8eSAndroid Build Coastguard Worker {
1161*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1162*ec779b8eSAndroid Build Coastguard Worker 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1163*ec779b8eSAndroid Build Coastguard Worker }
1164*ec779b8eSAndroid Build Coastguard Worker
1165*ec779b8eSAndroid Build Coastguard Worker template<>
1166*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<2, 16>(float* const out,
1167*ec779b8eSAndroid Build Coastguard Worker int count,
1168*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
1169*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
1170*ec779b8eSAndroid Build Coastguard Worker const float* sP,
1171*ec779b8eSAndroid Build Coastguard Worker const float* sN,
1172*ec779b8eSAndroid Build Coastguard Worker const float* const volumeLR)
1173*ec779b8eSAndroid Build Coastguard Worker {
1174*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1175*ec779b8eSAndroid Build Coastguard Worker 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1176*ec779b8eSAndroid Build Coastguard Worker }
1177*ec779b8eSAndroid Build Coastguard Worker
1178*ec779b8eSAndroid Build Coastguard Worker template<>
1179*ec779b8eSAndroid Build Coastguard Worker inline void Process<1, 16>(float* const out,
1180*ec779b8eSAndroid Build Coastguard Worker int count,
1181*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
1182*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
1183*ec779b8eSAndroid Build Coastguard Worker const float* coefsP1,
1184*ec779b8eSAndroid Build Coastguard Worker const float* coefsN1,
1185*ec779b8eSAndroid Build Coastguard Worker const float* sP,
1186*ec779b8eSAndroid Build Coastguard Worker const float* sN,
1187*ec779b8eSAndroid Build Coastguard Worker float lerpP,
1188*ec779b8eSAndroid Build Coastguard Worker const float* const volumeLR)
1189*ec779b8eSAndroid Build Coastguard Worker {
1190*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1191*ec779b8eSAndroid Build Coastguard Worker lerpP, coefsP1, coefsN1);
1192*ec779b8eSAndroid Build Coastguard Worker }
1193*ec779b8eSAndroid Build Coastguard Worker
1194*ec779b8eSAndroid Build Coastguard Worker template<>
1195*ec779b8eSAndroid Build Coastguard Worker inline void Process<2, 16>(float* const out,
1196*ec779b8eSAndroid Build Coastguard Worker int count,
1197*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
1198*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
1199*ec779b8eSAndroid Build Coastguard Worker const float* coefsP1,
1200*ec779b8eSAndroid Build Coastguard Worker const float* coefsN1,
1201*ec779b8eSAndroid Build Coastguard Worker const float* sP,
1202*ec779b8eSAndroid Build Coastguard Worker const float* sN,
1203*ec779b8eSAndroid Build Coastguard Worker float lerpP,
1204*ec779b8eSAndroid Build Coastguard Worker const float* const volumeLR)
1205*ec779b8eSAndroid Build Coastguard Worker {
1206*ec779b8eSAndroid Build Coastguard Worker ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1207*ec779b8eSAndroid Build Coastguard Worker lerpP, coefsP1, coefsN1);
1208*ec779b8eSAndroid Build Coastguard Worker }
1209*ec779b8eSAndroid Build Coastguard Worker
1210*ec779b8eSAndroid Build Coastguard Worker #endif //USE_NEON
1211*ec779b8eSAndroid Build Coastguard Worker
1212*ec779b8eSAndroid Build Coastguard Worker } // namespace android
1213*ec779b8eSAndroid Build Coastguard Worker
1214*ec779b8eSAndroid Build Coastguard Worker #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1215