1*ec779b8eSAndroid Build Coastguard Worker /*
2*ec779b8eSAndroid Build Coastguard Worker * Copyright (C) 2016 The Android Open Source Project
3*ec779b8eSAndroid Build Coastguard Worker *
4*ec779b8eSAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*ec779b8eSAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*ec779b8eSAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*ec779b8eSAndroid Build Coastguard Worker *
8*ec779b8eSAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*ec779b8eSAndroid Build Coastguard Worker *
10*ec779b8eSAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*ec779b8eSAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*ec779b8eSAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*ec779b8eSAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*ec779b8eSAndroid Build Coastguard Worker * limitations under the License.
15*ec779b8eSAndroid Build Coastguard Worker */
16*ec779b8eSAndroid Build Coastguard Worker
17*ec779b8eSAndroid Build Coastguard Worker #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
18*ec779b8eSAndroid Build Coastguard Worker #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
19*ec779b8eSAndroid Build Coastguard Worker
20*ec779b8eSAndroid Build Coastguard Worker namespace android {
21*ec779b8eSAndroid Build Coastguard Worker
22*ec779b8eSAndroid Build Coastguard Worker // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23*ec779b8eSAndroid Build Coastguard Worker
24*ec779b8eSAndroid Build Coastguard Worker #if USE_SSE
25*ec779b8eSAndroid Build Coastguard Worker
26*ec779b8eSAndroid Build Coastguard Worker #define TO_STRING2(x) #x
27*ec779b8eSAndroid Build Coastguard Worker #define TO_STRING(x) TO_STRING2(x)
28*ec779b8eSAndroid Build Coastguard Worker // uncomment to print GCC version, may be relevant for intrinsic optimizations
29*ec779b8eSAndroid Build Coastguard Worker /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
30*ec779b8eSAndroid Build Coastguard Worker "." TO_STRING(__GNUC_MINOR__) \
31*ec779b8eSAndroid Build Coastguard Worker "." TO_STRING(__GNUC_PATCHLEVEL__)) */
32*ec779b8eSAndroid Build Coastguard Worker
33*ec779b8eSAndroid Build Coastguard Worker //
34*ec779b8eSAndroid Build Coastguard Worker // SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
35*ec779b8eSAndroid Build Coastguard Worker //
36*ec779b8eSAndroid Build Coastguard Worker
37*ec779b8eSAndroid Build Coastguard Worker template <int CHANNELS, int STRIDE, bool FIXED>
ProcessSSEIntrinsic(float * out,int count,const float * coefsP,const float * coefsN,const float * sP,const float * sN,const float * volumeLR,float lerpP,const float * coefsP1,const float * coefsN1)38*ec779b8eSAndroid Build Coastguard Worker static inline void ProcessSSEIntrinsic(float* out,
39*ec779b8eSAndroid Build Coastguard Worker int count,
40*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
41*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
42*ec779b8eSAndroid Build Coastguard Worker const float* sP,
43*ec779b8eSAndroid Build Coastguard Worker const float* sN,
44*ec779b8eSAndroid Build Coastguard Worker const float* volumeLR,
45*ec779b8eSAndroid Build Coastguard Worker float lerpP,
46*ec779b8eSAndroid Build Coastguard Worker const float* coefsP1,
47*ec779b8eSAndroid Build Coastguard Worker const float* coefsN1)
48*ec779b8eSAndroid Build Coastguard Worker {
49*ec779b8eSAndroid Build Coastguard Worker ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
50*ec779b8eSAndroid Build Coastguard Worker static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
51*ec779b8eSAndroid Build Coastguard Worker
52*ec779b8eSAndroid Build Coastguard Worker sP -= CHANNELS*(4-1); // adjust sP for a loop iteration of four
53*ec779b8eSAndroid Build Coastguard Worker
54*ec779b8eSAndroid Build Coastguard Worker __m128 interp;
55*ec779b8eSAndroid Build Coastguard Worker if (!FIXED) {
56*ec779b8eSAndroid Build Coastguard Worker interp = _mm_set1_ps(lerpP);
57*ec779b8eSAndroid Build Coastguard Worker }
58*ec779b8eSAndroid Build Coastguard Worker
59*ec779b8eSAndroid Build Coastguard Worker __m128 accL, accR;
60*ec779b8eSAndroid Build Coastguard Worker accL = _mm_setzero_ps();
61*ec779b8eSAndroid Build Coastguard Worker if (CHANNELS == 2) {
62*ec779b8eSAndroid Build Coastguard Worker accR = _mm_setzero_ps();
63*ec779b8eSAndroid Build Coastguard Worker }
64*ec779b8eSAndroid Build Coastguard Worker
65*ec779b8eSAndroid Build Coastguard Worker do {
66*ec779b8eSAndroid Build Coastguard Worker __m128 posCoef = _mm_load_ps(coefsP);
67*ec779b8eSAndroid Build Coastguard Worker __m128 negCoef = _mm_load_ps(coefsN);
68*ec779b8eSAndroid Build Coastguard Worker coefsP += 4;
69*ec779b8eSAndroid Build Coastguard Worker coefsN += 4;
70*ec779b8eSAndroid Build Coastguard Worker
71*ec779b8eSAndroid Build Coastguard Worker if (!FIXED) { // interpolate
72*ec779b8eSAndroid Build Coastguard Worker __m128 posCoef1 = _mm_load_ps(coefsP1);
73*ec779b8eSAndroid Build Coastguard Worker __m128 negCoef1 = _mm_load_ps(coefsN1);
74*ec779b8eSAndroid Build Coastguard Worker coefsP1 += 4;
75*ec779b8eSAndroid Build Coastguard Worker coefsN1 += 4;
76*ec779b8eSAndroid Build Coastguard Worker
77*ec779b8eSAndroid Build Coastguard Worker // Calculate the final coefficient for interpolation
78*ec779b8eSAndroid Build Coastguard Worker // posCoef = interp * (posCoef1 - posCoef) + posCoef
79*ec779b8eSAndroid Build Coastguard Worker // negCoef = interp * (negCoef - negCoef1) + negCoef1
80*ec779b8eSAndroid Build Coastguard Worker posCoef1 = _mm_sub_ps(posCoef1, posCoef);
81*ec779b8eSAndroid Build Coastguard Worker negCoef = _mm_sub_ps(negCoef, negCoef1);
82*ec779b8eSAndroid Build Coastguard Worker
83*ec779b8eSAndroid Build Coastguard Worker
84*ec779b8eSAndroid Build Coastguard Worker #if USE_AVX2
85*ec779b8eSAndroid Build Coastguard Worker posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
86*ec779b8eSAndroid Build Coastguard Worker negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
87*ec779b8eSAndroid Build Coastguard Worker #else
88*ec779b8eSAndroid Build Coastguard Worker posCoef1 = _mm_mul_ps(posCoef1, interp);
89*ec779b8eSAndroid Build Coastguard Worker negCoef = _mm_mul_ps(negCoef, interp);
90*ec779b8eSAndroid Build Coastguard Worker posCoef = _mm_add_ps(posCoef1, posCoef);
91*ec779b8eSAndroid Build Coastguard Worker negCoef = _mm_add_ps(negCoef, negCoef1);
92*ec779b8eSAndroid Build Coastguard Worker #endif //USE_AVX2
93*ec779b8eSAndroid Build Coastguard Worker }
94*ec779b8eSAndroid Build Coastguard Worker switch (CHANNELS) {
95*ec779b8eSAndroid Build Coastguard Worker case 1: {
96*ec779b8eSAndroid Build Coastguard Worker __m128 posSamp = _mm_loadu_ps(sP);
97*ec779b8eSAndroid Build Coastguard Worker __m128 negSamp = _mm_loadu_ps(sN);
98*ec779b8eSAndroid Build Coastguard Worker sP -= 4;
99*ec779b8eSAndroid Build Coastguard Worker sN += 4;
100*ec779b8eSAndroid Build Coastguard Worker
101*ec779b8eSAndroid Build Coastguard Worker posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
102*ec779b8eSAndroid Build Coastguard Worker
103*ec779b8eSAndroid Build Coastguard Worker #if USE_AVX2
104*ec779b8eSAndroid Build Coastguard Worker accL = _mm_fmadd_ps(posSamp, posCoef, accL);
105*ec779b8eSAndroid Build Coastguard Worker accL = _mm_fmadd_ps(negSamp, negCoef, accL);
106*ec779b8eSAndroid Build Coastguard Worker #else
107*ec779b8eSAndroid Build Coastguard Worker posSamp = _mm_mul_ps(posSamp, posCoef);
108*ec779b8eSAndroid Build Coastguard Worker negSamp = _mm_mul_ps(negSamp, negCoef);
109*ec779b8eSAndroid Build Coastguard Worker accL = _mm_add_ps(accL, posSamp);
110*ec779b8eSAndroid Build Coastguard Worker accL = _mm_add_ps(accL, negSamp);
111*ec779b8eSAndroid Build Coastguard Worker #endif
112*ec779b8eSAndroid Build Coastguard Worker
113*ec779b8eSAndroid Build Coastguard Worker } break;
114*ec779b8eSAndroid Build Coastguard Worker case 2: {
115*ec779b8eSAndroid Build Coastguard Worker __m128 posSamp0 = _mm_loadu_ps(sP);
116*ec779b8eSAndroid Build Coastguard Worker __m128 posSamp1 = _mm_loadu_ps(sP+4);
117*ec779b8eSAndroid Build Coastguard Worker __m128 negSamp0 = _mm_loadu_ps(sN);
118*ec779b8eSAndroid Build Coastguard Worker __m128 negSamp1 = _mm_loadu_ps(sN+4);
119*ec779b8eSAndroid Build Coastguard Worker sP -= 8;
120*ec779b8eSAndroid Build Coastguard Worker sN += 8;
121*ec779b8eSAndroid Build Coastguard Worker
122*ec779b8eSAndroid Build Coastguard Worker // deinterleave everything and reverse the positives
123*ec779b8eSAndroid Build Coastguard Worker __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
124*ec779b8eSAndroid Build Coastguard Worker __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
125*ec779b8eSAndroid Build Coastguard Worker __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
126*ec779b8eSAndroid Build Coastguard Worker __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
127*ec779b8eSAndroid Build Coastguard Worker
128*ec779b8eSAndroid Build Coastguard Worker #if USE_AVX2
129*ec779b8eSAndroid Build Coastguard Worker accL = _mm_fmadd_ps(posSampL, posCoef, accL);
130*ec779b8eSAndroid Build Coastguard Worker accR = _mm_fmadd_ps(posSampR, posCoef, accR);
131*ec779b8eSAndroid Build Coastguard Worker accL = _mm_fmadd_ps(negSampL, negCoef, accL);
132*ec779b8eSAndroid Build Coastguard Worker accR = _mm_fmadd_ps(negSampR, negCoef, accR);
133*ec779b8eSAndroid Build Coastguard Worker #else
134*ec779b8eSAndroid Build Coastguard Worker posSampL = _mm_mul_ps(posSampL, posCoef);
135*ec779b8eSAndroid Build Coastguard Worker posSampR = _mm_mul_ps(posSampR, posCoef);
136*ec779b8eSAndroid Build Coastguard Worker negSampL = _mm_mul_ps(negSampL, negCoef);
137*ec779b8eSAndroid Build Coastguard Worker negSampR = _mm_mul_ps(negSampR, negCoef);
138*ec779b8eSAndroid Build Coastguard Worker
139*ec779b8eSAndroid Build Coastguard Worker accL = _mm_add_ps(accL, posSampL);
140*ec779b8eSAndroid Build Coastguard Worker accR = _mm_add_ps(accR, posSampR);
141*ec779b8eSAndroid Build Coastguard Worker accL = _mm_add_ps(accL, negSampL);
142*ec779b8eSAndroid Build Coastguard Worker accR = _mm_add_ps(accR, negSampR);
143*ec779b8eSAndroid Build Coastguard Worker #endif
144*ec779b8eSAndroid Build Coastguard Worker
145*ec779b8eSAndroid Build Coastguard Worker } break;
146*ec779b8eSAndroid Build Coastguard Worker }
147*ec779b8eSAndroid Build Coastguard Worker } while (count -= 4);
148*ec779b8eSAndroid Build Coastguard Worker
149*ec779b8eSAndroid Build Coastguard Worker // multiply by volume and save
150*ec779b8eSAndroid Build Coastguard Worker __m128 vLR = _mm_setzero_ps();
151*ec779b8eSAndroid Build Coastguard Worker __m128 outSamp;
152*ec779b8eSAndroid Build Coastguard Worker vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
153*ec779b8eSAndroid Build Coastguard Worker outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
154*ec779b8eSAndroid Build Coastguard Worker
155*ec779b8eSAndroid Build Coastguard Worker // combine and funnel down accumulator
156*ec779b8eSAndroid Build Coastguard Worker __m128 outAccum = _mm_setzero_ps();
157*ec779b8eSAndroid Build Coastguard Worker if (CHANNELS == 1) {
158*ec779b8eSAndroid Build Coastguard Worker // duplicate accL to both L and R
159*ec779b8eSAndroid Build Coastguard Worker outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
160*ec779b8eSAndroid Build Coastguard Worker outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
161*ec779b8eSAndroid Build Coastguard Worker } else if (CHANNELS == 2) {
162*ec779b8eSAndroid Build Coastguard Worker // accR contains R, fold in
163*ec779b8eSAndroid Build Coastguard Worker outAccum = _mm_hadd_ps(accL, accR);
164*ec779b8eSAndroid Build Coastguard Worker outAccum = _mm_hadd_ps(outAccum, outAccum);
165*ec779b8eSAndroid Build Coastguard Worker }
166*ec779b8eSAndroid Build Coastguard Worker #if USE_AVX2
167*ec779b8eSAndroid Build Coastguard Worker outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
168*ec779b8eSAndroid Build Coastguard Worker #else
169*ec779b8eSAndroid Build Coastguard Worker outAccum = _mm_mul_ps(outAccum, vLR);
170*ec779b8eSAndroid Build Coastguard Worker outSamp = _mm_add_ps(outSamp, outAccum);
171*ec779b8eSAndroid Build Coastguard Worker #endif
172*ec779b8eSAndroid Build Coastguard Worker
173*ec779b8eSAndroid Build Coastguard Worker _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
174*ec779b8eSAndroid Build Coastguard Worker }
175*ec779b8eSAndroid Build Coastguard Worker
176*ec779b8eSAndroid Build Coastguard Worker template<>
177*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<1, 16>(float* const out,
178*ec779b8eSAndroid Build Coastguard Worker int count,
179*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
180*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
181*ec779b8eSAndroid Build Coastguard Worker const float* sP,
182*ec779b8eSAndroid Build Coastguard Worker const float* sN,
183*ec779b8eSAndroid Build Coastguard Worker const float* const volumeLR)
184*ec779b8eSAndroid Build Coastguard Worker {
185*ec779b8eSAndroid Build Coastguard Worker ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
186*ec779b8eSAndroid Build Coastguard Worker 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
187*ec779b8eSAndroid Build Coastguard Worker }
188*ec779b8eSAndroid Build Coastguard Worker
189*ec779b8eSAndroid Build Coastguard Worker template<>
190*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<2, 16>(float* const out,
191*ec779b8eSAndroid Build Coastguard Worker int count,
192*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
193*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
194*ec779b8eSAndroid Build Coastguard Worker const float* sP,
195*ec779b8eSAndroid Build Coastguard Worker const float* sN,
196*ec779b8eSAndroid Build Coastguard Worker const float* const volumeLR)
197*ec779b8eSAndroid Build Coastguard Worker {
198*ec779b8eSAndroid Build Coastguard Worker ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
199*ec779b8eSAndroid Build Coastguard Worker 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
200*ec779b8eSAndroid Build Coastguard Worker }
201*ec779b8eSAndroid Build Coastguard Worker
202*ec779b8eSAndroid Build Coastguard Worker template<>
203*ec779b8eSAndroid Build Coastguard Worker inline void Process<1, 16>(float* const out,
204*ec779b8eSAndroid Build Coastguard Worker int count,
205*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
206*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
207*ec779b8eSAndroid Build Coastguard Worker const float* coefsP1,
208*ec779b8eSAndroid Build Coastguard Worker const float* coefsN1,
209*ec779b8eSAndroid Build Coastguard Worker const float* sP,
210*ec779b8eSAndroid Build Coastguard Worker const float* sN,
211*ec779b8eSAndroid Build Coastguard Worker float lerpP,
212*ec779b8eSAndroid Build Coastguard Worker const float* const volumeLR)
213*ec779b8eSAndroid Build Coastguard Worker {
214*ec779b8eSAndroid Build Coastguard Worker ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
215*ec779b8eSAndroid Build Coastguard Worker lerpP, coefsP1, coefsN1);
216*ec779b8eSAndroid Build Coastguard Worker }
217*ec779b8eSAndroid Build Coastguard Worker
218*ec779b8eSAndroid Build Coastguard Worker template<>
219*ec779b8eSAndroid Build Coastguard Worker inline void Process<2, 16>(float* const out,
220*ec779b8eSAndroid Build Coastguard Worker int count,
221*ec779b8eSAndroid Build Coastguard Worker const float* coefsP,
222*ec779b8eSAndroid Build Coastguard Worker const float* coefsN,
223*ec779b8eSAndroid Build Coastguard Worker const float* coefsP1,
224*ec779b8eSAndroid Build Coastguard Worker const float* coefsN1,
225*ec779b8eSAndroid Build Coastguard Worker const float* sP,
226*ec779b8eSAndroid Build Coastguard Worker const float* sN,
227*ec779b8eSAndroid Build Coastguard Worker float lerpP,
228*ec779b8eSAndroid Build Coastguard Worker const float* const volumeLR)
229*ec779b8eSAndroid Build Coastguard Worker {
230*ec779b8eSAndroid Build Coastguard Worker ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
231*ec779b8eSAndroid Build Coastguard Worker lerpP, coefsP1, coefsN1);
232*ec779b8eSAndroid Build Coastguard Worker }
233*ec779b8eSAndroid Build Coastguard Worker
234*ec779b8eSAndroid Build Coastguard Worker #endif //USE_SSE
235*ec779b8eSAndroid Build Coastguard Worker
236*ec779b8eSAndroid Build Coastguard Worker } // namespace android
237*ec779b8eSAndroid Build Coastguard Worker
238*ec779b8eSAndroid Build Coastguard Worker #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/
239