xref: /aosp_15_r20/frameworks/av/media/libaudioprocessing/AudioResamplerFirProcessSSE.h (revision ec779b8e0859a360c3d303172224686826e6e0e1)
1*ec779b8eSAndroid Build Coastguard Worker /*
2*ec779b8eSAndroid Build Coastguard Worker  * Copyright (C) 2016 The Android Open Source Project
3*ec779b8eSAndroid Build Coastguard Worker  *
4*ec779b8eSAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*ec779b8eSAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*ec779b8eSAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*ec779b8eSAndroid Build Coastguard Worker  *
8*ec779b8eSAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*ec779b8eSAndroid Build Coastguard Worker  *
10*ec779b8eSAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*ec779b8eSAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*ec779b8eSAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*ec779b8eSAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*ec779b8eSAndroid Build Coastguard Worker  * limitations under the License.
15*ec779b8eSAndroid Build Coastguard Worker  */
16*ec779b8eSAndroid Build Coastguard Worker 
17*ec779b8eSAndroid Build Coastguard Worker #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
18*ec779b8eSAndroid Build Coastguard Worker #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
19*ec779b8eSAndroid Build Coastguard Worker 
20*ec779b8eSAndroid Build Coastguard Worker namespace android {
21*ec779b8eSAndroid Build Coastguard Worker 
22*ec779b8eSAndroid Build Coastguard Worker // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23*ec779b8eSAndroid Build Coastguard Worker 
24*ec779b8eSAndroid Build Coastguard Worker #if USE_SSE
25*ec779b8eSAndroid Build Coastguard Worker 
26*ec779b8eSAndroid Build Coastguard Worker #define TO_STRING2(x) #x
27*ec779b8eSAndroid Build Coastguard Worker #define TO_STRING(x) TO_STRING2(x)
28*ec779b8eSAndroid Build Coastguard Worker // uncomment to print GCC version, may be relevant for intrinsic optimizations
29*ec779b8eSAndroid Build Coastguard Worker /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
30*ec779b8eSAndroid Build Coastguard Worker         "." TO_STRING(__GNUC_MINOR__) \
31*ec779b8eSAndroid Build Coastguard Worker         "." TO_STRING(__GNUC_PATCHLEVEL__)) */
32*ec779b8eSAndroid Build Coastguard Worker 
33*ec779b8eSAndroid Build Coastguard Worker //
34*ec779b8eSAndroid Build Coastguard Worker // SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
35*ec779b8eSAndroid Build Coastguard Worker //
36*ec779b8eSAndroid Build Coastguard Worker 
37*ec779b8eSAndroid Build Coastguard Worker template <int CHANNELS, int STRIDE, bool FIXED>
ProcessSSEIntrinsic(float * out,int count,const float * coefsP,const float * coefsN,const float * sP,const float * sN,const float * volumeLR,float lerpP,const float * coefsP1,const float * coefsN1)38*ec779b8eSAndroid Build Coastguard Worker static inline void ProcessSSEIntrinsic(float* out,
39*ec779b8eSAndroid Build Coastguard Worker         int count,
40*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
41*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
42*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
43*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
44*ec779b8eSAndroid Build Coastguard Worker         const float* volumeLR,
45*ec779b8eSAndroid Build Coastguard Worker         float lerpP,
46*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP1,
47*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN1)
48*ec779b8eSAndroid Build Coastguard Worker {
49*ec779b8eSAndroid Build Coastguard Worker     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
50*ec779b8eSAndroid Build Coastguard Worker     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
51*ec779b8eSAndroid Build Coastguard Worker 
52*ec779b8eSAndroid Build Coastguard Worker     sP -= CHANNELS*(4-1);   // adjust sP for a loop iteration of four
53*ec779b8eSAndroid Build Coastguard Worker 
54*ec779b8eSAndroid Build Coastguard Worker     __m128 interp;
55*ec779b8eSAndroid Build Coastguard Worker     if (!FIXED) {
56*ec779b8eSAndroid Build Coastguard Worker         interp = _mm_set1_ps(lerpP);
57*ec779b8eSAndroid Build Coastguard Worker     }
58*ec779b8eSAndroid Build Coastguard Worker 
59*ec779b8eSAndroid Build Coastguard Worker     __m128 accL, accR;
60*ec779b8eSAndroid Build Coastguard Worker     accL = _mm_setzero_ps();
61*ec779b8eSAndroid Build Coastguard Worker     if (CHANNELS == 2) {
62*ec779b8eSAndroid Build Coastguard Worker         accR = _mm_setzero_ps();
63*ec779b8eSAndroid Build Coastguard Worker     }
64*ec779b8eSAndroid Build Coastguard Worker 
65*ec779b8eSAndroid Build Coastguard Worker     do {
66*ec779b8eSAndroid Build Coastguard Worker         __m128 posCoef = _mm_load_ps(coefsP);
67*ec779b8eSAndroid Build Coastguard Worker         __m128 negCoef = _mm_load_ps(coefsN);
68*ec779b8eSAndroid Build Coastguard Worker         coefsP += 4;
69*ec779b8eSAndroid Build Coastguard Worker         coefsN += 4;
70*ec779b8eSAndroid Build Coastguard Worker 
71*ec779b8eSAndroid Build Coastguard Worker         if (!FIXED) { // interpolate
72*ec779b8eSAndroid Build Coastguard Worker             __m128 posCoef1 = _mm_load_ps(coefsP1);
73*ec779b8eSAndroid Build Coastguard Worker             __m128 negCoef1 = _mm_load_ps(coefsN1);
74*ec779b8eSAndroid Build Coastguard Worker             coefsP1 += 4;
75*ec779b8eSAndroid Build Coastguard Worker             coefsN1 += 4;
76*ec779b8eSAndroid Build Coastguard Worker 
77*ec779b8eSAndroid Build Coastguard Worker             // Calculate the final coefficient for interpolation
78*ec779b8eSAndroid Build Coastguard Worker             // posCoef = interp * (posCoef1 - posCoef) + posCoef
79*ec779b8eSAndroid Build Coastguard Worker             // negCoef = interp * (negCoef - negCoef1) + negCoef1
80*ec779b8eSAndroid Build Coastguard Worker             posCoef1 = _mm_sub_ps(posCoef1, posCoef);
81*ec779b8eSAndroid Build Coastguard Worker             negCoef = _mm_sub_ps(negCoef, negCoef1);
82*ec779b8eSAndroid Build Coastguard Worker 
83*ec779b8eSAndroid Build Coastguard Worker 
84*ec779b8eSAndroid Build Coastguard Worker             #if USE_AVX2
85*ec779b8eSAndroid Build Coastguard Worker             posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
86*ec779b8eSAndroid Build Coastguard Worker             negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
87*ec779b8eSAndroid Build Coastguard Worker             #else
88*ec779b8eSAndroid Build Coastguard Worker             posCoef1 = _mm_mul_ps(posCoef1, interp);
89*ec779b8eSAndroid Build Coastguard Worker             negCoef = _mm_mul_ps(negCoef, interp);
90*ec779b8eSAndroid Build Coastguard Worker             posCoef = _mm_add_ps(posCoef1, posCoef);
91*ec779b8eSAndroid Build Coastguard Worker             negCoef = _mm_add_ps(negCoef, negCoef1);
92*ec779b8eSAndroid Build Coastguard Worker             #endif //USE_AVX2
93*ec779b8eSAndroid Build Coastguard Worker         }
94*ec779b8eSAndroid Build Coastguard Worker         switch (CHANNELS) {
95*ec779b8eSAndroid Build Coastguard Worker         case 1: {
96*ec779b8eSAndroid Build Coastguard Worker             __m128 posSamp = _mm_loadu_ps(sP);
97*ec779b8eSAndroid Build Coastguard Worker             __m128 negSamp = _mm_loadu_ps(sN);
98*ec779b8eSAndroid Build Coastguard Worker             sP -= 4;
99*ec779b8eSAndroid Build Coastguard Worker             sN += 4;
100*ec779b8eSAndroid Build Coastguard Worker 
101*ec779b8eSAndroid Build Coastguard Worker             posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
102*ec779b8eSAndroid Build Coastguard Worker 
103*ec779b8eSAndroid Build Coastguard Worker             #if USE_AVX2
104*ec779b8eSAndroid Build Coastguard Worker             accL = _mm_fmadd_ps(posSamp, posCoef, accL);
105*ec779b8eSAndroid Build Coastguard Worker             accL = _mm_fmadd_ps(negSamp, negCoef, accL);
106*ec779b8eSAndroid Build Coastguard Worker             #else
107*ec779b8eSAndroid Build Coastguard Worker             posSamp = _mm_mul_ps(posSamp, posCoef);
108*ec779b8eSAndroid Build Coastguard Worker             negSamp = _mm_mul_ps(negSamp, negCoef);
109*ec779b8eSAndroid Build Coastguard Worker             accL = _mm_add_ps(accL, posSamp);
110*ec779b8eSAndroid Build Coastguard Worker             accL = _mm_add_ps(accL, negSamp);
111*ec779b8eSAndroid Build Coastguard Worker             #endif
112*ec779b8eSAndroid Build Coastguard Worker 
113*ec779b8eSAndroid Build Coastguard Worker         } break;
114*ec779b8eSAndroid Build Coastguard Worker         case 2: {
115*ec779b8eSAndroid Build Coastguard Worker             __m128 posSamp0 = _mm_loadu_ps(sP);
116*ec779b8eSAndroid Build Coastguard Worker             __m128 posSamp1 = _mm_loadu_ps(sP+4);
117*ec779b8eSAndroid Build Coastguard Worker             __m128 negSamp0 = _mm_loadu_ps(sN);
118*ec779b8eSAndroid Build Coastguard Worker             __m128 negSamp1 = _mm_loadu_ps(sN+4);
119*ec779b8eSAndroid Build Coastguard Worker             sP -= 8;
120*ec779b8eSAndroid Build Coastguard Worker             sN += 8;
121*ec779b8eSAndroid Build Coastguard Worker 
122*ec779b8eSAndroid Build Coastguard Worker             // deinterleave everything and reverse the positives
123*ec779b8eSAndroid Build Coastguard Worker             __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
124*ec779b8eSAndroid Build Coastguard Worker             __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
125*ec779b8eSAndroid Build Coastguard Worker             __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
126*ec779b8eSAndroid Build Coastguard Worker             __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
127*ec779b8eSAndroid Build Coastguard Worker 
128*ec779b8eSAndroid Build Coastguard Worker            #if USE_AVX2
129*ec779b8eSAndroid Build Coastguard Worker            accL = _mm_fmadd_ps(posSampL, posCoef, accL);
130*ec779b8eSAndroid Build Coastguard Worker            accR = _mm_fmadd_ps(posSampR, posCoef, accR);
131*ec779b8eSAndroid Build Coastguard Worker            accL = _mm_fmadd_ps(negSampL, negCoef, accL);
132*ec779b8eSAndroid Build Coastguard Worker            accR = _mm_fmadd_ps(negSampR, negCoef, accR);
133*ec779b8eSAndroid Build Coastguard Worker            #else
134*ec779b8eSAndroid Build Coastguard Worker            posSampL = _mm_mul_ps(posSampL, posCoef);
135*ec779b8eSAndroid Build Coastguard Worker            posSampR = _mm_mul_ps(posSampR, posCoef);
136*ec779b8eSAndroid Build Coastguard Worker            negSampL = _mm_mul_ps(negSampL, negCoef);
137*ec779b8eSAndroid Build Coastguard Worker            negSampR = _mm_mul_ps(negSampR, negCoef);
138*ec779b8eSAndroid Build Coastguard Worker 
139*ec779b8eSAndroid Build Coastguard Worker            accL = _mm_add_ps(accL, posSampL);
140*ec779b8eSAndroid Build Coastguard Worker            accR = _mm_add_ps(accR, posSampR);
141*ec779b8eSAndroid Build Coastguard Worker            accL = _mm_add_ps(accL, negSampL);
142*ec779b8eSAndroid Build Coastguard Worker            accR = _mm_add_ps(accR, negSampR);
143*ec779b8eSAndroid Build Coastguard Worker            #endif
144*ec779b8eSAndroid Build Coastguard Worker 
145*ec779b8eSAndroid Build Coastguard Worker         } break;
146*ec779b8eSAndroid Build Coastguard Worker         }
147*ec779b8eSAndroid Build Coastguard Worker     } while (count -= 4);
148*ec779b8eSAndroid Build Coastguard Worker 
149*ec779b8eSAndroid Build Coastguard Worker     // multiply by volume and save
150*ec779b8eSAndroid Build Coastguard Worker     __m128 vLR = _mm_setzero_ps();
151*ec779b8eSAndroid Build Coastguard Worker     __m128 outSamp;
152*ec779b8eSAndroid Build Coastguard Worker     vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
153*ec779b8eSAndroid Build Coastguard Worker     outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
154*ec779b8eSAndroid Build Coastguard Worker 
155*ec779b8eSAndroid Build Coastguard Worker     // combine and funnel down accumulator
156*ec779b8eSAndroid Build Coastguard Worker     __m128 outAccum = _mm_setzero_ps();
157*ec779b8eSAndroid Build Coastguard Worker     if (CHANNELS == 1) {
158*ec779b8eSAndroid Build Coastguard Worker         // duplicate accL to both L and R
159*ec779b8eSAndroid Build Coastguard Worker         outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
160*ec779b8eSAndroid Build Coastguard Worker         outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
161*ec779b8eSAndroid Build Coastguard Worker     } else if (CHANNELS == 2) {
162*ec779b8eSAndroid Build Coastguard Worker         // accR contains R, fold in
163*ec779b8eSAndroid Build Coastguard Worker         outAccum = _mm_hadd_ps(accL, accR);
164*ec779b8eSAndroid Build Coastguard Worker         outAccum = _mm_hadd_ps(outAccum, outAccum);
165*ec779b8eSAndroid Build Coastguard Worker     }
166*ec779b8eSAndroid Build Coastguard Worker     #if USE_AVX2
167*ec779b8eSAndroid Build Coastguard Worker     outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
168*ec779b8eSAndroid Build Coastguard Worker     #else
169*ec779b8eSAndroid Build Coastguard Worker     outAccum = _mm_mul_ps(outAccum, vLR);
170*ec779b8eSAndroid Build Coastguard Worker     outSamp = _mm_add_ps(outSamp, outAccum);
171*ec779b8eSAndroid Build Coastguard Worker     #endif
172*ec779b8eSAndroid Build Coastguard Worker 
173*ec779b8eSAndroid Build Coastguard Worker     _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
174*ec779b8eSAndroid Build Coastguard Worker }
175*ec779b8eSAndroid Build Coastguard Worker 
176*ec779b8eSAndroid Build Coastguard Worker template<>
177*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<1, 16>(float* const out,
178*ec779b8eSAndroid Build Coastguard Worker         int count,
179*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
180*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
181*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
182*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
183*ec779b8eSAndroid Build Coastguard Worker         const float* const volumeLR)
184*ec779b8eSAndroid Build Coastguard Worker {
185*ec779b8eSAndroid Build Coastguard Worker     ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
186*ec779b8eSAndroid Build Coastguard Worker             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
187*ec779b8eSAndroid Build Coastguard Worker }
188*ec779b8eSAndroid Build Coastguard Worker 
189*ec779b8eSAndroid Build Coastguard Worker template<>
190*ec779b8eSAndroid Build Coastguard Worker inline void ProcessL<2, 16>(float* const out,
191*ec779b8eSAndroid Build Coastguard Worker         int count,
192*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
193*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
194*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
195*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
196*ec779b8eSAndroid Build Coastguard Worker         const float* const volumeLR)
197*ec779b8eSAndroid Build Coastguard Worker {
198*ec779b8eSAndroid Build Coastguard Worker     ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
199*ec779b8eSAndroid Build Coastguard Worker             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
200*ec779b8eSAndroid Build Coastguard Worker }
201*ec779b8eSAndroid Build Coastguard Worker 
202*ec779b8eSAndroid Build Coastguard Worker template<>
203*ec779b8eSAndroid Build Coastguard Worker inline void Process<1, 16>(float* const out,
204*ec779b8eSAndroid Build Coastguard Worker         int count,
205*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
206*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
207*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP1,
208*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN1,
209*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
210*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
211*ec779b8eSAndroid Build Coastguard Worker         float lerpP,
212*ec779b8eSAndroid Build Coastguard Worker         const float* const volumeLR)
213*ec779b8eSAndroid Build Coastguard Worker {
214*ec779b8eSAndroid Build Coastguard Worker     ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
215*ec779b8eSAndroid Build Coastguard Worker             lerpP, coefsP1, coefsN1);
216*ec779b8eSAndroid Build Coastguard Worker }
217*ec779b8eSAndroid Build Coastguard Worker 
218*ec779b8eSAndroid Build Coastguard Worker template<>
219*ec779b8eSAndroid Build Coastguard Worker inline void Process<2, 16>(float* const out,
220*ec779b8eSAndroid Build Coastguard Worker         int count,
221*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP,
222*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN,
223*ec779b8eSAndroid Build Coastguard Worker         const float* coefsP1,
224*ec779b8eSAndroid Build Coastguard Worker         const float* coefsN1,
225*ec779b8eSAndroid Build Coastguard Worker         const float* sP,
226*ec779b8eSAndroid Build Coastguard Worker         const float* sN,
227*ec779b8eSAndroid Build Coastguard Worker         float lerpP,
228*ec779b8eSAndroid Build Coastguard Worker         const float* const volumeLR)
229*ec779b8eSAndroid Build Coastguard Worker {
230*ec779b8eSAndroid Build Coastguard Worker     ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
231*ec779b8eSAndroid Build Coastguard Worker             lerpP, coefsP1, coefsN1);
232*ec779b8eSAndroid Build Coastguard Worker }
233*ec779b8eSAndroid Build Coastguard Worker 
234*ec779b8eSAndroid Build Coastguard Worker #endif //USE_SSE
235*ec779b8eSAndroid Build Coastguard Worker 
236*ec779b8eSAndroid Build Coastguard Worker } // namespace android
237*ec779b8eSAndroid Build Coastguard Worker 
238*ec779b8eSAndroid Build Coastguard Worker #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/
239