xref: /aosp_15_r20/external/pffft/pffft_priv_impl.h (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1*3f1979aaSAndroid Build Coastguard Worker /* Copyright (c) 2013  Julien Pommier ( [email protected] )
2*3f1979aaSAndroid Build Coastguard Worker    Copyright (c) 2020  Hayati Ayguen ( [email protected] )
3*3f1979aaSAndroid Build Coastguard Worker    Copyright (c) 2020  Dario Mambro ( [email protected] )
4*3f1979aaSAndroid Build Coastguard Worker 
5*3f1979aaSAndroid Build Coastguard Worker    Based on original fortran 77 code from FFTPACKv4 from NETLIB
6*3f1979aaSAndroid Build Coastguard Worker    (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
7*3f1979aaSAndroid Build Coastguard Worker    of NCAR, in 1985.
8*3f1979aaSAndroid Build Coastguard Worker 
9*3f1979aaSAndroid Build Coastguard Worker    As confirmed by the NCAR fftpack software curators, the following
10*3f1979aaSAndroid Build Coastguard Worker    FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
11*3f1979aaSAndroid Build Coastguard Worker    released under the same terms.
12*3f1979aaSAndroid Build Coastguard Worker 
13*3f1979aaSAndroid Build Coastguard Worker    FFTPACK license:
14*3f1979aaSAndroid Build Coastguard Worker 
15*3f1979aaSAndroid Build Coastguard Worker    http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
16*3f1979aaSAndroid Build Coastguard Worker 
17*3f1979aaSAndroid Build Coastguard Worker    Copyright (c) 2004 the University Corporation for Atmospheric
18*3f1979aaSAndroid Build Coastguard Worker    Research ("UCAR"). All rights reserved. Developed by NCAR's
19*3f1979aaSAndroid Build Coastguard Worker    Computational and Information Systems Laboratory, UCAR,
20*3f1979aaSAndroid Build Coastguard Worker    www.cisl.ucar.edu.
21*3f1979aaSAndroid Build Coastguard Worker 
22*3f1979aaSAndroid Build Coastguard Worker    Redistribution and use of the Software in source and binary forms,
23*3f1979aaSAndroid Build Coastguard Worker    with or without modification, is permitted provided that the
24*3f1979aaSAndroid Build Coastguard Worker    following conditions are met:
25*3f1979aaSAndroid Build Coastguard Worker 
26*3f1979aaSAndroid Build Coastguard Worker    - Neither the names of NCAR's Computational and Information Systems
27*3f1979aaSAndroid Build Coastguard Worker    Laboratory, the University Corporation for Atmospheric Research,
28*3f1979aaSAndroid Build Coastguard Worker    nor the names of its sponsors or contributors may be used to
29*3f1979aaSAndroid Build Coastguard Worker    endorse or promote products derived from this Software without
30*3f1979aaSAndroid Build Coastguard Worker    specific prior written permission.
31*3f1979aaSAndroid Build Coastguard Worker 
32*3f1979aaSAndroid Build Coastguard Worker    - Redistributions of source code must retain the above copyright
33*3f1979aaSAndroid Build Coastguard Worker    notices, this list of conditions, and the disclaimer below.
34*3f1979aaSAndroid Build Coastguard Worker 
35*3f1979aaSAndroid Build Coastguard Worker    - Redistributions in binary form must reproduce the above copyright
36*3f1979aaSAndroid Build Coastguard Worker    notice, this list of conditions, and the disclaimer below in the
37*3f1979aaSAndroid Build Coastguard Worker    documentation and/or other materials provided with the
38*3f1979aaSAndroid Build Coastguard Worker    distribution.
39*3f1979aaSAndroid Build Coastguard Worker 
40*3f1979aaSAndroid Build Coastguard Worker    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
41*3f1979aaSAndroid Build Coastguard Worker    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
42*3f1979aaSAndroid Build Coastguard Worker    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
43*3f1979aaSAndroid Build Coastguard Worker    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
44*3f1979aaSAndroid Build Coastguard Worker    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
45*3f1979aaSAndroid Build Coastguard Worker    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
46*3f1979aaSAndroid Build Coastguard Worker    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
47*3f1979aaSAndroid Build Coastguard Worker    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
48*3f1979aaSAndroid Build Coastguard Worker    SOFTWARE.
49*3f1979aaSAndroid Build Coastguard Worker 
50*3f1979aaSAndroid Build Coastguard Worker 
51*3f1979aaSAndroid Build Coastguard Worker    PFFFT : a Pretty Fast FFT.
52*3f1979aaSAndroid Build Coastguard Worker 
53*3f1979aaSAndroid Build Coastguard Worker    This file is largerly based on the original FFTPACK implementation, modified in
54*3f1979aaSAndroid Build Coastguard Worker    order to take advantage of SIMD instructions of modern CPUs.
55*3f1979aaSAndroid Build Coastguard Worker */
56*3f1979aaSAndroid Build Coastguard Worker 
57*3f1979aaSAndroid Build Coastguard Worker /* this file requires architecture specific preprocessor definitions
58*3f1979aaSAndroid Build Coastguard Worker  * it's only for library internal use
59*3f1979aaSAndroid Build Coastguard Worker  */
60*3f1979aaSAndroid Build Coastguard Worker 
61*3f1979aaSAndroid Build Coastguard Worker 
62*3f1979aaSAndroid Build Coastguard Worker /* define own constants required to turn off g++ extensions .. */
63*3f1979aaSAndroid Build Coastguard Worker #ifndef M_PI
64*3f1979aaSAndroid Build Coastguard Worker   #define M_PI    3.14159265358979323846  /* pi */
65*3f1979aaSAndroid Build Coastguard Worker #endif
66*3f1979aaSAndroid Build Coastguard Worker 
67*3f1979aaSAndroid Build Coastguard Worker #ifndef M_SQRT2
68*3f1979aaSAndroid Build Coastguard Worker   #define M_SQRT2 1.41421356237309504880  /* sqrt(2) */
69*3f1979aaSAndroid Build Coastguard Worker #endif
70*3f1979aaSAndroid Build Coastguard Worker 
71*3f1979aaSAndroid Build Coastguard Worker 
FUNC_SIMD_SIZE()72*3f1979aaSAndroid Build Coastguard Worker int FUNC_SIMD_SIZE() { return SIMD_SZ; }
73*3f1979aaSAndroid Build Coastguard Worker 
FUNC_SIMD_ARCH()74*3f1979aaSAndroid Build Coastguard Worker const char * FUNC_SIMD_ARCH() { return VARCH; }
75*3f1979aaSAndroid Build Coastguard Worker 
76*3f1979aaSAndroid Build Coastguard Worker 
77*3f1979aaSAndroid Build Coastguard Worker /*
78*3f1979aaSAndroid Build Coastguard Worker   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
79*3f1979aaSAndroid Build Coastguard Worker */
passf2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,float fsign)80*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
81*3f1979aaSAndroid Build Coastguard Worker   int k, i;
82*3f1979aaSAndroid Build Coastguard Worker   int l1ido = l1*ido;
83*3f1979aaSAndroid Build Coastguard Worker   if (ido <= 2) {
84*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
85*3f1979aaSAndroid Build Coastguard Worker       ch[0]         = VADD(cc[0], cc[ido+0]);
86*3f1979aaSAndroid Build Coastguard Worker       ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
87*3f1979aaSAndroid Build Coastguard Worker       ch[1]         = VADD(cc[1], cc[ido+1]);
88*3f1979aaSAndroid Build Coastguard Worker       ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
89*3f1979aaSAndroid Build Coastguard Worker     }
90*3f1979aaSAndroid Build Coastguard Worker   } else {
91*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
92*3f1979aaSAndroid Build Coastguard Worker       for (i=0; i<ido-1; i+=2) {
93*3f1979aaSAndroid Build Coastguard Worker         v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
94*3f1979aaSAndroid Build Coastguard Worker         v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
95*3f1979aaSAndroid Build Coastguard Worker         v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
96*3f1979aaSAndroid Build Coastguard Worker         ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
97*3f1979aaSAndroid Build Coastguard Worker         ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
98*3f1979aaSAndroid Build Coastguard Worker         VCPLXMUL(tr2, ti2, wr, wi);
99*3f1979aaSAndroid Build Coastguard Worker         ch[i+l1ido]   = tr2;
100*3f1979aaSAndroid Build Coastguard Worker         ch[i+l1ido+1] = ti2;
101*3f1979aaSAndroid Build Coastguard Worker       }
102*3f1979aaSAndroid Build Coastguard Worker     }
103*3f1979aaSAndroid Build Coastguard Worker   }
104*3f1979aaSAndroid Build Coastguard Worker }
105*3f1979aaSAndroid Build Coastguard Worker 
106*3f1979aaSAndroid Build Coastguard Worker /*
107*3f1979aaSAndroid Build Coastguard Worker   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
108*3f1979aaSAndroid Build Coastguard Worker */
passf3_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,float fsign)109*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
110*3f1979aaSAndroid Build Coastguard Worker                                     const float *wa1, const float *wa2, float fsign) {
111*3f1979aaSAndroid Build Coastguard Worker   static const float taur = -0.5f;
112*3f1979aaSAndroid Build Coastguard Worker   float taui = 0.866025403784439f*fsign;
113*3f1979aaSAndroid Build Coastguard Worker   int i, k;
114*3f1979aaSAndroid Build Coastguard Worker   v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
115*3f1979aaSAndroid Build Coastguard Worker   int l1ido = l1*ido;
116*3f1979aaSAndroid Build Coastguard Worker   float wr1, wi1, wr2, wi2;
117*3f1979aaSAndroid Build Coastguard Worker   assert(ido > 2);
118*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
119*3f1979aaSAndroid Build Coastguard Worker     for (i=0; i<ido-1; i+=2) {
120*3f1979aaSAndroid Build Coastguard Worker       tr2 = VADD(cc[i+ido], cc[i+2*ido]);
121*3f1979aaSAndroid Build Coastguard Worker       cr2 = VADD(cc[i], SVMUL(taur,tr2));
122*3f1979aaSAndroid Build Coastguard Worker       ch[i]    = VADD(cc[i], tr2);
123*3f1979aaSAndroid Build Coastguard Worker       ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
124*3f1979aaSAndroid Build Coastguard Worker       ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
125*3f1979aaSAndroid Build Coastguard Worker       ch[i+1]  = VADD(cc[i+1], ti2);
126*3f1979aaSAndroid Build Coastguard Worker       cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
127*3f1979aaSAndroid Build Coastguard Worker       ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
128*3f1979aaSAndroid Build Coastguard Worker       dr2 = VSUB(cr2, ci3);
129*3f1979aaSAndroid Build Coastguard Worker       dr3 = VADD(cr2, ci3);
130*3f1979aaSAndroid Build Coastguard Worker       di2 = VADD(ci2, cr3);
131*3f1979aaSAndroid Build Coastguard Worker       di3 = VSUB(ci2, cr3);
132*3f1979aaSAndroid Build Coastguard Worker       wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
133*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
134*3f1979aaSAndroid Build Coastguard Worker       ch[i+l1ido] = dr2;
135*3f1979aaSAndroid Build Coastguard Worker       ch[i+l1ido + 1] = di2;
136*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
137*3f1979aaSAndroid Build Coastguard Worker       ch[i+2*l1ido] = dr3;
138*3f1979aaSAndroid Build Coastguard Worker       ch[i+2*l1ido+1] = di3;
139*3f1979aaSAndroid Build Coastguard Worker     }
140*3f1979aaSAndroid Build Coastguard Worker   }
141*3f1979aaSAndroid Build Coastguard Worker } /* passf3 */
142*3f1979aaSAndroid Build Coastguard Worker 
passf4_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,float fsign)143*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
144*3f1979aaSAndroid Build Coastguard Worker                                     const float *wa1, const float *wa2, const float *wa3, float fsign) {
145*3f1979aaSAndroid Build Coastguard Worker   /* isign == -1 for forward transform and +1 for backward transform */
146*3f1979aaSAndroid Build Coastguard Worker 
147*3f1979aaSAndroid Build Coastguard Worker   int i, k;
148*3f1979aaSAndroid Build Coastguard Worker   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
149*3f1979aaSAndroid Build Coastguard Worker   int l1ido = l1*ido;
150*3f1979aaSAndroid Build Coastguard Worker   if (ido == 2) {
151*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
152*3f1979aaSAndroid Build Coastguard Worker       tr1 = VSUB(cc[0], cc[2*ido + 0]);
153*3f1979aaSAndroid Build Coastguard Worker       tr2 = VADD(cc[0], cc[2*ido + 0]);
154*3f1979aaSAndroid Build Coastguard Worker       ti1 = VSUB(cc[1], cc[2*ido + 1]);
155*3f1979aaSAndroid Build Coastguard Worker       ti2 = VADD(cc[1], cc[2*ido + 1]);
156*3f1979aaSAndroid Build Coastguard Worker       ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
157*3f1979aaSAndroid Build Coastguard Worker       tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
158*3f1979aaSAndroid Build Coastguard Worker       tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
159*3f1979aaSAndroid Build Coastguard Worker       ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
160*3f1979aaSAndroid Build Coastguard Worker 
161*3f1979aaSAndroid Build Coastguard Worker       ch[0*l1ido + 0] = VADD(tr2, tr3);
162*3f1979aaSAndroid Build Coastguard Worker       ch[0*l1ido + 1] = VADD(ti2, ti3);
163*3f1979aaSAndroid Build Coastguard Worker       ch[1*l1ido + 0] = VADD(tr1, tr4);
164*3f1979aaSAndroid Build Coastguard Worker       ch[1*l1ido + 1] = VADD(ti1, ti4);
165*3f1979aaSAndroid Build Coastguard Worker       ch[2*l1ido + 0] = VSUB(tr2, tr3);
166*3f1979aaSAndroid Build Coastguard Worker       ch[2*l1ido + 1] = VSUB(ti2, ti3);
167*3f1979aaSAndroid Build Coastguard Worker       ch[3*l1ido + 0] = VSUB(tr1, tr4);
168*3f1979aaSAndroid Build Coastguard Worker       ch[3*l1ido + 1] = VSUB(ti1, ti4);
169*3f1979aaSAndroid Build Coastguard Worker     }
170*3f1979aaSAndroid Build Coastguard Worker   } else {
171*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
172*3f1979aaSAndroid Build Coastguard Worker       for (i=0; i<ido-1; i+=2) {
173*3f1979aaSAndroid Build Coastguard Worker         float wr1, wi1, wr2, wi2, wr3, wi3;
174*3f1979aaSAndroid Build Coastguard Worker         tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
175*3f1979aaSAndroid Build Coastguard Worker         tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
176*3f1979aaSAndroid Build Coastguard Worker         ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
177*3f1979aaSAndroid Build Coastguard Worker         ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
178*3f1979aaSAndroid Build Coastguard Worker         tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
179*3f1979aaSAndroid Build Coastguard Worker         ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
180*3f1979aaSAndroid Build Coastguard Worker         tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
181*3f1979aaSAndroid Build Coastguard Worker         ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
182*3f1979aaSAndroid Build Coastguard Worker 
183*3f1979aaSAndroid Build Coastguard Worker         ch[i] = VADD(tr2, tr3);
184*3f1979aaSAndroid Build Coastguard Worker         cr3    = VSUB(tr2, tr3);
185*3f1979aaSAndroid Build Coastguard Worker         ch[i + 1] = VADD(ti2, ti3);
186*3f1979aaSAndroid Build Coastguard Worker         ci3 = VSUB(ti2, ti3);
187*3f1979aaSAndroid Build Coastguard Worker 
188*3f1979aaSAndroid Build Coastguard Worker         cr2 = VADD(tr1, tr4);
189*3f1979aaSAndroid Build Coastguard Worker         cr4 = VSUB(tr1, tr4);
190*3f1979aaSAndroid Build Coastguard Worker         ci2 = VADD(ti1, ti4);
191*3f1979aaSAndroid Build Coastguard Worker         ci4 = VSUB(ti1, ti4);
192*3f1979aaSAndroid Build Coastguard Worker         wr1=wa1[i], wi1=fsign*wa1[i+1];
193*3f1979aaSAndroid Build Coastguard Worker         VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
194*3f1979aaSAndroid Build Coastguard Worker         wr2=wa2[i], wi2=fsign*wa2[i+1];
195*3f1979aaSAndroid Build Coastguard Worker         ch[i + l1ido] = cr2;
196*3f1979aaSAndroid Build Coastguard Worker         ch[i + l1ido + 1] = ci2;
197*3f1979aaSAndroid Build Coastguard Worker 
198*3f1979aaSAndroid Build Coastguard Worker         VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
199*3f1979aaSAndroid Build Coastguard Worker         wr3=wa3[i], wi3=fsign*wa3[i+1];
200*3f1979aaSAndroid Build Coastguard Worker         ch[i + 2*l1ido] = cr3;
201*3f1979aaSAndroid Build Coastguard Worker         ch[i + 2*l1ido + 1] = ci3;
202*3f1979aaSAndroid Build Coastguard Worker 
203*3f1979aaSAndroid Build Coastguard Worker         VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
204*3f1979aaSAndroid Build Coastguard Worker         ch[i + 3*l1ido] = cr4;
205*3f1979aaSAndroid Build Coastguard Worker         ch[i + 3*l1ido + 1] = ci4;
206*3f1979aaSAndroid Build Coastguard Worker       }
207*3f1979aaSAndroid Build Coastguard Worker     }
208*3f1979aaSAndroid Build Coastguard Worker   }
209*3f1979aaSAndroid Build Coastguard Worker } /* passf4 */
210*3f1979aaSAndroid Build Coastguard Worker 
211*3f1979aaSAndroid Build Coastguard Worker /*
212*3f1979aaSAndroid Build Coastguard Worker   passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
213*3f1979aaSAndroid Build Coastguard Worker */
passf5_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4,float fsign)214*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
215*3f1979aaSAndroid Build Coastguard Worker                                     const float *wa1, const float *wa2,
216*3f1979aaSAndroid Build Coastguard Worker                                     const float *wa3, const float *wa4, float fsign) {
217*3f1979aaSAndroid Build Coastguard Worker   static const float tr11 = .309016994374947f;
218*3f1979aaSAndroid Build Coastguard Worker   const float ti11 = .951056516295154f*fsign;
219*3f1979aaSAndroid Build Coastguard Worker   static const float tr12 = -.809016994374947f;
220*3f1979aaSAndroid Build Coastguard Worker   const float ti12 = .587785252292473f*fsign;
221*3f1979aaSAndroid Build Coastguard Worker 
222*3f1979aaSAndroid Build Coastguard Worker   /* Local variables */
223*3f1979aaSAndroid Build Coastguard Worker   int i, k;
224*3f1979aaSAndroid Build Coastguard Worker   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
225*3f1979aaSAndroid Build Coastguard Worker     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
226*3f1979aaSAndroid Build Coastguard Worker 
227*3f1979aaSAndroid Build Coastguard Worker   float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
228*3f1979aaSAndroid Build Coastguard Worker 
229*3f1979aaSAndroid Build Coastguard Worker #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
230*3f1979aaSAndroid Build Coastguard Worker #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
231*3f1979aaSAndroid Build Coastguard Worker 
232*3f1979aaSAndroid Build Coastguard Worker   assert(ido > 2);
233*3f1979aaSAndroid Build Coastguard Worker   for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
234*3f1979aaSAndroid Build Coastguard Worker     for (i = 0; i < ido-1; i += 2) {
235*3f1979aaSAndroid Build Coastguard Worker       ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
236*3f1979aaSAndroid Build Coastguard Worker       ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
237*3f1979aaSAndroid Build Coastguard Worker       ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
238*3f1979aaSAndroid Build Coastguard Worker       ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
239*3f1979aaSAndroid Build Coastguard Worker       tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
240*3f1979aaSAndroid Build Coastguard Worker       tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
241*3f1979aaSAndroid Build Coastguard Worker       tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
242*3f1979aaSAndroid Build Coastguard Worker       tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
243*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
244*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
245*3f1979aaSAndroid Build Coastguard Worker       cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
246*3f1979aaSAndroid Build Coastguard Worker       ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
247*3f1979aaSAndroid Build Coastguard Worker       cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
248*3f1979aaSAndroid Build Coastguard Worker       ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
249*3f1979aaSAndroid Build Coastguard Worker       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
250*3f1979aaSAndroid Build Coastguard Worker       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
251*3f1979aaSAndroid Build Coastguard Worker       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
252*3f1979aaSAndroid Build Coastguard Worker       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
253*3f1979aaSAndroid Build Coastguard Worker       dr3 = VSUB(cr3, ci4);
254*3f1979aaSAndroid Build Coastguard Worker       dr4 = VADD(cr3, ci4);
255*3f1979aaSAndroid Build Coastguard Worker       di3 = VADD(ci3, cr4);
256*3f1979aaSAndroid Build Coastguard Worker       di4 = VSUB(ci3, cr4);
257*3f1979aaSAndroid Build Coastguard Worker       dr5 = VADD(cr2, ci5);
258*3f1979aaSAndroid Build Coastguard Worker       dr2 = VSUB(cr2, ci5);
259*3f1979aaSAndroid Build Coastguard Worker       di5 = VSUB(ci2, cr5);
260*3f1979aaSAndroid Build Coastguard Worker       di2 = VADD(ci2, cr5);
261*3f1979aaSAndroid Build Coastguard Worker       wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
262*3f1979aaSAndroid Build Coastguard Worker       wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
263*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
264*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i - 1, 2) = dr2;
265*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i, 2)     = di2;
266*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
267*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i - 1, 3) = dr3;
268*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i, 3)     = di3;
269*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
270*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i - 1, 4) = dr4;
271*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i, 4)     = di4;
272*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
273*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i - 1, 5) = dr5;
274*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i, 5)     = di5;
275*3f1979aaSAndroid Build Coastguard Worker     }
276*3f1979aaSAndroid Build Coastguard Worker   }
277*3f1979aaSAndroid Build Coastguard Worker #undef ch_ref
278*3f1979aaSAndroid Build Coastguard Worker #undef cc_ref
279*3f1979aaSAndroid Build Coastguard Worker }
280*3f1979aaSAndroid Build Coastguard Worker 
radf2_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1)281*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
282*3f1979aaSAndroid Build Coastguard Worker   static const float minus_one = -1.f;
283*3f1979aaSAndroid Build Coastguard Worker   int i, k, l1ido = l1*ido;
284*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < l1ido; k += ido) {
285*3f1979aaSAndroid Build Coastguard Worker     v4sf a = cc[k], b = cc[k + l1ido];
286*3f1979aaSAndroid Build Coastguard Worker     ch[2*k] = VADD(a, b);
287*3f1979aaSAndroid Build Coastguard Worker     ch[2*(k+ido)-1] = VSUB(a, b);
288*3f1979aaSAndroid Build Coastguard Worker   }
289*3f1979aaSAndroid Build Coastguard Worker   if (ido < 2) return;
290*3f1979aaSAndroid Build Coastguard Worker   if (ido != 2) {
291*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < l1ido; k += ido) {
292*3f1979aaSAndroid Build Coastguard Worker       for (i=2; i<ido; i+=2) {
293*3f1979aaSAndroid Build Coastguard Worker         v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
294*3f1979aaSAndroid Build Coastguard Worker         v4sf br = cc[i - 1 + k], bi = cc[i + k];
295*3f1979aaSAndroid Build Coastguard Worker         VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
296*3f1979aaSAndroid Build Coastguard Worker         ch[i + 2*k] = VADD(bi, ti2);
297*3f1979aaSAndroid Build Coastguard Worker         ch[2*(k+ido) - i] = VSUB(ti2, bi);
298*3f1979aaSAndroid Build Coastguard Worker         ch[i - 1 + 2*k] = VADD(br, tr2);
299*3f1979aaSAndroid Build Coastguard Worker         ch[2*(k+ido) - i -1] = VSUB(br, tr2);
300*3f1979aaSAndroid Build Coastguard Worker       }
301*3f1979aaSAndroid Build Coastguard Worker     }
302*3f1979aaSAndroid Build Coastguard Worker     if (ido % 2 == 1) return;
303*3f1979aaSAndroid Build Coastguard Worker   }
304*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < l1ido; k += ido) {
305*3f1979aaSAndroid Build Coastguard Worker     ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
306*3f1979aaSAndroid Build Coastguard Worker     ch[2*k + ido-1] = cc[k + ido-1];
307*3f1979aaSAndroid Build Coastguard Worker   }
308*3f1979aaSAndroid Build Coastguard Worker } /* radf2 */
309*3f1979aaSAndroid Build Coastguard Worker 
310*3f1979aaSAndroid Build Coastguard Worker 
radb2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1)311*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
312*3f1979aaSAndroid Build Coastguard Worker   static const float minus_two=-2;
313*3f1979aaSAndroid Build Coastguard Worker   int i, k, l1ido = l1*ido;
314*3f1979aaSAndroid Build Coastguard Worker   v4sf a,b,c,d, tr2, ti2;
315*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < l1ido; k += ido) {
316*3f1979aaSAndroid Build Coastguard Worker     a = cc[2*k]; b = cc[2*(k+ido) - 1];
317*3f1979aaSAndroid Build Coastguard Worker     ch[k] = VADD(a, b);
318*3f1979aaSAndroid Build Coastguard Worker     ch[k + l1ido] =VSUB(a, b);
319*3f1979aaSAndroid Build Coastguard Worker   }
320*3f1979aaSAndroid Build Coastguard Worker   if (ido < 2) return;
321*3f1979aaSAndroid Build Coastguard Worker   if (ido != 2) {
322*3f1979aaSAndroid Build Coastguard Worker     for (k = 0; k < l1ido; k += ido) {
323*3f1979aaSAndroid Build Coastguard Worker       for (i = 2; i < ido; i += 2) {
324*3f1979aaSAndroid Build Coastguard Worker         a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
325*3f1979aaSAndroid Build Coastguard Worker         c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
326*3f1979aaSAndroid Build Coastguard Worker         ch[i-1 + k] = VADD(a, b);
327*3f1979aaSAndroid Build Coastguard Worker         tr2 = VSUB(a, b);
328*3f1979aaSAndroid Build Coastguard Worker         ch[i+0 + k] = VSUB(c, d);
329*3f1979aaSAndroid Build Coastguard Worker         ti2 = VADD(c, d);
330*3f1979aaSAndroid Build Coastguard Worker         VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
331*3f1979aaSAndroid Build Coastguard Worker         ch[i-1 + k + l1ido] = tr2;
332*3f1979aaSAndroid Build Coastguard Worker         ch[i+0 + k + l1ido] = ti2;
333*3f1979aaSAndroid Build Coastguard Worker       }
334*3f1979aaSAndroid Build Coastguard Worker     }
335*3f1979aaSAndroid Build Coastguard Worker     if (ido % 2 == 1) return;
336*3f1979aaSAndroid Build Coastguard Worker   }
337*3f1979aaSAndroid Build Coastguard Worker   for (k = 0; k < l1ido; k += ido) {
338*3f1979aaSAndroid Build Coastguard Worker     a = cc[2*k + ido-1]; b = cc[2*k + ido];
339*3f1979aaSAndroid Build Coastguard Worker     ch[k + ido-1] = VADD(a,a);
340*3f1979aaSAndroid Build Coastguard Worker     ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
341*3f1979aaSAndroid Build Coastguard Worker   }
342*3f1979aaSAndroid Build Coastguard Worker } /* radb2 */
343*3f1979aaSAndroid Build Coastguard Worker 
radf3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)344*3f1979aaSAndroid Build Coastguard Worker static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
345*3f1979aaSAndroid Build Coastguard Worker                      const float *wa1, const float *wa2) {
346*3f1979aaSAndroid Build Coastguard Worker   static const float taur = -0.5f;
347*3f1979aaSAndroid Build Coastguard Worker   static const float taui = 0.866025403784439f;
348*3f1979aaSAndroid Build Coastguard Worker   int i, k, ic;
349*3f1979aaSAndroid Build Coastguard Worker   v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
350*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k<l1; k++) {
351*3f1979aaSAndroid Build Coastguard Worker     cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
352*3f1979aaSAndroid Build Coastguard Worker     ch[3*k*ido] = VADD(cc[k*ido], cr2);
353*3f1979aaSAndroid Build Coastguard Worker     ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
354*3f1979aaSAndroid Build Coastguard Worker     ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
355*3f1979aaSAndroid Build Coastguard Worker   }
356*3f1979aaSAndroid Build Coastguard Worker   if (ido == 1) return;
357*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k<l1; k++) {
358*3f1979aaSAndroid Build Coastguard Worker     for (i=2; i<ido; i+=2) {
359*3f1979aaSAndroid Build Coastguard Worker       ic = ido - i;
360*3f1979aaSAndroid Build Coastguard Worker       wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
361*3f1979aaSAndroid Build Coastguard Worker       dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
362*3f1979aaSAndroid Build Coastguard Worker       VCPLXMULCONJ(dr2, di2, wr1, wi1);
363*3f1979aaSAndroid Build Coastguard Worker 
364*3f1979aaSAndroid Build Coastguard Worker       wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
365*3f1979aaSAndroid Build Coastguard Worker       dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
366*3f1979aaSAndroid Build Coastguard Worker       VCPLXMULCONJ(dr3, di3, wr2, wi2);
367*3f1979aaSAndroid Build Coastguard Worker 
368*3f1979aaSAndroid Build Coastguard Worker       cr2 = VADD(dr2, dr3);
369*3f1979aaSAndroid Build Coastguard Worker       ci2 = VADD(di2, di3);
370*3f1979aaSAndroid Build Coastguard Worker       ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
371*3f1979aaSAndroid Build Coastguard Worker       ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
372*3f1979aaSAndroid Build Coastguard Worker       tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
373*3f1979aaSAndroid Build Coastguard Worker       ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
374*3f1979aaSAndroid Build Coastguard Worker       tr3 = SVMUL(taui, VSUB(di2, di3));
375*3f1979aaSAndroid Build Coastguard Worker       ti3 = SVMUL(taui, VSUB(dr3, dr2));
376*3f1979aaSAndroid Build Coastguard Worker       ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
377*3f1979aaSAndroid Build Coastguard Worker       ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
378*3f1979aaSAndroid Build Coastguard Worker       ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
379*3f1979aaSAndroid Build Coastguard Worker       ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
380*3f1979aaSAndroid Build Coastguard Worker     }
381*3f1979aaSAndroid Build Coastguard Worker   }
382*3f1979aaSAndroid Build Coastguard Worker } /* radf3 */
383*3f1979aaSAndroid Build Coastguard Worker 
384*3f1979aaSAndroid Build Coastguard Worker 
radb3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)385*3f1979aaSAndroid Build Coastguard Worker static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
386*3f1979aaSAndroid Build Coastguard Worker                      const float *wa1, const float *wa2)
387*3f1979aaSAndroid Build Coastguard Worker {
388*3f1979aaSAndroid Build Coastguard Worker   static const float taur = -0.5f;
389*3f1979aaSAndroid Build Coastguard Worker   static const float taui = 0.866025403784439f;
390*3f1979aaSAndroid Build Coastguard Worker   static const float taui_2 = 0.866025403784439f*2;
391*3f1979aaSAndroid Build Coastguard Worker   int i, k, ic;
392*3f1979aaSAndroid Build Coastguard Worker   v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
393*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k<l1; k++) {
394*3f1979aaSAndroid Build Coastguard Worker     tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
395*3f1979aaSAndroid Build Coastguard Worker     cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
396*3f1979aaSAndroid Build Coastguard Worker     ch[k*ido] = VADD(cc[3*k*ido], tr2);
397*3f1979aaSAndroid Build Coastguard Worker     ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
398*3f1979aaSAndroid Build Coastguard Worker     ch[(k + l1)*ido] = VSUB(cr2, ci3);
399*3f1979aaSAndroid Build Coastguard Worker     ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
400*3f1979aaSAndroid Build Coastguard Worker   }
401*3f1979aaSAndroid Build Coastguard Worker   if (ido == 1) return;
402*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k<l1; k++) {
403*3f1979aaSAndroid Build Coastguard Worker     for (i=2; i<ido; i+=2) {
404*3f1979aaSAndroid Build Coastguard Worker       ic = ido - i;
405*3f1979aaSAndroid Build Coastguard Worker       tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
406*3f1979aaSAndroid Build Coastguard Worker       cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
407*3f1979aaSAndroid Build Coastguard Worker       ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
408*3f1979aaSAndroid Build Coastguard Worker       ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
409*3f1979aaSAndroid Build Coastguard Worker       ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
410*3f1979aaSAndroid Build Coastguard Worker       ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
411*3f1979aaSAndroid Build Coastguard Worker       cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
412*3f1979aaSAndroid Build Coastguard Worker       ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
413*3f1979aaSAndroid Build Coastguard Worker       dr2 = VSUB(cr2, ci3);
414*3f1979aaSAndroid Build Coastguard Worker       dr3 = VADD(cr2, ci3);
415*3f1979aaSAndroid Build Coastguard Worker       di2 = VADD(ci2, cr3);
416*3f1979aaSAndroid Build Coastguard Worker       di3 = VSUB(ci2, cr3);
417*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
418*3f1979aaSAndroid Build Coastguard Worker       ch[i - 1 + (k + l1)*ido] = dr2;
419*3f1979aaSAndroid Build Coastguard Worker       ch[i + (k + l1)*ido] = di2;
420*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
421*3f1979aaSAndroid Build Coastguard Worker       ch[i - 1 + (k + 2*l1)*ido] = dr3;
422*3f1979aaSAndroid Build Coastguard Worker       ch[i + (k + 2*l1)*ido] = di3;
423*3f1979aaSAndroid Build Coastguard Worker     }
424*3f1979aaSAndroid Build Coastguard Worker   }
425*3f1979aaSAndroid Build Coastguard Worker } /* radb3 */
426*3f1979aaSAndroid Build Coastguard Worker 
radf4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)427*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
428*3f1979aaSAndroid Build Coastguard Worker                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
429*3f1979aaSAndroid Build Coastguard Worker {
430*3f1979aaSAndroid Build Coastguard Worker   static const float minus_hsqt2 = (float)-0.7071067811865475;
431*3f1979aaSAndroid Build Coastguard Worker   int i, k, l1ido = l1*ido;
432*3f1979aaSAndroid Build Coastguard Worker   {
433*3f1979aaSAndroid Build Coastguard Worker     const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
434*3f1979aaSAndroid Build Coastguard Worker     v4sf * RESTRICT ch_ = ch;
435*3f1979aaSAndroid Build Coastguard Worker     while (cc < cc_end) {
436*3f1979aaSAndroid Build Coastguard Worker       /* this loop represents between 25% and 40% of total radf4_ps cost ! */
437*3f1979aaSAndroid Build Coastguard Worker       v4sf a0 = cc[0], a1 = cc[l1ido];
438*3f1979aaSAndroid Build Coastguard Worker       v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
439*3f1979aaSAndroid Build Coastguard Worker       v4sf tr1 = VADD(a1, a3);
440*3f1979aaSAndroid Build Coastguard Worker       v4sf tr2 = VADD(a0, a2);
441*3f1979aaSAndroid Build Coastguard Worker       ch[2*ido-1] = VSUB(a0, a2);
442*3f1979aaSAndroid Build Coastguard Worker       ch[2*ido  ] = VSUB(a3, a1);
443*3f1979aaSAndroid Build Coastguard Worker       ch[0      ] = VADD(tr1, tr2);
444*3f1979aaSAndroid Build Coastguard Worker       ch[4*ido-1] = VSUB(tr2, tr1);
445*3f1979aaSAndroid Build Coastguard Worker       cc += ido; ch += 4*ido;
446*3f1979aaSAndroid Build Coastguard Worker     }
447*3f1979aaSAndroid Build Coastguard Worker     cc = cc_; ch = ch_;
448*3f1979aaSAndroid Build Coastguard Worker   }
449*3f1979aaSAndroid Build Coastguard Worker   if (ido < 2) return;
450*3f1979aaSAndroid Build Coastguard Worker   if (ido != 2) {
451*3f1979aaSAndroid Build Coastguard Worker     for (k = 0; k < l1ido; k += ido) {
452*3f1979aaSAndroid Build Coastguard Worker       const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
453*3f1979aaSAndroid Build Coastguard Worker       for (i=2; i<ido; i += 2, pc += 2) {
454*3f1979aaSAndroid Build Coastguard Worker         int ic = ido - i;
455*3f1979aaSAndroid Build Coastguard Worker         v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
456*3f1979aaSAndroid Build Coastguard Worker         v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
457*3f1979aaSAndroid Build Coastguard Worker 
458*3f1979aaSAndroid Build Coastguard Worker         cr2 = pc[1*l1ido+0];
459*3f1979aaSAndroid Build Coastguard Worker         ci2 = pc[1*l1ido+1];
460*3f1979aaSAndroid Build Coastguard Worker         wr=LD_PS1(wa1[i - 2]);
461*3f1979aaSAndroid Build Coastguard Worker         wi=LD_PS1(wa1[i - 1]);
462*3f1979aaSAndroid Build Coastguard Worker         VCPLXMULCONJ(cr2,ci2,wr,wi);
463*3f1979aaSAndroid Build Coastguard Worker 
464*3f1979aaSAndroid Build Coastguard Worker         cr3 = pc[2*l1ido+0];
465*3f1979aaSAndroid Build Coastguard Worker         ci3 = pc[2*l1ido+1];
466*3f1979aaSAndroid Build Coastguard Worker         wr = LD_PS1(wa2[i-2]);
467*3f1979aaSAndroid Build Coastguard Worker         wi = LD_PS1(wa2[i-1]);
468*3f1979aaSAndroid Build Coastguard Worker         VCPLXMULCONJ(cr3, ci3, wr, wi);
469*3f1979aaSAndroid Build Coastguard Worker 
470*3f1979aaSAndroid Build Coastguard Worker         cr4 = pc[3*l1ido];
471*3f1979aaSAndroid Build Coastguard Worker         ci4 = pc[3*l1ido+1];
472*3f1979aaSAndroid Build Coastguard Worker         wr = LD_PS1(wa3[i-2]);
473*3f1979aaSAndroid Build Coastguard Worker         wi = LD_PS1(wa3[i-1]);
474*3f1979aaSAndroid Build Coastguard Worker         VCPLXMULCONJ(cr4, ci4, wr, wi);
475*3f1979aaSAndroid Build Coastguard Worker 
476*3f1979aaSAndroid Build Coastguard Worker         /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
477*3f1979aaSAndroid Build Coastguard Worker 
478*3f1979aaSAndroid Build Coastguard Worker         tr1 = VADD(cr2,cr4);
479*3f1979aaSAndroid Build Coastguard Worker         tr4 = VSUB(cr4,cr2);
480*3f1979aaSAndroid Build Coastguard Worker         tr2 = VADD(pc[0],cr3);
481*3f1979aaSAndroid Build Coastguard Worker         tr3 = VSUB(pc[0],cr3);
482*3f1979aaSAndroid Build Coastguard Worker         ch[i - 1 + 4*k] = VADD(tr1,tr2);
483*3f1979aaSAndroid Build Coastguard Worker         ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); /* at this point tr1 and tr2 can be disposed */
484*3f1979aaSAndroid Build Coastguard Worker         ti1 = VADD(ci2,ci4);
485*3f1979aaSAndroid Build Coastguard Worker         ti4 = VSUB(ci2,ci4);
486*3f1979aaSAndroid Build Coastguard Worker         ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
487*3f1979aaSAndroid Build Coastguard Worker         ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); /* dispose tr3, ti4 */
488*3f1979aaSAndroid Build Coastguard Worker         ti2 = VADD(pc[1],ci3);
489*3f1979aaSAndroid Build Coastguard Worker         ti3 = VSUB(pc[1],ci3);
490*3f1979aaSAndroid Build Coastguard Worker         ch[i + 4*k] = VADD(ti1, ti2);
491*3f1979aaSAndroid Build Coastguard Worker         ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
492*3f1979aaSAndroid Build Coastguard Worker         ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
493*3f1979aaSAndroid Build Coastguard Worker         ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
494*3f1979aaSAndroid Build Coastguard Worker       }
495*3f1979aaSAndroid Build Coastguard Worker     }
496*3f1979aaSAndroid Build Coastguard Worker     if (ido % 2 == 1) return;
497*3f1979aaSAndroid Build Coastguard Worker   }
498*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k<l1ido; k += ido) {
499*3f1979aaSAndroid Build Coastguard Worker     v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
500*3f1979aaSAndroid Build Coastguard Worker     v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
501*3f1979aaSAndroid Build Coastguard Worker     v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
502*3f1979aaSAndroid Build Coastguard Worker     v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
503*3f1979aaSAndroid Build Coastguard Worker     ch[ido-1 + 4*k] = VADD(tr1, c);
504*3f1979aaSAndroid Build Coastguard Worker     ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
505*3f1979aaSAndroid Build Coastguard Worker     ch[4*k + 1*ido] = VSUB(ti1, d);
506*3f1979aaSAndroid Build Coastguard Worker     ch[4*k + 3*ido] = VADD(ti1, d);
507*3f1979aaSAndroid Build Coastguard Worker   }
508*3f1979aaSAndroid Build Coastguard Worker } /* radf4 */
509*3f1979aaSAndroid Build Coastguard Worker 
510*3f1979aaSAndroid Build Coastguard Worker 
radb4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)511*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
512*3f1979aaSAndroid Build Coastguard Worker                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
513*3f1979aaSAndroid Build Coastguard Worker {
514*3f1979aaSAndroid Build Coastguard Worker   static const float minus_sqrt2 = (float)-1.414213562373095;
515*3f1979aaSAndroid Build Coastguard Worker   static const float two = 2.f;
516*3f1979aaSAndroid Build Coastguard Worker   int i, k, l1ido = l1*ido;
517*3f1979aaSAndroid Build Coastguard Worker   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
518*3f1979aaSAndroid Build Coastguard Worker   {
519*3f1979aaSAndroid Build Coastguard Worker     const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
520*3f1979aaSAndroid Build Coastguard Worker     v4sf *ch_ = ch;
521*3f1979aaSAndroid Build Coastguard Worker     while (ch < ch_end) {
522*3f1979aaSAndroid Build Coastguard Worker       v4sf a = cc[0], b = cc[4*ido-1];
523*3f1979aaSAndroid Build Coastguard Worker       v4sf c = cc[2*ido], d = cc[2*ido-1];
524*3f1979aaSAndroid Build Coastguard Worker       tr3 = SVMUL(two,d);
525*3f1979aaSAndroid Build Coastguard Worker       tr2 = VADD(a,b);
526*3f1979aaSAndroid Build Coastguard Worker       tr1 = VSUB(a,b);
527*3f1979aaSAndroid Build Coastguard Worker       tr4 = SVMUL(two,c);
528*3f1979aaSAndroid Build Coastguard Worker       ch[0*l1ido] = VADD(tr2, tr3);
529*3f1979aaSAndroid Build Coastguard Worker       ch[2*l1ido] = VSUB(tr2, tr3);
530*3f1979aaSAndroid Build Coastguard Worker       ch[1*l1ido] = VSUB(tr1, tr4);
531*3f1979aaSAndroid Build Coastguard Worker       ch[3*l1ido] = VADD(tr1, tr4);
532*3f1979aaSAndroid Build Coastguard Worker 
533*3f1979aaSAndroid Build Coastguard Worker       cc += 4*ido; ch += ido;
534*3f1979aaSAndroid Build Coastguard Worker     }
535*3f1979aaSAndroid Build Coastguard Worker     cc = cc_; ch = ch_;
536*3f1979aaSAndroid Build Coastguard Worker   }
537*3f1979aaSAndroid Build Coastguard Worker   if (ido < 2) return;
538*3f1979aaSAndroid Build Coastguard Worker   if (ido != 2) {
539*3f1979aaSAndroid Build Coastguard Worker     for (k = 0; k < l1ido; k += ido) {
540*3f1979aaSAndroid Build Coastguard Worker       const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
541*3f1979aaSAndroid Build Coastguard Worker       v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
542*3f1979aaSAndroid Build Coastguard Worker       for (i = 2; i < ido; i += 2) {
543*3f1979aaSAndroid Build Coastguard Worker 
544*3f1979aaSAndroid Build Coastguard Worker         tr1 = VSUB(pc[i], pc[4*ido - i]);
545*3f1979aaSAndroid Build Coastguard Worker         tr2 = VADD(pc[i], pc[4*ido - i]);
546*3f1979aaSAndroid Build Coastguard Worker         ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
547*3f1979aaSAndroid Build Coastguard Worker         tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
548*3f1979aaSAndroid Build Coastguard Worker         ph[0] = VADD(tr2, tr3);
549*3f1979aaSAndroid Build Coastguard Worker         cr3 = VSUB(tr2, tr3);
550*3f1979aaSAndroid Build Coastguard Worker 
551*3f1979aaSAndroid Build Coastguard Worker         ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
552*3f1979aaSAndroid Build Coastguard Worker         tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
553*3f1979aaSAndroid Build Coastguard Worker         cr2 = VSUB(tr1, tr4);
554*3f1979aaSAndroid Build Coastguard Worker         cr4 = VADD(tr1, tr4);
555*3f1979aaSAndroid Build Coastguard Worker 
556*3f1979aaSAndroid Build Coastguard Worker         ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
557*3f1979aaSAndroid Build Coastguard Worker         ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
558*3f1979aaSAndroid Build Coastguard Worker 
559*3f1979aaSAndroid Build Coastguard Worker         ph[1] = VADD(ti2, ti3); ph += l1ido;
560*3f1979aaSAndroid Build Coastguard Worker         ci3 = VSUB(ti2, ti3);
561*3f1979aaSAndroid Build Coastguard Worker         ci2 = VADD(ti1, ti4);
562*3f1979aaSAndroid Build Coastguard Worker         ci4 = VSUB(ti1, ti4);
563*3f1979aaSAndroid Build Coastguard Worker         VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
564*3f1979aaSAndroid Build Coastguard Worker         ph[0] = cr2;
565*3f1979aaSAndroid Build Coastguard Worker         ph[1] = ci2; ph += l1ido;
566*3f1979aaSAndroid Build Coastguard Worker         VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
567*3f1979aaSAndroid Build Coastguard Worker         ph[0] = cr3;
568*3f1979aaSAndroid Build Coastguard Worker         ph[1] = ci3; ph += l1ido;
569*3f1979aaSAndroid Build Coastguard Worker         VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
570*3f1979aaSAndroid Build Coastguard Worker         ph[0] = cr4;
571*3f1979aaSAndroid Build Coastguard Worker         ph[1] = ci4; ph = ph - 3*l1ido + 2;
572*3f1979aaSAndroid Build Coastguard Worker       }
573*3f1979aaSAndroid Build Coastguard Worker     }
574*3f1979aaSAndroid Build Coastguard Worker     if (ido % 2 == 1) return;
575*3f1979aaSAndroid Build Coastguard Worker   }
576*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < l1ido; k+=ido) {
577*3f1979aaSAndroid Build Coastguard Worker     int i0 = 4*k + ido;
578*3f1979aaSAndroid Build Coastguard Worker     v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
579*3f1979aaSAndroid Build Coastguard Worker     v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
580*3f1979aaSAndroid Build Coastguard Worker     tr1 = VSUB(c,d);
581*3f1979aaSAndroid Build Coastguard Worker     tr2 = VADD(c,d);
582*3f1979aaSAndroid Build Coastguard Worker     ti1 = VADD(b,a);
583*3f1979aaSAndroid Build Coastguard Worker     ti2 = VSUB(b,a);
584*3f1979aaSAndroid Build Coastguard Worker     ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
585*3f1979aaSAndroid Build Coastguard Worker     ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
586*3f1979aaSAndroid Build Coastguard Worker     ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
587*3f1979aaSAndroid Build Coastguard Worker     ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
588*3f1979aaSAndroid Build Coastguard Worker   }
589*3f1979aaSAndroid Build Coastguard Worker } /* radb4 */
590*3f1979aaSAndroid Build Coastguard Worker 
radf5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)591*3f1979aaSAndroid Build Coastguard Worker static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
592*3f1979aaSAndroid Build Coastguard Worker                      const float *wa1, const float *wa2, const float *wa3, const float *wa4)
593*3f1979aaSAndroid Build Coastguard Worker {
594*3f1979aaSAndroid Build Coastguard Worker   static const float tr11 = .309016994374947f;
595*3f1979aaSAndroid Build Coastguard Worker   static const float ti11 = .951056516295154f;
596*3f1979aaSAndroid Build Coastguard Worker   static const float tr12 = -.809016994374947f;
597*3f1979aaSAndroid Build Coastguard Worker   static const float ti12 = .587785252292473f;
598*3f1979aaSAndroid Build Coastguard Worker 
599*3f1979aaSAndroid Build Coastguard Worker   /* System generated locals */
600*3f1979aaSAndroid Build Coastguard Worker   int cc_offset, ch_offset;
601*3f1979aaSAndroid Build Coastguard Worker 
602*3f1979aaSAndroid Build Coastguard Worker   /* Local variables */
603*3f1979aaSAndroid Build Coastguard Worker   int i, k, ic;
604*3f1979aaSAndroid Build Coastguard Worker   v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
605*3f1979aaSAndroid Build Coastguard Worker     cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
606*3f1979aaSAndroid Build Coastguard Worker   int idp2;
607*3f1979aaSAndroid Build Coastguard Worker 
608*3f1979aaSAndroid Build Coastguard Worker 
609*3f1979aaSAndroid Build Coastguard Worker #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
610*3f1979aaSAndroid Build Coastguard Worker #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
611*3f1979aaSAndroid Build Coastguard Worker 
612*3f1979aaSAndroid Build Coastguard Worker   /* Parameter adjustments */
613*3f1979aaSAndroid Build Coastguard Worker   ch_offset = 1 + ido * 6;
614*3f1979aaSAndroid Build Coastguard Worker   ch -= ch_offset;
615*3f1979aaSAndroid Build Coastguard Worker   cc_offset = 1 + ido * (1 + l1);
616*3f1979aaSAndroid Build Coastguard Worker   cc -= cc_offset;
617*3f1979aaSAndroid Build Coastguard Worker 
618*3f1979aaSAndroid Build Coastguard Worker   /* Function Body */
619*3f1979aaSAndroid Build Coastguard Worker   for (k = 1; k <= l1; ++k) {
620*3f1979aaSAndroid Build Coastguard Worker     cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
621*3f1979aaSAndroid Build Coastguard Worker     ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
622*3f1979aaSAndroid Build Coastguard Worker     cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
623*3f1979aaSAndroid Build Coastguard Worker     ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
624*3f1979aaSAndroid Build Coastguard Worker     ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
625*3f1979aaSAndroid Build Coastguard Worker     ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
626*3f1979aaSAndroid Build Coastguard Worker     ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
627*3f1979aaSAndroid Build Coastguard Worker     ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
628*3f1979aaSAndroid Build Coastguard Worker     ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
629*3f1979aaSAndroid Build Coastguard Worker     /* printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4); */
630*3f1979aaSAndroid Build Coastguard Worker   }
631*3f1979aaSAndroid Build Coastguard Worker   if (ido == 1) {
632*3f1979aaSAndroid Build Coastguard Worker     return;
633*3f1979aaSAndroid Build Coastguard Worker   }
634*3f1979aaSAndroid Build Coastguard Worker   idp2 = ido + 2;
635*3f1979aaSAndroid Build Coastguard Worker   for (k = 1; k <= l1; ++k) {
636*3f1979aaSAndroid Build Coastguard Worker     for (i = 3; i <= ido; i += 2) {
637*3f1979aaSAndroid Build Coastguard Worker       ic = idp2 - i;
638*3f1979aaSAndroid Build Coastguard Worker       dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
639*3f1979aaSAndroid Build Coastguard Worker       dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
640*3f1979aaSAndroid Build Coastguard Worker       dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
641*3f1979aaSAndroid Build Coastguard Worker       dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
642*3f1979aaSAndroid Build Coastguard Worker       VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
643*3f1979aaSAndroid Build Coastguard Worker       VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
644*3f1979aaSAndroid Build Coastguard Worker       VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
645*3f1979aaSAndroid Build Coastguard Worker       VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
646*3f1979aaSAndroid Build Coastguard Worker       cr2 = VADD(dr2, dr5);
647*3f1979aaSAndroid Build Coastguard Worker       ci5 = VSUB(dr5, dr2);
648*3f1979aaSAndroid Build Coastguard Worker       cr5 = VSUB(di2, di5);
649*3f1979aaSAndroid Build Coastguard Worker       ci2 = VADD(di2, di5);
650*3f1979aaSAndroid Build Coastguard Worker       cr3 = VADD(dr3, dr4);
651*3f1979aaSAndroid Build Coastguard Worker       ci4 = VSUB(dr4, dr3);
652*3f1979aaSAndroid Build Coastguard Worker       cr4 = VSUB(di3, di4);
653*3f1979aaSAndroid Build Coastguard Worker       ci3 = VADD(di3, di4);
654*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
655*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));
656*3f1979aaSAndroid Build Coastguard Worker       tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
657*3f1979aaSAndroid Build Coastguard Worker       ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));
658*3f1979aaSAndroid Build Coastguard Worker       tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
659*3f1979aaSAndroid Build Coastguard Worker       ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));
660*3f1979aaSAndroid Build Coastguard Worker       tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
661*3f1979aaSAndroid Build Coastguard Worker       ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
662*3f1979aaSAndroid Build Coastguard Worker       tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
663*3f1979aaSAndroid Build Coastguard Worker       ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
664*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
665*3f1979aaSAndroid Build Coastguard Worker       ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
666*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i, 3, k) = VADD(ti2, ti5);
667*3f1979aaSAndroid Build Coastguard Worker       ch_ref(ic, 2, k) = VSUB(ti5, ti2);
668*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
669*3f1979aaSAndroid Build Coastguard Worker       ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
670*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i, 5, k) = VADD(ti3, ti4);
671*3f1979aaSAndroid Build Coastguard Worker       ch_ref(ic, 4, k) = VSUB(ti4, ti3);
672*3f1979aaSAndroid Build Coastguard Worker     }
673*3f1979aaSAndroid Build Coastguard Worker   }
674*3f1979aaSAndroid Build Coastguard Worker #undef cc_ref
675*3f1979aaSAndroid Build Coastguard Worker #undef ch_ref
676*3f1979aaSAndroid Build Coastguard Worker } /* radf5 */
677*3f1979aaSAndroid Build Coastguard Worker 
radb5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)678*3f1979aaSAndroid Build Coastguard Worker static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
679*3f1979aaSAndroid Build Coastguard Worker                   const float *wa1, const float *wa2, const float *wa3, const float *wa4)
680*3f1979aaSAndroid Build Coastguard Worker {
681*3f1979aaSAndroid Build Coastguard Worker   static const float tr11 = .309016994374947f;
682*3f1979aaSAndroid Build Coastguard Worker   static const float ti11 = .951056516295154f;
683*3f1979aaSAndroid Build Coastguard Worker   static const float tr12 = -.809016994374947f;
684*3f1979aaSAndroid Build Coastguard Worker   static const float ti12 = .587785252292473f;
685*3f1979aaSAndroid Build Coastguard Worker 
686*3f1979aaSAndroid Build Coastguard Worker   int cc_offset, ch_offset;
687*3f1979aaSAndroid Build Coastguard Worker 
688*3f1979aaSAndroid Build Coastguard Worker   /* Local variables */
689*3f1979aaSAndroid Build Coastguard Worker   int i, k, ic;
690*3f1979aaSAndroid Build Coastguard Worker   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
691*3f1979aaSAndroid Build Coastguard Worker     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
692*3f1979aaSAndroid Build Coastguard Worker   int idp2;
693*3f1979aaSAndroid Build Coastguard Worker 
694*3f1979aaSAndroid Build Coastguard Worker #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
695*3f1979aaSAndroid Build Coastguard Worker #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
696*3f1979aaSAndroid Build Coastguard Worker 
697*3f1979aaSAndroid Build Coastguard Worker   /* Parameter adjustments */
698*3f1979aaSAndroid Build Coastguard Worker   ch_offset = 1 + ido * (1 + l1);
699*3f1979aaSAndroid Build Coastguard Worker   ch -= ch_offset;
700*3f1979aaSAndroid Build Coastguard Worker   cc_offset = 1 + ido * 6;
701*3f1979aaSAndroid Build Coastguard Worker   cc -= cc_offset;
702*3f1979aaSAndroid Build Coastguard Worker 
703*3f1979aaSAndroid Build Coastguard Worker   /* Function Body */
704*3f1979aaSAndroid Build Coastguard Worker   for (k = 1; k <= l1; ++k) {
705*3f1979aaSAndroid Build Coastguard Worker     ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
706*3f1979aaSAndroid Build Coastguard Worker     ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
707*3f1979aaSAndroid Build Coastguard Worker     tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
708*3f1979aaSAndroid Build Coastguard Worker     tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
709*3f1979aaSAndroid Build Coastguard Worker     ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
710*3f1979aaSAndroid Build Coastguard Worker     cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
711*3f1979aaSAndroid Build Coastguard Worker     cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
712*3f1979aaSAndroid Build Coastguard Worker     ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
713*3f1979aaSAndroid Build Coastguard Worker     ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
714*3f1979aaSAndroid Build Coastguard Worker     ch_ref(1, k, 2) = VSUB(cr2, ci5);
715*3f1979aaSAndroid Build Coastguard Worker     ch_ref(1, k, 3) = VSUB(cr3, ci4);
716*3f1979aaSAndroid Build Coastguard Worker     ch_ref(1, k, 4) = VADD(cr3, ci4);
717*3f1979aaSAndroid Build Coastguard Worker     ch_ref(1, k, 5) = VADD(cr2, ci5);
718*3f1979aaSAndroid Build Coastguard Worker   }
719*3f1979aaSAndroid Build Coastguard Worker   if (ido == 1) {
720*3f1979aaSAndroid Build Coastguard Worker     return;
721*3f1979aaSAndroid Build Coastguard Worker   }
722*3f1979aaSAndroid Build Coastguard Worker   idp2 = ido + 2;
723*3f1979aaSAndroid Build Coastguard Worker   for (k = 1; k <= l1; ++k) {
724*3f1979aaSAndroid Build Coastguard Worker     for (i = 3; i <= ido; i += 2) {
725*3f1979aaSAndroid Build Coastguard Worker       ic = idp2 - i;
726*3f1979aaSAndroid Build Coastguard Worker       ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
727*3f1979aaSAndroid Build Coastguard Worker       ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
728*3f1979aaSAndroid Build Coastguard Worker       ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
729*3f1979aaSAndroid Build Coastguard Worker       ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
730*3f1979aaSAndroid Build Coastguard Worker       tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
731*3f1979aaSAndroid Build Coastguard Worker       tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
732*3f1979aaSAndroid Build Coastguard Worker       tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
733*3f1979aaSAndroid Build Coastguard Worker       tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
734*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
735*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
736*3f1979aaSAndroid Build Coastguard Worker       cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
737*3f1979aaSAndroid Build Coastguard Worker       ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
738*3f1979aaSAndroid Build Coastguard Worker       cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
739*3f1979aaSAndroid Build Coastguard Worker       ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
740*3f1979aaSAndroid Build Coastguard Worker       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
741*3f1979aaSAndroid Build Coastguard Worker       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
742*3f1979aaSAndroid Build Coastguard Worker       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
743*3f1979aaSAndroid Build Coastguard Worker       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
744*3f1979aaSAndroid Build Coastguard Worker       dr3 = VSUB(cr3, ci4);
745*3f1979aaSAndroid Build Coastguard Worker       dr4 = VADD(cr3, ci4);
746*3f1979aaSAndroid Build Coastguard Worker       di3 = VADD(ci3, cr4);
747*3f1979aaSAndroid Build Coastguard Worker       di4 = VSUB(ci3, cr4);
748*3f1979aaSAndroid Build Coastguard Worker       dr5 = VADD(cr2, ci5);
749*3f1979aaSAndroid Build Coastguard Worker       dr2 = VSUB(cr2, ci5);
750*3f1979aaSAndroid Build Coastguard Worker       di5 = VSUB(ci2, cr5);
751*3f1979aaSAndroid Build Coastguard Worker       di2 = VADD(ci2, cr5);
752*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
753*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
754*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
755*3f1979aaSAndroid Build Coastguard Worker       VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
756*3f1979aaSAndroid Build Coastguard Worker 
757*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
758*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
759*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
760*3f1979aaSAndroid Build Coastguard Worker       ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
761*3f1979aaSAndroid Build Coastguard Worker     }
762*3f1979aaSAndroid Build Coastguard Worker   }
763*3f1979aaSAndroid Build Coastguard Worker #undef cc_ref
764*3f1979aaSAndroid Build Coastguard Worker #undef ch_ref
765*3f1979aaSAndroid Build Coastguard Worker } /* radb5 */
766*3f1979aaSAndroid Build Coastguard Worker 
rfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)767*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
768*3f1979aaSAndroid Build Coastguard Worker                                       const float *wa, const int *ifac) {
769*3f1979aaSAndroid Build Coastguard Worker   v4sf *in  = (v4sf*)input_readonly;
770*3f1979aaSAndroid Build Coastguard Worker   v4sf *out = (in == work2 ? work1 : work2);
771*3f1979aaSAndroid Build Coastguard Worker   int nf = ifac[1], k1;
772*3f1979aaSAndroid Build Coastguard Worker   int l2 = n;
773*3f1979aaSAndroid Build Coastguard Worker   int iw = n-1;
774*3f1979aaSAndroid Build Coastguard Worker   assert(in != out && work1 != work2);
775*3f1979aaSAndroid Build Coastguard Worker   for (k1 = 1; k1 <= nf; ++k1) {
776*3f1979aaSAndroid Build Coastguard Worker     int kh = nf - k1;
777*3f1979aaSAndroid Build Coastguard Worker     int ip = ifac[kh + 2];
778*3f1979aaSAndroid Build Coastguard Worker     int l1 = l2 / ip;
779*3f1979aaSAndroid Build Coastguard Worker     int ido = n / l2;
780*3f1979aaSAndroid Build Coastguard Worker     iw -= (ip - 1)*ido;
781*3f1979aaSAndroid Build Coastguard Worker     switch (ip) {
782*3f1979aaSAndroid Build Coastguard Worker       case 5: {
783*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + ido;
784*3f1979aaSAndroid Build Coastguard Worker         int ix3 = ix2 + ido;
785*3f1979aaSAndroid Build Coastguard Worker         int ix4 = ix3 + ido;
786*3f1979aaSAndroid Build Coastguard Worker         radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
787*3f1979aaSAndroid Build Coastguard Worker       } break;
788*3f1979aaSAndroid Build Coastguard Worker       case 4: {
789*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + ido;
790*3f1979aaSAndroid Build Coastguard Worker         int ix3 = ix2 + ido;
791*3f1979aaSAndroid Build Coastguard Worker         radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
792*3f1979aaSAndroid Build Coastguard Worker       } break;
793*3f1979aaSAndroid Build Coastguard Worker       case 3: {
794*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + ido;
795*3f1979aaSAndroid Build Coastguard Worker         radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
796*3f1979aaSAndroid Build Coastguard Worker       } break;
797*3f1979aaSAndroid Build Coastguard Worker       case 2:
798*3f1979aaSAndroid Build Coastguard Worker         radf2_ps(ido, l1, in, out, &wa[iw]);
799*3f1979aaSAndroid Build Coastguard Worker         break;
800*3f1979aaSAndroid Build Coastguard Worker       default:
801*3f1979aaSAndroid Build Coastguard Worker         assert(0);
802*3f1979aaSAndroid Build Coastguard Worker         break;
803*3f1979aaSAndroid Build Coastguard Worker     }
804*3f1979aaSAndroid Build Coastguard Worker     l2 = l1;
805*3f1979aaSAndroid Build Coastguard Worker     if (out == work2) {
806*3f1979aaSAndroid Build Coastguard Worker       out = work1; in = work2;
807*3f1979aaSAndroid Build Coastguard Worker     } else {
808*3f1979aaSAndroid Build Coastguard Worker       out = work2; in = work1;
809*3f1979aaSAndroid Build Coastguard Worker     }
810*3f1979aaSAndroid Build Coastguard Worker   }
811*3f1979aaSAndroid Build Coastguard Worker   return in; /* this is in fact the output .. */
812*3f1979aaSAndroid Build Coastguard Worker } /* rfftf1 */
813*3f1979aaSAndroid Build Coastguard Worker 
rfftb1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)814*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
815*3f1979aaSAndroid Build Coastguard Worker                                       const float *wa, const int *ifac) {
816*3f1979aaSAndroid Build Coastguard Worker   v4sf *in  = (v4sf*)input_readonly;
817*3f1979aaSAndroid Build Coastguard Worker   v4sf *out = (in == work2 ? work1 : work2);
818*3f1979aaSAndroid Build Coastguard Worker   int nf = ifac[1], k1;
819*3f1979aaSAndroid Build Coastguard Worker   int l1 = 1;
820*3f1979aaSAndroid Build Coastguard Worker   int iw = 0;
821*3f1979aaSAndroid Build Coastguard Worker   assert(in != out);
822*3f1979aaSAndroid Build Coastguard Worker   for (k1=1; k1<=nf; k1++) {
823*3f1979aaSAndroid Build Coastguard Worker     int ip = ifac[k1 + 1];
824*3f1979aaSAndroid Build Coastguard Worker     int l2 = ip*l1;
825*3f1979aaSAndroid Build Coastguard Worker     int ido = n / l2;
826*3f1979aaSAndroid Build Coastguard Worker     switch (ip) {
827*3f1979aaSAndroid Build Coastguard Worker       case 5: {
828*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + ido;
829*3f1979aaSAndroid Build Coastguard Worker         int ix3 = ix2 + ido;
830*3f1979aaSAndroid Build Coastguard Worker         int ix4 = ix3 + ido;
831*3f1979aaSAndroid Build Coastguard Worker         radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
832*3f1979aaSAndroid Build Coastguard Worker       } break;
833*3f1979aaSAndroid Build Coastguard Worker       case 4: {
834*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + ido;
835*3f1979aaSAndroid Build Coastguard Worker         int ix3 = ix2 + ido;
836*3f1979aaSAndroid Build Coastguard Worker         radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
837*3f1979aaSAndroid Build Coastguard Worker       } break;
838*3f1979aaSAndroid Build Coastguard Worker       case 3: {
839*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + ido;
840*3f1979aaSAndroid Build Coastguard Worker         radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
841*3f1979aaSAndroid Build Coastguard Worker       } break;
842*3f1979aaSAndroid Build Coastguard Worker       case 2:
843*3f1979aaSAndroid Build Coastguard Worker         radb2_ps(ido, l1, in, out, &wa[iw]);
844*3f1979aaSAndroid Build Coastguard Worker         break;
845*3f1979aaSAndroid Build Coastguard Worker       default:
846*3f1979aaSAndroid Build Coastguard Worker         assert(0);
847*3f1979aaSAndroid Build Coastguard Worker         break;
848*3f1979aaSAndroid Build Coastguard Worker     }
849*3f1979aaSAndroid Build Coastguard Worker     l1 = l2;
850*3f1979aaSAndroid Build Coastguard Worker     iw += (ip - 1)*ido;
851*3f1979aaSAndroid Build Coastguard Worker 
852*3f1979aaSAndroid Build Coastguard Worker     if (out == work2) {
853*3f1979aaSAndroid Build Coastguard Worker       out = work1; in = work2;
854*3f1979aaSAndroid Build Coastguard Worker     } else {
855*3f1979aaSAndroid Build Coastguard Worker       out = work2; in = work1;
856*3f1979aaSAndroid Build Coastguard Worker     }
857*3f1979aaSAndroid Build Coastguard Worker   }
858*3f1979aaSAndroid Build Coastguard Worker   return in; /* this is in fact the output .. */
859*3f1979aaSAndroid Build Coastguard Worker }
860*3f1979aaSAndroid Build Coastguard Worker 
decompose(int n,int * ifac,const int * ntryh)861*3f1979aaSAndroid Build Coastguard Worker static int decompose(int n, int *ifac, const int *ntryh) {
862*3f1979aaSAndroid Build Coastguard Worker   int nl = n, nf = 0, i, j = 0;
863*3f1979aaSAndroid Build Coastguard Worker   for (j=0; ntryh[j]; ++j) {
864*3f1979aaSAndroid Build Coastguard Worker     int ntry = ntryh[j];
865*3f1979aaSAndroid Build Coastguard Worker     while (nl != 1) {
866*3f1979aaSAndroid Build Coastguard Worker       int nq = nl / ntry;
867*3f1979aaSAndroid Build Coastguard Worker       int nr = nl - ntry * nq;
868*3f1979aaSAndroid Build Coastguard Worker       if (nr == 0) {
869*3f1979aaSAndroid Build Coastguard Worker         ifac[2+nf++] = ntry;
870*3f1979aaSAndroid Build Coastguard Worker         nl = nq;
871*3f1979aaSAndroid Build Coastguard Worker         if (ntry == 2 && nf != 1) {
872*3f1979aaSAndroid Build Coastguard Worker           for (i = 2; i <= nf; ++i) {
873*3f1979aaSAndroid Build Coastguard Worker             int ib = nf - i + 2;
874*3f1979aaSAndroid Build Coastguard Worker             ifac[ib + 1] = ifac[ib];
875*3f1979aaSAndroid Build Coastguard Worker           }
876*3f1979aaSAndroid Build Coastguard Worker           ifac[2] = 2;
877*3f1979aaSAndroid Build Coastguard Worker         }
878*3f1979aaSAndroid Build Coastguard Worker       } else break;
879*3f1979aaSAndroid Build Coastguard Worker     }
880*3f1979aaSAndroid Build Coastguard Worker   }
881*3f1979aaSAndroid Build Coastguard Worker   ifac[0] = n;
882*3f1979aaSAndroid Build Coastguard Worker   ifac[1] = nf;
883*3f1979aaSAndroid Build Coastguard Worker   return nf;
884*3f1979aaSAndroid Build Coastguard Worker }
885*3f1979aaSAndroid Build Coastguard Worker 
886*3f1979aaSAndroid Build Coastguard Worker 
887*3f1979aaSAndroid Build Coastguard Worker 
rffti1_ps(int n,float * wa,int * ifac)888*3f1979aaSAndroid Build Coastguard Worker static void rffti1_ps(int n, float *wa, int *ifac)
889*3f1979aaSAndroid Build Coastguard Worker {
890*3f1979aaSAndroid Build Coastguard Worker   static const int ntryh[] = { 4,2,3,5,0 };
891*3f1979aaSAndroid Build Coastguard Worker   int k1, j, ii;
892*3f1979aaSAndroid Build Coastguard Worker 
893*3f1979aaSAndroid Build Coastguard Worker   int nf = decompose(n,ifac,ntryh);
894*3f1979aaSAndroid Build Coastguard Worker   float argh = (2*(float)M_PI) / n;
895*3f1979aaSAndroid Build Coastguard Worker   int is = 0;
896*3f1979aaSAndroid Build Coastguard Worker   int nfm1 = nf - 1;
897*3f1979aaSAndroid Build Coastguard Worker   int l1 = 1;
898*3f1979aaSAndroid Build Coastguard Worker   for (k1 = 1; k1 <= nfm1; k1++) {
899*3f1979aaSAndroid Build Coastguard Worker     int ip = ifac[k1 + 1];
900*3f1979aaSAndroid Build Coastguard Worker     int ld = 0;
901*3f1979aaSAndroid Build Coastguard Worker     int l2 = l1*ip;
902*3f1979aaSAndroid Build Coastguard Worker     int ido = n / l2;
903*3f1979aaSAndroid Build Coastguard Worker     int ipm = ip - 1;
904*3f1979aaSAndroid Build Coastguard Worker     for (j = 1; j <= ipm; ++j) {
905*3f1979aaSAndroid Build Coastguard Worker       float argld;
906*3f1979aaSAndroid Build Coastguard Worker       int i = is, fi=0;
907*3f1979aaSAndroid Build Coastguard Worker       ld += l1;
908*3f1979aaSAndroid Build Coastguard Worker       argld = ld*argh;
909*3f1979aaSAndroid Build Coastguard Worker       for (ii = 3; ii <= ido; ii += 2) {
910*3f1979aaSAndroid Build Coastguard Worker         i += 2;
911*3f1979aaSAndroid Build Coastguard Worker         fi += 1;
912*3f1979aaSAndroid Build Coastguard Worker         wa[i - 2] = FUNC_COS(fi*argld);
913*3f1979aaSAndroid Build Coastguard Worker         wa[i - 1] = FUNC_SIN(fi*argld);
914*3f1979aaSAndroid Build Coastguard Worker       }
915*3f1979aaSAndroid Build Coastguard Worker       is += ido;
916*3f1979aaSAndroid Build Coastguard Worker     }
917*3f1979aaSAndroid Build Coastguard Worker     l1 = l2;
918*3f1979aaSAndroid Build Coastguard Worker   }
919*3f1979aaSAndroid Build Coastguard Worker } /* rffti1 */
920*3f1979aaSAndroid Build Coastguard Worker 
cffti1_ps(int n,float * wa,int * ifac)921*3f1979aaSAndroid Build Coastguard Worker static void cffti1_ps(int n, float *wa, int *ifac)
922*3f1979aaSAndroid Build Coastguard Worker {
923*3f1979aaSAndroid Build Coastguard Worker   static const int ntryh[] = { 5,3,4,2,0 };
924*3f1979aaSAndroid Build Coastguard Worker   int k1, j, ii;
925*3f1979aaSAndroid Build Coastguard Worker 
926*3f1979aaSAndroid Build Coastguard Worker   int nf = decompose(n,ifac,ntryh);
927*3f1979aaSAndroid Build Coastguard Worker   float argh = (2*(float)M_PI) / n;
928*3f1979aaSAndroid Build Coastguard Worker   int i = 1;
929*3f1979aaSAndroid Build Coastguard Worker   int l1 = 1;
930*3f1979aaSAndroid Build Coastguard Worker   for (k1=1; k1<=nf; k1++) {
931*3f1979aaSAndroid Build Coastguard Worker     int ip = ifac[k1+1];
932*3f1979aaSAndroid Build Coastguard Worker     int ld = 0;
933*3f1979aaSAndroid Build Coastguard Worker     int l2 = l1*ip;
934*3f1979aaSAndroid Build Coastguard Worker     int ido = n / l2;
935*3f1979aaSAndroid Build Coastguard Worker     int idot = ido + ido + 2;
936*3f1979aaSAndroid Build Coastguard Worker     int ipm = ip - 1;
937*3f1979aaSAndroid Build Coastguard Worker     for (j=1; j<=ipm; j++) {
938*3f1979aaSAndroid Build Coastguard Worker       float argld;
939*3f1979aaSAndroid Build Coastguard Worker       int i1 = i, fi = 0;
940*3f1979aaSAndroid Build Coastguard Worker       wa[i-1] = 1;
941*3f1979aaSAndroid Build Coastguard Worker       wa[i] = 0;
942*3f1979aaSAndroid Build Coastguard Worker       ld += l1;
943*3f1979aaSAndroid Build Coastguard Worker       argld = ld*argh;
944*3f1979aaSAndroid Build Coastguard Worker       for (ii = 4; ii <= idot; ii += 2) {
945*3f1979aaSAndroid Build Coastguard Worker         i += 2;
946*3f1979aaSAndroid Build Coastguard Worker         fi += 1;
947*3f1979aaSAndroid Build Coastguard Worker         wa[i-1] = FUNC_COS(fi*argld);
948*3f1979aaSAndroid Build Coastguard Worker         wa[i] = FUNC_SIN(fi*argld);
949*3f1979aaSAndroid Build Coastguard Worker       }
950*3f1979aaSAndroid Build Coastguard Worker       if (ip > 5) {
951*3f1979aaSAndroid Build Coastguard Worker         wa[i1-1] = wa[i-1];
952*3f1979aaSAndroid Build Coastguard Worker         wa[i1] = wa[i];
953*3f1979aaSAndroid Build Coastguard Worker       }
954*3f1979aaSAndroid Build Coastguard Worker     }
955*3f1979aaSAndroid Build Coastguard Worker     l1 = l2;
956*3f1979aaSAndroid Build Coastguard Worker   }
957*3f1979aaSAndroid Build Coastguard Worker } /* cffti1 */
958*3f1979aaSAndroid Build Coastguard Worker 
959*3f1979aaSAndroid Build Coastguard Worker 
cfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac,int isign)960*3f1979aaSAndroid Build Coastguard Worker static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
961*3f1979aaSAndroid Build Coastguard Worker   v4sf *in  = (v4sf*)input_readonly;
962*3f1979aaSAndroid Build Coastguard Worker   v4sf *out = (in == work2 ? work1 : work2);
963*3f1979aaSAndroid Build Coastguard Worker   int nf = ifac[1], k1;
964*3f1979aaSAndroid Build Coastguard Worker   int l1 = 1;
965*3f1979aaSAndroid Build Coastguard Worker   int iw = 0;
966*3f1979aaSAndroid Build Coastguard Worker   assert(in != out && work1 != work2);
967*3f1979aaSAndroid Build Coastguard Worker   for (k1=2; k1<=nf+1; k1++) {
968*3f1979aaSAndroid Build Coastguard Worker     int ip = ifac[k1];
969*3f1979aaSAndroid Build Coastguard Worker     int l2 = ip*l1;
970*3f1979aaSAndroid Build Coastguard Worker     int ido = n / l2;
971*3f1979aaSAndroid Build Coastguard Worker     int idot = ido + ido;
972*3f1979aaSAndroid Build Coastguard Worker     switch (ip) {
973*3f1979aaSAndroid Build Coastguard Worker       case 5: {
974*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + idot;
975*3f1979aaSAndroid Build Coastguard Worker         int ix3 = ix2 + idot;
976*3f1979aaSAndroid Build Coastguard Worker         int ix4 = ix3 + idot;
977*3f1979aaSAndroid Build Coastguard Worker         passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
978*3f1979aaSAndroid Build Coastguard Worker       } break;
979*3f1979aaSAndroid Build Coastguard Worker       case 4: {
980*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + idot;
981*3f1979aaSAndroid Build Coastguard Worker         int ix3 = ix2 + idot;
982*3f1979aaSAndroid Build Coastguard Worker         passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
983*3f1979aaSAndroid Build Coastguard Worker       } break;
984*3f1979aaSAndroid Build Coastguard Worker       case 2: {
985*3f1979aaSAndroid Build Coastguard Worker         passf2_ps(idot, l1, in, out, &wa[iw], isign);
986*3f1979aaSAndroid Build Coastguard Worker       } break;
987*3f1979aaSAndroid Build Coastguard Worker       case 3: {
988*3f1979aaSAndroid Build Coastguard Worker         int ix2 = iw + idot;
989*3f1979aaSAndroid Build Coastguard Worker         passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
990*3f1979aaSAndroid Build Coastguard Worker       } break;
991*3f1979aaSAndroid Build Coastguard Worker       default:
992*3f1979aaSAndroid Build Coastguard Worker         assert(0);
993*3f1979aaSAndroid Build Coastguard Worker     }
994*3f1979aaSAndroid Build Coastguard Worker     l1 = l2;
995*3f1979aaSAndroid Build Coastguard Worker     iw += (ip - 1)*idot;
996*3f1979aaSAndroid Build Coastguard Worker     if (out == work2) {
997*3f1979aaSAndroid Build Coastguard Worker       out = work1; in = work2;
998*3f1979aaSAndroid Build Coastguard Worker     } else {
999*3f1979aaSAndroid Build Coastguard Worker       out = work2; in = work1;
1000*3f1979aaSAndroid Build Coastguard Worker     }
1001*3f1979aaSAndroid Build Coastguard Worker   }
1002*3f1979aaSAndroid Build Coastguard Worker 
1003*3f1979aaSAndroid Build Coastguard Worker   return in; /* this is in fact the output .. */
1004*3f1979aaSAndroid Build Coastguard Worker }
1005*3f1979aaSAndroid Build Coastguard Worker 
1006*3f1979aaSAndroid Build Coastguard Worker 
1007*3f1979aaSAndroid Build Coastguard Worker struct SETUP_STRUCT {
1008*3f1979aaSAndroid Build Coastguard Worker   int     N;
1009*3f1979aaSAndroid Build Coastguard Worker   int     Ncvec;  /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
1010*3f1979aaSAndroid Build Coastguard Worker   int ifac[15];
1011*3f1979aaSAndroid Build Coastguard Worker   pffft_transform_t transform;
1012*3f1979aaSAndroid Build Coastguard Worker   v4sf *data;     /* allocated room for twiddle coefs */
1013*3f1979aaSAndroid Build Coastguard Worker   float *e;       /* points into 'data', N/4*3 elements */
1014*3f1979aaSAndroid Build Coastguard Worker   float *twiddle; /* points into 'data', N/4 elements */
1015*3f1979aaSAndroid Build Coastguard Worker };
1016*3f1979aaSAndroid Build Coastguard Worker 
FUNC_NEW_SETUP(int N,pffft_transform_t transform)1017*3f1979aaSAndroid Build Coastguard Worker SETUP_STRUCT *FUNC_NEW_SETUP(int N, pffft_transform_t transform) {
1018*3f1979aaSAndroid Build Coastguard Worker   SETUP_STRUCT *s = (SETUP_STRUCT*)malloc(sizeof(SETUP_STRUCT));
1019*3f1979aaSAndroid Build Coastguard Worker   int k, m;
1020*3f1979aaSAndroid Build Coastguard Worker   /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1021*3f1979aaSAndroid Build Coastguard Worker      and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1022*3f1979aaSAndroid Build Coastguard Worker      handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1023*3f1979aaSAndroid Build Coastguard Worker   if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1024*3f1979aaSAndroid Build Coastguard Worker   if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1025*3f1979aaSAndroid Build Coastguard Worker   /* assert((N % 32) == 0); */
1026*3f1979aaSAndroid Build Coastguard Worker   s->N = N;
1027*3f1979aaSAndroid Build Coastguard Worker   s->transform = transform;
1028*3f1979aaSAndroid Build Coastguard Worker   /* nb of complex simd vectors */
1029*3f1979aaSAndroid Build Coastguard Worker   s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1030*3f1979aaSAndroid Build Coastguard Worker   s->data = (v4sf*)FUNC_ALIGNED_MALLOC(2*s->Ncvec * sizeof(v4sf));
1031*3f1979aaSAndroid Build Coastguard Worker   s->e = (float*)s->data;
1032*3f1979aaSAndroid Build Coastguard Worker   s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1033*3f1979aaSAndroid Build Coastguard Worker 
1034*3f1979aaSAndroid Build Coastguard Worker   if (transform == PFFFT_REAL) {
1035*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < s->Ncvec; ++k) {
1036*3f1979aaSAndroid Build Coastguard Worker       int i = k/SIMD_SZ;
1037*3f1979aaSAndroid Build Coastguard Worker       int j = k%SIMD_SZ;
1038*3f1979aaSAndroid Build Coastguard Worker       for (m=0; m < SIMD_SZ-1; ++m) {
1039*3f1979aaSAndroid Build Coastguard Worker         float A = -2*(float)M_PI*(m+1)*k / N;
1040*3f1979aaSAndroid Build Coastguard Worker         s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = FUNC_COS(A);
1041*3f1979aaSAndroid Build Coastguard Worker         s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = FUNC_SIN(A);
1042*3f1979aaSAndroid Build Coastguard Worker       }
1043*3f1979aaSAndroid Build Coastguard Worker     }
1044*3f1979aaSAndroid Build Coastguard Worker     rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1045*3f1979aaSAndroid Build Coastguard Worker   } else {
1046*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < s->Ncvec; ++k) {
1047*3f1979aaSAndroid Build Coastguard Worker       int i = k/SIMD_SZ;
1048*3f1979aaSAndroid Build Coastguard Worker       int j = k%SIMD_SZ;
1049*3f1979aaSAndroid Build Coastguard Worker       for (m=0; m < SIMD_SZ-1; ++m) {
1050*3f1979aaSAndroid Build Coastguard Worker         float A = -2*(float)M_PI*(m+1)*k / N;
1051*3f1979aaSAndroid Build Coastguard Worker         s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = FUNC_COS(A);
1052*3f1979aaSAndroid Build Coastguard Worker         s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = FUNC_SIN(A);
1053*3f1979aaSAndroid Build Coastguard Worker       }
1054*3f1979aaSAndroid Build Coastguard Worker     }
1055*3f1979aaSAndroid Build Coastguard Worker     cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1056*3f1979aaSAndroid Build Coastguard Worker   }
1057*3f1979aaSAndroid Build Coastguard Worker 
1058*3f1979aaSAndroid Build Coastguard Worker   /* check that N is decomposable with allowed prime factors */
1059*3f1979aaSAndroid Build Coastguard Worker   for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1060*3f1979aaSAndroid Build Coastguard Worker   if (m != N/SIMD_SZ) {
1061*3f1979aaSAndroid Build Coastguard Worker     FUNC_DESTROY(s); s = 0;
1062*3f1979aaSAndroid Build Coastguard Worker   }
1063*3f1979aaSAndroid Build Coastguard Worker 
1064*3f1979aaSAndroid Build Coastguard Worker   return s;
1065*3f1979aaSAndroid Build Coastguard Worker }
1066*3f1979aaSAndroid Build Coastguard Worker 
1067*3f1979aaSAndroid Build Coastguard Worker 
FUNC_DESTROY(SETUP_STRUCT * s)1068*3f1979aaSAndroid Build Coastguard Worker void FUNC_DESTROY(SETUP_STRUCT *s) {
1069*3f1979aaSAndroid Build Coastguard Worker   FUNC_ALIGNED_FREE(s->data);
1070*3f1979aaSAndroid Build Coastguard Worker   free(s);
1071*3f1979aaSAndroid Build Coastguard Worker }
1072*3f1979aaSAndroid Build Coastguard Worker 
1073*3f1979aaSAndroid Build Coastguard Worker #if ( SIMD_SZ == 4 )    /* !defined(PFFFT_SIMD_DISABLE) */
1074*3f1979aaSAndroid Build Coastguard Worker 
1075*3f1979aaSAndroid Build Coastguard Worker /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
reversed_copy(int N,const v4sf * in,int in_stride,v4sf * out)1076*3f1979aaSAndroid Build Coastguard Worker static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1077*3f1979aaSAndroid Build Coastguard Worker   v4sf g0, g1;
1078*3f1979aaSAndroid Build Coastguard Worker   int k;
1079*3f1979aaSAndroid Build Coastguard Worker   INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1080*3f1979aaSAndroid Build Coastguard Worker 
1081*3f1979aaSAndroid Build Coastguard Worker   *--out = VSWAPHL(g0, g1); /* [g0l, g0h], [g1l g1h] -> [g1l, g0h] */
1082*3f1979aaSAndroid Build Coastguard Worker   for (k=1; k < N; ++k) {
1083*3f1979aaSAndroid Build Coastguard Worker     v4sf h0, h1;
1084*3f1979aaSAndroid Build Coastguard Worker     INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1085*3f1979aaSAndroid Build Coastguard Worker     *--out = VSWAPHL(g1, h0);
1086*3f1979aaSAndroid Build Coastguard Worker     *--out = VSWAPHL(h0, h1);
1087*3f1979aaSAndroid Build Coastguard Worker     g1 = h1;
1088*3f1979aaSAndroid Build Coastguard Worker   }
1089*3f1979aaSAndroid Build Coastguard Worker   *--out = VSWAPHL(g1, g0);
1090*3f1979aaSAndroid Build Coastguard Worker }
1091*3f1979aaSAndroid Build Coastguard Worker 
unreversed_copy(int N,const v4sf * in,v4sf * out,int out_stride)1092*3f1979aaSAndroid Build Coastguard Worker static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1093*3f1979aaSAndroid Build Coastguard Worker   v4sf g0, g1, h0, h1;
1094*3f1979aaSAndroid Build Coastguard Worker   int k;
1095*3f1979aaSAndroid Build Coastguard Worker   g0 = g1 = in[0]; ++in;
1096*3f1979aaSAndroid Build Coastguard Worker   for (k=1; k < N; ++k) {
1097*3f1979aaSAndroid Build Coastguard Worker     h0 = *in++; h1 = *in++;
1098*3f1979aaSAndroid Build Coastguard Worker     g1 = VSWAPHL(g1, h0);
1099*3f1979aaSAndroid Build Coastguard Worker     h0 = VSWAPHL(h0, h1);
1100*3f1979aaSAndroid Build Coastguard Worker     UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1101*3f1979aaSAndroid Build Coastguard Worker     g1 = h1;
1102*3f1979aaSAndroid Build Coastguard Worker   }
1103*3f1979aaSAndroid Build Coastguard Worker   h0 = *in++; h1 = g0;
1104*3f1979aaSAndroid Build Coastguard Worker   g1 = VSWAPHL(g1, h0);
1105*3f1979aaSAndroid Build Coastguard Worker   h0 = VSWAPHL(h0, h1);
1106*3f1979aaSAndroid Build Coastguard Worker   UNINTERLEAVE2(h0, g1, out[0], out[1]);
1107*3f1979aaSAndroid Build Coastguard Worker }
1108*3f1979aaSAndroid Build Coastguard Worker 
FUNC_ZREORDER(SETUP_STRUCT * setup,const float * in,float * out,pffft_direction_t direction)1109*3f1979aaSAndroid Build Coastguard Worker void FUNC_ZREORDER(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) {
1110*3f1979aaSAndroid Build Coastguard Worker   int k, N = setup->N, Ncvec = setup->Ncvec;
1111*3f1979aaSAndroid Build Coastguard Worker   const v4sf *vin = (const v4sf*)in;
1112*3f1979aaSAndroid Build Coastguard Worker   v4sf *vout = (v4sf*)out;
1113*3f1979aaSAndroid Build Coastguard Worker   assert(in != out);
1114*3f1979aaSAndroid Build Coastguard Worker   if (setup->transform == PFFFT_REAL) {
1115*3f1979aaSAndroid Build Coastguard Worker     int k, dk = N/32;
1116*3f1979aaSAndroid Build Coastguard Worker     if (direction == PFFFT_FORWARD) {
1117*3f1979aaSAndroid Build Coastguard Worker       for (k=0; k < dk; ++k) {
1118*3f1979aaSAndroid Build Coastguard Worker         INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1119*3f1979aaSAndroid Build Coastguard Worker         INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1120*3f1979aaSAndroid Build Coastguard Worker       }
1121*3f1979aaSAndroid Build Coastguard Worker       reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1122*3f1979aaSAndroid Build Coastguard Worker       reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1123*3f1979aaSAndroid Build Coastguard Worker     } else {
1124*3f1979aaSAndroid Build Coastguard Worker       for (k=0; k < dk; ++k) {
1125*3f1979aaSAndroid Build Coastguard Worker         UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1126*3f1979aaSAndroid Build Coastguard Worker         UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1127*3f1979aaSAndroid Build Coastguard Worker       }
1128*3f1979aaSAndroid Build Coastguard Worker       unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1129*3f1979aaSAndroid Build Coastguard Worker       unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1130*3f1979aaSAndroid Build Coastguard Worker     }
1131*3f1979aaSAndroid Build Coastguard Worker   } else {
1132*3f1979aaSAndroid Build Coastguard Worker     if (direction == PFFFT_FORWARD) {
1133*3f1979aaSAndroid Build Coastguard Worker       for (k=0; k < Ncvec; ++k) {
1134*3f1979aaSAndroid Build Coastguard Worker         int kk = (k/4) + (k%4)*(Ncvec/4);
1135*3f1979aaSAndroid Build Coastguard Worker         INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1136*3f1979aaSAndroid Build Coastguard Worker       }
1137*3f1979aaSAndroid Build Coastguard Worker     } else {
1138*3f1979aaSAndroid Build Coastguard Worker       for (k=0; k < Ncvec; ++k) {
1139*3f1979aaSAndroid Build Coastguard Worker         int kk = (k/4) + (k%4)*(Ncvec/4);
1140*3f1979aaSAndroid Build Coastguard Worker         UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1141*3f1979aaSAndroid Build Coastguard Worker       }
1142*3f1979aaSAndroid Build Coastguard Worker     }
1143*3f1979aaSAndroid Build Coastguard Worker   }
1144*3f1979aaSAndroid Build Coastguard Worker }
1145*3f1979aaSAndroid Build Coastguard Worker 
FUNC_CPLX_FINALIZE(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1146*3f1979aaSAndroid Build Coastguard Worker void FUNC_CPLX_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1147*3f1979aaSAndroid Build Coastguard Worker   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
1148*3f1979aaSAndroid Build Coastguard Worker   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1149*3f1979aaSAndroid Build Coastguard Worker   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1150*3f1979aaSAndroid Build Coastguard Worker   assert(in != out);
1151*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < dk; ++k) {
1152*3f1979aaSAndroid Build Coastguard Worker     r0 = in[8*k+0]; i0 = in[8*k+1];
1153*3f1979aaSAndroid Build Coastguard Worker     r1 = in[8*k+2]; i1 = in[8*k+3];
1154*3f1979aaSAndroid Build Coastguard Worker     r2 = in[8*k+4]; i2 = in[8*k+5];
1155*3f1979aaSAndroid Build Coastguard Worker     r3 = in[8*k+6]; i3 = in[8*k+7];
1156*3f1979aaSAndroid Build Coastguard Worker     VTRANSPOSE4(r0,r1,r2,r3);
1157*3f1979aaSAndroid Build Coastguard Worker     VTRANSPOSE4(i0,i1,i2,i3);
1158*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1159*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1160*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1161*3f1979aaSAndroid Build Coastguard Worker 
1162*3f1979aaSAndroid Build Coastguard Worker     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1163*3f1979aaSAndroid Build Coastguard Worker     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1164*3f1979aaSAndroid Build Coastguard Worker     si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1165*3f1979aaSAndroid Build Coastguard Worker     si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1166*3f1979aaSAndroid Build Coastguard Worker 
1167*3f1979aaSAndroid Build Coastguard Worker     /*
1168*3f1979aaSAndroid Build Coastguard Worker       transformation for each column is:
1169*3f1979aaSAndroid Build Coastguard Worker 
1170*3f1979aaSAndroid Build Coastguard Worker       [1   1   1   1   0   0   0   0]   [r0]
1171*3f1979aaSAndroid Build Coastguard Worker       [1   0  -1   0   0  -1   0   1]   [r1]
1172*3f1979aaSAndroid Build Coastguard Worker       [1  -1   1  -1   0   0   0   0]   [r2]
1173*3f1979aaSAndroid Build Coastguard Worker       [1   0  -1   0   0   1   0  -1]   [r3]
1174*3f1979aaSAndroid Build Coastguard Worker       [0   0   0   0   1   1   1   1] * [i0]
1175*3f1979aaSAndroid Build Coastguard Worker       [0   1   0  -1   1   0  -1   0]   [i1]
1176*3f1979aaSAndroid Build Coastguard Worker       [0   0   0   0   1  -1   1  -1]   [i2]
1177*3f1979aaSAndroid Build Coastguard Worker       [0  -1   0   1   1   0  -1   0]   [i3]
1178*3f1979aaSAndroid Build Coastguard Worker     */
1179*3f1979aaSAndroid Build Coastguard Worker 
1180*3f1979aaSAndroid Build Coastguard Worker     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1181*3f1979aaSAndroid Build Coastguard Worker     r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1182*3f1979aaSAndroid Build Coastguard Worker     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1183*3f1979aaSAndroid Build Coastguard Worker     r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1184*3f1979aaSAndroid Build Coastguard Worker 
1185*3f1979aaSAndroid Build Coastguard Worker     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1186*3f1979aaSAndroid Build Coastguard Worker     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1187*3f1979aaSAndroid Build Coastguard Worker   }
1188*3f1979aaSAndroid Build Coastguard Worker }
1189*3f1979aaSAndroid Build Coastguard Worker 
FUNC_CPLX_PREPROCESS(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1190*3f1979aaSAndroid Build Coastguard Worker void FUNC_CPLX_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1191*3f1979aaSAndroid Build Coastguard Worker   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
1192*3f1979aaSAndroid Build Coastguard Worker   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1193*3f1979aaSAndroid Build Coastguard Worker   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1194*3f1979aaSAndroid Build Coastguard Worker   assert(in != out);
1195*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < dk; ++k) {
1196*3f1979aaSAndroid Build Coastguard Worker     r0 = in[8*k+0]; i0 = in[8*k+1];
1197*3f1979aaSAndroid Build Coastguard Worker     r1 = in[8*k+2]; i1 = in[8*k+3];
1198*3f1979aaSAndroid Build Coastguard Worker     r2 = in[8*k+4]; i2 = in[8*k+5];
1199*3f1979aaSAndroid Build Coastguard Worker     r3 = in[8*k+6]; i3 = in[8*k+7];
1200*3f1979aaSAndroid Build Coastguard Worker 
1201*3f1979aaSAndroid Build Coastguard Worker     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1202*3f1979aaSAndroid Build Coastguard Worker     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1203*3f1979aaSAndroid Build Coastguard Worker     si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1204*3f1979aaSAndroid Build Coastguard Worker     si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1205*3f1979aaSAndroid Build Coastguard Worker 
1206*3f1979aaSAndroid Build Coastguard Worker     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1207*3f1979aaSAndroid Build Coastguard Worker     r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1208*3f1979aaSAndroid Build Coastguard Worker     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1209*3f1979aaSAndroid Build Coastguard Worker     r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1210*3f1979aaSAndroid Build Coastguard Worker 
1211*3f1979aaSAndroid Build Coastguard Worker     VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1212*3f1979aaSAndroid Build Coastguard Worker     VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1213*3f1979aaSAndroid Build Coastguard Worker     VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1214*3f1979aaSAndroid Build Coastguard Worker 
1215*3f1979aaSAndroid Build Coastguard Worker     VTRANSPOSE4(r0,r1,r2,r3);
1216*3f1979aaSAndroid Build Coastguard Worker     VTRANSPOSE4(i0,i1,i2,i3);
1217*3f1979aaSAndroid Build Coastguard Worker 
1218*3f1979aaSAndroid Build Coastguard Worker     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1219*3f1979aaSAndroid Build Coastguard Worker     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1220*3f1979aaSAndroid Build Coastguard Worker   }
1221*3f1979aaSAndroid Build Coastguard Worker }
1222*3f1979aaSAndroid Build Coastguard Worker 
1223*3f1979aaSAndroid Build Coastguard Worker 
FUNC_REAL_FINALIZE_4X4(const v4sf * in0,const v4sf * in1,const v4sf * in,const v4sf * e,v4sf * out)1224*3f1979aaSAndroid Build Coastguard Worker static ALWAYS_INLINE(void) FUNC_REAL_FINALIZE_4X4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1225*3f1979aaSAndroid Build Coastguard Worker                             const v4sf *e, v4sf *out) {
1226*3f1979aaSAndroid Build Coastguard Worker   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1227*3f1979aaSAndroid Build Coastguard Worker   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1228*3f1979aaSAndroid Build Coastguard Worker   r0 = *in0; i0 = *in1;
1229*3f1979aaSAndroid Build Coastguard Worker   r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1230*3f1979aaSAndroid Build Coastguard Worker   VTRANSPOSE4(r0,r1,r2,r3);
1231*3f1979aaSAndroid Build Coastguard Worker   VTRANSPOSE4(i0,i1,i2,i3);
1232*3f1979aaSAndroid Build Coastguard Worker 
1233*3f1979aaSAndroid Build Coastguard Worker   /*
1234*3f1979aaSAndroid Build Coastguard Worker     transformation for each column is:
1235*3f1979aaSAndroid Build Coastguard Worker 
1236*3f1979aaSAndroid Build Coastguard Worker     [1   1   1   1   0   0   0   0]   [r0]
1237*3f1979aaSAndroid Build Coastguard Worker     [1   0  -1   0   0  -1   0   1]   [r1]
1238*3f1979aaSAndroid Build Coastguard Worker     [1   0  -1   0   0   1   0  -1]   [r2]
1239*3f1979aaSAndroid Build Coastguard Worker     [1  -1   1  -1   0   0   0   0]   [r3]
1240*3f1979aaSAndroid Build Coastguard Worker     [0   0   0   0   1   1   1   1] * [i0]
1241*3f1979aaSAndroid Build Coastguard Worker     [0  -1   0   1  -1   0   1   0]   [i1]
1242*3f1979aaSAndroid Build Coastguard Worker     [0  -1   0   1   1   0  -1   0]   [i2]
1243*3f1979aaSAndroid Build Coastguard Worker     [0   0   0   0  -1   1  -1   1]   [i3]
1244*3f1979aaSAndroid Build Coastguard Worker   */
1245*3f1979aaSAndroid Build Coastguard Worker 
1246*3f1979aaSAndroid Build Coastguard Worker   /* cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
1247*3f1979aaSAndroid Build Coastguard Worker   /* cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
1248*3f1979aaSAndroid Build Coastguard Worker 
1249*3f1979aaSAndroid Build Coastguard Worker   VCPLXMUL(r1,i1,e[0],e[1]);
1250*3f1979aaSAndroid Build Coastguard Worker   VCPLXMUL(r2,i2,e[2],e[3]);
1251*3f1979aaSAndroid Build Coastguard Worker   VCPLXMUL(r3,i3,e[4],e[5]);
1252*3f1979aaSAndroid Build Coastguard Worker 
1253*3f1979aaSAndroid Build Coastguard Worker   /* cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
1254*3f1979aaSAndroid Build Coastguard Worker   /* cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
1255*3f1979aaSAndroid Build Coastguard Worker 
1256*3f1979aaSAndroid Build Coastguard Worker   sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1257*3f1979aaSAndroid Build Coastguard Worker   sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1258*3f1979aaSAndroid Build Coastguard Worker   si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1259*3f1979aaSAndroid Build Coastguard Worker   si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1260*3f1979aaSAndroid Build Coastguard Worker 
1261*3f1979aaSAndroid Build Coastguard Worker   r0 = VADD(sr0, sr1);
1262*3f1979aaSAndroid Build Coastguard Worker   r3 = VSUB(sr0, sr1);
1263*3f1979aaSAndroid Build Coastguard Worker   i0 = VADD(si0, si1);
1264*3f1979aaSAndroid Build Coastguard Worker   i3 = VSUB(si1, si0);
1265*3f1979aaSAndroid Build Coastguard Worker   r1 = VADD(dr0, di1);
1266*3f1979aaSAndroid Build Coastguard Worker   r2 = VSUB(dr0, di1);
1267*3f1979aaSAndroid Build Coastguard Worker   i1 = VSUB(dr1, di0);
1268*3f1979aaSAndroid Build Coastguard Worker   i2 = VADD(dr1, di0);
1269*3f1979aaSAndroid Build Coastguard Worker 
1270*3f1979aaSAndroid Build Coastguard Worker   *out++ = r0;
1271*3f1979aaSAndroid Build Coastguard Worker   *out++ = i0;
1272*3f1979aaSAndroid Build Coastguard Worker   *out++ = r1;
1273*3f1979aaSAndroid Build Coastguard Worker   *out++ = i1;
1274*3f1979aaSAndroid Build Coastguard Worker   *out++ = r2;
1275*3f1979aaSAndroid Build Coastguard Worker   *out++ = i2;
1276*3f1979aaSAndroid Build Coastguard Worker   *out++ = r3;
1277*3f1979aaSAndroid Build Coastguard Worker   *out++ = i3;
1278*3f1979aaSAndroid Build Coastguard Worker 
1279*3f1979aaSAndroid Build Coastguard Worker }
1280*3f1979aaSAndroid Build Coastguard Worker 
FUNC_REAL_FINALIZE(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1281*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) FUNC_REAL_FINALIZE(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1282*3f1979aaSAndroid Build Coastguard Worker   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
1283*3f1979aaSAndroid Build Coastguard Worker   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1284*3f1979aaSAndroid Build Coastguard Worker 
1285*3f1979aaSAndroid Build Coastguard Worker   v4sf_union cr, ci, *uout = (v4sf_union*)out;
1286*3f1979aaSAndroid Build Coastguard Worker   v4sf save = in[7], zero=VZERO();
1287*3f1979aaSAndroid Build Coastguard Worker   float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1288*3f1979aaSAndroid Build Coastguard Worker   static const float s = (float)M_SQRT2/2;
1289*3f1979aaSAndroid Build Coastguard Worker 
1290*3f1979aaSAndroid Build Coastguard Worker   cr.v = in[0]; ci.v = in[Ncvec*2-1];
1291*3f1979aaSAndroid Build Coastguard Worker   assert(in != out);
1292*3f1979aaSAndroid Build Coastguard Worker   FUNC_REAL_FINALIZE_4X4(&zero, &zero, in+1, e, out);
1293*3f1979aaSAndroid Build Coastguard Worker 
1294*3f1979aaSAndroid Build Coastguard Worker   /*
1295*3f1979aaSAndroid Build Coastguard Worker     [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1296*3f1979aaSAndroid Build Coastguard Worker 
1297*3f1979aaSAndroid Build Coastguard Worker     [Xr(1)]  ] [1   1   1   1   0   0   0   0]
1298*3f1979aaSAndroid Build Coastguard Worker     [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
1299*3f1979aaSAndroid Build Coastguard Worker     [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
1300*3f1979aaSAndroid Build Coastguard Worker     [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
1301*3f1979aaSAndroid Build Coastguard Worker     [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
1302*3f1979aaSAndroid Build Coastguard Worker     [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
1303*3f1979aaSAndroid Build Coastguard Worker     [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
1304*3f1979aaSAndroid Build Coastguard Worker     [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
1305*3f1979aaSAndroid Build Coastguard Worker   */
1306*3f1979aaSAndroid Build Coastguard Worker 
1307*3f1979aaSAndroid Build Coastguard Worker   xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1308*3f1979aaSAndroid Build Coastguard Worker   xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1309*3f1979aaSAndroid Build Coastguard Worker   xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
1310*3f1979aaSAndroid Build Coastguard Worker   xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
1311*3f1979aaSAndroid Build Coastguard Worker   xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
1312*3f1979aaSAndroid Build Coastguard Worker   xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
1313*3f1979aaSAndroid Build Coastguard Worker   xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
1314*3f1979aaSAndroid Build Coastguard Worker   xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
1315*3f1979aaSAndroid Build Coastguard Worker 
1316*3f1979aaSAndroid Build Coastguard Worker   for (k=1; k < dk; ++k) {
1317*3f1979aaSAndroid Build Coastguard Worker     v4sf save_next = in[8*k+7];
1318*3f1979aaSAndroid Build Coastguard Worker     FUNC_REAL_FINALIZE_4X4(&save, &in[8*k+0], in + 8*k+1,
1319*3f1979aaSAndroid Build Coastguard Worker                            e + k*6, out + k*8);
1320*3f1979aaSAndroid Build Coastguard Worker     save = save_next;
1321*3f1979aaSAndroid Build Coastguard Worker   }
1322*3f1979aaSAndroid Build Coastguard Worker 
1323*3f1979aaSAndroid Build Coastguard Worker }
1324*3f1979aaSAndroid Build Coastguard Worker 
FUNC_REAL_PREPROCESS_4X4(const v4sf * in,const v4sf * e,v4sf * out,int first)1325*3f1979aaSAndroid Build Coastguard Worker static ALWAYS_INLINE(void) FUNC_REAL_PREPROCESS_4X4(const v4sf *in,
1326*3f1979aaSAndroid Build Coastguard Worker                                              const v4sf *e, v4sf *out, int first) {
1327*3f1979aaSAndroid Build Coastguard Worker   v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1328*3f1979aaSAndroid Build Coastguard Worker   /*
1329*3f1979aaSAndroid Build Coastguard Worker     transformation for each column is:
1330*3f1979aaSAndroid Build Coastguard Worker 
1331*3f1979aaSAndroid Build Coastguard Worker     [1   1   1   1   0   0   0   0]   [r0]
1332*3f1979aaSAndroid Build Coastguard Worker     [1   0   0  -1   0  -1  -1   0]   [r1]
1333*3f1979aaSAndroid Build Coastguard Worker     [1  -1  -1   1   0   0   0   0]   [r2]
1334*3f1979aaSAndroid Build Coastguard Worker     [1   0   0  -1   0   1   1   0]   [r3]
1335*3f1979aaSAndroid Build Coastguard Worker     [0   0   0   0   1  -1   1  -1] * [i0]
1336*3f1979aaSAndroid Build Coastguard Worker     [0  -1   1   0   1   0   0   1]   [i1]
1337*3f1979aaSAndroid Build Coastguard Worker     [0   0   0   0   1   1  -1  -1]   [i2]
1338*3f1979aaSAndroid Build Coastguard Worker     [0   1  -1   0   1   0   0   1]   [i3]
1339*3f1979aaSAndroid Build Coastguard Worker   */
1340*3f1979aaSAndroid Build Coastguard Worker 
1341*3f1979aaSAndroid Build Coastguard Worker   v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1342*3f1979aaSAndroid Build Coastguard Worker   v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1343*3f1979aaSAndroid Build Coastguard Worker   v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1344*3f1979aaSAndroid Build Coastguard Worker   v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1345*3f1979aaSAndroid Build Coastguard Worker 
1346*3f1979aaSAndroid Build Coastguard Worker   r0 = VADD(sr0, sr1);
1347*3f1979aaSAndroid Build Coastguard Worker   r2 = VSUB(sr0, sr1);
1348*3f1979aaSAndroid Build Coastguard Worker   r1 = VSUB(dr0, si1);
1349*3f1979aaSAndroid Build Coastguard Worker   r3 = VADD(dr0, si1);
1350*3f1979aaSAndroid Build Coastguard Worker   i0 = VSUB(di0, di1);
1351*3f1979aaSAndroid Build Coastguard Worker   i2 = VADD(di0, di1);
1352*3f1979aaSAndroid Build Coastguard Worker   i1 = VSUB(si0, dr1);
1353*3f1979aaSAndroid Build Coastguard Worker   i3 = VADD(si0, dr1);
1354*3f1979aaSAndroid Build Coastguard Worker 
1355*3f1979aaSAndroid Build Coastguard Worker   VCPLXMULCONJ(r1,i1,e[0],e[1]);
1356*3f1979aaSAndroid Build Coastguard Worker   VCPLXMULCONJ(r2,i2,e[2],e[3]);
1357*3f1979aaSAndroid Build Coastguard Worker   VCPLXMULCONJ(r3,i3,e[4],e[5]);
1358*3f1979aaSAndroid Build Coastguard Worker 
1359*3f1979aaSAndroid Build Coastguard Worker   VTRANSPOSE4(r0,r1,r2,r3);
1360*3f1979aaSAndroid Build Coastguard Worker   VTRANSPOSE4(i0,i1,i2,i3);
1361*3f1979aaSAndroid Build Coastguard Worker 
1362*3f1979aaSAndroid Build Coastguard Worker   if (!first) {
1363*3f1979aaSAndroid Build Coastguard Worker     *out++ = r0;
1364*3f1979aaSAndroid Build Coastguard Worker     *out++ = i0;
1365*3f1979aaSAndroid Build Coastguard Worker   }
1366*3f1979aaSAndroid Build Coastguard Worker   *out++ = r1;
1367*3f1979aaSAndroid Build Coastguard Worker   *out++ = i1;
1368*3f1979aaSAndroid Build Coastguard Worker   *out++ = r2;
1369*3f1979aaSAndroid Build Coastguard Worker   *out++ = i2;
1370*3f1979aaSAndroid Build Coastguard Worker   *out++ = r3;
1371*3f1979aaSAndroid Build Coastguard Worker   *out++ = i3;
1372*3f1979aaSAndroid Build Coastguard Worker }
1373*3f1979aaSAndroid Build Coastguard Worker 
FUNC_REAL_PREPROCESS(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1374*3f1979aaSAndroid Build Coastguard Worker static NEVER_INLINE(void) FUNC_REAL_PREPROCESS(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1375*3f1979aaSAndroid Build Coastguard Worker   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
1376*3f1979aaSAndroid Build Coastguard Worker   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1377*3f1979aaSAndroid Build Coastguard Worker 
1378*3f1979aaSAndroid Build Coastguard Worker   v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1379*3f1979aaSAndroid Build Coastguard Worker   float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1380*3f1979aaSAndroid Build Coastguard Worker   static const float s = (float)M_SQRT2;
1381*3f1979aaSAndroid Build Coastguard Worker   assert(in != out);
1382*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < 4; ++k) {
1383*3f1979aaSAndroid Build Coastguard Worker     Xr.f[k] = ((float*)in)[8*k];
1384*3f1979aaSAndroid Build Coastguard Worker     Xi.f[k] = ((float*)in)[8*k+4];
1385*3f1979aaSAndroid Build Coastguard Worker   }
1386*3f1979aaSAndroid Build Coastguard Worker 
1387*3f1979aaSAndroid Build Coastguard Worker   FUNC_REAL_PREPROCESS_4X4(in, e, out+1, 1); /* will write only 6 values */
1388*3f1979aaSAndroid Build Coastguard Worker 
1389*3f1979aaSAndroid Build Coastguard Worker   /*
1390*3f1979aaSAndroid Build Coastguard Worker     [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1391*3f1979aaSAndroid Build Coastguard Worker 
1392*3f1979aaSAndroid Build Coastguard Worker     [cr0] [1   0   2   0   1   0   0   0]
1393*3f1979aaSAndroid Build Coastguard Worker     [cr1] [1   0   0   0  -1   0  -2   0]
1394*3f1979aaSAndroid Build Coastguard Worker     [cr2] [1   0  -2   0   1   0   0   0]
1395*3f1979aaSAndroid Build Coastguard Worker     [cr3] [1   0   0   0  -1   0   2   0]
1396*3f1979aaSAndroid Build Coastguard Worker     [ci0] [0   2   0   2   0   0   0   0]
1397*3f1979aaSAndroid Build Coastguard Worker     [ci1] [0   s   0  -s   0  -s   0  -s]
1398*3f1979aaSAndroid Build Coastguard Worker     [ci2] [0   0   0   0   0  -2   0   2]
1399*3f1979aaSAndroid Build Coastguard Worker     [ci3] [0  -s   0   s   0  -s   0  -s]
1400*3f1979aaSAndroid Build Coastguard Worker   */
1401*3f1979aaSAndroid Build Coastguard Worker   for (k=1; k < dk; ++k) {
1402*3f1979aaSAndroid Build Coastguard Worker     FUNC_REAL_PREPROCESS_4X4(in+8*k, e + k*6, out-1+k*8, 0);
1403*3f1979aaSAndroid Build Coastguard Worker   }
1404*3f1979aaSAndroid Build Coastguard Worker 
1405*3f1979aaSAndroid Build Coastguard Worker   cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1406*3f1979aaSAndroid Build Coastguard Worker   cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1407*3f1979aaSAndroid Build Coastguard Worker   cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1408*3f1979aaSAndroid Build Coastguard Worker   cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1409*3f1979aaSAndroid Build Coastguard Worker   ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
1410*3f1979aaSAndroid Build Coastguard Worker   ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1411*3f1979aaSAndroid Build Coastguard Worker   ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
1412*3f1979aaSAndroid Build Coastguard Worker   ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1413*3f1979aaSAndroid Build Coastguard Worker }
1414*3f1979aaSAndroid Build Coastguard Worker 
1415*3f1979aaSAndroid Build Coastguard Worker 
FUNC_TRANSFORM_INTERNAL(SETUP_STRUCT * setup,const float * finput,float * foutput,v4sf * scratch,pffft_direction_t direction,int ordered)1416*3f1979aaSAndroid Build Coastguard Worker void FUNC_TRANSFORM_INTERNAL(SETUP_STRUCT *setup, const float *finput, float *foutput, v4sf *scratch,
1417*3f1979aaSAndroid Build Coastguard Worker                              pffft_direction_t direction, int ordered) {
1418*3f1979aaSAndroid Build Coastguard Worker   int k, Ncvec   = setup->Ncvec;
1419*3f1979aaSAndroid Build Coastguard Worker   int nf_odd = (setup->ifac[1] & 1);
1420*3f1979aaSAndroid Build Coastguard Worker 
1421*3f1979aaSAndroid Build Coastguard Worker   /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
1422*3f1979aaSAndroid Build Coastguard Worker   int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1423*3f1979aaSAndroid Build Coastguard Worker   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1424*3f1979aaSAndroid Build Coastguard Worker 
1425*3f1979aaSAndroid Build Coastguard Worker   const v4sf *vinput = (const v4sf*)finput;
1426*3f1979aaSAndroid Build Coastguard Worker   v4sf *voutput      = (v4sf*)foutput;
1427*3f1979aaSAndroid Build Coastguard Worker   v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
1428*3f1979aaSAndroid Build Coastguard Worker   int ib = (nf_odd ^ ordered ? 1 : 0);
1429*3f1979aaSAndroid Build Coastguard Worker 
1430*3f1979aaSAndroid Build Coastguard Worker   assert(VALIGNED(finput) && VALIGNED(foutput));
1431*3f1979aaSAndroid Build Coastguard Worker 
1432*3f1979aaSAndroid Build Coastguard Worker   /* assert(finput != foutput); */
1433*3f1979aaSAndroid Build Coastguard Worker   if (direction == PFFFT_FORWARD) {
1434*3f1979aaSAndroid Build Coastguard Worker     ib = !ib;
1435*3f1979aaSAndroid Build Coastguard Worker     if (setup->transform == PFFFT_REAL) {
1436*3f1979aaSAndroid Build Coastguard Worker       ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1437*3f1979aaSAndroid Build Coastguard Worker                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1438*3f1979aaSAndroid Build Coastguard Worker       FUNC_REAL_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1439*3f1979aaSAndroid Build Coastguard Worker     } else {
1440*3f1979aaSAndroid Build Coastguard Worker       v4sf *tmp = buff[ib];
1441*3f1979aaSAndroid Build Coastguard Worker       for (k=0; k < Ncvec; ++k) {
1442*3f1979aaSAndroid Build Coastguard Worker         UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1443*3f1979aaSAndroid Build Coastguard Worker       }
1444*3f1979aaSAndroid Build Coastguard Worker       ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1445*3f1979aaSAndroid Build Coastguard Worker                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1446*3f1979aaSAndroid Build Coastguard Worker       FUNC_CPLX_FINALIZE(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1447*3f1979aaSAndroid Build Coastguard Worker     }
1448*3f1979aaSAndroid Build Coastguard Worker     if (ordered) {
1449*3f1979aaSAndroid Build Coastguard Worker       FUNC_ZREORDER(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1450*3f1979aaSAndroid Build Coastguard Worker     } else ib = !ib;
1451*3f1979aaSAndroid Build Coastguard Worker   } else {
1452*3f1979aaSAndroid Build Coastguard Worker     if (vinput == buff[ib]) {
1453*3f1979aaSAndroid Build Coastguard Worker       ib = !ib; /* may happen when finput == foutput */
1454*3f1979aaSAndroid Build Coastguard Worker     }
1455*3f1979aaSAndroid Build Coastguard Worker     if (ordered) {
1456*3f1979aaSAndroid Build Coastguard Worker       FUNC_ZREORDER(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1457*3f1979aaSAndroid Build Coastguard Worker       vinput = buff[ib]; ib = !ib;
1458*3f1979aaSAndroid Build Coastguard Worker     }
1459*3f1979aaSAndroid Build Coastguard Worker     if (setup->transform == PFFFT_REAL) {
1460*3f1979aaSAndroid Build Coastguard Worker       FUNC_REAL_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1461*3f1979aaSAndroid Build Coastguard Worker       ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1462*3f1979aaSAndroid Build Coastguard Worker                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1463*3f1979aaSAndroid Build Coastguard Worker     } else {
1464*3f1979aaSAndroid Build Coastguard Worker       FUNC_CPLX_PREPROCESS(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1465*3f1979aaSAndroid Build Coastguard Worker       ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1466*3f1979aaSAndroid Build Coastguard Worker                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1467*3f1979aaSAndroid Build Coastguard Worker       for (k=0; k < Ncvec; ++k) {
1468*3f1979aaSAndroid Build Coastguard Worker         INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1469*3f1979aaSAndroid Build Coastguard Worker       }
1470*3f1979aaSAndroid Build Coastguard Worker     }
1471*3f1979aaSAndroid Build Coastguard Worker   }
1472*3f1979aaSAndroid Build Coastguard Worker 
1473*3f1979aaSAndroid Build Coastguard Worker   if (buff[ib] != voutput) {
1474*3f1979aaSAndroid Build Coastguard Worker     /* extra copy required -- this situation should only happen when finput == foutput */
1475*3f1979aaSAndroid Build Coastguard Worker     assert(finput==foutput);
1476*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < Ncvec; ++k) {
1477*3f1979aaSAndroid Build Coastguard Worker       v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1478*3f1979aaSAndroid Build Coastguard Worker       voutput[2*k] = a; voutput[2*k+1] = b;
1479*3f1979aaSAndroid Build Coastguard Worker     }
1480*3f1979aaSAndroid Build Coastguard Worker     ib = !ib;
1481*3f1979aaSAndroid Build Coastguard Worker   }
1482*3f1979aaSAndroid Build Coastguard Worker   assert(buff[ib] == voutput);
1483*3f1979aaSAndroid Build Coastguard Worker }
1484*3f1979aaSAndroid Build Coastguard Worker 
FUNC_ZCONVOLVE_ACCUMULATE(SETUP_STRUCT * s,const float * a,const float * b,float * ab,float scaling)1485*3f1979aaSAndroid Build Coastguard Worker void FUNC_ZCONVOLVE_ACCUMULATE(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) {
1486*3f1979aaSAndroid Build Coastguard Worker   int Ncvec = s->Ncvec;
1487*3f1979aaSAndroid Build Coastguard Worker   const v4sf * RESTRICT va = (const v4sf*)a;
1488*3f1979aaSAndroid Build Coastguard Worker   const v4sf * RESTRICT vb = (const v4sf*)b;
1489*3f1979aaSAndroid Build Coastguard Worker   v4sf * RESTRICT vab = (v4sf*)ab;
1490*3f1979aaSAndroid Build Coastguard Worker 
1491*3f1979aaSAndroid Build Coastguard Worker #ifdef __arm__
1492*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(va);
1493*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vb);
1494*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vab);
1495*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(va+2);
1496*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vb+2);
1497*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vab+2);
1498*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(va+4);
1499*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vb+4);
1500*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vab+4);
1501*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(va+6);
1502*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vb+6);
1503*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vab+6);
1504*3f1979aaSAndroid Build Coastguard Worker # ifndef __clang__
1505*3f1979aaSAndroid Build Coastguard Worker #   define ZCONVOLVE_USING_INLINE_NEON_ASM
1506*3f1979aaSAndroid Build Coastguard Worker # endif
1507*3f1979aaSAndroid Build Coastguard Worker #endif
1508*3f1979aaSAndroid Build Coastguard Worker 
1509*3f1979aaSAndroid Build Coastguard Worker   float ar, ai, br, bi, abr, abi;
1510*3f1979aaSAndroid Build Coastguard Worker #ifndef ZCONVOLVE_USING_INLINE_ASM
1511*3f1979aaSAndroid Build Coastguard Worker   v4sf vscal = LD_PS1(scaling);
1512*3f1979aaSAndroid Build Coastguard Worker   int i;
1513*3f1979aaSAndroid Build Coastguard Worker #endif
1514*3f1979aaSAndroid Build Coastguard Worker 
1515*3f1979aaSAndroid Build Coastguard Worker   assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1516*3f1979aaSAndroid Build Coastguard Worker   ar = ((v4sf_union*)va)[0].f[0];
1517*3f1979aaSAndroid Build Coastguard Worker   ai = ((v4sf_union*)va)[1].f[0];
1518*3f1979aaSAndroid Build Coastguard Worker   br = ((v4sf_union*)vb)[0].f[0];
1519*3f1979aaSAndroid Build Coastguard Worker   bi = ((v4sf_union*)vb)[1].f[0];
1520*3f1979aaSAndroid Build Coastguard Worker   abr = ((v4sf_union*)vab)[0].f[0];
1521*3f1979aaSAndroid Build Coastguard Worker   abi = ((v4sf_union*)vab)[1].f[0];
1522*3f1979aaSAndroid Build Coastguard Worker 
1523*3f1979aaSAndroid Build Coastguard Worker #ifdef ZCONVOLVE_USING_INLINE_ASM
1524*3f1979aaSAndroid Build Coastguard Worker   /* inline asm version, unfortunately miscompiled by clang 3.2,
1525*3f1979aaSAndroid Build Coastguard Worker    * at least on ubuntu.. so this will be restricted to gcc */
1526*3f1979aaSAndroid Build Coastguard Worker   const float *a_ = a, *b_ = b; float *ab_ = ab;
1527*3f1979aaSAndroid Build Coastguard Worker   int N = Ncvec;
1528*3f1979aaSAndroid Build Coastguard Worker   asm volatile("mov         r8, %2                  \n"
1529*3f1979aaSAndroid Build Coastguard Worker                "vdup.f32    q15, %4                 \n"
1530*3f1979aaSAndroid Build Coastguard Worker                "1:                                  \n"
1531*3f1979aaSAndroid Build Coastguard Worker                "pld         [%0,#64]                \n"
1532*3f1979aaSAndroid Build Coastguard Worker                "pld         [%1,#64]                \n"
1533*3f1979aaSAndroid Build Coastguard Worker                "pld         [%2,#64]                \n"
1534*3f1979aaSAndroid Build Coastguard Worker                "pld         [%0,#96]                \n"
1535*3f1979aaSAndroid Build Coastguard Worker                "pld         [%1,#96]                \n"
1536*3f1979aaSAndroid Build Coastguard Worker                "pld         [%2,#96]                \n"
1537*3f1979aaSAndroid Build Coastguard Worker                "vld1.f32    {q0,q1},   [%0,:128]!         \n"
1538*3f1979aaSAndroid Build Coastguard Worker                "vld1.f32    {q4,q5},   [%1,:128]!         \n"
1539*3f1979aaSAndroid Build Coastguard Worker                "vld1.f32    {q2,q3},   [%0,:128]!         \n"
1540*3f1979aaSAndroid Build Coastguard Worker                "vld1.f32    {q6,q7},   [%1,:128]!         \n"
1541*3f1979aaSAndroid Build Coastguard Worker                "vld1.f32    {q8,q9},   [r8,:128]!          \n"
1542*3f1979aaSAndroid Build Coastguard Worker 
1543*3f1979aaSAndroid Build Coastguard Worker                "vmul.f32    q10, q0, q4             \n"
1544*3f1979aaSAndroid Build Coastguard Worker                "vmul.f32    q11, q0, q5             \n"
1545*3f1979aaSAndroid Build Coastguard Worker                "vmul.f32    q12, q2, q6             \n"
1546*3f1979aaSAndroid Build Coastguard Worker                "vmul.f32    q13, q2, q7             \n"
1547*3f1979aaSAndroid Build Coastguard Worker                "vmls.f32    q10, q1, q5             \n"
1548*3f1979aaSAndroid Build Coastguard Worker                "vmla.f32    q11, q1, q4             \n"
1549*3f1979aaSAndroid Build Coastguard Worker                "vld1.f32    {q0,q1}, [r8,:128]!     \n"
1550*3f1979aaSAndroid Build Coastguard Worker                "vmls.f32    q12, q3, q7             \n"
1551*3f1979aaSAndroid Build Coastguard Worker                "vmla.f32    q13, q3, q6             \n"
1552*3f1979aaSAndroid Build Coastguard Worker                "vmla.f32    q8, q10, q15            \n"
1553*3f1979aaSAndroid Build Coastguard Worker                "vmla.f32    q9, q11, q15            \n"
1554*3f1979aaSAndroid Build Coastguard Worker                "vmla.f32    q0, q12, q15            \n"
1555*3f1979aaSAndroid Build Coastguard Worker                "vmla.f32    q1, q13, q15            \n"
1556*3f1979aaSAndroid Build Coastguard Worker                "vst1.f32    {q8,q9},[%2,:128]!    \n"
1557*3f1979aaSAndroid Build Coastguard Worker                "vst1.f32    {q0,q1},[%2,:128]!    \n"
1558*3f1979aaSAndroid Build Coastguard Worker                "subs        %3, #2                  \n"
1559*3f1979aaSAndroid Build Coastguard Worker                "bne         1b                      \n"
1560*3f1979aaSAndroid Build Coastguard Worker                : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1561*3f1979aaSAndroid Build Coastguard Worker #else
1562*3f1979aaSAndroid Build Coastguard Worker   /* default routine, works fine for non-arm cpus with current compilers */
1563*3f1979aaSAndroid Build Coastguard Worker   for (i=0; i < Ncvec; i += 2) {
1564*3f1979aaSAndroid Build Coastguard Worker     v4sf ar, ai, br, bi;
1565*3f1979aaSAndroid Build Coastguard Worker     ar = va[2*i+0]; ai = va[2*i+1];
1566*3f1979aaSAndroid Build Coastguard Worker     br = vb[2*i+0]; bi = vb[2*i+1];
1567*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(ar, ai, br, bi);
1568*3f1979aaSAndroid Build Coastguard Worker     vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1569*3f1979aaSAndroid Build Coastguard Worker     vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1570*3f1979aaSAndroid Build Coastguard Worker     ar = va[2*i+2]; ai = va[2*i+3];
1571*3f1979aaSAndroid Build Coastguard Worker     br = vb[2*i+2]; bi = vb[2*i+3];
1572*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(ar, ai, br, bi);
1573*3f1979aaSAndroid Build Coastguard Worker     vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1574*3f1979aaSAndroid Build Coastguard Worker     vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1575*3f1979aaSAndroid Build Coastguard Worker   }
1576*3f1979aaSAndroid Build Coastguard Worker #endif
1577*3f1979aaSAndroid Build Coastguard Worker   if (s->transform == PFFFT_REAL) {
1578*3f1979aaSAndroid Build Coastguard Worker     ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1579*3f1979aaSAndroid Build Coastguard Worker     ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1580*3f1979aaSAndroid Build Coastguard Worker   }
1581*3f1979aaSAndroid Build Coastguard Worker }
1582*3f1979aaSAndroid Build Coastguard Worker 
FUNC_ZCONVOLVE_NO_ACCU(SETUP_STRUCT * s,const float * a,const float * b,float * ab,float scaling)1583*3f1979aaSAndroid Build Coastguard Worker void FUNC_ZCONVOLVE_NO_ACCU(SETUP_STRUCT *s, const float *a, const float *b, float *ab, float scaling) {
1584*3f1979aaSAndroid Build Coastguard Worker   v4sf vscal = LD_PS1(scaling);
1585*3f1979aaSAndroid Build Coastguard Worker   const v4sf * RESTRICT va = (const v4sf*)a;
1586*3f1979aaSAndroid Build Coastguard Worker   const v4sf * RESTRICT vb = (const v4sf*)b;
1587*3f1979aaSAndroid Build Coastguard Worker   v4sf * RESTRICT vab = (v4sf*)ab;
1588*3f1979aaSAndroid Build Coastguard Worker   float sar, sai, sbr, sbi;
1589*3f1979aaSAndroid Build Coastguard Worker   const int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
1590*3f1979aaSAndroid Build Coastguard Worker   int k; /* was i -- but always used "2*i" - except at for() */
1591*3f1979aaSAndroid Build Coastguard Worker 
1592*3f1979aaSAndroid Build Coastguard Worker #ifdef __arm__
1593*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(va);
1594*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vb);
1595*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vab);
1596*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(va+2);
1597*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vb+2);
1598*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vab+2);
1599*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(va+4);
1600*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vb+4);
1601*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vab+4);
1602*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(va+6);
1603*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vb+6);
1604*3f1979aaSAndroid Build Coastguard Worker   __builtin_prefetch(vab+6);
1605*3f1979aaSAndroid Build Coastguard Worker # ifndef __clang__
1606*3f1979aaSAndroid Build Coastguard Worker #   define ZCONVOLVE_USING_INLINE_NEON_ASM
1607*3f1979aaSAndroid Build Coastguard Worker # endif
1608*3f1979aaSAndroid Build Coastguard Worker #endif
1609*3f1979aaSAndroid Build Coastguard Worker 
1610*3f1979aaSAndroid Build Coastguard Worker   assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1611*3f1979aaSAndroid Build Coastguard Worker   sar = ((v4sf_union*)va)[0].f[0];
1612*3f1979aaSAndroid Build Coastguard Worker   sai = ((v4sf_union*)va)[1].f[0];
1613*3f1979aaSAndroid Build Coastguard Worker   sbr = ((v4sf_union*)vb)[0].f[0];
1614*3f1979aaSAndroid Build Coastguard Worker   sbi = ((v4sf_union*)vb)[1].f[0];
1615*3f1979aaSAndroid Build Coastguard Worker 
1616*3f1979aaSAndroid Build Coastguard Worker   /* default routine, works fine for non-arm cpus with current compilers */
1617*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < NcvecMulTwo; k += 4) {
1618*3f1979aaSAndroid Build Coastguard Worker     v4sf var, vai, vbr, vbi;
1619*3f1979aaSAndroid Build Coastguard Worker     var = va[k+0]; vai = va[k+1];
1620*3f1979aaSAndroid Build Coastguard Worker     vbr = vb[k+0]; vbi = vb[k+1];
1621*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(var, vai, vbr, vbi);
1622*3f1979aaSAndroid Build Coastguard Worker     vab[k+0] = VMUL(var, vscal);
1623*3f1979aaSAndroid Build Coastguard Worker     vab[k+1] = VMUL(vai, vscal);
1624*3f1979aaSAndroid Build Coastguard Worker     var = va[k+2]; vai = va[k+3];
1625*3f1979aaSAndroid Build Coastguard Worker     vbr = vb[k+2]; vbi = vb[k+3];
1626*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(var, vai, vbr, vbi);
1627*3f1979aaSAndroid Build Coastguard Worker     vab[k+2] = VMUL(var, vscal);
1628*3f1979aaSAndroid Build Coastguard Worker     vab[k+3] = VMUL(vai, vscal);
1629*3f1979aaSAndroid Build Coastguard Worker   }
1630*3f1979aaSAndroid Build Coastguard Worker 
1631*3f1979aaSAndroid Build Coastguard Worker   if (s->transform == PFFFT_REAL) {
1632*3f1979aaSAndroid Build Coastguard Worker     ((v4sf_union*)vab)[0].f[0] = sar*sbr*scaling;
1633*3f1979aaSAndroid Build Coastguard Worker     ((v4sf_union*)vab)[1].f[0] = sai*sbi*scaling;
1634*3f1979aaSAndroid Build Coastguard Worker   }
1635*3f1979aaSAndroid Build Coastguard Worker }
1636*3f1979aaSAndroid Build Coastguard Worker 
1637*3f1979aaSAndroid Build Coastguard Worker 
1638*3f1979aaSAndroid Build Coastguard Worker #else  /* #if ( SIMD_SZ == 4 )   * !defined(PFFFT_SIMD_DISABLE) */
1639*3f1979aaSAndroid Build Coastguard Worker 
1640*3f1979aaSAndroid Build Coastguard Worker /* standard routine using scalar floats, without SIMD stuff. */
1641*3f1979aaSAndroid Build Coastguard Worker 
1642*3f1979aaSAndroid Build Coastguard Worker #define pffft_zreorder_nosimd FUNC_ZREORDER
pffft_zreorder_nosimd(SETUP_STRUCT * setup,const float * in,float * out,pffft_direction_t direction)1643*3f1979aaSAndroid Build Coastguard Worker void pffft_zreorder_nosimd(SETUP_STRUCT *setup, const float *in, float *out, pffft_direction_t direction) {
1644*3f1979aaSAndroid Build Coastguard Worker   int k, N = setup->N;
1645*3f1979aaSAndroid Build Coastguard Worker   if (setup->transform == PFFFT_COMPLEX) {
1646*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < 2*N; ++k) out[k] = in[k];
1647*3f1979aaSAndroid Build Coastguard Worker     return;
1648*3f1979aaSAndroid Build Coastguard Worker   }
1649*3f1979aaSAndroid Build Coastguard Worker   else if (direction == PFFFT_FORWARD) {
1650*3f1979aaSAndroid Build Coastguard Worker     float x_N = in[N-1];
1651*3f1979aaSAndroid Build Coastguard Worker     for (k=N-1; k > 1; --k) out[k] = in[k-1];
1652*3f1979aaSAndroid Build Coastguard Worker     out[0] = in[0];
1653*3f1979aaSAndroid Build Coastguard Worker     out[1] = x_N;
1654*3f1979aaSAndroid Build Coastguard Worker   } else {
1655*3f1979aaSAndroid Build Coastguard Worker     float x_N = in[1];
1656*3f1979aaSAndroid Build Coastguard Worker     for (k=1; k < N-1; ++k) out[k] = in[k+1];
1657*3f1979aaSAndroid Build Coastguard Worker     out[0] = in[0];
1658*3f1979aaSAndroid Build Coastguard Worker     out[N-1] = x_N;
1659*3f1979aaSAndroid Build Coastguard Worker   }
1660*3f1979aaSAndroid Build Coastguard Worker }
1661*3f1979aaSAndroid Build Coastguard Worker 
1662*3f1979aaSAndroid Build Coastguard Worker #define pffft_transform_internal_nosimd FUNC_TRANSFORM_INTERNAL
pffft_transform_internal_nosimd(SETUP_STRUCT * setup,const float * input,float * output,float * scratch,pffft_direction_t direction,int ordered)1663*3f1979aaSAndroid Build Coastguard Worker void pffft_transform_internal_nosimd(SETUP_STRUCT *setup, const float *input, float *output, float *scratch,
1664*3f1979aaSAndroid Build Coastguard Worker                                     pffft_direction_t direction, int ordered) {
1665*3f1979aaSAndroid Build Coastguard Worker   int Ncvec   = setup->Ncvec;
1666*3f1979aaSAndroid Build Coastguard Worker   int nf_odd = (setup->ifac[1] & 1);
1667*3f1979aaSAndroid Build Coastguard Worker 
1668*3f1979aaSAndroid Build Coastguard Worker   /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
1669*3f1979aaSAndroid Build Coastguard Worker   int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1670*3f1979aaSAndroid Build Coastguard Worker   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1671*3f1979aaSAndroid Build Coastguard Worker   float *buff[2];
1672*3f1979aaSAndroid Build Coastguard Worker   int ib;
1673*3f1979aaSAndroid Build Coastguard Worker   if (scratch == 0) scratch = scratch_on_stack;
1674*3f1979aaSAndroid Build Coastguard Worker   buff[0] = output; buff[1] = scratch;
1675*3f1979aaSAndroid Build Coastguard Worker 
1676*3f1979aaSAndroid Build Coastguard Worker   if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */
1677*3f1979aaSAndroid Build Coastguard Worker   ib = (nf_odd ^ ordered ? 1 : 0);
1678*3f1979aaSAndroid Build Coastguard Worker 
1679*3f1979aaSAndroid Build Coastguard Worker   if (direction == PFFFT_FORWARD) {
1680*3f1979aaSAndroid Build Coastguard Worker     if (setup->transform == PFFFT_REAL) {
1681*3f1979aaSAndroid Build Coastguard Worker       ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1682*3f1979aaSAndroid Build Coastguard Worker                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1683*3f1979aaSAndroid Build Coastguard Worker     } else {
1684*3f1979aaSAndroid Build Coastguard Worker       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1685*3f1979aaSAndroid Build Coastguard Worker                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1686*3f1979aaSAndroid Build Coastguard Worker     }
1687*3f1979aaSAndroid Build Coastguard Worker     if (ordered) {
1688*3f1979aaSAndroid Build Coastguard Worker       FUNC_ZREORDER(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1689*3f1979aaSAndroid Build Coastguard Worker     }
1690*3f1979aaSAndroid Build Coastguard Worker   } else {
1691*3f1979aaSAndroid Build Coastguard Worker     if (input == buff[ib]) {
1692*3f1979aaSAndroid Build Coastguard Worker       ib = !ib; /* may happen when finput == foutput */
1693*3f1979aaSAndroid Build Coastguard Worker     }
1694*3f1979aaSAndroid Build Coastguard Worker     if (ordered) {
1695*3f1979aaSAndroid Build Coastguard Worker       FUNC_ZREORDER(setup, input, buff[!ib], PFFFT_BACKWARD);
1696*3f1979aaSAndroid Build Coastguard Worker       input = buff[!ib];
1697*3f1979aaSAndroid Build Coastguard Worker     }
1698*3f1979aaSAndroid Build Coastguard Worker     if (setup->transform == PFFFT_REAL) {
1699*3f1979aaSAndroid Build Coastguard Worker       ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1700*3f1979aaSAndroid Build Coastguard Worker                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1701*3f1979aaSAndroid Build Coastguard Worker     } else {
1702*3f1979aaSAndroid Build Coastguard Worker       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1703*3f1979aaSAndroid Build Coastguard Worker                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1704*3f1979aaSAndroid Build Coastguard Worker     }
1705*3f1979aaSAndroid Build Coastguard Worker   }
1706*3f1979aaSAndroid Build Coastguard Worker   if (buff[ib] != output) {
1707*3f1979aaSAndroid Build Coastguard Worker     int k;
1708*3f1979aaSAndroid Build Coastguard Worker     /* extra copy required -- this situation should happens only when finput == foutput */
1709*3f1979aaSAndroid Build Coastguard Worker     assert(input==output);
1710*3f1979aaSAndroid Build Coastguard Worker     for (k=0; k < Ncvec; ++k) {
1711*3f1979aaSAndroid Build Coastguard Worker       float a = buff[ib][2*k], b = buff[ib][2*k+1];
1712*3f1979aaSAndroid Build Coastguard Worker       output[2*k] = a; output[2*k+1] = b;
1713*3f1979aaSAndroid Build Coastguard Worker     }
1714*3f1979aaSAndroid Build Coastguard Worker     ib = !ib;
1715*3f1979aaSAndroid Build Coastguard Worker   }
1716*3f1979aaSAndroid Build Coastguard Worker   assert(buff[ib] == output);
1717*3f1979aaSAndroid Build Coastguard Worker }
1718*3f1979aaSAndroid Build Coastguard Worker 
1719*3f1979aaSAndroid Build Coastguard Worker #define pffft_zconvolve_accumulate_nosimd FUNC_ZCONVOLVE_ACCUMULATE
pffft_zconvolve_accumulate_nosimd(SETUP_STRUCT * s,const float * a,const float * b,float * ab,float scaling)1720*3f1979aaSAndroid Build Coastguard Worker void pffft_zconvolve_accumulate_nosimd(SETUP_STRUCT *s, const float *a, const float *b,
1721*3f1979aaSAndroid Build Coastguard Worker                                        float *ab, float scaling) {
1722*3f1979aaSAndroid Build Coastguard Worker   int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
1723*3f1979aaSAndroid Build Coastguard Worker   int k; /* was i -- but always used "2*i" - except at for() */
1724*3f1979aaSAndroid Build Coastguard Worker 
1725*3f1979aaSAndroid Build Coastguard Worker   if (s->transform == PFFFT_REAL) {
1726*3f1979aaSAndroid Build Coastguard Worker     /* take care of the fftpack ordering */
1727*3f1979aaSAndroid Build Coastguard Worker     ab[0] += a[0]*b[0]*scaling;
1728*3f1979aaSAndroid Build Coastguard Worker     ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling;
1729*3f1979aaSAndroid Build Coastguard Worker     ++ab; ++a; ++b; NcvecMulTwo -= 2;
1730*3f1979aaSAndroid Build Coastguard Worker   }
1731*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < NcvecMulTwo; k += 2) {
1732*3f1979aaSAndroid Build Coastguard Worker     float ar, ai, br, bi;
1733*3f1979aaSAndroid Build Coastguard Worker     ar = a[k+0]; ai = a[k+1];
1734*3f1979aaSAndroid Build Coastguard Worker     br = b[k+0]; bi = b[k+1];
1735*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(ar, ai, br, bi);
1736*3f1979aaSAndroid Build Coastguard Worker     ab[k+0] += ar*scaling;
1737*3f1979aaSAndroid Build Coastguard Worker     ab[k+1] += ai*scaling;
1738*3f1979aaSAndroid Build Coastguard Worker   }
1739*3f1979aaSAndroid Build Coastguard Worker }
1740*3f1979aaSAndroid Build Coastguard Worker 
1741*3f1979aaSAndroid Build Coastguard Worker #define pffft_zconvolve_no_accu_nosimd FUNC_ZCONVOLVE_NO_ACCU
pffft_zconvolve_no_accu_nosimd(SETUP_STRUCT * s,const float * a,const float * b,float * ab,float scaling)1742*3f1979aaSAndroid Build Coastguard Worker void pffft_zconvolve_no_accu_nosimd(SETUP_STRUCT *s, const float *a, const float *b,
1743*3f1979aaSAndroid Build Coastguard Worker                                     float *ab, float scaling) {
1744*3f1979aaSAndroid Build Coastguard Worker   int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
1745*3f1979aaSAndroid Build Coastguard Worker   int k; /* was i -- but always used "2*i" - except at for() */
1746*3f1979aaSAndroid Build Coastguard Worker 
1747*3f1979aaSAndroid Build Coastguard Worker   if (s->transform == PFFFT_REAL) {
1748*3f1979aaSAndroid Build Coastguard Worker     /* take care of the fftpack ordering */
1749*3f1979aaSAndroid Build Coastguard Worker     ab[0] += a[0]*b[0]*scaling;
1750*3f1979aaSAndroid Build Coastguard Worker     ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling;
1751*3f1979aaSAndroid Build Coastguard Worker     ++ab; ++a; ++b; NcvecMulTwo -= 2;
1752*3f1979aaSAndroid Build Coastguard Worker   }
1753*3f1979aaSAndroid Build Coastguard Worker   for (k=0; k < NcvecMulTwo; k += 2) {
1754*3f1979aaSAndroid Build Coastguard Worker     float ar, ai, br, bi;
1755*3f1979aaSAndroid Build Coastguard Worker     ar = a[k+0]; ai = a[k+1];
1756*3f1979aaSAndroid Build Coastguard Worker     br = b[k+0]; bi = b[k+1];
1757*3f1979aaSAndroid Build Coastguard Worker     VCPLXMUL(ar, ai, br, bi);
1758*3f1979aaSAndroid Build Coastguard Worker     ab[k+0] = ar*scaling;
1759*3f1979aaSAndroid Build Coastguard Worker     ab[k+1] = ai*scaling;
1760*3f1979aaSAndroid Build Coastguard Worker   }
1761*3f1979aaSAndroid Build Coastguard Worker }
1762*3f1979aaSAndroid Build Coastguard Worker 
1763*3f1979aaSAndroid Build Coastguard Worker 
1764*3f1979aaSAndroid Build Coastguard Worker #endif /* #if ( SIMD_SZ == 4 )    * !defined(PFFFT_SIMD_DISABLE) */
1765*3f1979aaSAndroid Build Coastguard Worker 
1766*3f1979aaSAndroid Build Coastguard Worker 
FUNC_TRANSFORM_UNORDRD(SETUP_STRUCT * setup,const float * input,float * output,float * work,pffft_direction_t direction)1767*3f1979aaSAndroid Build Coastguard Worker void FUNC_TRANSFORM_UNORDRD(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1768*3f1979aaSAndroid Build Coastguard Worker   FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 0);
1769*3f1979aaSAndroid Build Coastguard Worker }
1770*3f1979aaSAndroid Build Coastguard Worker 
FUNC_TRANSFORM_ORDERED(SETUP_STRUCT * setup,const float * input,float * output,float * work,pffft_direction_t direction)1771*3f1979aaSAndroid Build Coastguard Worker void FUNC_TRANSFORM_ORDERED(SETUP_STRUCT *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1772*3f1979aaSAndroid Build Coastguard Worker   FUNC_TRANSFORM_INTERNAL(setup, input, output, (v4sf*)work, direction, 1);
1773*3f1979aaSAndroid Build Coastguard Worker }
1774*3f1979aaSAndroid Build Coastguard Worker 
1775*3f1979aaSAndroid Build Coastguard Worker 
1776*3f1979aaSAndroid Build Coastguard Worker #if ( SIMD_SZ == 4 )
1777*3f1979aaSAndroid Build Coastguard Worker 
1778*3f1979aaSAndroid Build Coastguard Worker #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
1779*3f1979aaSAndroid Build Coastguard Worker 
1780*3f1979aaSAndroid Build Coastguard Worker /* detect bugs with the vector support macros */
FUNC_VALIDATE_SIMD_A()1781*3f1979aaSAndroid Build Coastguard Worker void FUNC_VALIDATE_SIMD_A() {
1782*3f1979aaSAndroid Build Coastguard Worker   float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
1783*3f1979aaSAndroid Build Coastguard Worker   v4sf_union a0, a1, a2, a3, t, u;
1784*3f1979aaSAndroid Build Coastguard Worker   memcpy(a0.f, f, 4*sizeof(float));
1785*3f1979aaSAndroid Build Coastguard Worker   memcpy(a1.f, f+4, 4*sizeof(float));
1786*3f1979aaSAndroid Build Coastguard Worker   memcpy(a2.f, f+8, 4*sizeof(float));
1787*3f1979aaSAndroid Build Coastguard Worker   memcpy(a3.f, f+12, 4*sizeof(float));
1788*3f1979aaSAndroid Build Coastguard Worker 
1789*3f1979aaSAndroid Build Coastguard Worker   t = a0; u = a1; t.v = VZERO();
1790*3f1979aaSAndroid Build Coastguard Worker   printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
1791*3f1979aaSAndroid Build Coastguard Worker   t.v = VADD(a1.v, a2.v);
1792*3f1979aaSAndroid Build Coastguard Worker   printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
1793*3f1979aaSAndroid Build Coastguard Worker   t.v = VMUL(a1.v, a2.v);
1794*3f1979aaSAndroid Build Coastguard Worker   printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
1795*3f1979aaSAndroid Build Coastguard Worker   t.v = VMADD(a1.v, a2.v,a0.v);
1796*3f1979aaSAndroid Build Coastguard Worker   printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
1797*3f1979aaSAndroid Build Coastguard Worker 
1798*3f1979aaSAndroid Build Coastguard Worker   INTERLEAVE2(a1.v,a2.v,t.v,u.v);
1799*3f1979aaSAndroid Build Coastguard Worker   printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
1800*3f1979aaSAndroid Build Coastguard Worker   assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
1801*3f1979aaSAndroid Build Coastguard Worker   UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
1802*3f1979aaSAndroid Build Coastguard Worker   printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
1803*3f1979aaSAndroid Build Coastguard Worker   assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
1804*3f1979aaSAndroid Build Coastguard Worker 
1805*3f1979aaSAndroid Build Coastguard Worker   t.v=LD_PS1(f[15]);
1806*3f1979aaSAndroid Build Coastguard Worker   printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
1807*3f1979aaSAndroid Build Coastguard Worker   assertv4(t, 15, 15, 15, 15);
1808*3f1979aaSAndroid Build Coastguard Worker   t.v = VSWAPHL(a1.v, a2.v);
1809*3f1979aaSAndroid Build Coastguard Worker   printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
1810*3f1979aaSAndroid Build Coastguard Worker   assertv4(t, 8, 9, 6, 7);
1811*3f1979aaSAndroid Build Coastguard Worker   VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
1812*3f1979aaSAndroid Build Coastguard Worker   printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
1813*3f1979aaSAndroid Build Coastguard Worker          a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
1814*3f1979aaSAndroid Build Coastguard Worker          a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
1815*3f1979aaSAndroid Build Coastguard Worker   assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
1816*3f1979aaSAndroid Build Coastguard Worker }
1817*3f1979aaSAndroid Build Coastguard Worker 
1818*3f1979aaSAndroid Build Coastguard Worker 
pffft_assert1(float result,float ref,const char * vartxt,const char * functxt,int * numErrs,const char * f,int lineNo)1819*3f1979aaSAndroid Build Coastguard Worker static void pffft_assert1( float result, float ref, const char * vartxt, const char * functxt, int * numErrs, const char * f, int lineNo )
1820*3f1979aaSAndroid Build Coastguard Worker {
1821*3f1979aaSAndroid Build Coastguard Worker   if ( !( fabsf( result - ref ) < 0.01F ) )
1822*3f1979aaSAndroid Build Coastguard Worker   {
1823*3f1979aaSAndroid Build Coastguard Worker     fprintf(stderr, "%s: assert for %s at %s(%d)\n  expected %f  value %f\n", functxt, vartxt, f, lineNo, ref, result);
1824*3f1979aaSAndroid Build Coastguard Worker     ++(*numErrs);
1825*3f1979aaSAndroid Build Coastguard Worker   }
1826*3f1979aaSAndroid Build Coastguard Worker }
1827*3f1979aaSAndroid Build Coastguard Worker 
pffft_assert4(vsfscalar v0,vsfscalar v1,vsfscalar v2,vsfscalar v3,float a,float b,float c,float d,const char * functxt,int * numErrs,const char * f,int lineNo)1828*3f1979aaSAndroid Build Coastguard Worker static void pffft_assert4(  vsfscalar v0, vsfscalar v1, vsfscalar v2, vsfscalar v3,
1829*3f1979aaSAndroid Build Coastguard Worker   float a, float b, float c, float d, const char * functxt, int * numErrs, const char * f, int lineNo )
1830*3f1979aaSAndroid Build Coastguard Worker {
1831*3f1979aaSAndroid Build Coastguard Worker   pffft_assert1( v0, a, "[0]", functxt, numErrs, f, lineNo );
1832*3f1979aaSAndroid Build Coastguard Worker   pffft_assert1( v1, b, "[1]", functxt, numErrs, f, lineNo );
1833*3f1979aaSAndroid Build Coastguard Worker   pffft_assert1( v2, c, "[2]", functxt, numErrs, f, lineNo );
1834*3f1979aaSAndroid Build Coastguard Worker   pffft_assert1( v3, d, "[3]", functxt, numErrs, f, lineNo );
1835*3f1979aaSAndroid Build Coastguard Worker }
1836*3f1979aaSAndroid Build Coastguard Worker 
1837*3f1979aaSAndroid Build Coastguard Worker #define PFFFT_ASSERT4( V, a, b, c, d, FUNCTXT )  pffft_assert4( (V).f[0], (V).f[1], (V).f[2], (V).f[3], a, b, c, d, FUNCTXT, &numErrs, __FILE__, __LINE__ )
1838*3f1979aaSAndroid Build Coastguard Worker 
1839*3f1979aaSAndroid Build Coastguard Worker 
FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)1840*3f1979aaSAndroid Build Coastguard Worker int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)
1841*3f1979aaSAndroid Build Coastguard Worker {
1842*3f1979aaSAndroid Build Coastguard Worker   int numErrs = 0;
1843*3f1979aaSAndroid Build Coastguard Worker 
1844*3f1979aaSAndroid Build Coastguard Worker   {
1845*3f1979aaSAndroid Build Coastguard Worker     v4sf_union C;
1846*3f1979aaSAndroid Build Coastguard Worker     int k;
1847*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1848*3f1979aaSAndroid Build Coastguard Worker 
1849*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1850*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: { }\n" );
1851*3f1979aaSAndroid Build Coastguard Worker     }
1852*3f1979aaSAndroid Build Coastguard Worker     C.v = VZERO();
1853*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1854*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "VZERO(a) => C) => {\n" );
1855*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1856*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1857*3f1979aaSAndroid Build Coastguard Worker     }
1858*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 0.0F, 0.0F, 0.0F, 0.0F, "VZERO() Out C" );
1859*3f1979aaSAndroid Build Coastguard Worker   }
1860*3f1979aaSAndroid Build Coastguard Worker 
1861*3f1979aaSAndroid Build Coastguard Worker   {
1862*3f1979aaSAndroid Build Coastguard Worker     v4sf_union C;
1863*3f1979aaSAndroid Build Coastguard Worker     float a = 42.0F;
1864*3f1979aaSAndroid Build Coastguard Worker     int k;
1865*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1866*3f1979aaSAndroid Build Coastguard Worker 
1867*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1868*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: a = {\n" );
1869*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp a:  %f\n", a );
1870*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1871*3f1979aaSAndroid Build Coastguard Worker     }
1872*3f1979aaSAndroid Build Coastguard Worker     C.v = LD_PS1(a);
1873*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1874*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "LD_PS1(a) => C) => {\n" );
1875*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1876*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1877*3f1979aaSAndroid Build Coastguard Worker     }
1878*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 42.0F, 42.0F, 42.0F, 42.0F, "LD_PS1() Out C" );
1879*3f1979aaSAndroid Build Coastguard Worker   }
1880*3f1979aaSAndroid Build Coastguard Worker 
1881*3f1979aaSAndroid Build Coastguard Worker   {
1882*3f1979aaSAndroid Build Coastguard Worker     v4sf_union C;
1883*3f1979aaSAndroid Build Coastguard Worker     float a[16];
1884*3f1979aaSAndroid Build Coastguard Worker     int numAligned = 0, numUnaligned = 0;
1885*3f1979aaSAndroid Build Coastguard Worker     int k;
1886*3f1979aaSAndroid Build Coastguard Worker     const char * pUn;
1887*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 16; ++k ) a[k] = k+1;
1888*3f1979aaSAndroid Build Coastguard Worker 
1889*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k + 3 < 16; ++k )
1890*3f1979aaSAndroid Build Coastguard Worker     {
1891*3f1979aaSAndroid Build Coastguard Worker       const float * ptr = &a[k];
1892*3f1979aaSAndroid Build Coastguard Worker       if (DbgOut)
1893*3f1979aaSAndroid Build Coastguard Worker         fprintf(DbgOut, "\ninput: a = [ %f, %f, %f, %f ]\n", ptr[0], ptr[1], ptr[2], ptr[3] );
1894*3f1979aaSAndroid Build Coastguard Worker       if ( VALIGNED(ptr) )
1895*3f1979aaSAndroid Build Coastguard Worker       {
1896*3f1979aaSAndroid Build Coastguard Worker         C.v = VLOAD_ALIGNED( ptr );
1897*3f1979aaSAndroid Build Coastguard Worker         pUn = "";
1898*3f1979aaSAndroid Build Coastguard Worker         ++numAligned;
1899*3f1979aaSAndroid Build Coastguard Worker       }
1900*3f1979aaSAndroid Build Coastguard Worker       else
1901*3f1979aaSAndroid Build Coastguard Worker       {
1902*3f1979aaSAndroid Build Coastguard Worker         C.v = VLOAD_UNALIGNED( ptr );
1903*3f1979aaSAndroid Build Coastguard Worker         pUn = "UN";
1904*3f1979aaSAndroid Build Coastguard Worker         ++numUnaligned;
1905*3f1979aaSAndroid Build Coastguard Worker       }
1906*3f1979aaSAndroid Build Coastguard Worker       if (DbgOut) {
1907*3f1979aaSAndroid Build Coastguard Worker         fprintf(DbgOut, "C = VLOAD_%sALIGNED(&a[%d]) => {\n", pUn, k );
1908*3f1979aaSAndroid Build Coastguard Worker         fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1909*3f1979aaSAndroid Build Coastguard Worker         fprintf(DbgOut, "}\n" );
1910*3f1979aaSAndroid Build Coastguard Worker       }
1911*3f1979aaSAndroid Build Coastguard Worker       //PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" );
1912*3f1979aaSAndroid Build Coastguard Worker 
1913*3f1979aaSAndroid Build Coastguard Worker       if ( numAligned >= 1 && numUnaligned >= 4 )
1914*3f1979aaSAndroid Build Coastguard Worker         break;
1915*3f1979aaSAndroid Build Coastguard Worker     }
1916*3f1979aaSAndroid Build Coastguard Worker     if ( numAligned < 1 ) {
1917*3f1979aaSAndroid Build Coastguard Worker       fprintf(stderr, "VALIGNED() should have found at least 1 occurence!");
1918*3f1979aaSAndroid Build Coastguard Worker       ++numErrs;
1919*3f1979aaSAndroid Build Coastguard Worker     }
1920*3f1979aaSAndroid Build Coastguard Worker     if ( numUnaligned < 4 ) {
1921*3f1979aaSAndroid Build Coastguard Worker       fprintf(stderr, "!VALIGNED() should have found at least 4 occurences!");
1922*3f1979aaSAndroid Build Coastguard Worker       ++numErrs;
1923*3f1979aaSAndroid Build Coastguard Worker     }
1924*3f1979aaSAndroid Build Coastguard Worker   }
1925*3f1979aaSAndroid Build Coastguard Worker 
1926*3f1979aaSAndroid Build Coastguard Worker   {
1927*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, B, C;
1928*3f1979aaSAndroid Build Coastguard Worker     int k;
1929*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
1930*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
1931*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1932*3f1979aaSAndroid Build Coastguard Worker 
1933*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1934*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A,B = {\n" );
1935*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1936*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1937*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1938*3f1979aaSAndroid Build Coastguard Worker     }
1939*3f1979aaSAndroid Build Coastguard Worker     C.v = VADD(A.v, B.v);
1940*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1941*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "C = VADD(A,B) => {\n" );
1942*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1943*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1944*3f1979aaSAndroid Build Coastguard Worker     }
1945*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VADD(): Inp A" );
1946*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VADD(): Inp B" );
1947*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 32.0F, 34.0F, 36.0F, 38.0F, "VADD(): Out C" );
1948*3f1979aaSAndroid Build Coastguard Worker   }
1949*3f1979aaSAndroid Build Coastguard Worker 
1950*3f1979aaSAndroid Build Coastguard Worker   {
1951*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, B, C;
1952*3f1979aaSAndroid Build Coastguard Worker     int k;
1953*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 20 + 2*k+1;
1954*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  B.f[k] = 10 + k+1;
1955*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1956*3f1979aaSAndroid Build Coastguard Worker 
1957*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1958*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A,B = {\n" );
1959*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1960*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1961*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1962*3f1979aaSAndroid Build Coastguard Worker     }
1963*3f1979aaSAndroid Build Coastguard Worker     C.v = VSUB(A.v, B.v);
1964*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1965*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "C = VSUB(A,B) => {\n" );
1966*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1967*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1968*3f1979aaSAndroid Build Coastguard Worker     }
1969*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 21.0F, 23.0F, 25.0F, 27.0F, "VSUB(): Inp A" );
1970*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( B, 11.0F, 12.0F, 13.0F, 14.0F, "VSUB(): Inp B" );
1971*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VSUB(): Out C" );
1972*3f1979aaSAndroid Build Coastguard Worker   }
1973*3f1979aaSAndroid Build Coastguard Worker 
1974*3f1979aaSAndroid Build Coastguard Worker   {
1975*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, B, C;
1976*3f1979aaSAndroid Build Coastguard Worker     int k;
1977*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
1978*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  B.f[k] = k+1;
1979*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
1980*3f1979aaSAndroid Build Coastguard Worker 
1981*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1982*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A,B = {\n" );
1983*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
1984*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
1985*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1986*3f1979aaSAndroid Build Coastguard Worker     }
1987*3f1979aaSAndroid Build Coastguard Worker     C.v = VMUL(A.v, B.v);
1988*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
1989*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "C = VMUL(A,B) => {\n" );
1990*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
1991*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
1992*3f1979aaSAndroid Build Coastguard Worker     }
1993*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMUL(): Inp A" );
1994*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( B,  1.0F,  2.0F,  3.0F,  4.0F, "VMUL(): Inp B" );
1995*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 11.0F, 24.0F, 39.0F, 56.0F, "VMUL(): Out C" );
1996*3f1979aaSAndroid Build Coastguard Worker   }
1997*3f1979aaSAndroid Build Coastguard Worker 
1998*3f1979aaSAndroid Build Coastguard Worker   {
1999*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, B, C, D;
2000*3f1979aaSAndroid Build Coastguard Worker     int k;
2001*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2002*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  B.f[k] = k+1;
2003*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 10 + k;
2004*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  D.f[k] = 40 + k+1;
2005*3f1979aaSAndroid Build Coastguard Worker 
2006*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2007*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A,B,C = {\n" );
2008*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2009*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2010*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2011*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2012*3f1979aaSAndroid Build Coastguard Worker     }
2013*3f1979aaSAndroid Build Coastguard Worker     D.v = VMADD(A.v, B.v, C.v);
2014*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2015*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "D = VMADD(A,B,C) => {\n" );
2016*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2017*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2018*3f1979aaSAndroid Build Coastguard Worker     }
2019*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VMADD(): Inp A" );
2020*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( B,  1.0F,  2.0F,  3.0F,  4.0F, "VMADD(): Inp B" );
2021*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 10.0F, 11.0F, 12.0F, 13.0F, "VMADD(): Inp C" );
2022*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( D, 21.0F, 35.0F, 51.0F, 69.0F, "VMADD(): Out D" );
2023*3f1979aaSAndroid Build Coastguard Worker   }
2024*3f1979aaSAndroid Build Coastguard Worker 
2025*3f1979aaSAndroid Build Coastguard Worker   {
2026*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, B, C, D;
2027*3f1979aaSAndroid Build Coastguard Worker     int k;
2028*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2029*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
2030*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2031*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  D.f[k] = 40 + k+1;
2032*3f1979aaSAndroid Build Coastguard Worker 
2033*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2034*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A,B = {\n" );
2035*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2036*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2037*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2038*3f1979aaSAndroid Build Coastguard Worker     }
2039*3f1979aaSAndroid Build Coastguard Worker     INTERLEAVE2(A.v, B.v, C.v, D.v);
2040*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2041*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "INTERLEAVE2(A,B, => C,D) => {\n" );
2042*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2043*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2044*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2045*3f1979aaSAndroid Build Coastguard Worker     }
2046*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "INTERLEAVE2() Inp A" );
2047*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "INTERLEAVE2() Inp B" );
2048*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 11.0F, 21.0F, 12.0F, 22.0F, "INTERLEAVE2() Out C" );
2049*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( D, 13.0F, 23.0F, 14.0F, 24.0F, "INTERLEAVE2() Out D" );
2050*3f1979aaSAndroid Build Coastguard Worker   }
2051*3f1979aaSAndroid Build Coastguard Worker 
2052*3f1979aaSAndroid Build Coastguard Worker   {
2053*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, B, C, D;
2054*3f1979aaSAndroid Build Coastguard Worker     int k;
2055*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2056*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
2057*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2058*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  D.f[k] = 40 + k+1;
2059*3f1979aaSAndroid Build Coastguard Worker 
2060*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2061*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A,B = {\n" );
2062*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2063*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2064*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2065*3f1979aaSAndroid Build Coastguard Worker     }
2066*3f1979aaSAndroid Build Coastguard Worker     UNINTERLEAVE2(A.v, B.v, C.v, D.v);
2067*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2068*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "UNINTERLEAVE2(A,B, => C,D) => {\n" );
2069*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2070*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2071*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2072*3f1979aaSAndroid Build Coastguard Worker     }
2073*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "UNINTERLEAVE2() Inp A" );
2074*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "UNINTERLEAVE2() Inp B" );
2075*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 11.0F, 13.0F, 21.0F, 23.0F, "UNINTERLEAVE2() Out C" );
2076*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( D, 12.0F, 14.0F, 22.0F, 24.0F, "UNINTERLEAVE2() Out D" );
2077*3f1979aaSAndroid Build Coastguard Worker   }
2078*3f1979aaSAndroid Build Coastguard Worker 
2079*3f1979aaSAndroid Build Coastguard Worker   {
2080*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, B, C, D;
2081*3f1979aaSAndroid Build Coastguard Worker     int k;
2082*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2083*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
2084*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2085*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  D.f[k] = 40 + k+1;
2086*3f1979aaSAndroid Build Coastguard Worker 
2087*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2088*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A,B,C,D = {\n" );
2089*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2090*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2091*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2092*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2093*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2094*3f1979aaSAndroid Build Coastguard Worker     }
2095*3f1979aaSAndroid Build Coastguard Worker     VTRANSPOSE4(A.v, B.v, C.v, D.v);
2096*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2097*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "VTRANSPOSE4(A,B,C,D) => {\n" );
2098*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2099*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2100*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2101*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out D:  %f, %f, %f, %f\n", D.f[0], D.f[1], D.f[2], D.f[3] );
2102*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2103*3f1979aaSAndroid Build Coastguard Worker     }
2104*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 21.0F, 31.0F, 41.0F, "VTRANSPOSE4(): Out A" );
2105*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( B, 12.0F, 22.0F, 32.0F, 42.0F, "VTRANSPOSE4(): Out B" );
2106*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 13.0F, 23.0F, 33.0F, 43.0F, "VTRANSPOSE4(): Out C" );
2107*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( D, 14.0F, 24.0F, 34.0F, 44.0F, "VTRANSPOSE4(): Out D" );
2108*3f1979aaSAndroid Build Coastguard Worker   }
2109*3f1979aaSAndroid Build Coastguard Worker 
2110*3f1979aaSAndroid Build Coastguard Worker   {
2111*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, B, C;
2112*3f1979aaSAndroid Build Coastguard Worker     int k;
2113*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2114*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  B.f[k] = 20 + k+1;
2115*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2116*3f1979aaSAndroid Build Coastguard Worker 
2117*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2118*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A,B = {\n" );
2119*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2120*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp B:  %f, %f, %f, %f\n", B.f[0], B.f[1], B.f[2], B.f[3] );
2121*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2122*3f1979aaSAndroid Build Coastguard Worker     }
2123*3f1979aaSAndroid Build Coastguard Worker     C.v = VSWAPHL(A.v, B.v);
2124*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2125*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "C = VSWAPHL(A,B) => {\n" );
2126*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2127*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2128*3f1979aaSAndroid Build Coastguard Worker     }
2129*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VSWAPHL(): Inp A" );
2130*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( B, 21.0F, 22.0F, 23.0F, 24.0F, "VSWAPHL(): Inp B" );
2131*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 21.0F, 22.0F, 13.0F, 14.0F, "VSWAPHL(): Out C" );
2132*3f1979aaSAndroid Build Coastguard Worker   }
2133*3f1979aaSAndroid Build Coastguard Worker 
2134*3f1979aaSAndroid Build Coastguard Worker   {
2135*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, C;
2136*3f1979aaSAndroid Build Coastguard Worker     int k;
2137*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2138*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  C.f[k] = 30 + k+1;
2139*3f1979aaSAndroid Build Coastguard Worker 
2140*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2141*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A = {\n" );
2142*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2143*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2144*3f1979aaSAndroid Build Coastguard Worker     }
2145*3f1979aaSAndroid Build Coastguard Worker     C.v = VREV_S(A.v);
2146*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2147*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "C = VREV_S(A) => {\n" );
2148*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2149*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2150*3f1979aaSAndroid Build Coastguard Worker     }
2151*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_S(): Inp A" );
2152*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 14.0F, 13.0F, 12.0F, 11.0F, "VREV_S(): Out C" );
2153*3f1979aaSAndroid Build Coastguard Worker   }
2154*3f1979aaSAndroid Build Coastguard Worker 
2155*3f1979aaSAndroid Build Coastguard Worker   {
2156*3f1979aaSAndroid Build Coastguard Worker     v4sf_union A, C;
2157*3f1979aaSAndroid Build Coastguard Worker     int k;
2158*3f1979aaSAndroid Build Coastguard Worker     for ( k = 0; k < 4; ++k )  A.f[k] = 10 + k+1;
2159*3f1979aaSAndroid Build Coastguard Worker 
2160*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2161*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "\ninput: A = {\n" );
2162*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Inp A:  %f, %f, %f, %f\n", A.f[0], A.f[1], A.f[2], A.f[3] );
2163*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2164*3f1979aaSAndroid Build Coastguard Worker     }
2165*3f1979aaSAndroid Build Coastguard Worker     C.v = VREV_C(A.v);
2166*3f1979aaSAndroid Build Coastguard Worker     if (DbgOut) {
2167*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "C = VREV_C(A) => {\n" );
2168*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "  Out C:  %f, %f, %f, %f\n", C.f[0], C.f[1], C.f[2], C.f[3] );
2169*3f1979aaSAndroid Build Coastguard Worker       fprintf(DbgOut, "}\n" );
2170*3f1979aaSAndroid Build Coastguard Worker     }
2171*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( A, 11.0F, 12.0F, 13.0F, 14.0F, "VREV_C(): Inp A" );
2172*3f1979aaSAndroid Build Coastguard Worker     PFFFT_ASSERT4( C, 13.0F, 14.0F, 11.0F, 12.0F, "VREV_C(): Out A" );
2173*3f1979aaSAndroid Build Coastguard Worker   }
2174*3f1979aaSAndroid Build Coastguard Worker 
2175*3f1979aaSAndroid Build Coastguard Worker   return numErrs;
2176*3f1979aaSAndroid Build Coastguard Worker }
2177*3f1979aaSAndroid Build Coastguard Worker 
2178*3f1979aaSAndroid Build Coastguard Worker #else  /* if ( SIMD_SZ == 4 ) */
2179*3f1979aaSAndroid Build Coastguard Worker 
FUNC_VALIDATE_SIMD_A()2180*3f1979aaSAndroid Build Coastguard Worker void FUNC_VALIDATE_SIMD_A()
2181*3f1979aaSAndroid Build Coastguard Worker {
2182*3f1979aaSAndroid Build Coastguard Worker }
2183*3f1979aaSAndroid Build Coastguard Worker 
FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)2184*3f1979aaSAndroid Build Coastguard Worker int FUNC_VALIDATE_SIMD_EX(FILE * DbgOut)
2185*3f1979aaSAndroid Build Coastguard Worker {
2186*3f1979aaSAndroid Build Coastguard Worker   (void)DbgOut;
2187*3f1979aaSAndroid Build Coastguard Worker   return -1;
2188*3f1979aaSAndroid Build Coastguard Worker }
2189*3f1979aaSAndroid Build Coastguard Worker 
2190*3f1979aaSAndroid Build Coastguard Worker #endif  /* end if ( SIMD_SZ == 4 ) */
2191*3f1979aaSAndroid Build Coastguard Worker 
2192