xref: /aosp_15_r20/external/pffft/bench_pffft.c (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1 /*
2   Copyright (c) 2013 Julien Pommier.
3   Copyright (c) 2019  Hayati Ayguen ( [email protected] )
4 
5   Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
6 
7   How to build:
8 
9   on linux, with fftw3:
10   gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
11 
12   on macos, without fftw3:
13   clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
14 
15   on macos, with fftw3:
16   clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
17 
18   as alternative: replace clang by gcc.
19 
20   on windows, with visual c++:
21   cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
22 
23   build without SIMD instructions:
24   gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
25 
26  */
27 
28 #define CONCAT_TOKENS(A, B)  A ## B
29 #define CONCAT_THREE_TOKENS(A, B, C)  A ## B ## C
30 
31 #ifdef PFFFT_ENABLE_FLOAT
32 #include "pffft.h"
33 
34 typedef float pffft_scalar;
35 typedef PFFFT_Setup PFFFT_SETUP;
36 #define PFFFT_FUNC(F)  CONCAT_TOKENS(pffft_, F)
37 
38 #else
39 /*
40 Note: adapted for double precision dynamic range version.
41 */
42 #include "pffft_double.h"
43 
44 typedef double pffft_scalar;
45 typedef PFFFTD_Setup PFFFT_SETUP;
46 #define PFFFT_FUNC(F)  CONCAT_TOKENS(pffftd_, F)
47 #endif
48 
49 #ifdef PFFFT_ENABLE_FLOAT
50 
51 #include "fftpack.h"
52 
53 #ifdef HAVE_GREEN_FFTS
54 #include "fftext.h"
55 #endif
56 
57 #ifdef HAVE_KISS_FFT
58 #include <kiss_fft.h>
59 #include <kiss_fftr.h>
60 #endif
61 
62 #endif
63 
64 #ifdef HAVE_POCKET_FFT
65 #include <pocketfft_double.h>
66 #include <pocketfft_single.h>
67 #endif
68 
69 #ifdef PFFFT_ENABLE_FLOAT
70   #define POCKFFTR_PRE(R)   CONCAT_TOKENS(rffts, R)
71   #define POCKFFTC_PRE(R)   CONCAT_TOKENS(cffts, R)
72   #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rffts, R)
73   #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cffts, R)
74 #else
75   #define POCKFFTR_PRE(R)   CONCAT_TOKENS(rfft, R)
76   #define POCKFFTC_PRE(R)   CONCAT_TOKENS(cfft, R)
77   #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rfft, R)
78   #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cfft, R)
79 #endif
80 
81 
82 
83 #include <math.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 #include <time.h>
87 #include <assert.h>
88 #include <string.h>
89 
90 #ifdef HAVE_SYS_TIMES
91 #  include <sys/times.h>
92 #  include <unistd.h>
93 #endif
94 
95 #ifdef HAVE_VECLIB
96 #  include <Accelerate/Accelerate.h>
97 #endif
98 
99 #ifdef HAVE_FFTW
100 #  include <fftw3.h>
101 
102 #ifdef PFFFT_ENABLE_FLOAT
103 typedef fftwf_plan FFTW_PLAN;
104 typedef fftwf_complex FFTW_COMPLEX;
105 #define FFTW_FUNC(F)  CONCAT_TOKENS(fftwf_, F)
106 #else
107 typedef fftw_plan FFTW_PLAN;
108 typedef fftw_complex FFTW_COMPLEX;
109 #define FFTW_FUNC(F)  CONCAT_TOKENS(fftw_, F)
110 #endif
111 
112 #endif /* HAVE_FFTW */
113 
114 #ifndef M_LN2
115   #define M_LN2   0.69314718055994530942  /* log_e 2 */
116 #endif
117 
118 
119 #define NUM_FFT_ALGOS  9
120 enum {
121   ALGO_FFTPACK = 0,
122   ALGO_VECLIB,
123   ALGO_FFTW_ESTIM,
124   ALGO_FFTW_AUTO,
125   ALGO_GREEN,
126   ALGO_KISS,
127   ALGO_POCKET,
128   ALGO_PFFFT_U, /* = 7 */
129   ALGO_PFFFT_O  /* = 8 */
130 };
131 
132 #define NUM_TYPES      7
133 enum {
134   TYPE_PREP = 0,         /* time for preparation in ms */
135   TYPE_DUR_NS = 1,       /* time per fft in ns */
136   TYPE_DUR_FASTEST = 2,  /* relative time to fastest */
137   TYPE_REL_PFFFT = 3,    /* relative time to ALGO_PFFFT */
138   TYPE_ITER = 4,         /* # of iterations in measurement */
139   TYPE_MFLOPS = 5,       /* MFlops/sec */
140   TYPE_DUR_TOT = 6       /* test duration in sec */
141 };
142 /* double tmeas[NUM_TYPES][NUM_FFT_ALGOS]; */
143 
144 const char * algoName[NUM_FFT_ALGOS] = {
145   "FFTPack      ",
146   "vDSP (vec)   ",
147   "FFTW F(estim)",
148   "FFTW F(auto) ",
149   "Green        ",
150   "Kiss         ",
151   "Pocket       ",
152   "PFFFT-U(simd)",  /* unordered */
153   "PFFFT (simd) "   /* ordered */
154 };
155 
156 
157 int compiledInAlgo[NUM_FFT_ALGOS] = {
158 #ifdef PFFFT_ENABLE_FLOAT
159   1, /* "FFTPack    " */
160 #else
161   0, /* "FFTPack    " */
162 #endif
163 #if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT)
164   1, /* "vDSP (vec) " */
165 #else
166   0,
167 #endif
168 #if defined(HAVE_FFTW)
169   1, /* "FFTW(estim)" */
170   1, /* "FFTW (auto)" */
171 #else
172   0, 0,
173 #endif
174 #if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT)
175   1, /* "Green      " */
176 #else
177   0,
178 #endif
179 #if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT)
180   1, /* "Kiss       " */
181 #else
182   0,
183 #endif
184 #if defined(HAVE_POCKET_FFT)
185   1, /* "Pocket     " */
186 #else
187   0,
188 #endif
189   1, /* "PFFFT_U    " */
190   1  /* "PFFFT_O    " */
191 };
192 
193 const char * algoTableHeader[NUM_FFT_ALGOS][2] = {
194 { "| real FFTPack ", "| cplx FFTPack " },
195 { "|  real   vDSP ", "|  cplx   vDSP " },
196 { "|real FFTWestim", "|cplx FFTWestim" },
197 { "|real FFTWauto ", "|cplx FFTWauto " },
198 { "|  real  Green ", "|  cplx  Green " },
199 { "|  real   Kiss ", "|  cplx   Kiss " },
200 { "|  real Pocket ", "|  cplx Pocket " },
201 { "| real PFFFT-U ", "| cplx PFFFT-U " },
202 { "|  real  PFFFT ", "|  cplx  PFFFT " } };
203 
204 const char * typeText[NUM_TYPES] = {
205   "preparation in ms",
206   "time per fft in ns",
207   "relative to fastest",
208   "relative to pffft",
209   "measured_num_iters",
210   "mflops",
211   "test duration in sec"
212 };
213 
214 const char * typeFilenamePart[NUM_TYPES] = {
215   "1-preparation-in-ms",
216   "2-timePerFft-in-ns",
217   "3-rel-fastest",
218   "4-rel-pffft",
219   "5-num-iter",
220   "6-mflops",
221   "7-duration-in-sec"
222 };
223 
224 #define SAVE_ALL_TYPES  0
225 
226 const int saveType[NUM_TYPES] = {
227   1, /* "1-preparation-in-ms" */
228   0, /* "2-timePerFft-in-ns"  */
229   0, /* "3-rel-fastest"       */
230   1, /* "4-rel-pffft"         */
231   1, /* "5-num-iter"          */
232   1, /* "6-mflops"            */
233   1, /* "7-duration-in-sec"   */
234 };
235 
236 
237 #define MAX(x,y) ((x)>(y)?(x):(y))
238 #define MIN(x,y) ((x)<(y)?(x):(y))
239 
Log2(unsigned v)240 unsigned Log2(unsigned v) {
241   /* we don't need speed records .. obvious way is good enough */
242   /* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious */
243   /* Find the log base 2 of an integer with the MSB N set in O(N) operations (the obvious way):
244    * unsigned v: 32-bit word to find the log base 2 of */
245   unsigned r = 0; /* r will be lg(v) */
246   while (v >>= 1)
247   {
248     r++;
249   }
250   return r;
251 }
252 
253 
frand()254 double frand() {
255   return rand()/(double)RAND_MAX;
256 }
257 
258 #if defined(HAVE_SYS_TIMES)
uclock_sec(void)259   inline double uclock_sec(void) {
260     static double ttclk = 0.;
261     struct tms t;
262     if (ttclk == 0.)
263       ttclk = sysconf(_SC_CLK_TCK);
264     times(&t);
265     /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
266     return ((double)t.tms_utime)) / ttclk;
267   }
268 # else
269   double uclock_sec(void)
270 { return (double)clock()/(double)CLOCKS_PER_SEC; }
271 #endif
272 
273 
274 /* compare results with the regular fftpack */
275 void pffft_validate_N(int N, int cplx) {
276 
277 #ifdef PFFFT_ENABLE_FLOAT
278 
279   int Nfloat = N*(cplx?2:1);
280   int Nbytes = Nfloat * sizeof(pffft_scalar);
281   float *ref, *in, *out, *tmp, *tmp2;
282   PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
283   int pass;
284 
285 
286   if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
287   ref = PFFFT_FUNC(aligned_malloc)(Nbytes);
288   in = PFFFT_FUNC(aligned_malloc)(Nbytes);
289   out = PFFFT_FUNC(aligned_malloc)(Nbytes);
290   tmp = PFFFT_FUNC(aligned_malloc)(Nbytes);
291   tmp2 = PFFFT_FUNC(aligned_malloc)(Nbytes);
292 
293   for (pass=0; pass < 2; ++pass) {
294     float ref_max = 0;
295     int k;
296     /* printf("N=%d pass=%d cplx=%d\n", N, pass, cplx); */
297     /* compute reference solution with FFTPACK */
298     if (pass == 0) {
299       float *wrk = malloc(2*Nbytes+15*sizeof(pffft_scalar));
300       for (k=0; k < Nfloat; ++k) {
301         ref[k] = in[k] = (float)( frand()*2-1 );
302         out[k] = 1e30F;
303       }
304       if (!cplx) {
305         rffti(N, wrk);
306         rfftf(N, ref, wrk);
307         /* use our ordering for real ffts instead of the one of fftpack */
308         {
309           float refN=ref[N-1];
310           for (k=N-2; k >= 1; --k) ref[k+1] = ref[k];
311           ref[1] = refN;
312         }
313       } else {
314         cffti(N, wrk);
315         cfftf(N, ref, wrk);
316       }
317       free(wrk);
318     }
319 
320     for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, (float)( fabs(ref[k]) ));
321 
322 
323     /* pass 0 : non canonical ordering of transform coefficients */
324     if (pass == 0) {
325       /* test forward transform, with different input / output */
326       PFFFT_FUNC(transform)(s, in, tmp, 0, PFFFT_FORWARD);
327       memcpy(tmp2, tmp, Nbytes);
328       memcpy(tmp, in, Nbytes);
329       PFFFT_FUNC(transform)(s, tmp, tmp, 0, PFFFT_FORWARD);
330       for (k = 0; k < Nfloat; ++k) {
331         assert(tmp2[k] == tmp[k]);
332       }
333 
334       /* test reordering */
335       PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD);
336       PFFFT_FUNC(zreorder)(s, out, tmp, PFFFT_BACKWARD);
337       for (k = 0; k < Nfloat; ++k) {
338         assert(tmp2[k] == tmp[k]);
339       }
340       PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD);
341     } else {
342       /* pass 1 : canonical ordering of transform coeffs. */
343       PFFFT_FUNC(transform_ordered)(s, in, tmp, 0, PFFFT_FORWARD);
344       memcpy(tmp2, tmp, Nbytes);
345       memcpy(tmp, in, Nbytes);
346       PFFFT_FUNC(transform_ordered)(s, tmp, tmp, 0, PFFFT_FORWARD);
347       for (k = 0; k < Nfloat; ++k) {
348         assert(tmp2[k] == tmp[k]);
349       }
350       memcpy(out, tmp, Nbytes);
351     }
352 
353     {
354       for (k=0; k < Nfloat; ++k) {
355         if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
356           printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
357           exit(1);
358         }
359       }
360 
361       if (pass == 0) PFFFT_FUNC(transform)(s, tmp, out, 0, PFFFT_BACKWARD);
362       else   PFFFT_FUNC(transform_ordered)(s, tmp, out, 0, PFFFT_BACKWARD);
363       memcpy(tmp2, out, Nbytes);
364       memcpy(out, tmp, Nbytes);
365       if (pass == 0) PFFFT_FUNC(transform)(s, out, out, 0, PFFFT_BACKWARD);
366       else   PFFFT_FUNC(transform_ordered)(s, out, out, 0, PFFFT_BACKWARD);
367       for (k = 0; k < Nfloat; ++k) {
368         assert(tmp2[k] == out[k]);
369         out[k] *= 1.f/N;
370       }
371       for (k = 0; k < Nfloat; ++k) {
372         if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
373           printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
374           exit(1);
375         }
376       }
377     }
378 
379     /* quick test of the circular convolution in fft domain */
380     {
381       float conv_err = 0, conv_max = 0;
382 
383       PFFFT_FUNC(zreorder)(s, ref, tmp, PFFFT_FORWARD);
384       memset(out, 0, Nbytes);
385       PFFFT_FUNC(zconvolve_accumulate)(s, ref, ref, out, 1.0);
386       PFFFT_FUNC(zreorder)(s, out, tmp2, PFFFT_FORWARD);
387 
388       for (k=0; k < Nfloat; k += 2) {
389         float ar = tmp[k], ai=tmp[k+1];
390         if (cplx || k > 0) {
391           tmp[k] = ar*ar - ai*ai;
392           tmp[k+1] = 2*ar*ai;
393         } else {
394           tmp[0] = ar*ar;
395           tmp[1] = ai*ai;
396         }
397       }
398 
399       for (k=0; k < Nfloat; ++k) {
400         float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
401         if (d > conv_err) conv_err = d;
402         if (e > conv_max) conv_max = e;
403       }
404       if (conv_err > 1e-5*conv_max) {
405         printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
406       }
407     }
408 
409   }
410 
411   printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
412 
413   PFFFT_FUNC(destroy_setup)(s);
414   PFFFT_FUNC(aligned_free)(ref);
415   PFFFT_FUNC(aligned_free)(in);
416   PFFFT_FUNC(aligned_free)(out);
417   PFFFT_FUNC(aligned_free)(tmp);
418   PFFFT_FUNC(aligned_free)(tmp2);
419 
420 #endif /* PFFFT_ENABLE_FLOAT */
421 }
422 
423 void pffft_validate(int cplx) {
424   static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
425   int k;
426   for (k = 0; Ntest[k]; ++k) {
427     int N = Ntest[k];
428     if (N == 16 && !cplx) continue;
429     pffft_validate_N(N, cplx);
430   }
431 }
432 
433 int array_output_format = 1;
434 
435 
436 void print_table(const char *txt, FILE *tableFile) {
437   fprintf(stdout, "%s", txt);
438   if (tableFile && tableFile != stdout)
439     fprintf(tableFile, "%s", txt);
440 }
441 
442 void print_table_flops(float mflops, FILE *tableFile) {
443   fprintf(stdout, "|%11.0f   ", mflops);
444   if (tableFile && tableFile != stdout)
445     fprintf(tableFile, "|%11.0f   ", mflops);
446 }
447 
448 void print_table_fftsize(int N, FILE *tableFile) {
449   fprintf(stdout, "|%9d  ", N);
450   if (tableFile && tableFile != stdout)
451     fprintf(tableFile, "|%9d  ", N);
452 }
453 
454 double show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter, FILE *tableFile) {
455   double T = (double)(t1-t0)/2/max_iter * 1e9;
456   float mflops = flops/1e6/(t1 - t0 + 1e-16);
457   if (array_output_format) {
458     if (flops != -1)
459       print_table_flops(mflops, tableFile);
460     else
461       print_table("|        n/a   ", tableFile);
462   } else {
463     if (flops != -1) {
464       printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
465     }
466   }
467   fflush(stdout);
468   return T;
469 }
470 
471 double cal_benchmark(int N, int cplx) {
472   const int log2N = Log2(N);
473   int Nfloat = (cplx ? N*2 : N);
474   int Nbytes = Nfloat * sizeof(pffft_scalar);
475   pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes);
476   double t0, t1, tstop, T, nI;
477   int k, iter;
478 
479   assert( PFFFT_FUNC(is_power_of_two)(N) );
480   for (k = 0; k < Nfloat; ++k) {
481     X[k] = sqrtf(k+1);
482   }
483 
484   /* PFFFT-U (unordered) benchmark */
485   PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
486   assert(s);
487   iter = 0;
488   t0 = uclock_sec();
489   tstop = t0 + 0.25;  /* benchmark duration: 250 ms */
490   do {
491     for ( k = 0; k < 512; ++k ) {
492       PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD);
493       PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD);
494       ++iter;
495     }
496     t1 = uclock_sec();
497   } while ( t1 < tstop );
498   PFFFT_FUNC(destroy_setup)(s);
499   PFFFT_FUNC(aligned_free)(X);
500   PFFFT_FUNC(aligned_free)(Y);
501   PFFFT_FUNC(aligned_free)(Z);
502 
503   T = ( t1 - t0 );  /* duration per fft() */
504   nI = ((double)iter) * ( log2N * N );  /* number of iterations "normalized" to O(N) = N*log2(N) */
505   return (nI / T);    /* normalized iterations per second */
506 }
507 
508 
509 
510 void benchmark_ffts(int N, int cplx, int withFFTWfullMeas, double iterCal, double tmeas[NUM_TYPES][NUM_FFT_ALGOS], int haveAlgo[NUM_FFT_ALGOS], FILE *tableFile ) {
511   const int log2N = Log2(N);
512   int nextPow2N = PFFFT_FUNC(next_power_of_two)(N);
513   int log2NextN = Log2(nextPow2N);
514   int pffftPow2N = nextPow2N;
515 
516   int Nfloat = (cplx ? MAX(nextPow2N, pffftPow2N)*2 : MAX(nextPow2N, pffftPow2N));
517   int Nmax, k, iter;
518   int Nbytes = Nfloat * sizeof(pffft_scalar);
519 
520   pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes + sizeof(pffft_scalar)), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes + 2*sizeof(pffft_scalar) ), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes);
521   double te, t0, t1, tstop, flops, Tfastest;
522 
523   const double max_test_duration = 0.150;   /* test duration 150 ms */
524   double numIter = max_test_duration * iterCal / ( log2N * N );  /* number of iteration for max_test_duration */
525   const int step_iter = MAX(1, ((int)(0.01 * numIter)) );  /* one hundredth */
526   int max_iter = MAX(1, ((int)numIter) );  /* minimum 1 iteration */
527 
528   const float checkVal = 12345.0F;
529 
530   /* printf("benchmark_ffts(N = %d, cplx = %d): Nfloat = %d, X_mem = 0x%p, X = %p\n", N, cplx, Nfloat, X_mem, X); */
531 
532   memset( X, 0, Nfloat * sizeof(pffft_scalar) );
533   if ( Nfloat < 32 ) {
534     for (k = 0; k < Nfloat; k += 4)
535       X[k] = sqrtf(k+1);
536   } else {
537     for (k = 0; k < Nfloat; k += (Nfloat/16) )
538       X[k] = sqrtf(k+1);
539   }
540 
541   for ( k = 0; k < NUM_TYPES; ++k )
542   {
543     for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
544       tmeas[k][iter] = 0.0;
545   }
546 
547 
548   /* FFTPack benchmark */
549   Nmax = (cplx ? N*2 : N);
550   X[Nmax] = checkVal;
551 #ifdef PFFFT_ENABLE_FLOAT
552   {
553     float *wrk = malloc(2*Nbytes + 15*sizeof(pffft_scalar));
554     te = uclock_sec();
555     if (cplx) cffti(N, wrk);
556     else      rffti(N, wrk);
557     t0 = uclock_sec();
558     tstop = t0 + max_test_duration;
559     max_iter = 0;
560     do {
561       for ( k = 0; k < step_iter; ++k ) {
562         if (cplx) {
563           assert( X[Nmax] == checkVal );
564           cfftf(N, X, wrk);
565           assert( X[Nmax] == checkVal );
566           cfftb(N, X, wrk);
567           assert( X[Nmax] == checkVal );
568         } else {
569           assert( X[Nmax] == checkVal );
570           rfftf(N, X, wrk);
571           assert( X[Nmax] == checkVal );
572           rfftb(N, X, wrk);
573           assert( X[Nmax] == checkVal );
574         }
575         ++max_iter;
576       }
577       t1 = uclock_sec();
578     } while ( t1 < tstop );
579 
580     free(wrk);
581 
582     flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
583     tmeas[TYPE_ITER][ALGO_FFTPACK] = max_iter;
584     tmeas[TYPE_MFLOPS][ALGO_FFTPACK] = flops/1e6/(t1 - t0 + 1e-16);
585     tmeas[TYPE_DUR_TOT][ALGO_FFTPACK] = t1 - t0;
586     tmeas[TYPE_DUR_NS][ALGO_FFTPACK] = show_output("FFTPack", N, cplx, flops, t0, t1, max_iter, tableFile);
587     tmeas[TYPE_PREP][ALGO_FFTPACK] = (t0 - te) * 1e3;
588     haveAlgo[ALGO_FFTPACK] = 1;
589   }
590 #endif
591 
592 #if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT)
593   Nmax = (cplx ? nextPow2N*2 : nextPow2N);
594   X[Nmax] = checkVal;
595   te = uclock_sec();
596   if ( 1 || PFFFT_FUNC(is_power_of_two)(N) ) {
597     FFTSetup setup;
598 
599     setup = vDSP_create_fftsetup(log2NextN, FFT_RADIX2);
600     DSPSplitComplex zsamples;
601     zsamples.realp = &X[0];
602     zsamples.imagp = &X[Nfloat/2];
603     t0 = uclock_sec();
604     tstop = t0 + max_test_duration;
605     max_iter = 0;
606     do {
607       for ( k = 0; k < step_iter; ++k ) {
608         if (cplx) {
609           assert( X[Nmax] == checkVal );
610           vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward);
611           assert( X[Nmax] == checkVal );
612           vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse);
613           assert( X[Nmax] == checkVal );
614         } else {
615           assert( X[Nmax] == checkVal );
616           vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward);
617           assert( X[Nmax] == checkVal );
618           vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse);
619           assert( X[Nmax] == checkVal );
620         }
621         ++max_iter;
622       }
623       t1 = uclock_sec();
624     } while ( t1 < tstop );
625 
626     vDSP_destroy_fftsetup(setup);
627     flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
628     tmeas[TYPE_ITER][ALGO_VECLIB] = max_iter;
629     tmeas[TYPE_MFLOPS][ALGO_VECLIB] = flops/1e6/(t1 - t0 + 1e-16);
630     tmeas[TYPE_DUR_TOT][ALGO_VECLIB] = t1 - t0;
631     tmeas[TYPE_DUR_NS][ALGO_VECLIB] = show_output("vDSP", N, cplx, flops, t0, t1, max_iter, tableFile);
632     tmeas[TYPE_PREP][ALGO_VECLIB] = (t0 - te) * 1e3;
633     haveAlgo[ALGO_VECLIB] = 1;
634   } else {
635     show_output("vDSP", N, cplx, -1, -1, -1, -1, tableFile);
636   }
637 #endif
638 
639 #if defined(HAVE_FFTW)
640   Nmax = (cplx ? N*2 : N);
641   X[Nmax] = checkVal;
642   {
643     /* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE);  measure takes a lot of time on largest ffts */
644     int flags = FFTW_ESTIMATE;
645     te = uclock_sec();
646 
647     FFTW_PLAN planf, planb;
648     FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
649     FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
650     memset(in, 0, sizeof(FFTW_COMPLEX) * N);
651     if (cplx) {
652       planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags);
653       planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags);
654     } else {
655       planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags);
656       planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags);
657     }
658 
659     t0 = uclock_sec();
660     tstop = t0 + max_test_duration;
661     max_iter = 0;
662     do {
663       for ( k = 0; k < step_iter; ++k ) {
664         assert( X[Nmax] == checkVal );
665         FFTW_FUNC(execute)(planf);
666         assert( X[Nmax] == checkVal );
667         FFTW_FUNC(execute)(planb);
668         assert( X[Nmax] == checkVal );
669         ++max_iter;
670       }
671       t1 = uclock_sec();
672     } while ( t1 < tstop );
673 
674     FFTW_FUNC(destroy_plan)(planf);
675     FFTW_FUNC(destroy_plan)(planb);
676     FFTW_FUNC(free)(in); FFTW_FUNC(free)(out);
677 
678     flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
679     tmeas[TYPE_ITER][ALGO_FFTW_ESTIM] = max_iter;
680     tmeas[TYPE_MFLOPS][ALGO_FFTW_ESTIM] = flops/1e6/(t1 - t0 + 1e-16);
681     tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM] = t1 - t0;
682     tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile);
683     tmeas[TYPE_PREP][ALGO_FFTW_ESTIM] = (t0 - te) * 1e3;
684     haveAlgo[ALGO_FFTW_ESTIM] = 1;
685   }
686   Nmax = (cplx ? N*2 : N);
687   X[Nmax] = checkVal;
688   do {
689     /* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE);  measure takes a lot of time on largest ffts */
690     /* int flags = FFTW_MEASURE; */
691 #if ( defined(__arm__) || defined(__aarch64__) || defined(__arm64__) )
692     int limitFFTsize = 31;  /* takes over a second on Raspberry Pi 3 B+ -- and much much more on higher ffts sizes! */
693 #else
694     int limitFFTsize = 2400;  /* take over a second on i7 for fft size 2400 */
695 #endif
696     int flags = (N < limitFFTsize ? FFTW_MEASURE : (withFFTWfullMeas ? FFTW_MEASURE : FFTW_ESTIMATE));
697 
698     if (flags == FFTW_ESTIMATE) {
699       show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, -1, -1, -1, -1, tableFile);
700       /* copy values from estimation */
701       tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = tmeas[TYPE_ITER][ALGO_FFTW_ESTIM];
702       tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM];
703       tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM];
704       tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = tmeas[TYPE_PREP][ALGO_FFTW_ESTIM];
705     } else {
706       te = uclock_sec();
707       FFTW_PLAN planf, planb;
708       FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
709       FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
710       memset(in, 0, sizeof(FFTW_COMPLEX) * N);
711       if (cplx) {
712         planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags);
713         planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags);
714       } else {
715         planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags);
716         planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags);
717       }
718 
719       t0 = uclock_sec();
720       tstop = t0 + max_test_duration;
721       max_iter = 0;
722       do {
723         for ( k = 0; k < step_iter; ++k ) {
724           assert( X[Nmax] == checkVal );
725           FFTW_FUNC(execute)(planf);
726           assert( X[Nmax] == checkVal );
727           FFTW_FUNC(execute)(planb);
728           assert( X[Nmax] == checkVal );
729           ++max_iter;
730         }
731         t1 = uclock_sec();
732       } while ( t1 < tstop );
733 
734       FFTW_FUNC(destroy_plan)(planf);
735       FFTW_FUNC(destroy_plan)(planb);
736       FFTW_FUNC(free)(in); FFTW_FUNC(free)(out);
737 
738       flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
739       tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = max_iter;
740       tmeas[TYPE_MFLOPS][ALGO_FFTW_AUTO] = flops/1e6/(t1 - t0 + 1e-16);
741       tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = t1 - t0;
742       tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile);
743       tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = (t0 - te) * 1e3;
744       haveAlgo[ALGO_FFTW_AUTO] = 1;
745     }
746   } while (0);
747 #else
748   (void)withFFTWfullMeas;
749 #endif
750 
751 #if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT)
752   Nmax = (cplx ? nextPow2N*2 : nextPow2N);
753   X[Nmax] = checkVal;
754   if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
755   {
756     te = uclock_sec();
757     fftInit(log2NextN);
758 
759     t0 = uclock_sec();
760     tstop = t0 + max_test_duration;
761     max_iter = 0;
762     do {
763       for ( k = 0; k < step_iter; ++k ) {
764         if (cplx) {
765           assert( X[Nmax] == checkVal );
766           ffts(X, log2NextN, 1);
767           assert( X[Nmax] == checkVal );
768           iffts(X, log2NextN, 1);
769           assert( X[Nmax] == checkVal );
770         } else {
771           rffts(X, log2NextN, 1);
772           riffts(X, log2NextN, 1);
773         }
774 
775         ++max_iter;
776       }
777       t1 = uclock_sec();
778     } while ( t1 < tstop );
779 
780     fftFree();
781 
782     flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
783     tmeas[TYPE_ITER][ALGO_GREEN] = max_iter;
784     tmeas[TYPE_MFLOPS][ALGO_GREEN] = flops/1e6/(t1 - t0 + 1e-16);
785     tmeas[TYPE_DUR_TOT][ALGO_GREEN] = t1 - t0;
786     tmeas[TYPE_DUR_NS][ALGO_GREEN] = show_output("Green", N, cplx, flops, t0, t1, max_iter, tableFile);
787     tmeas[TYPE_PREP][ALGO_GREEN] = (t0 - te) * 1e3;
788     haveAlgo[ALGO_GREEN] = 1;
789   } else {
790     show_output("Green", N, cplx, -1, -1, -1, -1, tableFile);
791   }
792 #endif
793 
794 #if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT)
795   Nmax = (cplx ? nextPow2N*2 : nextPow2N);
796   X[Nmax] = checkVal;
797   if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
798   {
799     kiss_fft_cfg stf;
800     kiss_fft_cfg sti;
801     kiss_fftr_cfg stfr;
802     kiss_fftr_cfg stir;
803 
804     te = uclock_sec();
805     if (cplx) {
806       stf = kiss_fft_alloc(nextPow2N, 0, 0, 0);
807       sti = kiss_fft_alloc(nextPow2N, 1, 0, 0);
808     } else {
809       stfr = kiss_fftr_alloc(nextPow2N, 0, 0, 0);
810       stir = kiss_fftr_alloc(nextPow2N, 1, 0, 0);
811     }
812 
813     t0 = uclock_sec();
814     tstop = t0 + max_test_duration;
815     max_iter = 0;
816     do {
817       for ( k = 0; k < step_iter; ++k ) {
818         if (cplx) {
819           assert( X[Nmax] == checkVal );
820           kiss_fft(stf, (const kiss_fft_cpx *)X, (kiss_fft_cpx *)Y);
821           assert( X[Nmax] == checkVal );
822           kiss_fft(sti, (const kiss_fft_cpx *)Y, (kiss_fft_cpx *)X);
823           assert( X[Nmax] == checkVal );
824         } else {
825           assert( X[Nmax] == checkVal );
826           kiss_fftr(stfr, X, (kiss_fft_cpx *)Y);
827           assert( X[Nmax] == checkVal );
828           kiss_fftri(stir, (const kiss_fft_cpx *)Y, X);
829           assert( X[Nmax] == checkVal );
830         }
831         ++max_iter;
832       }
833       t1 = uclock_sec();
834     } while ( t1 < tstop );
835 
836     kiss_fft_cleanup();
837 
838     flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
839     tmeas[TYPE_ITER][ALGO_KISS] = max_iter;
840     tmeas[TYPE_MFLOPS][ALGO_KISS] = flops/1e6/(t1 - t0 + 1e-16);
841     tmeas[TYPE_DUR_TOT][ALGO_KISS] = t1 - t0;
842     tmeas[TYPE_DUR_NS][ALGO_KISS] = show_output("Kiss", N, cplx, flops, t0, t1, max_iter, tableFile);
843     tmeas[TYPE_PREP][ALGO_KISS] = (t0 - te) * 1e3;
844     haveAlgo[ALGO_KISS] = 1;
845   } else {
846     show_output("Kiss", N, cplx, -1, -1, -1, -1, tableFile);
847   }
848 #endif
849 
850 #if defined(HAVE_POCKET_FFT)
851 
852   Nmax = (cplx ? nextPow2N*2 : nextPow2N);
853   X[Nmax] = checkVal;
854   if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
855   {
856     POCKFFTR_PRE(_plan) planr;
857     POCKFFTC_PRE(_plan) planc;
858 
859     te = uclock_sec();
860     if (cplx) {
861       planc = POCKFFTC_MID(make_,_plan)(nextPow2N);
862     } else {
863       planr = POCKFFTR_MID(make_,_plan)(nextPow2N);
864     }
865 
866     t0 = uclock_sec();
867     tstop = t0 + max_test_duration;
868     max_iter = 0;
869     do {
870       for ( k = 0; k < step_iter; ++k ) {
871         if (cplx) {
872           assert( X[Nmax] == checkVal );
873           memcpy(Y, X, 2*nextPow2N * sizeof(pffft_scalar) );
874           POCKFFTC_PRE(_forward)(planc, Y, 1.);
875           assert( X[Nmax] == checkVal );
876           memcpy(X, Y, 2*nextPow2N * sizeof(pffft_scalar) );
877           POCKFFTC_PRE(_backward)(planc, X, 1./nextPow2N);
878           assert( X[Nmax] == checkVal );
879         } else {
880           assert( X[Nmax] == checkVal );
881           memcpy(Y, X, nextPow2N * sizeof(pffft_scalar) );
882           POCKFFTR_PRE(_forward)(planr, Y, 1.);
883           assert( X[Nmax] == checkVal );
884           memcpy(X, Y, nextPow2N * sizeof(pffft_scalar) );
885           POCKFFTR_PRE(_backward)(planr, X, 1./nextPow2N);
886           assert( X[Nmax] == checkVal );
887         }
888         ++max_iter;
889       }
890       t1 = uclock_sec();
891     } while ( t1 < tstop );
892 
893     if (cplx) {
894       POCKFFTC_MID(destroy_,_plan)(planc);
895     } else {
896       POCKFFTR_MID(destroy_,_plan)(planr);
897     }
898 
899     flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
900     tmeas[TYPE_ITER][ALGO_POCKET] = max_iter;
901     tmeas[TYPE_MFLOPS][ALGO_POCKET] = flops/1e6/(t1 - t0 + 1e-16);
902     tmeas[TYPE_DUR_TOT][ALGO_POCKET] = t1 - t0;
903     tmeas[TYPE_DUR_NS][ALGO_POCKET] = show_output("Pocket", N, cplx, flops, t0, t1, max_iter, tableFile);
904     tmeas[TYPE_PREP][ALGO_POCKET] = (t0 - te) * 1e3;
905     haveAlgo[ALGO_POCKET] = 1;
906   } else {
907     show_output("Pocket", N, cplx, -1, -1, -1, -1, tableFile);
908   }
909 #endif
910 
911 
912   /* PFFFT-U (unordered) benchmark */
913   Nmax = (cplx ? pffftPow2N*2 : pffftPow2N);
914   X[Nmax] = checkVal;
915   if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) )
916   {
917     te = uclock_sec();
918     PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
919     if (s) {
920       t0 = uclock_sec();
921       tstop = t0 + max_test_duration;
922       max_iter = 0;
923       do {
924         for ( k = 0; k < step_iter; ++k ) {
925           assert( X[Nmax] == checkVal );
926           PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD);
927           assert( X[Nmax] == checkVal );
928           PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD);
929           assert( X[Nmax] == checkVal );
930           ++max_iter;
931         }
932         t1 = uclock_sec();
933       } while ( t1 < tstop );
934 
935       PFFFT_FUNC(destroy_setup)(s);
936 
937       flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
938       tmeas[TYPE_ITER][ALGO_PFFFT_U] = max_iter;
939       tmeas[TYPE_MFLOPS][ALGO_PFFFT_U] = flops/1e6/(t1 - t0 + 1e-16);
940       tmeas[TYPE_DUR_TOT][ALGO_PFFFT_U] = t1 - t0;
941       tmeas[TYPE_DUR_NS][ALGO_PFFFT_U] = show_output("PFFFT-U", N, cplx, flops, t0, t1, max_iter, tableFile);
942       tmeas[TYPE_PREP][ALGO_PFFFT_U] = (t0 - te) * 1e3;
943       haveAlgo[ALGO_PFFFT_U] = 1;
944     }
945   } else {
946     show_output("PFFFT-U", N, cplx, -1, -1, -1, -1, tableFile);
947   }
948 
949 
950   if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) )
951   {
952     te = uclock_sec();
953     PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
954     if (s) {
955       t0 = uclock_sec();
956       tstop = t0 + max_test_duration;
957       max_iter = 0;
958       do {
959         for ( k = 0; k < step_iter; ++k ) {
960           assert( X[Nmax] == checkVal );
961           PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_FORWARD);
962           assert( X[Nmax] == checkVal );
963           PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_BACKWARD);
964           assert( X[Nmax] == checkVal );
965           ++max_iter;
966         }
967         t1 = uclock_sec();
968       } while ( t1 < tstop );
969 
970       PFFFT_FUNC(destroy_setup)(s);
971 
972       flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
973       tmeas[TYPE_ITER][ALGO_PFFFT_O] = max_iter;
974       tmeas[TYPE_MFLOPS][ALGO_PFFFT_O] = flops/1e6/(t1 - t0 + 1e-16);
975       tmeas[TYPE_DUR_TOT][ALGO_PFFFT_O] = t1 - t0;
976       tmeas[TYPE_DUR_NS][ALGO_PFFFT_O] = show_output("PFFFT", N, cplx, flops, t0, t1, max_iter, tableFile);
977       tmeas[TYPE_PREP][ALGO_PFFFT_O] = (t0 - te) * 1e3;
978       haveAlgo[ALGO_PFFFT_O] = 1;
979     }
980   } else {
981     show_output("PFFFT", N, cplx, -1, -1, -1, -1, tableFile);
982   }
983 
984   if (!array_output_format)
985   {
986     printf("prepare/ms:     ");
987     for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
988     {
989       if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
990         printf("%s %.3f    ", algoName[iter], tmeas[TYPE_PREP][iter] );
991       }
992     }
993     printf("\n");
994   }
995   Tfastest = 0.0;
996   for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
997   {
998     if ( Tfastest == 0.0 || ( tmeas[TYPE_DUR_NS][iter] != 0.0 && tmeas[TYPE_DUR_NS][iter] < Tfastest ) )
999       Tfastest = tmeas[TYPE_DUR_NS][iter];
1000   }
1001   if ( Tfastest > 0.0 )
1002   {
1003     if (!array_output_format)
1004       printf("relative fast:  ");
1005     for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
1006     {
1007       if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
1008         tmeas[TYPE_DUR_FASTEST][iter] = tmeas[TYPE_DUR_NS][iter] / Tfastest;
1009         if (!array_output_format)
1010           printf("%s %.3f    ", algoName[iter], tmeas[TYPE_DUR_FASTEST][iter] );
1011       }
1012     }
1013     if (!array_output_format)
1014       printf("\n");
1015   }
1016 
1017   {
1018     if (!array_output_format)
1019       printf("relative pffft: ");
1020     for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
1021     {
1022       if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
1023         tmeas[TYPE_REL_PFFFT][iter] = tmeas[TYPE_DUR_NS][iter] / tmeas[TYPE_DUR_NS][ALGO_PFFFT_O];
1024         if (!array_output_format)
1025           printf("%s %.3f    ", algoName[iter], tmeas[TYPE_REL_PFFFT][iter] );
1026       }
1027     }
1028     if (!array_output_format)
1029       printf("\n");
1030   }
1031 
1032   if (!array_output_format) {
1033     printf("--\n");
1034   }
1035 
1036   PFFFT_FUNC(aligned_free)(X);
1037   PFFFT_FUNC(aligned_free)(Y);
1038   PFFFT_FUNC(aligned_free)(Z);
1039 }
1040 
1041 
1042 /* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
1043 void validate_pffft_simd();
1044 int  validate_pffft_simd_ex(FILE * DbgOut);
1045 void validate_pffftd_simd();
1046 int  validate_pffftd_simd_ex(FILE * DbgOut);
1047 
1048 
1049 
1050 int main(int argc, char **argv) {
1051   /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1052      and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1053      handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1054 
1055 #if 0  /* include powers of 2 ? */
1056 #define NUMNONPOW2LENS  23
1057   int NnonPow2[NUMNONPOW2LENS] = {
1058     64, 96, 128, 160, 192,   256, 384, 5*96, 512, 5*128,
1059     3*256, 800, 1024, 2048, 2400,   4096, 8192, 9*1024, 16384, 32768,
1060     256*1024, 1024*1024, -1 };
1061 #else
1062 #define NUMNONPOW2LENS  11
1063   int NnonPow2[NUMNONPOW2LENS] = {
1064     96, 160, 192, 384, 5*96,   5*128,3*256, 800, 2400, 9*1024,
1065     -1 };
1066 #endif
1067 
1068 #define NUMPOW2FFTLENS  22
1069 #define MAXNUMFFTLENS MAX( NUMPOW2FFTLENS, NUMNONPOW2LENS )
1070   int Npow2[NUMPOW2FFTLENS];  /* exp = 1 .. 21, -1 */
1071   const int *Nvalues = NULL;
1072   double tmeas[2][MAXNUMFFTLENS][NUM_TYPES][NUM_FFT_ALGOS];
1073   double iterCalReal, iterCalCplx;
1074 
1075   int benchReal=1, benchCplx=1, withFFTWfullMeas=0, outputTable2File=1, usePow2=1;
1076   int realCplxIdx, typeIdx;
1077   int i, k;
1078   FILE *tableFile = NULL;
1079 
1080   int haveAlgo[NUM_FFT_ALGOS];
1081   char acCsvFilename[64];
1082 
1083   for ( k = 1; k <= NUMPOW2FFTLENS; ++k )
1084     Npow2[k-1] = (k == NUMPOW2FFTLENS) ? -1 : (1 << k);
1085   Nvalues = Npow2;  /* set default .. for comparisons .. */
1086 
1087   for ( i = 0; i < NUM_FFT_ALGOS; ++i )
1088     haveAlgo[i] = 0;
1089 
1090   printf("pffft architecture:    '%s'\n", PFFFT_FUNC(simd_arch)());
1091   printf("pffft SIMD size:       %d\n", PFFFT_FUNC(simd_size)());
1092   printf("pffft min real fft:    %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_REAL));
1093   printf("pffft min complex fft: %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_COMPLEX));
1094   printf("\n");
1095 
1096   for ( i = 1; i < argc; ++i ) {
1097     if (!strcmp(argv[i], "--array-format") || !strcmp(argv[i], "--table")) {
1098       array_output_format = 1;
1099     }
1100     else if (!strcmp(argv[i], "--no-tab")) {
1101       array_output_format = 0;
1102     }
1103     else if (!strcmp(argv[i], "--real")) {
1104       benchCplx = 0;
1105     }
1106     else if (!strcmp(argv[i], "--cplx")) {
1107       benchReal = 0;
1108     }
1109     else if (!strcmp(argv[i], "--fftw-full-measure")) {
1110       withFFTWfullMeas = 1;
1111     }
1112     else if (!strcmp(argv[i], "--non-pow2")) {
1113       Nvalues = NnonPow2;
1114       usePow2 = 0;
1115     }
1116     else /* if (!strcmp(argv[i], "--help")) */ {
1117       printf("usage: %s [--array-format|--table] [--no-tab] [--real|--cplx] [--fftw-full-measure] [--non-pow2]\n", argv[0]);
1118       exit(0);
1119     }
1120   }
1121 
1122 #ifdef HAVE_FFTW
1123 #ifdef PFFFT_ENABLE_DOUBLE
1124   algoName[ALGO_FFTW_ESTIM] = "FFTW D(estim)";
1125   algoName[ALGO_FFTW_AUTO]  = "FFTW D(auto) ";
1126 #endif
1127 
1128   if (withFFTWfullMeas)
1129   {
1130 #ifdef PFFFT_ENABLE_FLOAT
1131     algoName[ALGO_FFTW_AUTO] = "FFTWF(meas)"; /* "FFTW (auto)" */
1132 #else
1133     algoName[ALGO_FFTW_AUTO] = "FFTWD(meas)"; /* "FFTW (auto)" */
1134 #endif
1135     algoTableHeader[NUM_FFT_ALGOS][0] = "|real FFTWmeas "; /* "|real FFTWauto " */
1136     algoTableHeader[NUM_FFT_ALGOS][0] = "|cplx FFTWmeas "; /* "|cplx FFTWauto " */
1137   }
1138 #endif
1139 
1140   if ( PFFFT_FUNC(simd_size)() == 1 )
1141   {
1142     algoName[ALGO_PFFFT_U] = "PFFFTU scal-1";
1143     algoName[ALGO_PFFFT_O] = "PFFFT scal-1 ";
1144   }
1145   else if ( !strcmp(PFFFT_FUNC(simd_arch)(), "4xScalar") )
1146   {
1147     algoName[ALGO_PFFFT_U] = "PFFFT-U scal4";
1148     algoName[ALGO_PFFFT_O] = "PFFFT scal-4 ";
1149   }
1150 
1151 
1152   clock();
1153   /* double TClockDur = 1.0 / CLOCKS_PER_SEC;
1154   printf("clock() duration for CLOCKS_PER_SEC = %f sec = %f ms\n", TClockDur, 1000.0 * TClockDur );
1155   */
1156 
1157   /* calibrate test duration */
1158   {
1159     double t0, t1, dur;
1160     printf("calibrating fft benchmark duration at size N = 512 ..\n");
1161     t0 = uclock_sec();
1162     if (benchReal) {
1163       iterCalReal = cal_benchmark(512, 0 /* real fft */);
1164       printf("real fft iterCal = %f\n", iterCalReal);
1165     }
1166     if (benchCplx) {
1167       iterCalCplx = cal_benchmark(512, 1 /* cplx fft */);
1168       printf("cplx fft iterCal = %f\n", iterCalCplx);
1169     }
1170     t1 = uclock_sec();
1171     dur = t1 - t0;
1172     printf("calibration done in %f sec.\n\n", dur);
1173   }
1174 
1175   if (!array_output_format) {
1176     if (benchReal) {
1177       for (i=0; Nvalues[i] > 0; ++i)
1178         benchmark_ffts(Nvalues[i], 0 /* real fft */, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, NULL);
1179     }
1180     if (benchCplx) {
1181       for (i=0; Nvalues[i] > 0; ++i)
1182         benchmark_ffts(Nvalues[i], 1 /* cplx fft */, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, NULL);
1183     }
1184 
1185   } else {
1186 
1187     if (outputTable2File) {
1188       tableFile = fopen( usePow2 ? "bench-fft-table-pow2.txt" : "bench-fft-table-non2.txt", "w");
1189     }
1190     /* print table headers */
1191     printf("table shows MFlops; higher values indicate faster computation\n\n");
1192 
1193     {
1194       print_table("| input len ", tableFile);
1195       for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
1196       {
1197         if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) )
1198           continue;
1199         for (k=0; k < NUM_FFT_ALGOS; ++k)
1200         {
1201           if ( compiledInAlgo[k] )
1202             print_table(algoTableHeader[k][realCplxIdx], tableFile);
1203         }
1204       }
1205       print_table("|\n", tableFile);
1206     }
1207     /* print table value seperators */
1208     {
1209       print_table("|----------", tableFile);
1210       for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
1211       {
1212         if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) )
1213           continue;
1214         for (k=0; k < NUM_FFT_ALGOS; ++k)
1215         {
1216           if ( compiledInAlgo[k] )
1217             print_table(":|-------------", tableFile);
1218         }
1219       }
1220       print_table(":|\n", tableFile);
1221     }
1222 
1223     for (i=0; Nvalues[i] > 0; ++i) {
1224       {
1225         double t0, t1;
1226         print_table_fftsize(Nvalues[i], tableFile);
1227         t0 = uclock_sec();
1228         if (benchReal)
1229           benchmark_ffts(Nvalues[i], 0, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, tableFile);
1230         if (benchCplx)
1231           benchmark_ffts(Nvalues[i], 1, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, tableFile);
1232         t1 = uclock_sec();
1233         print_table("|\n", tableFile);
1234         /* printf("all ffts for size %d took %f sec\n", Nvalues[i], t1-t0); */
1235         (void)t0;
1236         (void)t1;
1237       }
1238     }
1239     fprintf(stdout, " (numbers are given in MFlops)\n");
1240     if (outputTable2File) {
1241       fclose(tableFile);
1242     }
1243   }
1244 
1245   printf("\n");
1246   printf("now writing .csv files ..\n");
1247 
1248   for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
1249   {
1250     if ( (benchReal && realCplxIdx == 0) || (benchCplx && realCplxIdx == 1) )
1251     {
1252       for (typeIdx = 0; typeIdx < NUM_TYPES; ++typeIdx)
1253       {
1254         FILE *f = NULL;
1255         if ( !(SAVE_ALL_TYPES || saveType[typeIdx]) )
1256           continue;
1257         acCsvFilename[0] = 0;
1258 #ifdef PFFFT_SIMD_DISABLE
1259         strcat(acCsvFilename, "scal-");
1260 #else
1261         strcat(acCsvFilename, "simd-");
1262 #endif
1263         strcat(acCsvFilename, (realCplxIdx == 0 ? "real-" : "cplx-"));
1264         strcat(acCsvFilename, ( usePow2 ? "pow2-" : "non2-"));
1265         assert( strlen(acCsvFilename) + strlen(typeFilenamePart[typeIdx]) + 5 < (sizeof(acCsvFilename) / sizeof(acCsvFilename[0])) );
1266         strcat(acCsvFilename, typeFilenamePart[typeIdx]);
1267         strcat(acCsvFilename, ".csv");
1268         f = fopen(acCsvFilename, "w");
1269         if (!f)
1270           continue;
1271         {
1272           fprintf(f, "size, log2, ");
1273           for (k=0; k < NUM_FFT_ALGOS; ++k)
1274             if ( haveAlgo[k] )
1275               fprintf(f, "%s, ", algoName[k]);
1276           fprintf(f, "\n");
1277         }
1278         for (i=0; Nvalues[i] > 0; ++i)
1279         {
1280           {
1281             fprintf(f, "%d, %.3f, ", Nvalues[i], log10((double)Nvalues[i])/log10(2.0) );
1282             for (k=0; k < NUM_FFT_ALGOS; ++k)
1283               if ( haveAlgo[k] )
1284                 fprintf(f, "%f, ", tmeas[realCplxIdx][i][typeIdx][k]);
1285             fprintf(f, "\n");
1286           }
1287         }
1288         fclose(f);
1289       }
1290     }
1291   }
1292 
1293   return 0;
1294 }
1295 
1296