1 /*
2 Copyright (c) 2013 Julien Pommier.
3 Copyright (c) 2019 Hayati Ayguen ( [email protected] )
4
5 Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
6
7 How to build:
8
9 on linux, with fftw3:
10 gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
11
12 on macos, without fftw3:
13 clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
14
15 on macos, with fftw3:
16 clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
17
18 as alternative: replace clang by gcc.
19
20 on windows, with visual c++:
21 cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
22
23 build without SIMD instructions:
24 gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
25
26 */
27
28 #define CONCAT_TOKENS(A, B) A ## B
29 #define CONCAT_THREE_TOKENS(A, B, C) A ## B ## C
30
31 #ifdef PFFFT_ENABLE_FLOAT
32 #include "pffft.h"
33
34 typedef float pffft_scalar;
35 typedef PFFFT_Setup PFFFT_SETUP;
36 #define PFFFT_FUNC(F) CONCAT_TOKENS(pffft_, F)
37
38 #else
39 /*
40 Note: adapted for double precision dynamic range version.
41 */
42 #include "pffft_double.h"
43
44 typedef double pffft_scalar;
45 typedef PFFFTD_Setup PFFFT_SETUP;
46 #define PFFFT_FUNC(F) CONCAT_TOKENS(pffftd_, F)
47 #endif
48
49 #ifdef PFFFT_ENABLE_FLOAT
50
51 #include "fftpack.h"
52
53 #ifdef HAVE_GREEN_FFTS
54 #include "fftext.h"
55 #endif
56
57 #ifdef HAVE_KISS_FFT
58 #include <kiss_fft.h>
59 #include <kiss_fftr.h>
60 #endif
61
62 #endif
63
64 #ifdef HAVE_POCKET_FFT
65 #include <pocketfft_double.h>
66 #include <pocketfft_single.h>
67 #endif
68
69 #ifdef PFFFT_ENABLE_FLOAT
70 #define POCKFFTR_PRE(R) CONCAT_TOKENS(rffts, R)
71 #define POCKFFTC_PRE(R) CONCAT_TOKENS(cffts, R)
72 #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rffts, R)
73 #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cffts, R)
74 #else
75 #define POCKFFTR_PRE(R) CONCAT_TOKENS(rfft, R)
76 #define POCKFFTC_PRE(R) CONCAT_TOKENS(cfft, R)
77 #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rfft, R)
78 #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cfft, R)
79 #endif
80
81
82
83 #include <math.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 #include <time.h>
87 #include <assert.h>
88 #include <string.h>
89
90 #ifdef HAVE_SYS_TIMES
91 # include <sys/times.h>
92 # include <unistd.h>
93 #endif
94
95 #ifdef HAVE_VECLIB
96 # include <Accelerate/Accelerate.h>
97 #endif
98
99 #ifdef HAVE_FFTW
100 # include <fftw3.h>
101
102 #ifdef PFFFT_ENABLE_FLOAT
103 typedef fftwf_plan FFTW_PLAN;
104 typedef fftwf_complex FFTW_COMPLEX;
105 #define FFTW_FUNC(F) CONCAT_TOKENS(fftwf_, F)
106 #else
107 typedef fftw_plan FFTW_PLAN;
108 typedef fftw_complex FFTW_COMPLEX;
109 #define FFTW_FUNC(F) CONCAT_TOKENS(fftw_, F)
110 #endif
111
112 #endif /* HAVE_FFTW */
113
114 #ifndef M_LN2
115 #define M_LN2 0.69314718055994530942 /* log_e 2 */
116 #endif
117
118
119 #define NUM_FFT_ALGOS 9
120 enum {
121 ALGO_FFTPACK = 0,
122 ALGO_VECLIB,
123 ALGO_FFTW_ESTIM,
124 ALGO_FFTW_AUTO,
125 ALGO_GREEN,
126 ALGO_KISS,
127 ALGO_POCKET,
128 ALGO_PFFFT_U, /* = 7 */
129 ALGO_PFFFT_O /* = 8 */
130 };
131
132 #define NUM_TYPES 7
133 enum {
134 TYPE_PREP = 0, /* time for preparation in ms */
135 TYPE_DUR_NS = 1, /* time per fft in ns */
136 TYPE_DUR_FASTEST = 2, /* relative time to fastest */
137 TYPE_REL_PFFFT = 3, /* relative time to ALGO_PFFFT */
138 TYPE_ITER = 4, /* # of iterations in measurement */
139 TYPE_MFLOPS = 5, /* MFlops/sec */
140 TYPE_DUR_TOT = 6 /* test duration in sec */
141 };
142 /* double tmeas[NUM_TYPES][NUM_FFT_ALGOS]; */
143
144 const char * algoName[NUM_FFT_ALGOS] = {
145 "FFTPack ",
146 "vDSP (vec) ",
147 "FFTW F(estim)",
148 "FFTW F(auto) ",
149 "Green ",
150 "Kiss ",
151 "Pocket ",
152 "PFFFT-U(simd)", /* unordered */
153 "PFFFT (simd) " /* ordered */
154 };
155
156
157 int compiledInAlgo[NUM_FFT_ALGOS] = {
158 #ifdef PFFFT_ENABLE_FLOAT
159 1, /* "FFTPack " */
160 #else
161 0, /* "FFTPack " */
162 #endif
163 #if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT)
164 1, /* "vDSP (vec) " */
165 #else
166 0,
167 #endif
168 #if defined(HAVE_FFTW)
169 1, /* "FFTW(estim)" */
170 1, /* "FFTW (auto)" */
171 #else
172 0, 0,
173 #endif
174 #if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT)
175 1, /* "Green " */
176 #else
177 0,
178 #endif
179 #if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT)
180 1, /* "Kiss " */
181 #else
182 0,
183 #endif
184 #if defined(HAVE_POCKET_FFT)
185 1, /* "Pocket " */
186 #else
187 0,
188 #endif
189 1, /* "PFFFT_U " */
190 1 /* "PFFFT_O " */
191 };
192
193 const char * algoTableHeader[NUM_FFT_ALGOS][2] = {
194 { "| real FFTPack ", "| cplx FFTPack " },
195 { "| real vDSP ", "| cplx vDSP " },
196 { "|real FFTWestim", "|cplx FFTWestim" },
197 { "|real FFTWauto ", "|cplx FFTWauto " },
198 { "| real Green ", "| cplx Green " },
199 { "| real Kiss ", "| cplx Kiss " },
200 { "| real Pocket ", "| cplx Pocket " },
201 { "| real PFFFT-U ", "| cplx PFFFT-U " },
202 { "| real PFFFT ", "| cplx PFFFT " } };
203
204 const char * typeText[NUM_TYPES] = {
205 "preparation in ms",
206 "time per fft in ns",
207 "relative to fastest",
208 "relative to pffft",
209 "measured_num_iters",
210 "mflops",
211 "test duration in sec"
212 };
213
214 const char * typeFilenamePart[NUM_TYPES] = {
215 "1-preparation-in-ms",
216 "2-timePerFft-in-ns",
217 "3-rel-fastest",
218 "4-rel-pffft",
219 "5-num-iter",
220 "6-mflops",
221 "7-duration-in-sec"
222 };
223
224 #define SAVE_ALL_TYPES 0
225
226 const int saveType[NUM_TYPES] = {
227 1, /* "1-preparation-in-ms" */
228 0, /* "2-timePerFft-in-ns" */
229 0, /* "3-rel-fastest" */
230 1, /* "4-rel-pffft" */
231 1, /* "5-num-iter" */
232 1, /* "6-mflops" */
233 1, /* "7-duration-in-sec" */
234 };
235
236
237 #define MAX(x,y) ((x)>(y)?(x):(y))
238 #define MIN(x,y) ((x)<(y)?(x):(y))
239
Log2(unsigned v)240 unsigned Log2(unsigned v) {
241 /* we don't need speed records .. obvious way is good enough */
242 /* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious */
243 /* Find the log base 2 of an integer with the MSB N set in O(N) operations (the obvious way):
244 * unsigned v: 32-bit word to find the log base 2 of */
245 unsigned r = 0; /* r will be lg(v) */
246 while (v >>= 1)
247 {
248 r++;
249 }
250 return r;
251 }
252
253
frand()254 double frand() {
255 return rand()/(double)RAND_MAX;
256 }
257
258 #if defined(HAVE_SYS_TIMES)
uclock_sec(void)259 inline double uclock_sec(void) {
260 static double ttclk = 0.;
261 struct tms t;
262 if (ttclk == 0.)
263 ttclk = sysconf(_SC_CLK_TCK);
264 times(&t);
265 /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
266 return ((double)t.tms_utime)) / ttclk;
267 }
268 # else
269 double uclock_sec(void)
270 { return (double)clock()/(double)CLOCKS_PER_SEC; }
271 #endif
272
273
274 /* compare results with the regular fftpack */
275 void pffft_validate_N(int N, int cplx) {
276
277 #ifdef PFFFT_ENABLE_FLOAT
278
279 int Nfloat = N*(cplx?2:1);
280 int Nbytes = Nfloat * sizeof(pffft_scalar);
281 float *ref, *in, *out, *tmp, *tmp2;
282 PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
283 int pass;
284
285
286 if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
287 ref = PFFFT_FUNC(aligned_malloc)(Nbytes);
288 in = PFFFT_FUNC(aligned_malloc)(Nbytes);
289 out = PFFFT_FUNC(aligned_malloc)(Nbytes);
290 tmp = PFFFT_FUNC(aligned_malloc)(Nbytes);
291 tmp2 = PFFFT_FUNC(aligned_malloc)(Nbytes);
292
293 for (pass=0; pass < 2; ++pass) {
294 float ref_max = 0;
295 int k;
296 /* printf("N=%d pass=%d cplx=%d\n", N, pass, cplx); */
297 /* compute reference solution with FFTPACK */
298 if (pass == 0) {
299 float *wrk = malloc(2*Nbytes+15*sizeof(pffft_scalar));
300 for (k=0; k < Nfloat; ++k) {
301 ref[k] = in[k] = (float)( frand()*2-1 );
302 out[k] = 1e30F;
303 }
304 if (!cplx) {
305 rffti(N, wrk);
306 rfftf(N, ref, wrk);
307 /* use our ordering for real ffts instead of the one of fftpack */
308 {
309 float refN=ref[N-1];
310 for (k=N-2; k >= 1; --k) ref[k+1] = ref[k];
311 ref[1] = refN;
312 }
313 } else {
314 cffti(N, wrk);
315 cfftf(N, ref, wrk);
316 }
317 free(wrk);
318 }
319
320 for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, (float)( fabs(ref[k]) ));
321
322
323 /* pass 0 : non canonical ordering of transform coefficients */
324 if (pass == 0) {
325 /* test forward transform, with different input / output */
326 PFFFT_FUNC(transform)(s, in, tmp, 0, PFFFT_FORWARD);
327 memcpy(tmp2, tmp, Nbytes);
328 memcpy(tmp, in, Nbytes);
329 PFFFT_FUNC(transform)(s, tmp, tmp, 0, PFFFT_FORWARD);
330 for (k = 0; k < Nfloat; ++k) {
331 assert(tmp2[k] == tmp[k]);
332 }
333
334 /* test reordering */
335 PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD);
336 PFFFT_FUNC(zreorder)(s, out, tmp, PFFFT_BACKWARD);
337 for (k = 0; k < Nfloat; ++k) {
338 assert(tmp2[k] == tmp[k]);
339 }
340 PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD);
341 } else {
342 /* pass 1 : canonical ordering of transform coeffs. */
343 PFFFT_FUNC(transform_ordered)(s, in, tmp, 0, PFFFT_FORWARD);
344 memcpy(tmp2, tmp, Nbytes);
345 memcpy(tmp, in, Nbytes);
346 PFFFT_FUNC(transform_ordered)(s, tmp, tmp, 0, PFFFT_FORWARD);
347 for (k = 0; k < Nfloat; ++k) {
348 assert(tmp2[k] == tmp[k]);
349 }
350 memcpy(out, tmp, Nbytes);
351 }
352
353 {
354 for (k=0; k < Nfloat; ++k) {
355 if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
356 printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
357 exit(1);
358 }
359 }
360
361 if (pass == 0) PFFFT_FUNC(transform)(s, tmp, out, 0, PFFFT_BACKWARD);
362 else PFFFT_FUNC(transform_ordered)(s, tmp, out, 0, PFFFT_BACKWARD);
363 memcpy(tmp2, out, Nbytes);
364 memcpy(out, tmp, Nbytes);
365 if (pass == 0) PFFFT_FUNC(transform)(s, out, out, 0, PFFFT_BACKWARD);
366 else PFFFT_FUNC(transform_ordered)(s, out, out, 0, PFFFT_BACKWARD);
367 for (k = 0; k < Nfloat; ++k) {
368 assert(tmp2[k] == out[k]);
369 out[k] *= 1.f/N;
370 }
371 for (k = 0; k < Nfloat; ++k) {
372 if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
373 printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
374 exit(1);
375 }
376 }
377 }
378
379 /* quick test of the circular convolution in fft domain */
380 {
381 float conv_err = 0, conv_max = 0;
382
383 PFFFT_FUNC(zreorder)(s, ref, tmp, PFFFT_FORWARD);
384 memset(out, 0, Nbytes);
385 PFFFT_FUNC(zconvolve_accumulate)(s, ref, ref, out, 1.0);
386 PFFFT_FUNC(zreorder)(s, out, tmp2, PFFFT_FORWARD);
387
388 for (k=0; k < Nfloat; k += 2) {
389 float ar = tmp[k], ai=tmp[k+1];
390 if (cplx || k > 0) {
391 tmp[k] = ar*ar - ai*ai;
392 tmp[k+1] = 2*ar*ai;
393 } else {
394 tmp[0] = ar*ar;
395 tmp[1] = ai*ai;
396 }
397 }
398
399 for (k=0; k < Nfloat; ++k) {
400 float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
401 if (d > conv_err) conv_err = d;
402 if (e > conv_max) conv_max = e;
403 }
404 if (conv_err > 1e-5*conv_max) {
405 printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
406 }
407 }
408
409 }
410
411 printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
412
413 PFFFT_FUNC(destroy_setup)(s);
414 PFFFT_FUNC(aligned_free)(ref);
415 PFFFT_FUNC(aligned_free)(in);
416 PFFFT_FUNC(aligned_free)(out);
417 PFFFT_FUNC(aligned_free)(tmp);
418 PFFFT_FUNC(aligned_free)(tmp2);
419
420 #endif /* PFFFT_ENABLE_FLOAT */
421 }
422
423 void pffft_validate(int cplx) {
424 static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
425 int k;
426 for (k = 0; Ntest[k]; ++k) {
427 int N = Ntest[k];
428 if (N == 16 && !cplx) continue;
429 pffft_validate_N(N, cplx);
430 }
431 }
432
433 int array_output_format = 1;
434
435
436 void print_table(const char *txt, FILE *tableFile) {
437 fprintf(stdout, "%s", txt);
438 if (tableFile && tableFile != stdout)
439 fprintf(tableFile, "%s", txt);
440 }
441
442 void print_table_flops(float mflops, FILE *tableFile) {
443 fprintf(stdout, "|%11.0f ", mflops);
444 if (tableFile && tableFile != stdout)
445 fprintf(tableFile, "|%11.0f ", mflops);
446 }
447
448 void print_table_fftsize(int N, FILE *tableFile) {
449 fprintf(stdout, "|%9d ", N);
450 if (tableFile && tableFile != stdout)
451 fprintf(tableFile, "|%9d ", N);
452 }
453
454 double show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter, FILE *tableFile) {
455 double T = (double)(t1-t0)/2/max_iter * 1e9;
456 float mflops = flops/1e6/(t1 - t0 + 1e-16);
457 if (array_output_format) {
458 if (flops != -1)
459 print_table_flops(mflops, tableFile);
460 else
461 print_table("| n/a ", tableFile);
462 } else {
463 if (flops != -1) {
464 printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
465 }
466 }
467 fflush(stdout);
468 return T;
469 }
470
471 double cal_benchmark(int N, int cplx) {
472 const int log2N = Log2(N);
473 int Nfloat = (cplx ? N*2 : N);
474 int Nbytes = Nfloat * sizeof(pffft_scalar);
475 pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes);
476 double t0, t1, tstop, T, nI;
477 int k, iter;
478
479 assert( PFFFT_FUNC(is_power_of_two)(N) );
480 for (k = 0; k < Nfloat; ++k) {
481 X[k] = sqrtf(k+1);
482 }
483
484 /* PFFFT-U (unordered) benchmark */
485 PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
486 assert(s);
487 iter = 0;
488 t0 = uclock_sec();
489 tstop = t0 + 0.25; /* benchmark duration: 250 ms */
490 do {
491 for ( k = 0; k < 512; ++k ) {
492 PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD);
493 PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD);
494 ++iter;
495 }
496 t1 = uclock_sec();
497 } while ( t1 < tstop );
498 PFFFT_FUNC(destroy_setup)(s);
499 PFFFT_FUNC(aligned_free)(X);
500 PFFFT_FUNC(aligned_free)(Y);
501 PFFFT_FUNC(aligned_free)(Z);
502
503 T = ( t1 - t0 ); /* duration per fft() */
504 nI = ((double)iter) * ( log2N * N ); /* number of iterations "normalized" to O(N) = N*log2(N) */
505 return (nI / T); /* normalized iterations per second */
506 }
507
508
509
510 void benchmark_ffts(int N, int cplx, int withFFTWfullMeas, double iterCal, double tmeas[NUM_TYPES][NUM_FFT_ALGOS], int haveAlgo[NUM_FFT_ALGOS], FILE *tableFile ) {
511 const int log2N = Log2(N);
512 int nextPow2N = PFFFT_FUNC(next_power_of_two)(N);
513 int log2NextN = Log2(nextPow2N);
514 int pffftPow2N = nextPow2N;
515
516 int Nfloat = (cplx ? MAX(nextPow2N, pffftPow2N)*2 : MAX(nextPow2N, pffftPow2N));
517 int Nmax, k, iter;
518 int Nbytes = Nfloat * sizeof(pffft_scalar);
519
520 pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes + sizeof(pffft_scalar)), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes + 2*sizeof(pffft_scalar) ), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes);
521 double te, t0, t1, tstop, flops, Tfastest;
522
523 const double max_test_duration = 0.150; /* test duration 150 ms */
524 double numIter = max_test_duration * iterCal / ( log2N * N ); /* number of iteration for max_test_duration */
525 const int step_iter = MAX(1, ((int)(0.01 * numIter)) ); /* one hundredth */
526 int max_iter = MAX(1, ((int)numIter) ); /* minimum 1 iteration */
527
528 const float checkVal = 12345.0F;
529
530 /* printf("benchmark_ffts(N = %d, cplx = %d): Nfloat = %d, X_mem = 0x%p, X = %p\n", N, cplx, Nfloat, X_mem, X); */
531
532 memset( X, 0, Nfloat * sizeof(pffft_scalar) );
533 if ( Nfloat < 32 ) {
534 for (k = 0; k < Nfloat; k += 4)
535 X[k] = sqrtf(k+1);
536 } else {
537 for (k = 0; k < Nfloat; k += (Nfloat/16) )
538 X[k] = sqrtf(k+1);
539 }
540
541 for ( k = 0; k < NUM_TYPES; ++k )
542 {
543 for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
544 tmeas[k][iter] = 0.0;
545 }
546
547
548 /* FFTPack benchmark */
549 Nmax = (cplx ? N*2 : N);
550 X[Nmax] = checkVal;
551 #ifdef PFFFT_ENABLE_FLOAT
552 {
553 float *wrk = malloc(2*Nbytes + 15*sizeof(pffft_scalar));
554 te = uclock_sec();
555 if (cplx) cffti(N, wrk);
556 else rffti(N, wrk);
557 t0 = uclock_sec();
558 tstop = t0 + max_test_duration;
559 max_iter = 0;
560 do {
561 for ( k = 0; k < step_iter; ++k ) {
562 if (cplx) {
563 assert( X[Nmax] == checkVal );
564 cfftf(N, X, wrk);
565 assert( X[Nmax] == checkVal );
566 cfftb(N, X, wrk);
567 assert( X[Nmax] == checkVal );
568 } else {
569 assert( X[Nmax] == checkVal );
570 rfftf(N, X, wrk);
571 assert( X[Nmax] == checkVal );
572 rfftb(N, X, wrk);
573 assert( X[Nmax] == checkVal );
574 }
575 ++max_iter;
576 }
577 t1 = uclock_sec();
578 } while ( t1 < tstop );
579
580 free(wrk);
581
582 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
583 tmeas[TYPE_ITER][ALGO_FFTPACK] = max_iter;
584 tmeas[TYPE_MFLOPS][ALGO_FFTPACK] = flops/1e6/(t1 - t0 + 1e-16);
585 tmeas[TYPE_DUR_TOT][ALGO_FFTPACK] = t1 - t0;
586 tmeas[TYPE_DUR_NS][ALGO_FFTPACK] = show_output("FFTPack", N, cplx, flops, t0, t1, max_iter, tableFile);
587 tmeas[TYPE_PREP][ALGO_FFTPACK] = (t0 - te) * 1e3;
588 haveAlgo[ALGO_FFTPACK] = 1;
589 }
590 #endif
591
592 #if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT)
593 Nmax = (cplx ? nextPow2N*2 : nextPow2N);
594 X[Nmax] = checkVal;
595 te = uclock_sec();
596 if ( 1 || PFFFT_FUNC(is_power_of_two)(N) ) {
597 FFTSetup setup;
598
599 setup = vDSP_create_fftsetup(log2NextN, FFT_RADIX2);
600 DSPSplitComplex zsamples;
601 zsamples.realp = &X[0];
602 zsamples.imagp = &X[Nfloat/2];
603 t0 = uclock_sec();
604 tstop = t0 + max_test_duration;
605 max_iter = 0;
606 do {
607 for ( k = 0; k < step_iter; ++k ) {
608 if (cplx) {
609 assert( X[Nmax] == checkVal );
610 vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward);
611 assert( X[Nmax] == checkVal );
612 vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse);
613 assert( X[Nmax] == checkVal );
614 } else {
615 assert( X[Nmax] == checkVal );
616 vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward);
617 assert( X[Nmax] == checkVal );
618 vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse);
619 assert( X[Nmax] == checkVal );
620 }
621 ++max_iter;
622 }
623 t1 = uclock_sec();
624 } while ( t1 < tstop );
625
626 vDSP_destroy_fftsetup(setup);
627 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
628 tmeas[TYPE_ITER][ALGO_VECLIB] = max_iter;
629 tmeas[TYPE_MFLOPS][ALGO_VECLIB] = flops/1e6/(t1 - t0 + 1e-16);
630 tmeas[TYPE_DUR_TOT][ALGO_VECLIB] = t1 - t0;
631 tmeas[TYPE_DUR_NS][ALGO_VECLIB] = show_output("vDSP", N, cplx, flops, t0, t1, max_iter, tableFile);
632 tmeas[TYPE_PREP][ALGO_VECLIB] = (t0 - te) * 1e3;
633 haveAlgo[ALGO_VECLIB] = 1;
634 } else {
635 show_output("vDSP", N, cplx, -1, -1, -1, -1, tableFile);
636 }
637 #endif
638
639 #if defined(HAVE_FFTW)
640 Nmax = (cplx ? N*2 : N);
641 X[Nmax] = checkVal;
642 {
643 /* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE); measure takes a lot of time on largest ffts */
644 int flags = FFTW_ESTIMATE;
645 te = uclock_sec();
646
647 FFTW_PLAN planf, planb;
648 FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
649 FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
650 memset(in, 0, sizeof(FFTW_COMPLEX) * N);
651 if (cplx) {
652 planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags);
653 planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags);
654 } else {
655 planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags);
656 planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags);
657 }
658
659 t0 = uclock_sec();
660 tstop = t0 + max_test_duration;
661 max_iter = 0;
662 do {
663 for ( k = 0; k < step_iter; ++k ) {
664 assert( X[Nmax] == checkVal );
665 FFTW_FUNC(execute)(planf);
666 assert( X[Nmax] == checkVal );
667 FFTW_FUNC(execute)(planb);
668 assert( X[Nmax] == checkVal );
669 ++max_iter;
670 }
671 t1 = uclock_sec();
672 } while ( t1 < tstop );
673
674 FFTW_FUNC(destroy_plan)(planf);
675 FFTW_FUNC(destroy_plan)(planb);
676 FFTW_FUNC(free)(in); FFTW_FUNC(free)(out);
677
678 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
679 tmeas[TYPE_ITER][ALGO_FFTW_ESTIM] = max_iter;
680 tmeas[TYPE_MFLOPS][ALGO_FFTW_ESTIM] = flops/1e6/(t1 - t0 + 1e-16);
681 tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM] = t1 - t0;
682 tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile);
683 tmeas[TYPE_PREP][ALGO_FFTW_ESTIM] = (t0 - te) * 1e3;
684 haveAlgo[ALGO_FFTW_ESTIM] = 1;
685 }
686 Nmax = (cplx ? N*2 : N);
687 X[Nmax] = checkVal;
688 do {
689 /* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE); measure takes a lot of time on largest ffts */
690 /* int flags = FFTW_MEASURE; */
691 #if ( defined(__arm__) || defined(__aarch64__) || defined(__arm64__) )
692 int limitFFTsize = 31; /* takes over a second on Raspberry Pi 3 B+ -- and much much more on higher ffts sizes! */
693 #else
694 int limitFFTsize = 2400; /* take over a second on i7 for fft size 2400 */
695 #endif
696 int flags = (N < limitFFTsize ? FFTW_MEASURE : (withFFTWfullMeas ? FFTW_MEASURE : FFTW_ESTIMATE));
697
698 if (flags == FFTW_ESTIMATE) {
699 show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, -1, -1, -1, -1, tableFile);
700 /* copy values from estimation */
701 tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = tmeas[TYPE_ITER][ALGO_FFTW_ESTIM];
702 tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM];
703 tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM];
704 tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = tmeas[TYPE_PREP][ALGO_FFTW_ESTIM];
705 } else {
706 te = uclock_sec();
707 FFTW_PLAN planf, planb;
708 FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
709 FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
710 memset(in, 0, sizeof(FFTW_COMPLEX) * N);
711 if (cplx) {
712 planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags);
713 planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags);
714 } else {
715 planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags);
716 planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags);
717 }
718
719 t0 = uclock_sec();
720 tstop = t0 + max_test_duration;
721 max_iter = 0;
722 do {
723 for ( k = 0; k < step_iter; ++k ) {
724 assert( X[Nmax] == checkVal );
725 FFTW_FUNC(execute)(planf);
726 assert( X[Nmax] == checkVal );
727 FFTW_FUNC(execute)(planb);
728 assert( X[Nmax] == checkVal );
729 ++max_iter;
730 }
731 t1 = uclock_sec();
732 } while ( t1 < tstop );
733
734 FFTW_FUNC(destroy_plan)(planf);
735 FFTW_FUNC(destroy_plan)(planb);
736 FFTW_FUNC(free)(in); FFTW_FUNC(free)(out);
737
738 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
739 tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = max_iter;
740 tmeas[TYPE_MFLOPS][ALGO_FFTW_AUTO] = flops/1e6/(t1 - t0 + 1e-16);
741 tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = t1 - t0;
742 tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile);
743 tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = (t0 - te) * 1e3;
744 haveAlgo[ALGO_FFTW_AUTO] = 1;
745 }
746 } while (0);
747 #else
748 (void)withFFTWfullMeas;
749 #endif
750
751 #if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT)
752 Nmax = (cplx ? nextPow2N*2 : nextPow2N);
753 X[Nmax] = checkVal;
754 if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
755 {
756 te = uclock_sec();
757 fftInit(log2NextN);
758
759 t0 = uclock_sec();
760 tstop = t0 + max_test_duration;
761 max_iter = 0;
762 do {
763 for ( k = 0; k < step_iter; ++k ) {
764 if (cplx) {
765 assert( X[Nmax] == checkVal );
766 ffts(X, log2NextN, 1);
767 assert( X[Nmax] == checkVal );
768 iffts(X, log2NextN, 1);
769 assert( X[Nmax] == checkVal );
770 } else {
771 rffts(X, log2NextN, 1);
772 riffts(X, log2NextN, 1);
773 }
774
775 ++max_iter;
776 }
777 t1 = uclock_sec();
778 } while ( t1 < tstop );
779
780 fftFree();
781
782 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
783 tmeas[TYPE_ITER][ALGO_GREEN] = max_iter;
784 tmeas[TYPE_MFLOPS][ALGO_GREEN] = flops/1e6/(t1 - t0 + 1e-16);
785 tmeas[TYPE_DUR_TOT][ALGO_GREEN] = t1 - t0;
786 tmeas[TYPE_DUR_NS][ALGO_GREEN] = show_output("Green", N, cplx, flops, t0, t1, max_iter, tableFile);
787 tmeas[TYPE_PREP][ALGO_GREEN] = (t0 - te) * 1e3;
788 haveAlgo[ALGO_GREEN] = 1;
789 } else {
790 show_output("Green", N, cplx, -1, -1, -1, -1, tableFile);
791 }
792 #endif
793
794 #if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT)
795 Nmax = (cplx ? nextPow2N*2 : nextPow2N);
796 X[Nmax] = checkVal;
797 if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
798 {
799 kiss_fft_cfg stf;
800 kiss_fft_cfg sti;
801 kiss_fftr_cfg stfr;
802 kiss_fftr_cfg stir;
803
804 te = uclock_sec();
805 if (cplx) {
806 stf = kiss_fft_alloc(nextPow2N, 0, 0, 0);
807 sti = kiss_fft_alloc(nextPow2N, 1, 0, 0);
808 } else {
809 stfr = kiss_fftr_alloc(nextPow2N, 0, 0, 0);
810 stir = kiss_fftr_alloc(nextPow2N, 1, 0, 0);
811 }
812
813 t0 = uclock_sec();
814 tstop = t0 + max_test_duration;
815 max_iter = 0;
816 do {
817 for ( k = 0; k < step_iter; ++k ) {
818 if (cplx) {
819 assert( X[Nmax] == checkVal );
820 kiss_fft(stf, (const kiss_fft_cpx *)X, (kiss_fft_cpx *)Y);
821 assert( X[Nmax] == checkVal );
822 kiss_fft(sti, (const kiss_fft_cpx *)Y, (kiss_fft_cpx *)X);
823 assert( X[Nmax] == checkVal );
824 } else {
825 assert( X[Nmax] == checkVal );
826 kiss_fftr(stfr, X, (kiss_fft_cpx *)Y);
827 assert( X[Nmax] == checkVal );
828 kiss_fftri(stir, (const kiss_fft_cpx *)Y, X);
829 assert( X[Nmax] == checkVal );
830 }
831 ++max_iter;
832 }
833 t1 = uclock_sec();
834 } while ( t1 < tstop );
835
836 kiss_fft_cleanup();
837
838 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
839 tmeas[TYPE_ITER][ALGO_KISS] = max_iter;
840 tmeas[TYPE_MFLOPS][ALGO_KISS] = flops/1e6/(t1 - t0 + 1e-16);
841 tmeas[TYPE_DUR_TOT][ALGO_KISS] = t1 - t0;
842 tmeas[TYPE_DUR_NS][ALGO_KISS] = show_output("Kiss", N, cplx, flops, t0, t1, max_iter, tableFile);
843 tmeas[TYPE_PREP][ALGO_KISS] = (t0 - te) * 1e3;
844 haveAlgo[ALGO_KISS] = 1;
845 } else {
846 show_output("Kiss", N, cplx, -1, -1, -1, -1, tableFile);
847 }
848 #endif
849
850 #if defined(HAVE_POCKET_FFT)
851
852 Nmax = (cplx ? nextPow2N*2 : nextPow2N);
853 X[Nmax] = checkVal;
854 if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
855 {
856 POCKFFTR_PRE(_plan) planr;
857 POCKFFTC_PRE(_plan) planc;
858
859 te = uclock_sec();
860 if (cplx) {
861 planc = POCKFFTC_MID(make_,_plan)(nextPow2N);
862 } else {
863 planr = POCKFFTR_MID(make_,_plan)(nextPow2N);
864 }
865
866 t0 = uclock_sec();
867 tstop = t0 + max_test_duration;
868 max_iter = 0;
869 do {
870 for ( k = 0; k < step_iter; ++k ) {
871 if (cplx) {
872 assert( X[Nmax] == checkVal );
873 memcpy(Y, X, 2*nextPow2N * sizeof(pffft_scalar) );
874 POCKFFTC_PRE(_forward)(planc, Y, 1.);
875 assert( X[Nmax] == checkVal );
876 memcpy(X, Y, 2*nextPow2N * sizeof(pffft_scalar) );
877 POCKFFTC_PRE(_backward)(planc, X, 1./nextPow2N);
878 assert( X[Nmax] == checkVal );
879 } else {
880 assert( X[Nmax] == checkVal );
881 memcpy(Y, X, nextPow2N * sizeof(pffft_scalar) );
882 POCKFFTR_PRE(_forward)(planr, Y, 1.);
883 assert( X[Nmax] == checkVal );
884 memcpy(X, Y, nextPow2N * sizeof(pffft_scalar) );
885 POCKFFTR_PRE(_backward)(planr, X, 1./nextPow2N);
886 assert( X[Nmax] == checkVal );
887 }
888 ++max_iter;
889 }
890 t1 = uclock_sec();
891 } while ( t1 < tstop );
892
893 if (cplx) {
894 POCKFFTC_MID(destroy_,_plan)(planc);
895 } else {
896 POCKFFTR_MID(destroy_,_plan)(planr);
897 }
898
899 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
900 tmeas[TYPE_ITER][ALGO_POCKET] = max_iter;
901 tmeas[TYPE_MFLOPS][ALGO_POCKET] = flops/1e6/(t1 - t0 + 1e-16);
902 tmeas[TYPE_DUR_TOT][ALGO_POCKET] = t1 - t0;
903 tmeas[TYPE_DUR_NS][ALGO_POCKET] = show_output("Pocket", N, cplx, flops, t0, t1, max_iter, tableFile);
904 tmeas[TYPE_PREP][ALGO_POCKET] = (t0 - te) * 1e3;
905 haveAlgo[ALGO_POCKET] = 1;
906 } else {
907 show_output("Pocket", N, cplx, -1, -1, -1, -1, tableFile);
908 }
909 #endif
910
911
912 /* PFFFT-U (unordered) benchmark */
913 Nmax = (cplx ? pffftPow2N*2 : pffftPow2N);
914 X[Nmax] = checkVal;
915 if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) )
916 {
917 te = uclock_sec();
918 PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
919 if (s) {
920 t0 = uclock_sec();
921 tstop = t0 + max_test_duration;
922 max_iter = 0;
923 do {
924 for ( k = 0; k < step_iter; ++k ) {
925 assert( X[Nmax] == checkVal );
926 PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD);
927 assert( X[Nmax] == checkVal );
928 PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD);
929 assert( X[Nmax] == checkVal );
930 ++max_iter;
931 }
932 t1 = uclock_sec();
933 } while ( t1 < tstop );
934
935 PFFFT_FUNC(destroy_setup)(s);
936
937 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
938 tmeas[TYPE_ITER][ALGO_PFFFT_U] = max_iter;
939 tmeas[TYPE_MFLOPS][ALGO_PFFFT_U] = flops/1e6/(t1 - t0 + 1e-16);
940 tmeas[TYPE_DUR_TOT][ALGO_PFFFT_U] = t1 - t0;
941 tmeas[TYPE_DUR_NS][ALGO_PFFFT_U] = show_output("PFFFT-U", N, cplx, flops, t0, t1, max_iter, tableFile);
942 tmeas[TYPE_PREP][ALGO_PFFFT_U] = (t0 - te) * 1e3;
943 haveAlgo[ALGO_PFFFT_U] = 1;
944 }
945 } else {
946 show_output("PFFFT-U", N, cplx, -1, -1, -1, -1, tableFile);
947 }
948
949
950 if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) )
951 {
952 te = uclock_sec();
953 PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
954 if (s) {
955 t0 = uclock_sec();
956 tstop = t0 + max_test_duration;
957 max_iter = 0;
958 do {
959 for ( k = 0; k < step_iter; ++k ) {
960 assert( X[Nmax] == checkVal );
961 PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_FORWARD);
962 assert( X[Nmax] == checkVal );
963 PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_BACKWARD);
964 assert( X[Nmax] == checkVal );
965 ++max_iter;
966 }
967 t1 = uclock_sec();
968 } while ( t1 < tstop );
969
970 PFFFT_FUNC(destroy_setup)(s);
971
972 flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
973 tmeas[TYPE_ITER][ALGO_PFFFT_O] = max_iter;
974 tmeas[TYPE_MFLOPS][ALGO_PFFFT_O] = flops/1e6/(t1 - t0 + 1e-16);
975 tmeas[TYPE_DUR_TOT][ALGO_PFFFT_O] = t1 - t0;
976 tmeas[TYPE_DUR_NS][ALGO_PFFFT_O] = show_output("PFFFT", N, cplx, flops, t0, t1, max_iter, tableFile);
977 tmeas[TYPE_PREP][ALGO_PFFFT_O] = (t0 - te) * 1e3;
978 haveAlgo[ALGO_PFFFT_O] = 1;
979 }
980 } else {
981 show_output("PFFFT", N, cplx, -1, -1, -1, -1, tableFile);
982 }
983
984 if (!array_output_format)
985 {
986 printf("prepare/ms: ");
987 for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
988 {
989 if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
990 printf("%s %.3f ", algoName[iter], tmeas[TYPE_PREP][iter] );
991 }
992 }
993 printf("\n");
994 }
995 Tfastest = 0.0;
996 for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
997 {
998 if ( Tfastest == 0.0 || ( tmeas[TYPE_DUR_NS][iter] != 0.0 && tmeas[TYPE_DUR_NS][iter] < Tfastest ) )
999 Tfastest = tmeas[TYPE_DUR_NS][iter];
1000 }
1001 if ( Tfastest > 0.0 )
1002 {
1003 if (!array_output_format)
1004 printf("relative fast: ");
1005 for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
1006 {
1007 if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
1008 tmeas[TYPE_DUR_FASTEST][iter] = tmeas[TYPE_DUR_NS][iter] / Tfastest;
1009 if (!array_output_format)
1010 printf("%s %.3f ", algoName[iter], tmeas[TYPE_DUR_FASTEST][iter] );
1011 }
1012 }
1013 if (!array_output_format)
1014 printf("\n");
1015 }
1016
1017 {
1018 if (!array_output_format)
1019 printf("relative pffft: ");
1020 for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
1021 {
1022 if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
1023 tmeas[TYPE_REL_PFFFT][iter] = tmeas[TYPE_DUR_NS][iter] / tmeas[TYPE_DUR_NS][ALGO_PFFFT_O];
1024 if (!array_output_format)
1025 printf("%s %.3f ", algoName[iter], tmeas[TYPE_REL_PFFFT][iter] );
1026 }
1027 }
1028 if (!array_output_format)
1029 printf("\n");
1030 }
1031
1032 if (!array_output_format) {
1033 printf("--\n");
1034 }
1035
1036 PFFFT_FUNC(aligned_free)(X);
1037 PFFFT_FUNC(aligned_free)(Y);
1038 PFFFT_FUNC(aligned_free)(Z);
1039 }
1040
1041
1042 /* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
1043 void validate_pffft_simd();
1044 int validate_pffft_simd_ex(FILE * DbgOut);
1045 void validate_pffftd_simd();
1046 int validate_pffftd_simd_ex(FILE * DbgOut);
1047
1048
1049
1050 int main(int argc, char **argv) {
1051 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1052 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1053 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1054
1055 #if 0 /* include powers of 2 ? */
1056 #define NUMNONPOW2LENS 23
1057 int NnonPow2[NUMNONPOW2LENS] = {
1058 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128,
1059 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768,
1060 256*1024, 1024*1024, -1 };
1061 #else
1062 #define NUMNONPOW2LENS 11
1063 int NnonPow2[NUMNONPOW2LENS] = {
1064 96, 160, 192, 384, 5*96, 5*128,3*256, 800, 2400, 9*1024,
1065 -1 };
1066 #endif
1067
1068 #define NUMPOW2FFTLENS 22
1069 #define MAXNUMFFTLENS MAX( NUMPOW2FFTLENS, NUMNONPOW2LENS )
1070 int Npow2[NUMPOW2FFTLENS]; /* exp = 1 .. 21, -1 */
1071 const int *Nvalues = NULL;
1072 double tmeas[2][MAXNUMFFTLENS][NUM_TYPES][NUM_FFT_ALGOS];
1073 double iterCalReal, iterCalCplx;
1074
1075 int benchReal=1, benchCplx=1, withFFTWfullMeas=0, outputTable2File=1, usePow2=1;
1076 int realCplxIdx, typeIdx;
1077 int i, k;
1078 FILE *tableFile = NULL;
1079
1080 int haveAlgo[NUM_FFT_ALGOS];
1081 char acCsvFilename[64];
1082
1083 for ( k = 1; k <= NUMPOW2FFTLENS; ++k )
1084 Npow2[k-1] = (k == NUMPOW2FFTLENS) ? -1 : (1 << k);
1085 Nvalues = Npow2; /* set default .. for comparisons .. */
1086
1087 for ( i = 0; i < NUM_FFT_ALGOS; ++i )
1088 haveAlgo[i] = 0;
1089
1090 printf("pffft architecture: '%s'\n", PFFFT_FUNC(simd_arch)());
1091 printf("pffft SIMD size: %d\n", PFFFT_FUNC(simd_size)());
1092 printf("pffft min real fft: %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_REAL));
1093 printf("pffft min complex fft: %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_COMPLEX));
1094 printf("\n");
1095
1096 for ( i = 1; i < argc; ++i ) {
1097 if (!strcmp(argv[i], "--array-format") || !strcmp(argv[i], "--table")) {
1098 array_output_format = 1;
1099 }
1100 else if (!strcmp(argv[i], "--no-tab")) {
1101 array_output_format = 0;
1102 }
1103 else if (!strcmp(argv[i], "--real")) {
1104 benchCplx = 0;
1105 }
1106 else if (!strcmp(argv[i], "--cplx")) {
1107 benchReal = 0;
1108 }
1109 else if (!strcmp(argv[i], "--fftw-full-measure")) {
1110 withFFTWfullMeas = 1;
1111 }
1112 else if (!strcmp(argv[i], "--non-pow2")) {
1113 Nvalues = NnonPow2;
1114 usePow2 = 0;
1115 }
1116 else /* if (!strcmp(argv[i], "--help")) */ {
1117 printf("usage: %s [--array-format|--table] [--no-tab] [--real|--cplx] [--fftw-full-measure] [--non-pow2]\n", argv[0]);
1118 exit(0);
1119 }
1120 }
1121
1122 #ifdef HAVE_FFTW
1123 #ifdef PFFFT_ENABLE_DOUBLE
1124 algoName[ALGO_FFTW_ESTIM] = "FFTW D(estim)";
1125 algoName[ALGO_FFTW_AUTO] = "FFTW D(auto) ";
1126 #endif
1127
1128 if (withFFTWfullMeas)
1129 {
1130 #ifdef PFFFT_ENABLE_FLOAT
1131 algoName[ALGO_FFTW_AUTO] = "FFTWF(meas)"; /* "FFTW (auto)" */
1132 #else
1133 algoName[ALGO_FFTW_AUTO] = "FFTWD(meas)"; /* "FFTW (auto)" */
1134 #endif
1135 algoTableHeader[NUM_FFT_ALGOS][0] = "|real FFTWmeas "; /* "|real FFTWauto " */
1136 algoTableHeader[NUM_FFT_ALGOS][0] = "|cplx FFTWmeas "; /* "|cplx FFTWauto " */
1137 }
1138 #endif
1139
1140 if ( PFFFT_FUNC(simd_size)() == 1 )
1141 {
1142 algoName[ALGO_PFFFT_U] = "PFFFTU scal-1";
1143 algoName[ALGO_PFFFT_O] = "PFFFT scal-1 ";
1144 }
1145 else if ( !strcmp(PFFFT_FUNC(simd_arch)(), "4xScalar") )
1146 {
1147 algoName[ALGO_PFFFT_U] = "PFFFT-U scal4";
1148 algoName[ALGO_PFFFT_O] = "PFFFT scal-4 ";
1149 }
1150
1151
1152 clock();
1153 /* double TClockDur = 1.0 / CLOCKS_PER_SEC;
1154 printf("clock() duration for CLOCKS_PER_SEC = %f sec = %f ms\n", TClockDur, 1000.0 * TClockDur );
1155 */
1156
1157 /* calibrate test duration */
1158 {
1159 double t0, t1, dur;
1160 printf("calibrating fft benchmark duration at size N = 512 ..\n");
1161 t0 = uclock_sec();
1162 if (benchReal) {
1163 iterCalReal = cal_benchmark(512, 0 /* real fft */);
1164 printf("real fft iterCal = %f\n", iterCalReal);
1165 }
1166 if (benchCplx) {
1167 iterCalCplx = cal_benchmark(512, 1 /* cplx fft */);
1168 printf("cplx fft iterCal = %f\n", iterCalCplx);
1169 }
1170 t1 = uclock_sec();
1171 dur = t1 - t0;
1172 printf("calibration done in %f sec.\n\n", dur);
1173 }
1174
1175 if (!array_output_format) {
1176 if (benchReal) {
1177 for (i=0; Nvalues[i] > 0; ++i)
1178 benchmark_ffts(Nvalues[i], 0 /* real fft */, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, NULL);
1179 }
1180 if (benchCplx) {
1181 for (i=0; Nvalues[i] > 0; ++i)
1182 benchmark_ffts(Nvalues[i], 1 /* cplx fft */, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, NULL);
1183 }
1184
1185 } else {
1186
1187 if (outputTable2File) {
1188 tableFile = fopen( usePow2 ? "bench-fft-table-pow2.txt" : "bench-fft-table-non2.txt", "w");
1189 }
1190 /* print table headers */
1191 printf("table shows MFlops; higher values indicate faster computation\n\n");
1192
1193 {
1194 print_table("| input len ", tableFile);
1195 for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
1196 {
1197 if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) )
1198 continue;
1199 for (k=0; k < NUM_FFT_ALGOS; ++k)
1200 {
1201 if ( compiledInAlgo[k] )
1202 print_table(algoTableHeader[k][realCplxIdx], tableFile);
1203 }
1204 }
1205 print_table("|\n", tableFile);
1206 }
1207 /* print table value seperators */
1208 {
1209 print_table("|----------", tableFile);
1210 for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
1211 {
1212 if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) )
1213 continue;
1214 for (k=0; k < NUM_FFT_ALGOS; ++k)
1215 {
1216 if ( compiledInAlgo[k] )
1217 print_table(":|-------------", tableFile);
1218 }
1219 }
1220 print_table(":|\n", tableFile);
1221 }
1222
1223 for (i=0; Nvalues[i] > 0; ++i) {
1224 {
1225 double t0, t1;
1226 print_table_fftsize(Nvalues[i], tableFile);
1227 t0 = uclock_sec();
1228 if (benchReal)
1229 benchmark_ffts(Nvalues[i], 0, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, tableFile);
1230 if (benchCplx)
1231 benchmark_ffts(Nvalues[i], 1, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, tableFile);
1232 t1 = uclock_sec();
1233 print_table("|\n", tableFile);
1234 /* printf("all ffts for size %d took %f sec\n", Nvalues[i], t1-t0); */
1235 (void)t0;
1236 (void)t1;
1237 }
1238 }
1239 fprintf(stdout, " (numbers are given in MFlops)\n");
1240 if (outputTable2File) {
1241 fclose(tableFile);
1242 }
1243 }
1244
1245 printf("\n");
1246 printf("now writing .csv files ..\n");
1247
1248 for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
1249 {
1250 if ( (benchReal && realCplxIdx == 0) || (benchCplx && realCplxIdx == 1) )
1251 {
1252 for (typeIdx = 0; typeIdx < NUM_TYPES; ++typeIdx)
1253 {
1254 FILE *f = NULL;
1255 if ( !(SAVE_ALL_TYPES || saveType[typeIdx]) )
1256 continue;
1257 acCsvFilename[0] = 0;
1258 #ifdef PFFFT_SIMD_DISABLE
1259 strcat(acCsvFilename, "scal-");
1260 #else
1261 strcat(acCsvFilename, "simd-");
1262 #endif
1263 strcat(acCsvFilename, (realCplxIdx == 0 ? "real-" : "cplx-"));
1264 strcat(acCsvFilename, ( usePow2 ? "pow2-" : "non2-"));
1265 assert( strlen(acCsvFilename) + strlen(typeFilenamePart[typeIdx]) + 5 < (sizeof(acCsvFilename) / sizeof(acCsvFilename[0])) );
1266 strcat(acCsvFilename, typeFilenamePart[typeIdx]);
1267 strcat(acCsvFilename, ".csv");
1268 f = fopen(acCsvFilename, "w");
1269 if (!f)
1270 continue;
1271 {
1272 fprintf(f, "size, log2, ");
1273 for (k=0; k < NUM_FFT_ALGOS; ++k)
1274 if ( haveAlgo[k] )
1275 fprintf(f, "%s, ", algoName[k]);
1276 fprintf(f, "\n");
1277 }
1278 for (i=0; Nvalues[i] > 0; ++i)
1279 {
1280 {
1281 fprintf(f, "%d, %.3f, ", Nvalues[i], log10((double)Nvalues[i])/log10(2.0) );
1282 for (k=0; k < NUM_FFT_ALGOS; ++k)
1283 if ( haveAlgo[k] )
1284 fprintf(f, "%f, ", tmeas[realCplxIdx][i][typeIdx][k]);
1285 fprintf(f, "\n");
1286 }
1287 }
1288 fclose(f);
1289 }
1290 }
1291 }
1292
1293 return 0;
1294 }
1295
1296