external/pffft/bench_pffft.c

/*
  Copyright (c) 2013 Julien Pommier.
  Copyright (c) 2019  Hayati Ayguen ( [email protected] )

  Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP

  How to build:

  on linux, with fftw3:
  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm

  on macos, without fftw3:
  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate

  on macos, with fftw3:
  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate

  as alternative: replace clang by gcc.

  on windows, with visual c++:
  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c

  build without SIMD instructions:
  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm

 */

#define CONCAT_TOKENS(A, B)  A ## B
#define CONCAT_THREE_TOKENS(A, B, C)  A ## B ## C

#ifdef PFFFT_ENABLE_FLOAT
#include "pffft.h"

typedef float pffft_scalar;
typedef PFFFT_Setup PFFFT_SETUP;
#define PFFFT_FUNC(F)  CONCAT_TOKENS(pffft_, F)

#else
/*
Note: adapted for double precision dynamic range version.
*/
#include "pffft_double.h"

typedef double pffft_scalar;
typedef PFFFTD_Setup PFFFT_SETUP;
#define PFFFT_FUNC(F)  CONCAT_TOKENS(pffftd_, F)
#endif

#ifdef PFFFT_ENABLE_FLOAT

#include "fftpack.h"

#ifdef HAVE_GREEN_FFTS
#include "fftext.h"
#endif

#ifdef HAVE_KISS_FFT
#include <kiss_fft.h>
#include <kiss_fftr.h>
#endif

#endif

#ifdef HAVE_POCKET_FFT
#include <pocketfft_double.h>
#include <pocketfft_single.h>
#endif

#ifdef PFFFT_ENABLE_FLOAT
  #define POCKFFTR_PRE(R)   CONCAT_TOKENS(rffts, R)
  #define POCKFFTC_PRE(R)   CONCAT_TOKENS(cffts, R)
  #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rffts, R)
  #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cffts, R)
#else
  #define POCKFFTR_PRE(R)   CONCAT_TOKENS(rfft, R)
  #define POCKFFTC_PRE(R)   CONCAT_TOKENS(cfft, R)
  #define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rfft, R)
  #define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cfft, R)
#endif


#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <string.h>

#ifdef HAVE_SYS_TIMES
#  include <sys/times.h>
#  include <unistd.h>
#endif

#ifdef HAVE_VECLIB
#  include <Accelerate/Accelerate.h>
#endif

#ifdef HAVE_FFTW
#  include <fftw3.h>

#ifdef PFFFT_ENABLE_FLOAT
typedef fftwf_plan FFTW_PLAN;
typedef fftwf_complex FFTW_COMPLEX;
#define FFTW_FUNC(F)  CONCAT_TOKENS(fftwf_, F)
#else
typedef fftw_plan FFTW_PLAN;
typedef fftw_complex FFTW_COMPLEX;
#define FFTW_FUNC(F)  CONCAT_TOKENS(fftw_, F)
#endif

#endif /* HAVE_FFTW */

#ifndef M_LN2
  #define M_LN2   0.69314718055994530942  /* log_e 2 */
#endif


#define NUM_FFT_ALGOS  9
enum {
  ALGO_FFTPACK = 0,
  ALGO_VECLIB,
  ALGO_FFTW_ESTIM,
  ALGO_FFTW_AUTO,
  ALGO_GREEN,
  ALGO_KISS,
  ALGO_POCKET,
  ALGO_PFFFT_U, /* = 7 */
  ALGO_PFFFT_O  /* = 8 */
};

#define NUM_TYPES      7
enum {
  TYPE_PREP = 0,         /* time for preparation in ms */
  TYPE_DUR_NS = 1,       /* time per fft in ns */
  TYPE_DUR_FASTEST = 2,  /* relative time to fastest */
  TYPE_REL_PFFFT = 3,    /* relative time to ALGO_PFFFT */
  TYPE_ITER = 4,         /* # of iterations in measurement */
  TYPE_MFLOPS = 5,       /* MFlops/sec */
  TYPE_DUR_TOT = 6       /* test duration in sec */
};
/* double tmeas[NUM_TYPES][NUM_FFT_ALGOS]; */

const char * algoName[NUM_FFT_ALGOS] = {
  "FFTPack      ",
  "vDSP (vec)   ",
  "FFTW F(estim)",
  "FFTW F(auto) ",
  "Green        ",
  "Kiss         ",
  "Pocket       ",
  "PFFFT-U(simd)",  /* unordered */
  "PFFFT (simd) "   /* ordered */
};


int compiledInAlgo[NUM_FFT_ALGOS] = {
#ifdef PFFFT_ENABLE_FLOAT
  1, /* "FFTPack    " */
#else
  0, /* "FFTPack    " */
#endif
#if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT)
  1, /* "vDSP (vec) " */
#else
  0,
#endif
#if defined(HAVE_FFTW)
  1, /* "FFTW(estim)" */
  1, /* "FFTW (auto)" */
#else
  0, 0,
#endif
#if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT)
  1, /* "Green      " */
#else
  0,
#endif
#if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT)
  1, /* "Kiss       " */
#else
  0,
#endif
#if defined(HAVE_POCKET_FFT)
  1, /* "Pocket     " */
#else
  0,
#endif
  1, /* "PFFFT_U    " */
  1  /* "PFFFT_O    " */
};

const char * algoTableHeader[NUM_FFT_ALGOS][2] = {
{ "| real FFTPack ", "| cplx FFTPack " },
{ "|  real   vDSP ", "|  cplx   vDSP " },
{ "|real FFTWestim", "|cplx FFTWestim" },
{ "|real FFTWauto ", "|cplx FFTWauto " },
{ "|  real  Green ", "|  cplx  Green " },
{ "|  real   Kiss ", "|  cplx   Kiss " },
{ "|  real Pocket ", "|  cplx Pocket " },
{ "| real PFFFT-U ", "| cplx PFFFT-U " },
{ "|  real  PFFFT ", "|  cplx  PFFFT " } };

const char * typeText[NUM_TYPES] = {
  "preparation in ms",
  "time per fft in ns",
  "relative to fastest",
  "relative to pffft",
  "measured_num_iters",
  "mflops",
  "test duration in sec"
};

const char * typeFilenamePart[NUM_TYPES] = {
  "1-preparation-in-ms",
  "2-timePerFft-in-ns",
  "3-rel-fastest",
  "4-rel-pffft",
  "5-num-iter",
  "6-mflops",
  "7-duration-in-sec"
};

#define SAVE_ALL_TYPES  0

const int saveType[NUM_TYPES] = {
  1, /* "1-preparation-in-ms" */
  0, /* "2-timePerFft-in-ns"  */
  0, /* "3-rel-fastest"       */
  1, /* "4-rel-pffft"         */
  1, /* "5-num-iter"          */
  1, /* "6-mflops"            */
  1, /* "7-duration-in-sec"   */
};


#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)<(y)?(x):(y))

unsigned Log2(unsigned v) {
  /* we don't need speed records .. obvious way is good enough */
  /* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious */
  /* Find the log base 2 of an integer with the MSB N set in O(N) operations (the obvious way):
   * unsigned v: 32-bit word to find the log base 2 of */
  unsigned r = 0; /* r will be lg(v) */
  while (v >>= 1)
  {
    r++;
  }
  return r;
}


double frand() {
  return rand()/(double)RAND_MAX;
}

#if defined(HAVE_SYS_TIMES)
  inline double uclock_sec(void) {
    static double ttclk = 0.;
    struct tms t;
    if (ttclk == 0.)
      ttclk = sysconf(_SC_CLK_TCK);
    times(&t);
    /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
    return ((double)t.tms_utime)) / ttclk;
  }
# else
  double uclock_sec(void)
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
#endif


/* compare results with the regular fftpack */
void pffft_validate_N(int N, int cplx) {

#ifdef PFFFT_ENABLE_FLOAT

  int Nfloat = N*(cplx?2:1);
  int Nbytes = Nfloat * sizeof(pffft_scalar);
  float *ref, *in, *out, *tmp, *tmp2;
  PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
  int pass;


  if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
  ref = PFFFT_FUNC(aligned_malloc)(Nbytes);
  in = PFFFT_FUNC(aligned_malloc)(Nbytes);
  out = PFFFT_FUNC(aligned_malloc)(Nbytes);
  tmp = PFFFT_FUNC(aligned_malloc)(Nbytes);
  tmp2 = PFFFT_FUNC(aligned_malloc)(Nbytes);

  for (pass=0; pass < 2; ++pass) {
    float ref_max = 0;
    int k;
    /* printf("N=%d pass=%d cplx=%d\n", N, pass, cplx); */
    /* compute reference solution with FFTPACK */
    if (pass == 0) {
      float *wrk = malloc(2*Nbytes+15*sizeof(pffft_scalar));
      for (k=0; k < Nfloat; ++k) {
        ref[k] = in[k] = (float)( frand()*2-1 );
        out[k] = 1e30F;
      }
      if (!cplx) {
        rffti(N, wrk);
        rfftf(N, ref, wrk);
        /* use our ordering for real ffts instead of the one of fftpack */
        {
          float refN=ref[N-1];
          for (k=N-2; k >= 1; --k) ref[k+1] = ref[k];
          ref[1] = refN;
        }
      } else {
        cffti(N, wrk);
        cfftf(N, ref, wrk);
      }
      free(wrk);
    }

    for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, (float)( fabs(ref[k]) ));


    /* pass 0 : non canonical ordering of transform coefficients */
    if (pass == 0) {
      /* test forward transform, with different input / output */
      PFFFT_FUNC(transform)(s, in, tmp, 0, PFFFT_FORWARD);
      memcpy(tmp2, tmp, Nbytes);
      memcpy(tmp, in, Nbytes);
      PFFFT_FUNC(transform)(s, tmp, tmp, 0, PFFFT_FORWARD);
      for (k = 0; k < Nfloat; ++k) {
        assert(tmp2[k] == tmp[k]);
      }

      /* test reordering */
      PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD);
      PFFFT_FUNC(zreorder)(s, out, tmp, PFFFT_BACKWARD);
      for (k = 0; k < Nfloat; ++k) {
        assert(tmp2[k] == tmp[k]);
      }
      PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD);
    } else {
      /* pass 1 : canonical ordering of transform coeffs. */
      PFFFT_FUNC(transform_ordered)(s, in, tmp, 0, PFFFT_FORWARD);
      memcpy(tmp2, tmp, Nbytes);
      memcpy(tmp, in, Nbytes);
      PFFFT_FUNC(transform_ordered)(s, tmp, tmp, 0, PFFFT_FORWARD);
      for (k = 0; k < Nfloat; ++k) {
        assert(tmp2[k] == tmp[k]);
      }
      memcpy(out, tmp, Nbytes);
    }

    {
      for (k=0; k < Nfloat; ++k) {
        if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
          printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
          exit(1);
        }
      }

      if (pass == 0) PFFFT_FUNC(transform)(s, tmp, out, 0, PFFFT_BACKWARD);
      else   PFFFT_FUNC(transform_ordered)(s, tmp, out, 0, PFFFT_BACKWARD);
      memcpy(tmp2, out, Nbytes);
      memcpy(out, tmp, Nbytes);
      if (pass == 0) PFFFT_FUNC(transform)(s, out, out, 0, PFFFT_BACKWARD);
      else   PFFFT_FUNC(transform_ordered)(s, out, out, 0, PFFFT_BACKWARD);
      for (k = 0; k < Nfloat; ++k) {
        assert(tmp2[k] == out[k]);
        out[k] *= 1.f/N;
      }
      for (k = 0; k < Nfloat; ++k) {
        if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
          printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
          exit(1);
        }
      }
    }

    /* quick test of the circular convolution in fft domain */
    {
      float conv_err = 0, conv_max = 0;

      PFFFT_FUNC(zreorder)(s, ref, tmp, PFFFT_FORWARD);
      memset(out, 0, Nbytes);
      PFFFT_FUNC(zconvolve_accumulate)(s, ref, ref, out, 1.0);
      PFFFT_FUNC(zreorder)(s, out, tmp2, PFFFT_FORWARD);

      for (k=0; k < Nfloat; k += 2) {
        float ar = tmp[k], ai=tmp[k+1];
        if (cplx || k > 0) {
          tmp[k] = ar*ar - ai*ai;
          tmp[k+1] = 2*ar*ai;
        } else {
          tmp[0] = ar*ar;
          tmp[1] = ai*ai;
        }
      }

      for (k=0; k < Nfloat; ++k) {
        float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
        if (d > conv_err) conv_err = d;
        if (e > conv_max) conv_max = e;
      }
      if (conv_err > 1e-5*conv_max) {
        printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
      }
    }

  }

  printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);

  PFFFT_FUNC(destroy_setup)(s);
  PFFFT_FUNC(aligned_free)(ref);
  PFFFT_FUNC(aligned_free)(in);
  PFFFT_FUNC(aligned_free)(out);
  PFFFT_FUNC(aligned_free)(tmp);
  PFFFT_FUNC(aligned_free)(tmp2);

#endif /* PFFFT_ENABLE_FLOAT */
}

void pffft_validate(int cplx) {
  static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
  int k;
  for (k = 0; Ntest[k]; ++k) {
    int N = Ntest[k];
    if (N == 16 && !cplx) continue;
    pffft_validate_N(N, cplx);
  }
}

int array_output_format = 1;


void print_table(const char *txt, FILE *tableFile) {
  fprintf(stdout, "%s", txt);
  if (tableFile && tableFile != stdout)
    fprintf(tableFile, "%s", txt);
}

void print_table_flops(float mflops, FILE *tableFile) {
  fprintf(stdout, "|%11.0f   ", mflops);
  if (tableFile && tableFile != stdout)
    fprintf(tableFile, "|%11.0f   ", mflops);
}

void print_table_fftsize(int N, FILE *tableFile) {
  fprintf(stdout, "|%9d  ", N);
  if (tableFile && tableFile != stdout)
    fprintf(tableFile, "|%9d  ", N);
}

double show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter, FILE *tableFile) {
  double T = (double)(t1-t0)/2/max_iter * 1e9;
  float mflops = flops/1e6/(t1 - t0 + 1e-16);
  if (array_output_format) {
    if (flops != -1)
      print_table_flops(mflops, tableFile);
    else
      print_table("|        n/a   ", tableFile);
  } else {
    if (flops != -1) {
      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
    }
  }
  fflush(stdout);
  return T;
}

double cal_benchmark(int N, int cplx) {
  const int log2N = Log2(N);
  int Nfloat = (cplx ? N*2 : N);
  int Nbytes = Nfloat * sizeof(pffft_scalar);
  pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes);
  double t0, t1, tstop, T, nI;
  int k, iter;

  assert( PFFFT_FUNC(is_power_of_two)(N) );
  for (k = 0; k < Nfloat; ++k) {
    X[k] = sqrtf(k+1);
  }

  /* PFFFT-U (unordered) benchmark */
  PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
  assert(s);
  iter = 0;
  t0 = uclock_sec();
  tstop = t0 + 0.25;  /* benchmark duration: 250 ms */
  do {
    for ( k = 0; k < 512; ++k ) {
      PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD);
      PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD);
      ++iter;
    }
    t1 = uclock_sec();
  } while ( t1 < tstop );
  PFFFT_FUNC(destroy_setup)(s);
  PFFFT_FUNC(aligned_free)(X);
  PFFFT_FUNC(aligned_free)(Y);
  PFFFT_FUNC(aligned_free)(Z);

  T = ( t1 - t0 );  /* duration per fft() */
  nI = ((double)iter) * ( log2N * N );  /* number of iterations "normalized" to O(N) = N*log2(N) */
  return (nI / T);    /* normalized iterations per second */
}


void benchmark_ffts(int N, int cplx, int withFFTWfullMeas, double iterCal, double tmeas[NUM_TYPES][NUM_FFT_ALGOS], int haveAlgo[NUM_FFT_ALGOS], FILE *tableFile ) {
  const int log2N = Log2(N);
  int nextPow2N = PFFFT_FUNC(next_power_of_two)(N);
  int log2NextN = Log2(nextPow2N);
  int pffftPow2N = nextPow2N;

  int Nfloat = (cplx ? MAX(nextPow2N, pffftPow2N)*2 : MAX(nextPow2N, pffftPow2N));
  int Nmax, k, iter;
  int Nbytes = Nfloat * sizeof(pffft_scalar);

  pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes + sizeof(pffft_scalar)), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes + 2*sizeof(pffft_scalar) ), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes);
  double te, t0, t1, tstop, flops, Tfastest;

  const double max_test_duration = 0.150;   /* test duration 150 ms */
  double numIter = max_test_duration * iterCal / ( log2N * N );  /* number of iteration for max_test_duration */
  const int step_iter = MAX(1, ((int)(0.01 * numIter)) );  /* one hundredth */
  int max_iter = MAX(1, ((int)numIter) );  /* minimum 1 iteration */

  const float checkVal = 12345.0F;

  /* printf("benchmark_ffts(N = %d, cplx = %d): Nfloat = %d, X_mem = 0x%p, X = %p\n", N, cplx, Nfloat, X_mem, X); */

  memset( X, 0, Nfloat * sizeof(pffft_scalar) );
  if ( Nfloat < 32 ) {
    for (k = 0; k < Nfloat; k += 4)
      X[k] = sqrtf(k+1);
  } else {
    for (k = 0; k < Nfloat; k += (Nfloat/16) )
      X[k] = sqrtf(k+1);
  }

  for ( k = 0; k < NUM_TYPES; ++k )
  {
    for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
      tmeas[k][iter] = 0.0;
  }


  /* FFTPack benchmark */
  Nmax = (cplx ? N*2 : N);
  X[Nmax] = checkVal;
#ifdef PFFFT_ENABLE_FLOAT
  {
    float *wrk = malloc(2*Nbytes + 15*sizeof(pffft_scalar));
    te = uclock_sec();
    if (cplx) cffti(N, wrk);
    else      rffti(N, wrk);
    t0 = uclock_sec();
    tstop = t0 + max_test_duration;
    max_iter = 0;
    do {
      for ( k = 0; k < step_iter; ++k ) {
        if (cplx) {
          assert( X[Nmax] == checkVal );
          cfftf(N, X, wrk);
          assert( X[Nmax] == checkVal );
          cfftb(N, X, wrk);
          assert( X[Nmax] == checkVal );
        } else {
          assert( X[Nmax] == checkVal );
          rfftf(N, X, wrk);
          assert( X[Nmax] == checkVal );
          rfftb(N, X, wrk);
          assert( X[Nmax] == checkVal );
        }
        ++max_iter;
      }
      t1 = uclock_sec();
    } while ( t1 < tstop );

    free(wrk);

    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
    tmeas[TYPE_ITER][ALGO_FFTPACK] = max_iter;
    tmeas[TYPE_MFLOPS][ALGO_FFTPACK] = flops/1e6/(t1 - t0 + 1e-16);
    tmeas[TYPE_DUR_TOT][ALGO_FFTPACK] = t1 - t0;
    tmeas[TYPE_DUR_NS][ALGO_FFTPACK] = show_output("FFTPack", N, cplx, flops, t0, t1, max_iter, tableFile);
    tmeas[TYPE_PREP][ALGO_FFTPACK] = (t0 - te) * 1e3;
    haveAlgo[ALGO_FFTPACK] = 1;
  }
#endif

#if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT)
  Nmax = (cplx ? nextPow2N*2 : nextPow2N);
  X[Nmax] = checkVal;
  te = uclock_sec();
  if ( 1 || PFFFT_FUNC(is_power_of_two)(N) ) {
    FFTSetup setup;

    setup = vDSP_create_fftsetup(log2NextN, FFT_RADIX2);
    DSPSplitComplex zsamples;
    zsamples.realp = &X[0];
    zsamples.imagp = &X[Nfloat/2];
    t0 = uclock_sec();
    tstop = t0 + max_test_duration;
    max_iter = 0;
    do {
      for ( k = 0; k < step_iter; ++k ) {
        if (cplx) {
          assert( X[Nmax] == checkVal );
          vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward);
          assert( X[Nmax] == checkVal );
          vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse);
          assert( X[Nmax] == checkVal );
        } else {
          assert( X[Nmax] == checkVal );
          vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward);
          assert( X[Nmax] == checkVal );
          vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse);
          assert( X[Nmax] == checkVal );
        }
        ++max_iter;
      }
      t1 = uclock_sec();
    } while ( t1 < tstop );

    vDSP_destroy_fftsetup(setup);
    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
    tmeas[TYPE_ITER][ALGO_VECLIB] = max_iter;
    tmeas[TYPE_MFLOPS][ALGO_VECLIB] = flops/1e6/(t1 - t0 + 1e-16);
    tmeas[TYPE_DUR_TOT][ALGO_VECLIB] = t1 - t0;
    tmeas[TYPE_DUR_NS][ALGO_VECLIB] = show_output("vDSP", N, cplx, flops, t0, t1, max_iter, tableFile);
    tmeas[TYPE_PREP][ALGO_VECLIB] = (t0 - te) * 1e3;
    haveAlgo[ALGO_VECLIB] = 1;
  } else {
    show_output("vDSP", N, cplx, -1, -1, -1, -1, tableFile);
  }
#endif

#if defined(HAVE_FFTW)
  Nmax = (cplx ? N*2 : N);
  X[Nmax] = checkVal;
  {
    /* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE);  measure takes a lot of time on largest ffts */
    int flags = FFTW_ESTIMATE;
    te = uclock_sec();

    FFTW_PLAN planf, planb;
    FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
    FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
    memset(in, 0, sizeof(FFTW_COMPLEX) * N);
    if (cplx) {
      planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags);
      planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags);
    } else {
      planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags);
      planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags);
    }

    t0 = uclock_sec();
    tstop = t0 + max_test_duration;
    max_iter = 0;
    do {
      for ( k = 0; k < step_iter; ++k ) {
        assert( X[Nmax] == checkVal );
        FFTW_FUNC(execute)(planf);
        assert( X[Nmax] == checkVal );
        FFTW_FUNC(execute)(planb);
        assert( X[Nmax] == checkVal );
        ++max_iter;
      }
      t1 = uclock_sec();
    } while ( t1 < tstop );

    FFTW_FUNC(destroy_plan)(planf);
    FFTW_FUNC(destroy_plan)(planb);
    FFTW_FUNC(free)(in); FFTW_FUNC(free)(out);

    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
    tmeas[TYPE_ITER][ALGO_FFTW_ESTIM] = max_iter;
    tmeas[TYPE_MFLOPS][ALGO_FFTW_ESTIM] = flops/1e6/(t1 - t0 + 1e-16);
    tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM] = t1 - t0;
    tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile);
    tmeas[TYPE_PREP][ALGO_FFTW_ESTIM] = (t0 - te) * 1e3;
    haveAlgo[ALGO_FFTW_ESTIM] = 1;
  }
  Nmax = (cplx ? N*2 : N);
  X[Nmax] = checkVal;
  do {
    /* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE);  measure takes a lot of time on largest ffts */
    /* int flags = FFTW_MEASURE; */
#if ( defined(__arm__) || defined(__aarch64__) || defined(__arm64__) )
    int limitFFTsize = 31;  /* takes over a second on Raspberry Pi 3 B+ -- and much much more on higher ffts sizes! */
#else
    int limitFFTsize = 2400;  /* take over a second on i7 for fft size 2400 */
#endif
    int flags = (N < limitFFTsize ? FFTW_MEASURE : (withFFTWfullMeas ? FFTW_MEASURE : FFTW_ESTIMATE));

    if (flags == FFTW_ESTIMATE) {
      show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, -1, -1, -1, -1, tableFile);
      /* copy values from estimation */
      tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = tmeas[TYPE_ITER][ALGO_FFTW_ESTIM];
      tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM];
      tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM];
      tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = tmeas[TYPE_PREP][ALGO_FFTW_ESTIM];
    } else {
      te = uclock_sec();
      FFTW_PLAN planf, planb;
      FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
      FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
      memset(in, 0, sizeof(FFTW_COMPLEX) * N);
      if (cplx) {
        planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags);
        planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags);
      } else {
        planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags);
        planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags);
      }

      t0 = uclock_sec();
      tstop = t0 + max_test_duration;
      max_iter = 0;
      do {
        for ( k = 0; k < step_iter; ++k ) {
          assert( X[Nmax] == checkVal );
          FFTW_FUNC(execute)(planf);
          assert( X[Nmax] == checkVal );
          FFTW_FUNC(execute)(planb);
          assert( X[Nmax] == checkVal );
          ++max_iter;
        }
        t1 = uclock_sec();
      } while ( t1 < tstop );

      FFTW_FUNC(destroy_plan)(planf);
      FFTW_FUNC(destroy_plan)(planb);
      FFTW_FUNC(free)(in); FFTW_FUNC(free)(out);

      flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
      tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = max_iter;
      tmeas[TYPE_MFLOPS][ALGO_FFTW_AUTO] = flops/1e6/(t1 - t0 + 1e-16);
      tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = t1 - t0;
      tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile);
      tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = (t0 - te) * 1e3;
      haveAlgo[ALGO_FFTW_AUTO] = 1;
    }
  } while (0);
#else
  (void)withFFTWfullMeas;
#endif

#if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT)
  Nmax = (cplx ? nextPow2N*2 : nextPow2N);
  X[Nmax] = checkVal;
  if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
  {
    te = uclock_sec();
    fftInit(log2NextN);

    t0 = uclock_sec();
    tstop = t0 + max_test_duration;
    max_iter = 0;
    do {
      for ( k = 0; k < step_iter; ++k ) {
        if (cplx) {
          assert( X[Nmax] == checkVal );
          ffts(X, log2NextN, 1);
          assert( X[Nmax] == checkVal );
          iffts(X, log2NextN, 1);
          assert( X[Nmax] == checkVal );
        } else {
          rffts(X, log2NextN, 1);
          riffts(X, log2NextN, 1);
        }

        ++max_iter;
      }
      t1 = uclock_sec();
    } while ( t1 < tstop );

    fftFree();

    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
    tmeas[TYPE_ITER][ALGO_GREEN] = max_iter;
    tmeas[TYPE_MFLOPS][ALGO_GREEN] = flops/1e6/(t1 - t0 + 1e-16);
    tmeas[TYPE_DUR_TOT][ALGO_GREEN] = t1 - t0;
    tmeas[TYPE_DUR_NS][ALGO_GREEN] = show_output("Green", N, cplx, flops, t0, t1, max_iter, tableFile);
    tmeas[TYPE_PREP][ALGO_GREEN] = (t0 - te) * 1e3;
    haveAlgo[ALGO_GREEN] = 1;
  } else {
    show_output("Green", N, cplx, -1, -1, -1, -1, tableFile);
  }
#endif

#if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT)
  Nmax = (cplx ? nextPow2N*2 : nextPow2N);
  X[Nmax] = checkVal;
  if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
  {
    kiss_fft_cfg stf;
    kiss_fft_cfg sti;
    kiss_fftr_cfg stfr;
    kiss_fftr_cfg stir;

    te = uclock_sec();
    if (cplx) {
      stf = kiss_fft_alloc(nextPow2N, 0, 0, 0);
      sti = kiss_fft_alloc(nextPow2N, 1, 0, 0);
    } else {
      stfr = kiss_fftr_alloc(nextPow2N, 0, 0, 0);
      stir = kiss_fftr_alloc(nextPow2N, 1, 0, 0);
    }

    t0 = uclock_sec();
    tstop = t0 + max_test_duration;
    max_iter = 0;
    do {
      for ( k = 0; k < step_iter; ++k ) {
        if (cplx) {
          assert( X[Nmax] == checkVal );
          kiss_fft(stf, (const kiss_fft_cpx *)X, (kiss_fft_cpx *)Y);
          assert( X[Nmax] == checkVal );
          kiss_fft(sti, (const kiss_fft_cpx *)Y, (kiss_fft_cpx *)X);
          assert( X[Nmax] == checkVal );
        } else {
          assert( X[Nmax] == checkVal );
          kiss_fftr(stfr, X, (kiss_fft_cpx *)Y);
          assert( X[Nmax] == checkVal );
          kiss_fftri(stir, (const kiss_fft_cpx *)Y, X);
          assert( X[Nmax] == checkVal );
        }
        ++max_iter;
      }
      t1 = uclock_sec();
    } while ( t1 < tstop );

    kiss_fft_cleanup();

    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
    tmeas[TYPE_ITER][ALGO_KISS] = max_iter;
    tmeas[TYPE_MFLOPS][ALGO_KISS] = flops/1e6/(t1 - t0 + 1e-16);
    tmeas[TYPE_DUR_TOT][ALGO_KISS] = t1 - t0;
    tmeas[TYPE_DUR_NS][ALGO_KISS] = show_output("Kiss", N, cplx, flops, t0, t1, max_iter, tableFile);
    tmeas[TYPE_PREP][ALGO_KISS] = (t0 - te) * 1e3;
    haveAlgo[ALGO_KISS] = 1;
  } else {
    show_output("Kiss", N, cplx, -1, -1, -1, -1, tableFile);
  }
#endif

#if defined(HAVE_POCKET_FFT)

  Nmax = (cplx ? nextPow2N*2 : nextPow2N);
  X[Nmax] = checkVal;
  if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
  {
    POCKFFTR_PRE(_plan) planr;
    POCKFFTC_PRE(_plan) planc;

    te = uclock_sec();
    if (cplx) {
      planc = POCKFFTC_MID(make_,_plan)(nextPow2N);
    } else {
      planr = POCKFFTR_MID(make_,_plan)(nextPow2N);
    }

    t0 = uclock_sec();
    tstop = t0 + max_test_duration;
    max_iter = 0;
    do {
      for ( k = 0; k < step_iter; ++k ) {
        if (cplx) {
          assert( X[Nmax] == checkVal );
          memcpy(Y, X, 2*nextPow2N * sizeof(pffft_scalar) );
          POCKFFTC_PRE(_forward)(planc, Y, 1.);
          assert( X[Nmax] == checkVal );
          memcpy(X, Y, 2*nextPow2N * sizeof(pffft_scalar) );
          POCKFFTC_PRE(_backward)(planc, X, 1./nextPow2N);
          assert( X[Nmax] == checkVal );
        } else {
          assert( X[Nmax] == checkVal );
          memcpy(Y, X, nextPow2N * sizeof(pffft_scalar) );
          POCKFFTR_PRE(_forward)(planr, Y, 1.);
          assert( X[Nmax] == checkVal );
          memcpy(X, Y, nextPow2N * sizeof(pffft_scalar) );
          POCKFFTR_PRE(_backward)(planr, X, 1./nextPow2N);
          assert( X[Nmax] == checkVal );
        }
        ++max_iter;
      }
      t1 = uclock_sec();
    } while ( t1 < tstop );

    if (cplx) {
      POCKFFTC_MID(destroy_,_plan)(planc);
    } else {
      POCKFFTR_MID(destroy_,_plan)(planr);
    }

    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
    tmeas[TYPE_ITER][ALGO_POCKET] = max_iter;
    tmeas[TYPE_MFLOPS][ALGO_POCKET] = flops/1e6/(t1 - t0 + 1e-16);
    tmeas[TYPE_DUR_TOT][ALGO_POCKET] = t1 - t0;
    tmeas[TYPE_DUR_NS][ALGO_POCKET] = show_output("Pocket", N, cplx, flops, t0, t1, max_iter, tableFile);
    tmeas[TYPE_PREP][ALGO_POCKET] = (t0 - te) * 1e3;
    haveAlgo[ALGO_POCKET] = 1;
  } else {
    show_output("Pocket", N, cplx, -1, -1, -1, -1, tableFile);
  }
#endif


  /* PFFFT-U (unordered) benchmark */
  Nmax = (cplx ? pffftPow2N*2 : pffftPow2N);
  X[Nmax] = checkVal;
  if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) )
  {
    te = uclock_sec();
    PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
    if (s) {
      t0 = uclock_sec();
      tstop = t0 + max_test_duration;
      max_iter = 0;
      do {
        for ( k = 0; k < step_iter; ++k ) {
          assert( X[Nmax] == checkVal );
          PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD);
          assert( X[Nmax] == checkVal );
          PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD);
          assert( X[Nmax] == checkVal );
          ++max_iter;
        }
        t1 = uclock_sec();
      } while ( t1 < tstop );

      PFFFT_FUNC(destroy_setup)(s);

      flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
      tmeas[TYPE_ITER][ALGO_PFFFT_U] = max_iter;
      tmeas[TYPE_MFLOPS][ALGO_PFFFT_U] = flops/1e6/(t1 - t0 + 1e-16);
      tmeas[TYPE_DUR_TOT][ALGO_PFFFT_U] = t1 - t0;
      tmeas[TYPE_DUR_NS][ALGO_PFFFT_U] = show_output("PFFFT-U", N, cplx, flops, t0, t1, max_iter, tableFile);
      tmeas[TYPE_PREP][ALGO_PFFFT_U] = (t0 - te) * 1e3;
      haveAlgo[ALGO_PFFFT_U] = 1;
    }
  } else {
    show_output("PFFFT-U", N, cplx, -1, -1, -1, -1, tableFile);
  }


  if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) )
  {
    te = uclock_sec();
    PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
    if (s) {
      t0 = uclock_sec();
      tstop = t0 + max_test_duration;
      max_iter = 0;
      do {
        for ( k = 0; k < step_iter; ++k ) {
          assert( X[Nmax] == checkVal );
          PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_FORWARD);
          assert( X[Nmax] == checkVal );
          PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_BACKWARD);
          assert( X[Nmax] == checkVal );
          ++max_iter;
        }
        t1 = uclock_sec();
      } while ( t1 < tstop );

      PFFFT_FUNC(destroy_setup)(s);

      flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
      tmeas[TYPE_ITER][ALGO_PFFFT_O] = max_iter;
      tmeas[TYPE_MFLOPS][ALGO_PFFFT_O] = flops/1e6/(t1 - t0 + 1e-16);
      tmeas[TYPE_DUR_TOT][ALGO_PFFFT_O] = t1 - t0;
      tmeas[TYPE_DUR_NS][ALGO_PFFFT_O] = show_output("PFFFT", N, cplx, flops, t0, t1, max_iter, tableFile);
      tmeas[TYPE_PREP][ALGO_PFFFT_O] = (t0 - te) * 1e3;
      haveAlgo[ALGO_PFFFT_O] = 1;
    }
  } else {
    show_output("PFFFT", N, cplx, -1, -1, -1, -1, tableFile);
  }

  if (!array_output_format)
  {
    printf("prepare/ms:     ");
    for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
    {
      if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
        printf("%s %.3f    ", algoName[iter], tmeas[TYPE_PREP][iter] );
      }
    }
    printf("\n");
  }
  Tfastest = 0.0;
  for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
  {
    if ( Tfastest == 0.0 || ( tmeas[TYPE_DUR_NS][iter] != 0.0 && tmeas[TYPE_DUR_NS][iter] < Tfastest ) )
      Tfastest = tmeas[TYPE_DUR_NS][iter];
  }
  if ( Tfastest > 0.0 )
  {
    if (!array_output_format)
      printf("relative fast:  ");
    for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
    {
      if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
        tmeas[TYPE_DUR_FASTEST][iter] = tmeas[TYPE_DUR_NS][iter] / Tfastest;
        if (!array_output_format)
          printf("%s %.3f    ", algoName[iter], tmeas[TYPE_DUR_FASTEST][iter] );
      }
    }
    if (!array_output_format)
      printf("\n");
  }

  {
    if (!array_output_format)
      printf("relative pffft: ");
    for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
    {
      if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
        tmeas[TYPE_REL_PFFFT][iter] = tmeas[TYPE_DUR_NS][iter] / tmeas[TYPE_DUR_NS][ALGO_PFFFT_O];
        if (!array_output_format)
          printf("%s %.3f    ", algoName[iter], tmeas[TYPE_REL_PFFFT][iter] );
      }
    }
    if (!array_output_format)
      printf("\n");
  }

  if (!array_output_format) {
    printf("--\n");
  }

  PFFFT_FUNC(aligned_free)(X);
  PFFFT_FUNC(aligned_free)(Y);
  PFFFT_FUNC(aligned_free)(Z);
}


/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
void validate_pffft_simd();
int  validate_pffft_simd_ex(FILE * DbgOut);
void validate_pffftd_simd();
int  validate_pffftd_simd_ex(FILE * DbgOut);


int main(int argc, char **argv) {
  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
     handle other cases (or maybe just switch to a scalar fft, I don't know..) */

#if 0  /* include powers of 2 ? */
#define NUMNONPOW2LENS  23
  int NnonPow2[NUMNONPOW2LENS] = {
    64, 96, 128, 160, 192,   256, 384, 5*96, 512, 5*128,
    3*256, 800, 1024, 2048, 2400,   4096, 8192, 9*1024, 16384, 32768,
    256*1024, 1024*1024, -1 };
#else
#define NUMNONPOW2LENS  11
  int NnonPow2[NUMNONPOW2LENS] = {
    96, 160, 192, 384, 5*96,   5*128,3*256, 800, 2400, 9*1024,
    -1 };
#endif

#define NUMPOW2FFTLENS  22
#define MAXNUMFFTLENS MAX( NUMPOW2FFTLENS, NUMNONPOW2LENS )
  int Npow2[NUMPOW2FFTLENS];  /* exp = 1 .. 21, -1 */
  const int *Nvalues = NULL;
  double tmeas[2][MAXNUMFFTLENS][NUM_TYPES][NUM_FFT_ALGOS];
  double iterCalReal, iterCalCplx;

  int benchReal=1, benchCplx=1, withFFTWfullMeas=0, outputTable2File=1, usePow2=1;
  int realCplxIdx, typeIdx;
  int i, k;
  FILE *tableFile = NULL;

  int haveAlgo[NUM_FFT_ALGOS];
  char acCsvFilename[64];

  for ( k = 1; k <= NUMPOW2FFTLENS; ++k )
    Npow2[k-1] = (k == NUMPOW2FFTLENS) ? -1 : (1 << k);
  Nvalues = Npow2;  /* set default .. for comparisons .. */

  for ( i = 0; i < NUM_FFT_ALGOS; ++i )
    haveAlgo[i] = 0;

  printf("pffft architecture:    '%s'\n", PFFFT_FUNC(simd_arch)());
  printf("pffft SIMD size:       %d\n", PFFFT_FUNC(simd_size)());
  printf("pffft min real fft:    %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_REAL));
  printf("pffft min complex fft: %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_COMPLEX));
  printf("\n");

  for ( i = 1; i < argc; ++i ) {
    if (!strcmp(argv[i], "--array-format") || !strcmp(argv[i], "--table")) {
      array_output_format = 1;
    }
    else if (!strcmp(argv[i], "--no-tab")) {
      array_output_format = 0;
    }
    else if (!strcmp(argv[i], "--real")) {
      benchCplx = 0;
    }
    else if (!strcmp(argv[i], "--cplx")) {
      benchReal = 0;
    }
    else if (!strcmp(argv[i], "--fftw-full-measure")) {
      withFFTWfullMeas = 1;
    }
    else if (!strcmp(argv[i], "--non-pow2")) {
      Nvalues = NnonPow2;
      usePow2 = 0;
    }
    else /* if (!strcmp(argv[i], "--help")) */ {
      printf("usage: %s [--array-format|--table] [--no-tab] [--real|--cplx] [--fftw-full-measure] [--non-pow2]\n", argv[0]);
      exit(0);
    }
  }

#ifdef HAVE_FFTW
#ifdef PFFFT_ENABLE_DOUBLE
  algoName[ALGO_FFTW_ESTIM] = "FFTW D(estim)";
  algoName[ALGO_FFTW_AUTO]  = "FFTW D(auto) ";
#endif

  if (withFFTWfullMeas)
  {
#ifdef PFFFT_ENABLE_FLOAT
    algoName[ALGO_FFTW_AUTO] = "FFTWF(meas)"; /* "FFTW (auto)" */
#else
    algoName[ALGO_FFTW_AUTO] = "FFTWD(meas)"; /* "FFTW (auto)" */
#endif
    algoTableHeader[NUM_FFT_ALGOS][0] = "|real FFTWmeas "; /* "|real FFTWauto " */
    algoTableHeader[NUM_FFT_ALGOS][0] = "|cplx FFTWmeas "; /* "|cplx FFTWauto " */
  }
#endif

  if ( PFFFT_FUNC(simd_size)() == 1 )
  {
    algoName[ALGO_PFFFT_U] = "PFFFTU scal-1";
    algoName[ALGO_PFFFT_O] = "PFFFT scal-1 ";
  }
  else if ( !strcmp(PFFFT_FUNC(simd_arch)(), "4xScalar") )
  {
    algoName[ALGO_PFFFT_U] = "PFFFT-U scal4";
    algoName[ALGO_PFFFT_O] = "PFFFT scal-4 ";
  }


  clock();
  /* double TClockDur = 1.0 / CLOCKS_PER_SEC;
  printf("clock() duration for CLOCKS_PER_SEC = %f sec = %f ms\n", TClockDur, 1000.0 * TClockDur );
  */

  /* calibrate test duration */
  {
    double t0, t1, dur;
    printf("calibrating fft benchmark duration at size N = 512 ..\n");
    t0 = uclock_sec();
    if (benchReal) {
      iterCalReal = cal_benchmark(512, 0 /* real fft */);
      printf("real fft iterCal = %f\n", iterCalReal);
    }
    if (benchCplx) {
      iterCalCplx = cal_benchmark(512, 1 /* cplx fft */);
      printf("cplx fft iterCal = %f\n", iterCalCplx);
    }
    t1 = uclock_sec();
    dur = t1 - t0;
    printf("calibration done in %f sec.\n\n", dur);
  }

  if (!array_output_format) {
    if (benchReal) {
      for (i=0; Nvalues[i] > 0; ++i)
        benchmark_ffts(Nvalues[i], 0 /* real fft */, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, NULL);
    }
    if (benchCplx) {
      for (i=0; Nvalues[i] > 0; ++i)
        benchmark_ffts(Nvalues[i], 1 /* cplx fft */, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, NULL);
    }

  } else {

    if (outputTable2File) {
      tableFile = fopen( usePow2 ? "bench-fft-table-pow2.txt" : "bench-fft-table-non2.txt", "w");
    }
    /* print table headers */
    printf("table shows MFlops; higher values indicate faster computation\n\n");

    {
      print_table("| input len ", tableFile);
      for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
      {
        if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) )
          continue;
        for (k=0; k < NUM_FFT_ALGOS; ++k)
        {
          if ( compiledInAlgo[k] )
            print_table(algoTableHeader[k][realCplxIdx], tableFile);
        }
      }
      print_table("|\n", tableFile);
    }
    /* print table value seperators */
    {
      print_table("|----------", tableFile);
      for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
      {
        if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) )
          continue;
        for (k=0; k < NUM_FFT_ALGOS; ++k)
        {
          if ( compiledInAlgo[k] )
            print_table(":|-------------", tableFile);
        }
      }
      print_table(":|\n", tableFile);
    }

    for (i=0; Nvalues[i] > 0; ++i) {
      {
        double t0, t1;
        print_table_fftsize(Nvalues[i], tableFile);
        t0 = uclock_sec();
        if (benchReal)
          benchmark_ffts(Nvalues[i], 0, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, tableFile);
        if (benchCplx)
          benchmark_ffts(Nvalues[i], 1, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, tableFile);
        t1 = uclock_sec();
        print_table("|\n", tableFile);
        /* printf("all ffts for size %d took %f sec\n", Nvalues[i], t1-t0); */
        (void)t0;
        (void)t1;
      }
    }
    fprintf(stdout, " (numbers are given in MFlops)\n");
    if (outputTable2File) {
      fclose(tableFile);
    }
  }

  printf("\n");
  printf("now writing .csv files ..\n");

  for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
  {
    if ( (benchReal && realCplxIdx == 0) || (benchCplx && realCplxIdx == 1) )
    {
      for (typeIdx = 0; typeIdx < NUM_TYPES; ++typeIdx)
      {
        FILE *f = NULL;
        if ( !(SAVE_ALL_TYPES || saveType[typeIdx]) )
          continue;
        acCsvFilename[0] = 0;
#ifdef PFFFT_SIMD_DISABLE
        strcat(acCsvFilename, "scal-");
#else
        strcat(acCsvFilename, "simd-");
#endif
        strcat(acCsvFilename, (realCplxIdx == 0 ? "real-" : "cplx-"));
        strcat(acCsvFilename, ( usePow2 ? "pow2-" : "non2-"));
        assert( strlen(acCsvFilename) + strlen(typeFilenamePart[typeIdx]) + 5 < (sizeof(acCsvFilename) / sizeof(acCsvFilename[0])) );
        strcat(acCsvFilename, typeFilenamePart[typeIdx]);
        strcat(acCsvFilename, ".csv");
        f = fopen(acCsvFilename, "w");
        if (!f)
          continue;
        {
          fprintf(f, "size, log2, ");
          for (k=0; k < NUM_FFT_ALGOS; ++k)
            if ( haveAlgo[k] )
              fprintf(f, "%s, ", algoName[k]);
          fprintf(f, "\n");
        }
        for (i=0; Nvalues[i] > 0; ++i)
        {
          {
            fprintf(f, "%d, %.3f, ", Nvalues[i], log10((double)Nvalues[i])/log10(2.0) );
            for (k=0; k < NUM_FFT_ALGOS; ++k)
              if ( haveAlgo[k] )
                fprintf(f, "%f, ", tmeas[realCplxIdx][i][typeIdx][k]);
            fprintf(f, "\n");
          }
        }
        fclose(f);
      }
    }
  }

  return 0;
}