//---------------------------------------------------------------------------
// File  :  C:\cbproj\Remote_CW_Keyer\FFT_API.c
// Date  :  2025-02-16 (ISO 8601,  YYYY-MM-DD)
// Author:  Wolfgang Buescher  (DL4YHF)
//
// Description:
//     Application interface for the Fast Fourier Transform function.
//     Based on DL4YHF's "Sound Utilities", but stripped-down
//     for simpler projects like the "Remote CW Keyer" (anno 2025).
//
// Revision history (YYYY-MM-DD):
//
//   2025-02-16  Based on 'SoundMaths.c', created THIS module.
//               Threw out everything not related to the FFT itself.
//               Decided to use the old FFT implementation by
//               Takuya Ooura because it's pretty small, self-contained,
//               has NO DEPENDENCIES AT ALL (unlike PFFFT, etc).
//
// Literature:
//   [SGDSP] = Steven W. Smith,
//          "The Scientists and Engineer's Guide to Digital Signal Processing",
//          Chapter 12, "The Fast Fourier Transform".    www.DSPguide.com .
//          Locally saved by W.B. under c:\Literat1\dspguide\*.pdf .
//
//   [HFTWIN] ("Heinzel, New Flat-Top Windows") :
//          "Spectrum and spectral density estimation by the Discrete Fourier
//           transform (DFT), including a comprehensive list of window
//           functions and some new flat-top windows." (2002-02-15)
//              by G. Heinzel, A. Rdiger, R. Schilling,
//              Max-Planck-Institut fr Gravitationsphysik, Hannover.
//           Locally saved as
//    C:\literatur\Signal_processing_and_filters\New_FFT_Windowing_Functions_2002.pdf
//---------------------------------------------------------------------------


#include "SWITCHES.H"  // project specific compiler switches ("options")
         // like SWI_FLOAT_PRECISION (1:float=float, 2:float=double).
         // Must be included before anything else !


#include <windows.h>
#include <math.h>
#include <float.h> // _isnan() and other not-really-standardized stuff

#pragma hdrstop   // no precompiled headers after this point

#include "FFT_API.h" // header for this module (Fast Fourier Transformations)

#ifndef  SWI_SOUND_TAB_INCLUDED  /* may be defined in SWITCHES.H ... */
 #define SWI_SOUND_TAB_INCLUDED 0
#endif
#if( SWI_SOUND_TAB_INCLUDED ) /* Module SoundTab.c included in the project ? */
 #include "SoundTab.h"  // common tables used by many audio processing modules
 // (most important here: a large SINE table to speed things up)
#endif

#if( SWI_USE_OOURAS_FFT ) /* use Takuya Ooura's FFT (a bit faster than textbook-FFT) ? */
# undef   T_FFT_FLOAT
# define  T_FFT_FLOAT float
# include "FFT_Ooura.c"   // compile T. Ooura's code for SINGLE PRECISION
  // With a C macro trick, the FFT functions can be compiled
  // for DOUBLE precision (besides SINGLE precision), generating
  // different function names. We don't use that possibility here
  // in this stripped-down library.
# if(SWI_FFT_NEED_DOUBLE_PRECISION) // compile the same FFT algorithm into functions for DOUBLE precision ? 0=no, 1=yes..
#  undef   T_FFT_FLOAT
#  define  T_FFT_FLOAT double
#  include "FFT_Ooura.c"   // compile T. Ooura's code for DOUBLE PRECISION
# endif // SWI_FFT_NEED_DOUBLE_PRECISION ?
#endif // SWI_USE_OOURAS_FFT ?


// #pragma warn -8017
#pragma warn -8004  // .. is assigned a value that is never used (well..)
#pragma warn -8080  // .. is declared but never used (.. so what ?)
#pragma warn -8057  // suppress "parameter 'xyz' is never used"

// Debugging stuff ...
int  FFT_iSourceCodeLine = __LINE__;  // last source code line (WATCH this!)
#define DOBINI() FFT_iSourceCodeLine=__LINE__
 // DOBINI() added here 2009-07-10 to find the reason for the "phantom breakpoint"

long FFT_i32MathErrorCounter = 0;

//---------------------------------------------------------------------------
int _matherr(struct exception *e)
  // Added 2008-08-08 for 'Spectrum Lab' .
  //   This should help us get rid of math library exceptions,
  //   like the annoying " * DING-DONG *   sqrt: DOMAIN error"  .
  // Note: Adding a _matherr() to a project only has an effect
  //       if it's implemented in a *.C - module ,   NOT  *.CPP !
  //       Not sure about the reason, most likely a namespace problem
  //       or caused by the C++ "name mangling" .
  //
  // Note: the proper type is "struct exception", NOT _exception !
  //       The struct 'exception' is defined in math.h  .
  //       It can tell us a bit about the cause of the exception,
  //       and allows setting the result of the function call
  //       (like sqrt(), log10(), or the other usual "suspects") .
  //
  // Copied from Borland's help system :
  // > Certain math errors can also occur in library functions; for instance,
  // > if you try to take the square root of a negative number.
  // > The default behavior is to print an error message to the screen,
  // > and to return a NAN (an IEEE not-a-number). Use of the NAN is likely
  // > to cause a floating-point exception later, which will abort the program
  // > if unmasked. If you dont want the message to be printed,
  // > insert the following version of _matherr into your program (...)
  // > Any other use of _matherr to intercept math errors is not encouraged;
  // > it is considered obsolete and might not be supported in future versions
  // > of C++Builder.
  // Oh well. "Considered obsolete" but what's the replacement for it ? ?
  // The stupid C++ exceptions -try..catch(...)- could not catch
  //     floating point errors; at least not in BCB V4 and V6 .
{
  #ifndef DWORD
    #define DWORD unsigned long
  #endif
  union
   { DWORD dw;
     unsigned char b[4];
   } name4;
  #define C2N(a,b,c,d) ((DWORD)a|(((DWORD)b)<<8)|(((DWORD)c)<<16)|(((DWORD)d)<<24))

  // Pack the function name, up to 4 characters in a 32-bit 'DWORD',
  // respecting the trailing zero in the C string .
  // This makes the name suited for a switch..case statement below.
  name4.dw = 0;
  name4.b[0] = e->name[0];
  if( name4.b[0] )
   { name4.b[1] = e->name[1];
     if( name4.b[1] )
      { name4.b[2] = e->name[2];
        if( name4.b[2] )
         { name4.b[3] = e->name[3];
         }
      }
   }

  // Find the "best suited return value" for this application .
  //      (But caution, the return value is ignored when the ERROR
  //       is signalled as an EXCEPTION to the caller with RaiseException() !)
  // The function name should give the most clues ...
  switch( name4.dw ) // for keywords with up to 4 characters, we don't need "strcmp" !
   { case C2N('s','q','r','t') :  // most likely, sqrt(-X) ..
        e->retval = 0.0;
        break;
     case C2N('l','o','g','1') :  // most likely, log10(0) or log10(-X)
        e->retval = -100.0;       // emit a "very negative magnitude" (logarithm)
                 // Note: log10 is often used in a formula like 20*log10(X)
                 //       to convert a voltage ratio X into dB ,
                 //   or 10*log10(P) to convert a power ratio P into dB .
                 // If the input (X) is zero, the result will be -300 dB ,
                 // which will never occurr in a real system .
        // Unfortunately, if RaiseException() is called below,
        // this 'return value' (retval) is *NOT* returned to the caller.
        //   Shit .    So, to keep it simple, we don't call RaiseException ;-)
        break;
     case C2N('p','o','w','\0') :  // most likely, pow(x,y) ...
        // 2010-05-19 : got here with the following call stack:
        //    CalculateArbitraryWaveform("cos(pi*x)^1000000")
        //      -> CLI_CalcSum_Float() -> CLI_CalcSum2() -> CLI_CalcProd()
        //          -> CLI_CalcPow() -> __org_pow() -> __matherr() -> _matherr()
        e->retval = 0.0;
        break;
     default:  // for all OTHER FUNCTIONS, just look at the exception TYPE..
        switch( e->type )
         { case DOMAIN:   // Argument was not in domain of function, such as log(-1).
              e->retval = 0.0;
              break; // end case case DOMAIN

           case SING:     // Argument would result in a singularity, such as pow(0, -2).
              e->retval = 0.0;
              break;

           case OVERFLOW: // Argument would produce a function result greater than
                          // DBL_MAX (or LDBL_MAX), such as exp(1000).
              e->retval = 0.0;
              break;

           case UNDERFLOW: // Argument would produce a function result less than
                           // DBL_MIN (or LDBL_MIN), such as exp(-1000).
              e->retval = 0.0;
              break;

           case TLOSS: // Argument would produce function result with total loss
                       // of significant digits, such as sin(10e70).
              e->retval = 0.0;
              break;

            default:  // what could this be ??
              e->retval = 0.0;
              break;
         } // end switch( e->type )
        break; // end case < all OTHER math functions >
   }

  // The following was just a TEST, but to the author's big surprise
  //  it really helped to catch the floating point errors
  //  in the caller's C++ standard exception handler (try/catch).   WHOW....
  // ex: RaiseException( EXCEPTION_FLT_INVALID_OPERATION,  0, 0 , NULL );
  // BUT:  It's the author's feeling that calling RaiseException() from here
  //       may be unsafe, because RaiseException() is a Win32 API function .

  ++FFT_i32MathErrorCounter; // count the number of errors (for the debug-log)

  return 1;       /* error has been handled */
} // end _matherr()


//---------------------------------------------------------------------------
CPROT void FFT_Init( void )
  // Should be called ONCE on init. In the older (bloated) variants, tried to
  // dynamically load a DLL with a faster FFT (feature removed many years ago).
  // If only the FFT Takuya Ooura or the 'textbook FFT' from www.DSPguide.com
  // is in used, FFT_Init() doesn't do anything at all - but anyway, CALL IT.
{
} // end FFT_Init()

//---------------------------------------------------------------------------
CPROT void FFT_Exit( void )
  // Should be called ONCE when the application exits. Possibly frees resources,
  // and formerly unloaded the 'pretty fast FFT' DLL (or whatever the name was).
{
} // end FFT_Exit()


//---------------------------------------------------------------------------
void FFT_ClearFloatArray(float *pfltArray, int iLength )
  // Clears an array with floating point values (sets all elements to 0.0) .
{
  if( pfltArray!=NULL )
   { while( iLength-- )
      { *(pfltArray++) = 0.0;
      }
   }
} // end FFT_ClearFloatArray()

//---------------------------------------------------------------------------
void FFT_CopyFloatArray( float *pfltSource, float *pfltDest, int iLength )
{ // Don't try to be smart and use "memcpy" or "memmove" here.
  // Move the data in 4-byte chunks instead, which is often faster than BYTE-wise copying.
  while( iLength >= 4 )  // reduce the CPU time spent here by 'partially unrolled loop'
   { iLength -= 4;
     *(pfltDest++) = *(pfltSource++);
     *(pfltDest++) = *(pfltSource++);
     *(pfltDest++) = *(pfltSource++);
     *(pfltDest++) = *(pfltSource++);
   }
  while( (iLength--) > 0 )  // process the 'last few elements'
   { *(pfltDest++) = *(pfltSource++);
   }
} // end FFT_CopyFloatArray()


//---------------------------------------------------------------------------
void FFT_SwapFloatArrayHalves(float *pfltArray, int iLength )
  // Strange function indeed; but required somewhere to bring the frequency bins
  //                          into a more 'intuitive' order ;-)
  // exchanges pfltArray[0...iLength/2-1]
  //      with pfltArray[iLength/2..iLength-1] .
{
  float fltTemp, *pfltPart1, *pfltPart2;
  if(pfltArray)
   { iLength /= 2;
     pfltPart1 = &pfltArray[0];
     pfltPart2 = &pfltArray[iLength];
     while( (iLength--)>0)
      { fltTemp = *pfltPart1;
        *pfltPart1++ = *pfltPart2;
        *pfltPart2++ = fltTemp;
      }
   }
} // end FFT_SwapFloatArrayHalves()

//---------------------------------------------------------------------------
void FFT_SwapDoubleArrayHalves(double *pfltArray, int iLength )
  // Similar as above, but for DOUBLE PRECISION floats .
  // exchanges pfltArray[0...iLength/2-1]
  //      with pfltArray[iLength/2..iLength-1] .
{
  double fltTemp, *pfltPart1, *pfltPart2;
  if(pfltArray)
   { iLength /= 2;
     pfltPart1 = &pfltArray[0];
     pfltPart2 = &pfltArray[iLength];
     while( (iLength--)>0)
      { fltTemp = *pfltPart1;
        *pfltPart1++ = *pfltPart2;
        *pfltPart2++ = fltTemp;
      }
   }
} // end FFT_SwapDoubleArrayHalves()


//---------------------------------------------------------------------------
void FFT_ResampleFloatArray(float *pfltArray, int iSourceLength, int iDestLength )
  // Stretches or shrinks an array.  Originally used for the FFT-based filter,
  // to adapt the frequency response curve when changing the FFT size .
  // Neither iSourceLength nor iDestLength may be zero or negative !
{
  float fltStretchFactor = (float)iSourceLength / (float)iDestLength;
  float fltSourceIndex, fltSrcLeft, fltSrcRight, fltTemp;
  int iDstIdx, iSrcIdx, iStartIdx, iEndIdx, iStep;

  if( iDestLength > iSourceLength  )  // "stretching" (array gets LARGER) :
   {  // begin at the END of the array to avoid overwriting values
      iStartIdx = iDestLength-1;
      iEndIdx   = 0;
      iStep     = -1;
   }
  else // ( iDestLength < iSourceLength ) -> "shrinking" (array gets SMALLER) :
   {  // begin at the START of the array ...
      iStartIdx = 0;
      iEndIdx   = iDestLength-1;
      iStep     = +1;
   }
  for(iDstIdx=iStartIdx; iDstIdx>=0 && iDstIdx<iDestLength; iDstIdx+=iStep)
   { fltSourceIndex = (float)iDstIdx * fltStretchFactor;
     if( fltSourceIndex < 0.0 )
         fltSourceIndex = 0.0;
     if( fltSourceIndex >= iSourceLength )
         fltSourceIndex = iSourceLength-1;
     iSrcIdx = (int)fltSourceIndex;
     fltSrcLeft = pfltArray[iSrcIdx];
     if( (iSrcIdx+1) < iSourceLength)
          fltSrcRight = pfltArray[iSrcIdx+1];
     else fltSrcRight = fltSrcLeft;
     // Interpolate between "left" and "right" value :
     fltTemp = fltSourceIndex - (float)iSrcIdx;  // -> fractional index, 0 .. 0.999999
     fltTemp = fltSrcLeft * (1.0-fltTemp) + fltSrcRight * fltTemp;
     pfltArray[iDstIdx] = fltTemp;
   } // end for(iDstIdx ..

} // end FFT_ResampleFloatArray()

#if( SWI_FFT_NEED_DOUBLE_PRECISION )
//---------------------------------------------------------------------------
void FFT_ResampleDoubleArray(double *pfltArray, int iSourceLength, int iDestLength )
  // Stretches or shrinks an array of doubles.
  // Besides the data type, same function as FFT_ResampleFloatArray() .
{
  double fltStretchFactor = (double)iSourceLength / (double)iDestLength;
  double fltSourceIndex, fltSrcLeft, fltSrcRight, fltTemp;
  int iDstIdx, iSrcIdx, iStartIdx, iEndIdx, iStep;

  if( iDestLength > iSourceLength  )  // "stretching" (array gets LARGER) :
   {  // begin at the END of the array to avoid overwriting values
      iStartIdx = iDestLength-1;
      iEndIdx   = 0;
      iStep     = -1;
   }
  else // ( iDestLength < iSourceLength ) -> "shrinking" (array gets SMALLER) :
   {  // begin at the START of the array ...
      iStartIdx = 0;
      iEndIdx   = iDestLength-1;
      iStep     = +1;
   }
  for(iDstIdx=iStartIdx; iDstIdx>=0 && iDstIdx<iDestLength; iDstIdx+=iStep)
   { fltSourceIndex = (double)iDstIdx * fltStretchFactor;
     if( fltSourceIndex < 0.0 )
         fltSourceIndex = 0.0;
     if( fltSourceIndex >= iSourceLength )
         fltSourceIndex = iSourceLength-1;
     iSrcIdx = (int)fltSourceIndex;
     fltSrcLeft = pfltArray[iSrcIdx];
     if( (iSrcIdx+1) < iSourceLength)
          fltSrcRight = pfltArray[iSrcIdx+1];
     else fltSrcRight = fltSrcLeft;
     // Interpolate between "left" and "right" value :
     fltTemp = fltSourceIndex - (double)iSrcIdx;  // -> fractional index, 0 .. 0.999999
     fltTemp = fltSrcLeft * (1.0-fltTemp) + fltSrcRight * fltTemp;
     pfltArray[iDstIdx] = fltTemp;
   } // end for(iDstIdx ..

} // end FFT_ResampleDoubleArray()
#endif // ( SWI_FFT_NEED_DOUBLE_PRECISION )

//---------------------------------------------------------------------------
double FFT_CalculateAngle(double re, double im)
  // Four-quadrant conversion of a complex pair ("I/Q")
  //   into an phase value (in radians, but explained in degrees here).
  //   Typically operates on a complex frequency bin from the FFT.
  // A positive real value gives an angle of zero, etc.
  // Returned value range is -180 .. +180 =  -pi .. +pi .
  // If both real and imaginary part are zero, the returned value
  // is zero.
  // Revision history:
  //   Jan 13, 2002, by DL4YHF:
  //       Implemented for a phase-sensitive spectrum analyser
  //   Sept  15, 2002:
  //       Copied into AM_FM_DeMod.cpp (sound utilities) .
  //   2007-03-11 :
  //       Copied into the FFT-filter-plugin demo .
{
#define ANGLE_RANGE_PLUS_MINUS_180_DEGREES 1  // 1: result_range = -pi..pi = -180..+180
     // (-180..+180 is preferred because angles tend to be +-0.x degrees,
     //  and it looks ugly if the display jumps from "0.1" to "359.9" and back)
  if(im > 0.0)
   {  // first or second quadrant
     if( re > 0.0 )
      { // first quadrant (0..90 degees)
        return atan(im/re);
      }
     else
     if( re < 0.0 )
      { // second quadrant (90..180 degrees)
        return atan(im/re) + C_PI;
      }
     else // re=0, im>0
      {
        return 0.5 * C_PI;
      }
   }
  else // ! im>0
  if(im < 0.0)
   {  // third or fourth quadrant
     if( re < 0.0 )
      { // third quadrant
#if(ANGLE_RANGE_PLUS_MINUS_180_DEGREES)
        return atan(im/re) - C_PI;     // for result range -180..-90
#else
        return atan(im/re) + C_PI;     // for result range 180..270
#endif
      }
     else
     if( re > 0.0 )
      { // fourth quadrant
#if(ANGLE_RANGE_PLUS_MINUS_180_DEGREES)
        return atan(im/re);           // for result range -90..0
#else
        return atan(im/re) + 2*C_PI;  // for result range 270..360
#endif
      }
     else // re=0, im<0  -> 270 degrees
      {
#if(ANGLE_RANGE_PLUS_MINUS_180_DEGREES)
        return -0.5 * C_PI;
#else
        return 1.5 *  C_PI;
#endif
      }
   }
  else   // im=0, a "real" number
   {
     if(re>=0)
        return 0;
     else
        return C_PI;    // negative -> 180 degrees
   }
} // end ..CalculateAngle()


//---------------------------------------------------------------------------
float FFT_CalculateAngleFast(float x, float y)
{ // Fast atan2 calculation with self normalization.
  // Returned value range is  -pi..pi =  -180 .. +180 .
  //
  // Based on an article by Jim Shima, found at
  //       http://www.dspguru.com/comp.dsp/tricks/alg/fxdatan2.htm .
  // The Trick:
  //  compute a self-normalizing ratio depending on the quadrant
  //  that the complex number resides in.
  //  For a complex number z, let x = Re(z) and y = Im(z).
  //
  //  For a complex number in quadrant I (0<=theta<=pi/4), compute the ratio:
  //
  //     x-y
  // r = ---     (1)
  //     x+y
  //
  // To get the phase angle, compute:
  //
  // theta1 = pi/4 - pi/4*r (2)
  //
  // Likewise, if the complex number resides in quadrant II (pi/4<=theta<=3*pi/4),
  // compute the ratio:
  //
  //     x+y
  // r = ---     (3)
  //     y-x
  //
  // And to get the quadrant II phase angle, compute:
  //
  // theta2 = 3*pi/4 - pi/4*r (4)
  //
  // If it turns out that the complex number was really in quad IV
  //  instead of quad I, just negate the answer resulting from (2).
  //
  // Likewise, do the same if the number was in quad III
  // instead of quad II. By doing this, you have a 4-quadrant arctan function.
  //
  // The max error using equations (2) or (4) is a little less than 0.07 rads
  // (only at a few angles though). The accuracy of the estimator is actually
  // quite good considering using a 1st-order polynomial to estimate the phase angle.
  //
  // If you use a higher degree polynomial, it turns out that the even powers
  // of the poly will disappear (due to the odd function), thus relaxing some
  // of the computational load.
  //
#define ATAN2_HIGH_ACCURACY 1
  // FOR BETTER ACCURACY:
  //   To obtain better accuracy (a max error of .01 rads =~ 0.6 degrees),
  //   one can replace equations (2) and (4) with:
  //       theta1 = 0.1963 * r^3 - 0.9817 * r + pi/4   (2a)
  //       theta2 = 0.1963 * r^3 - 0.9817 * r + 3*pi/4 (4a)
  //
  //  Equations (2a) or (4a) can be computed using 2 MACs on a DSP,  // (YHF: hw^3 ?)
  //  which does not involve much more computation for a 7x increase
  //  in accuracy.
  //
  // C code using equations (1)-(4):
  //-----------------------------------------------
  // Fast arctan2
  static float coeff_1 = C_PI/4;
  static float coeff_2 = 3*C_PI/4;
  float r,angle;
 // ex:  float abs_y = fabs(y)+1e-10;    // kludge to prevent 0/0 condition
  float abs_y = fabs(y)+1e-30;    // kludge to prevent 0/0 condition, more accurate result

  // 2013-11-25: Suspected a problem with FFT_CalculateAngleFast()
  //    in C:\cbproj\ColorDF\ColourDF.cpp, when x or y were 'very large'.
  //    Added the following stuff to debug this:
  r = fabs(x) + abs_y;   // -> 'Manhattan length' is ok for this purpose !
  if(  r > 1.0 )  // test added 2013-11-25 : no effect, neither with r>1 nor r>100 .. but doesn't hurt
   {  x /= r;
      y /= r;
      abs_y = fabs(y)+1e-30;
   } // end of the code added 2013-11-25

  if (x>=0)
   {
      r = (x - abs_y) / (x + abs_y);         // (equation 1)
#if(ATAN2_HIGH_ACCURACY)
      angle = coeff_1 - 0.9817 * r + 0.1963 * r*r*r; // (2a)
#else
      angle = coeff_1 - coeff_1 * r;         // (equation 2)
#endif // (ATAN2_HIGH_ACCURACY)
   }
  else // x<0
   {
      r = (x + abs_y) / (abs_y - x);         // (equation 3)
#if(ATAN2_HIGH_ACCURACY)
      angle = coeff_2 - 0.9817 * r + 0.1963 * r*r*r; // (4a)
#else
      angle = coeff_2 - coeff_1 * r;         // (equation 4)
#endif // (ATAN2_HIGH_ACCURACY)
   }

  if (y < 0)
     return(-angle);     // negate if in quad III or IV
  else
     return(angle);
} // end FFT_CalculateAngleFast()




//---------------------------------------------------------------------------
void FFT_CalcComplexFft( // ... for SINGLE PRECISION floating point values
          int iNrOfPoints,    // N =  number of points in the DFT *AND* in the time domain
          float *pfltRe,      // REX[] = real parts of input and output
          float *pfltIm )     // IMX[] = imaginary parts of input and output
 //  THE FAST FOURIER TRANSFORM  - inspired by [SGDSP] TABLE 12-3 or -4 .
 //     No cluttered classes, global vars, windowing, averaging and whatsoever-
 //     Just the classic complex FFT (complex input, complex output) !
 //  Upon entry, N contains the number of points in the DFT, REX[ ] and
 //     IMX[ ] contain the real and imaginary parts of the input.
 //     All signals run from 0 to N-1.
 //  Upon return, REX[0..N-1] & IMX[0..N-1] contain the DFT output:
 //     The frequencies between 0 and N/2 are positive,
 //     while the frequencies between N/2 and N-1 are negative.
 //     Remember, the frequency spectrum of a discrete signal is
 //     periodic, making the negative frequencies between N/2 and N-1
 //     the same as between -N/2 and 0. The samples at 0 and N/2
 //     straddle the line between positive and negative.
 // More specific: Upon return,
 //   - pfltXX[0] contains the DC component
 //   - pfltXX[1] contains the smallest positive frequency
 //   - pfltXX[N/2-1] contains the largest positive frequency
 //   - pfltXX[N/2] is a special case, see TEST RESULTS further below !
 //   - pfltXX[N/2+1] is the bin with the "most negative" frequency
 //   - pfltXX[N-1] contains the smallest negative frequency ("small but negative")
 // Output range:  A pure sine wave ... A*sin(wt)... will produce an
 //                fft output peak of (N*A/4)^2  where N is FFT_SIZE.
 //   [ this is for a HANN- or similar window, with m_fltWindowAvrg=0.5 ]
 //
 // TEST RESULTS (copied from PhaseAmplMeter.cpp, 2010-05-02) :
 //  *  TEST A: feed a "DC" test signal into the FFT :
 //         for(i=0; i<8192; ++i)
 //          { pPAM->fltFftBuf_re[i] = 1.0;
 //            pPAM->fltFftBuf_im[i] = 2.0;
 //          }
 //     Test result (directly after calling FFT_CalcComplexFft(8192,..) ) :
 //     pPAM->fltFftBuf_re[0] =  8192
 //     pPAM->fltFftBuf_im[0] = 16384  (all other bins were ZERO)
 //
 //  *  TEST B: feed a test signal with a "positive" (??) frequency, at fs/4,
 //       into the FFT :
 //       const int t4[4]={ 0,1,0,-1 };
 //       for(i=0; i<pPAM->m_i32SamplesPerDFT; ++i)
 //        { pPAM->fltFftBuf_re[i] = t4[(i+1)&3];  // "re" leads "im" ...
 //          pPAM->fltFftBuf_im[i] = t4[(i+0)&3];  // (not sure if this should be
 //          // a positive or negative frequency.. see the NCO multiplication..)
 //        }
 //      Test result (directly after calling FFT_CalcComplexFft(8192,..) ) :
 //      pPAM->fltFftBuf_re[2048] =  8192
 //      pPAM->fltFftBuf_im[2048] =   0     (all other bins were ZERO)
 //
 //  *  TEST C: feed a test signal with a "negative" (??) frequency, at fs/4,
 //             into the FFT ?
 //       const int t4[4]={ 0,1,0,-1 };
 //       for(i=0; i<pPAM->m_i32SamplesPerDFT; ++i)
 //        { pPAM->fltFftBuf_re[i] = t4[(i+0)&3];  // "re" LAGS "im" ...
 //          pPAM->fltFftBuf_im[i] = t4[(i+1)&3];  // (not sure if this should be
 //          // a positive or negative frequency.. see the NCO multiplication..)
 //        }
 //      Test result (directly after calling FFT_CalcComplexFft(8192,..) ) :
 //      pPAM->fltFftBuf_re[6144] =   0
 //      pPAM->fltFftBuf_im[6144] = 8192   (all other bins were ZERO)
 //
 //  *  TEST D: feed a test signal with "the largest possible" frequency, at fs/2,
 //       into the FFT. Note: It's impossible to say if
 //       this frequency is "negative" or "positive". Go figure....
 //        for(i=0; i<pPAM->m_i32SamplesPerDFT; ++i)
 //         { pPAM->fltFftBuf_re[i] = (i&1) ? 1 : -1;
 //           pPAM->fltFftBuf_im[i] = 0;
 //           // Try to shift the phase for the Q-channel by +90 or -90 :
 //           //     the result is always zero .. so no "leading" nor "lagging" !
 //           // Conclusion: The complex frequency bin at index <FftSize/2>
 //           // is 'something special', just as well as the "DC" bin at index zero;
 //           // but for real-world application the "DC" bin is much more important
 //           // than this 'maximum-possible-frequency' bin (at the Shannon limit).
 //         }
 //       Test result (directly after calling FFT_CalcComplexFft(8192,..) ) :
 //       pPAM->fltFftBuf_re[4096] = -8192
 //       pPAM->fltFftBuf_im[4096] =   0   (all other bins were ZERO)
 //
 // Benchmarks on a HP laptop, Intel i7 CPU in "economy" power setting,
 //               32768 complex samples in/out :
 //  PFFFT (Pretty Fast FFT by Julien Pommier) : 1.17 ms / FFT (using SIMD instructions, only possible in GCC but not Borland)
 //  FFTPACK (old Fortran code converted to C) : 2.45 ms / FFT
 //  original FFT_CalcComplexFft( 2k)       : 4.7 ms  / FFT
 //  original FFT_CalcComplexInverseFft(32k): 4.7 ms  / FFT (almost the same as FFT_CalcComplexFft)

{  // begin FFT_CalcComplexFft() ...


#if( SWI_USE_OOURAS_FFT ) // Use Takuya Ooura's FFT (fftsg_h.c) :
  // Note that much in contrast to the FFT from the 'DSP Guide',
  //  Ooura's FFT uses a single array with complex numbers .
  //  For MANY (if not all) applications, this is more convenient,
  //  but -unfortunately- utterly incompatible with the older FFT  :-(
  float *pfltTemp, *pflt;
  int i;
#endif // SWI_USE_OOURAS_FFT ?


#if ( ! SWI_USE_OOURAS_FFT ) && ( ! SWI_USE_KISS_FFT )
  // Here, for comparison and because of THIS FFT's simplicity,
  //       a clean, textbook style FFT based on the 'DSP Guide' :
  int I,J,JM1,K,L,M,LE,LE2, IP;
  int NM1 = iNrOfPoints - 1;
  int ND2 = iNrOfPoints / 2;
  float UR, UI, SR, SI, TR, TI;
#endif // ( ! SWI_USE_OOURAS_FFT ) && ( ! SWI_USE_KISS_FFT )



#if ( ! SWI_USE_OOURAS_FFT ) && ( ! SWI_USE_KISS_FFT )
  // ex: m = CINT(LOG(N%)/LOG(2))
  M = 0; I=iNrOfPoints; while(I>1){ ++M; I = (I>>1); }   // -> m = log2( n )
  J = ND2;

  DOBINI();

  for(I=1; I<NM1; ++I)                  // Bit reversal sorting
   {
     if(I<J) // 1120   IF I% >= J% THEN GOTO 1190
      { TR = pfltRe[J];
        TI = pfltIm[J];
        pfltRe[J] = pfltRe[I];   // 2008-08-19: Crashed here with an access violation,
           // after switching from the "default config" to "SDR-IQ with audio output".
        pfltIm[J] = pfltIm[I];
        pfltRe[I] = TR;
       pfltIm[I] = TI;
      }
     K = ND2;    // 1190

     while(K<=J) // 1200   IF K% > J% THEN GOTO 1240
      { J = J - K;
        K = K / 2;
      }          // 1230  GOTO 1200
     J += K;     // 1240   J% = J%+K%
   }             // 1250 NEXT I%

  DOBINI();

  for( L=1; L<=M; ++L)             // 1270 Loop for each stage
   {
     LE = 1<<L;   // 1280  LE% = CINT(2^L%)
     LE2 = LE/2;  // 1290  LE2% = LE%/2
     UR = 1;
     UI = 0;
     DOBINI();
     // Use the standard trig functions instead of table lookup.
     // (these calculations are rarely done; not worth to eliminate sin+cos here)
     SR = cos(C_PI/(float)LE2);   // Calculate sine & cosine values
     SI = -sin(C_PI/(float)LE2);
     for(J=1; J<=LE2; ++J)        // 1340 Loop for each sub DFT
      { JM1 = J-1;
        DOBINI();
        for(I=JM1; I<=NM1; I+=LE) // 1360 Loop for each butterfly
         { IP = I+LE2;
           TR = pfltRe[IP]*UR - pfltIm[IP]*UI;  // Butterfly calculation
           TI = pfltRe[IP]*UI + pfltIm[IP]*UR;
           pfltRe[IP] = pfltRe[I]-TR;
           pfltIm[IP] = pfltIm[I]-TI;
           pfltRe[I]  = pfltRe[I]+TR;
           pfltIm[I]  = pfltIm[I]+TI;
         } // NEXT I
        DOBINI();
        TR = UR;                  // 1450
        UR = TR*SR - UI*SI;
        UI = TR*SI + UI*SR;
      } // NEXT J
     DOBINI();
   } // NEXT L
#endif // < use 'plain textbook-style FFT from the 'DSP Guide' > ?

#if( SWI_USE_OOURAS_FFT ) // Use Takuya Ooura's FFT (fftsg_h.c) :
  // Note that much in contrast to the FFT from the 'DSP Guide',
  //  Ooura's FFT uses a single array with complex numbers .
  //  For MANY (if not all) applications, this is more convenient,
  //  but -unfortunately- utterly incompatible with the older FFT  :-(
  pfltTemp = (float*)malloc( ( 2*iNrOfPoints ) * sizeof(float) );   // MUST BE THREAD-SAFE / reentrant !
  if( pfltTemp != NULL )
   {
     // Combine the input (separate real and imaginary parts) .
     // It remained unclear why the imaginary part had to be inverted
     // before and after the FFT, to produce the same result
     // with Ooura's FFT as with the simple FFT code from the DSP-Guide !
     pflt = pfltTemp;
     for(i=0;i<iNrOfPoints; ++i)
      { *pflt++ =  pfltRe[i];
        *pflt++ = -pfltIm[i];   // !!!?!
      }

     cdft_flt( 2*iNrOfPoints, 1/*forward*/, pfltTemp );
     // Complex Discrete Fourier Transform  .
     // > Usage:  cdft_flt(2*n, 1, a) : forward FFT, single precision float
     // > Parameters:
     // >   2*n            :data length (int)
     // >                   n >= 1, n = power of 2
     // >   a[0...2*n-1]   :input/output data (float *)
     // >                   input data
     // >                       a[2*j] = Re(x[j]),
     // >                       a[2*j+1] = Im(x[j]), 0<=j<n
     // >                   output data
     // >                       a[2*k] = Re(X[k]),
     // >                       a[2*k+1] = Im(X[k]), 0<=k<n
     //

     // Split up the real and imaginary part again:
     pflt = pfltTemp;
     for(i=0;i<iNrOfPoints; ++i)
      { pfltRe[i] =  *pflt++;
        pfltIm[i] = -*pflt++;   // !!
      }
     // Remember, the output of the complex FFT from 'DSP-Guide' chapter 12 was this :
     //   - pfltXX[1]     contained the smallest positive frequency
     //   - pfltXX[N/2-1] contained the largest positive frequency
     //   - pfltXX[N/2]   contained the most negative frequency ("very negative")
     //   - pfltXX[N-1]   contained the smallest negative frequency ("small but negative")
     //   - pfltXX[0]     obviously contained the "DC" bin (?)
     free( pfltTemp );    // clean up the temporary 'complex' array WITHOUT guard-area
   }
  else
   { // heavens no; running out of memory on a PC ?!
   }

#endif // SWI_USE_OOURAS_FFT ?



  DOBINI();
} // end FFT_CalcComplexFft()  [ for SINGLE PRECISION, i.e. 4-byte floating point ]

#if( SWI_FFT_NEED_DOUBLE_PRECISION )
//---------------------------------------------------------------------------
void FFT_CalcComplexFft_Double(  // similar to FFT_CalcComplexFft(), for double precision
          int iNrOfPoints,  // N =  number of points in the DFT *AND* in the time domain
          double *pdblRe,   // REX[] = real parts of input and output
          double *pdblIm )  // IMX[] = imaginary parts of input and output
{ // Similar as above, but using DOUBLE precision floats !
  // Not used by Spectrum Lab so don't care about optimisation. KEEP IT SIMPLE!
  int I,J,JM1,K,L,M,LE,LE2, IP;
  int NM1 = iNrOfPoints - 1;
  int ND2 = iNrOfPoints / 2;
  double UR, UI, SR, SI, TR, TI;

  DOBINI();

  // ex: m = CINT(LOG(N%)/LOG(2))
  M = 0; I=iNrOfPoints; while(I>1){ ++M; I = (I>>1); }   // -> m = log2( n )
  J = ND2;

  DOBINI();
  for(I=1; I<NM1; ++I)                  // Bit reversal sorting
   {
     if(I<J) // 1120   IF I% >= J% THEN GOTO 1190
      { TR = pdblRe[J];
        TI = pdblIm[J];
        pdblRe[J] = pdblRe[I];
        pdblIm[J] = pdblIm[I];
        pdblRe[I] = TR;
       pdblIm[I] = TI;
      }
     K = ND2;    // 1190

     while(K<=J) // 1200   IF K% > J% THEN GOTO 1240
      { J = J - K;
        K = K / 2;
      }          // 1230  GOTO 1200
     J += K;     // 1240   J% = J%+K%
   }             // 1250 NEXT I%

  DOBINI();
  for( L=1; L<=M; ++L)             // 1270 Loop for each stage
   {
     LE = 1<<L;   // 1280  LE% = CINT(2^L%)
     LE2 = LE/2;  // 1290  LE2% = LE%/2
     UR = 1;
     UI = 0;
     // Use the standard trig functions instead of table lookup.
     // (these calculations are rarely done; not worth to eliminate sin+cos here)
     SR = cos(C_PI/(double)LE2);   // Calculate sine & cosine values
     SI = -sin(C_PI/(double)LE2);
     for(J=1; J<=LE2; ++J)        // 1340 Loop for each sub DFT
      { JM1 = J-1;
        for(I=JM1; I<=NM1; I+=LE) // 1360 Loop for each butterfly
         { IP = I+LE2;
           TR = pdblRe[IP]*UR - pdblIm[IP]*UI;  // Butterfly calculation
           TI = pdblRe[IP]*UI + pdblIm[IP]*UR;
           pdblRe[IP] = pdblRe[I]-TR;
           pdblIm[IP] = pdblIm[I]-TI;
           pdblRe[I]  = pdblRe[I]+TR;
           pdblIm[I]  = pdblIm[I]+TI;
         } // NEXT I
        TR = UR;                  // 1450
        UR = TR*SR - UI*SI;
        UI = TR*SI + UI*SR;
      } // NEXT J
   } // NEXT L
  DOBINI();

} // end FFT_CalcComplexFft_Double()
#endif // SWI_FFT_NEED_DOUBLE_PRECISION ?

//---------------------------------------------------------------------------
void FFT_SortComplexFftForIncreasingFreqBins(
          int iNrOfPoints,    // N =  number of points in the DFT *AND* in the time domain
          float *pfltRe,      // REX[] = real parts of input and output
          float *pfltIm )     // IMX[] = imaginary parts of input and output
  // Sorts the result from FFT_CalcComplexFft() to have the results (=the DFT)
  // ordered by increasing frequency bins, so that ...
  //   - pfltRe+Im[0]     contains the most negative frequency (?)
  //   - pfltRe+Im[N/2-1] contains the smallest negative frequency
  //   - pfltRe+Im[N/2]   contains the DC component
  //   - pfltRe+Im[N/2+1] contains the smallest positive frequency
  //   - pfltRe+Im[N-1]   contains the largest positive frequency
{
   FFT_SwapFloatArrayHalves(pfltRe, iNrOfPoints );
   FFT_SwapFloatArrayHalves(pfltIm, iNrOfPoints );
} // end FFT_SortComplexFftForIncreasingFreqBins()

#if( SWI_FFT_NEED_DOUBLE_PRECISION )
//---------------------------------------------------------------------------
void FFT_SortComplexFftForIncreasingFreqBins_Double(
          int iNrOfPoints,    // N =  number of points in the DFT *AND* in the time domain
          double *pfltRe,     // REX[] = real parts of input and output
          double *pfltIm )    // IMX[] = imaginary parts of input and output
  // Details in FFT_SortComplexFftForIncreasingFreqBins() .
{
   FFT_SwapDoubleArrayHalves(pfltRe, iNrOfPoints );
   FFT_SwapDoubleArrayHalves(pfltIm, iNrOfPoints );
}
#endif // SWI_FFT_NEED_DOUBLE_PRECISION ?

//---------------------------------------------------------------------------
void FFT_CalcComplexInverseFft(
       int iNrOfPoints, // N  number of points in the IDFT (?) .. IN THE TIME DOMAIN
       float *pfltRe,   // REX[] = input: real parts of frequency domain, result: re(time domain)
       float *pfltIm )  // IMX[] = input: imag. part of frequency domain, result: im(time domain)
 //  INVERSE FFT FOR COMPLEX SIGNALS  - inspired by [SGDSP] TABLE 12-5 .
 //  Upon entry, N contains the number of points in the IDFT, REX[ ] & IMX[]
 //    contain the real & imaginary parts of the complex frequency domain.
 //    The FIRST HALF [0..N/2-1] seems to contain the POSITIVE frequencies,
 //    increasing index for higher frequencies; index 0 = DC.
 //    The SECOND HALF [N/2..N-1] contains the NEGATIVE frequencies then,
 //    increasing index for higher ("less negative") frequencies.
 //    Index [N/2] contains "the most negative possible frequency".
 //
 //  Upon return, REX[ ] and IMX[ ] contain the complex time domain.
 //   All signals run from 0 to N-1.
 //  Used (for example) in \Digimodes\hell_mod.cpp  (for Fouier Hell),
 //                        \SoundUtl\FftFilter.cpp  (for I/Q output),
 //
 //  Benchmarks: Similar as for FFT_CalcComplexFft() [details THERE] .
{
  int i;
  float fltFactor;

  DOBINI();

  for(i=0; i<iNrOfPoints; ++i) //  Change the sign of IMX[ ]
   { pfltIm[i] = -pfltIm[i];
   }

  DOBINI();
  FFT_CalcComplexFft(    // Calculate forward FFT
              iNrOfPoints,  // N =  number of points in the DFT
              pfltRe,       // REX[] = real parts of input and output
              pfltIm );     // IMX[] = imaginary parts of input and output
  DOBINI();

  // Divide the time domain by N and change the sign of IMX[ ] :
  fltFactor = 1.0 / (float)iNrOfPoints;
  for(i=0; i<iNrOfPoints; ++i)
   { pfltRe[i] =  pfltRe[i] * fltFactor;
     pfltIm[i] = -pfltIm[i] * fltFactor;
   }
  DOBINI();

} // end FFT_CalcComplexInverseFft()


#if( SWI_FFT_NEED_DOUBLE_PRECISION )
//---------------------------------------------------------------------------
void FFT_CalcComplexInverseFft_Double(  // // similar to FFT_CalcComplexInverseFft(), for double precision
       int iNrOfPoints, // N  number of points in the IDFT (?) .. IN THE TIME DOMAIN
       double *pfltRe,   // REX[] = input: real parts of frequency domain, result: re(time domain)
       double *pfltIm )  // IMX[] = input: imag. part of frequency domain, result: im(time domain)
 //  For details, see FFT_CalcComplexInverseFft() !
{
  int i;
  double fltFactor;

  DOBINI();

  for(i=0; i<iNrOfPoints; ++i) //  Change the sign of IMX[ ]
   { pfltIm[i] = -pfltIm[i];
   }

  DOBINI();
  FFT_CalcComplexFft_Double( // Calculate forward FFT
              iNrOfPoints,  // N =  number of points in the DFT
              pfltRe,       // REX[] = real parts of input and output
              pfltIm );     // IMX[] = imaginary parts of input and output
  DOBINI();

  // Divide the time domain by N and change the sign of IMX[ ] :
  fltFactor = 1.0 / (double)iNrOfPoints;
  for(i=0; i<iNrOfPoints; ++i)
   { pfltRe[i] =  pfltRe[i] * fltFactor;
     pfltIm[i] = -pfltIm[i] * fltFactor;
   }
  DOBINI();

} // end FFT_CalcComplexInverseFft_Double()
#endif // SWI_FFT_NEED_DOUBLE_PRECISION ?

//---------------------------------------------------------------------------
void FFT_CalcRealFft(
          int iNrOfPoints, // number of points in the time domain (input), 2^N
          float *pfltRe,   // the real input signal, also used as result (real part)
          float *pfltIm )  // output, imaginary part
 // FFT FOR REAL SIGNALS  - inspired by [SGDSP] TABLE 12-7 / .
 // Upon entry, iNrOfPoints contains the number of points in the "DFT" (oh really?!),
 //              pfltRe[0..iNrOfPoints-1] contains the real input signal,
 //              while values in pfltIm[ ] are ignored.
 //              The INPUT signals run from 0 to iNrOfPoints-1  .
 // Upon return, pfltRe[ ] & pfltIm[ ] contain the DFT output.
 //
 // Output range:  A pure sine wave ... A*sin(wt)... will produce an
 //                fft output peak of (N*A/4)^2  where N is FFT_SIZE.
 //   [ this is for a HANN- or similar window, with m_fltWindowAvrg=0.5 ]
 //
 //  Note: The output signals run from  0...iNrOfPoints/2 !
 //        A "1024 point REAL FFT" produces 513(!) POINTS in re[]
 //                                     and 513(!) POINTS in im[] !
{
  int I, IM, IP, IP2, IPM, J,JM1, LE, LE2, NH, NM1, ND2, N4;
  float UR, UI, SR, SI, TR, TI;

    // Info: "The Sientist and Engineer's Guide to Digital Signal Processing",
    //        www.DSPguide.com, chapter 12 :
    // Even/odd decomposition for the REAL-input FFT
    // -----------------------------------------------------------------
    // > The input signal is broken in by half using an interlaced decomposition.
    // > The N/2 even points are placed into the real real part of the
    // > time domain signal, while the N/2 odd points go into the imaginary part.
    // > An N/2 point FFT is then calculated, requiring about one-half
    // > the time as an N point FFT. The resulting frequency domain is then
    // > separated by the even/odd decomposition, resulting in the frequency
    // > spectra of the two interlaced time domain signals.
    // > These two frequency spectra are then combined into a single spectrum,
    // > just as in the last synthesis stage of the FFT.

  // Separate even and odd points
  NH = iNrOfPoints/2-1;
  for(I=0; I<=NH; ++I)
   { pfltRe[I] = pfltRe[2*I];
     pfltIm[I] = pfltRe[2*I+1];
   }

  // Calculate N/2 point FFT complex FFT
  FFT_CalcComplexFft(
             iNrOfPoints / 2, // N =  number of points in the DFT
             pfltRe,          // real parts of input and output
             pfltIm );        // imaginary parts of input and output


  // Even/odd frequency domain decomposition
  NM1 = iNrOfPoints-1 ;
  ND2 = iNrOfPoints/2 ;
  N4  = iNrOfPoints/4-1;
  for(I=1; I<=N4; ++I)
   { IM = ND2-I;
     IP2 = I+ND2;
     IPM = IM+ND2;
     pfltRe[IP2] = (pfltIm[I] + pfltIm[IM]) * 0.5;
     pfltRe[IPM] =  pfltRe[IP2];
     pfltIm[IP2] = -(pfltRe[I] - pfltRe[IM]) * 0.5;
     pfltIm[IPM] = -pfltIm[IP2];
     pfltRe[I]   = (pfltRe[I] + pfltRe[IM]) * 0.5;
     pfltRe[IM]  =  pfltRe[I];
     pfltIm[I]   = (pfltIm[I] - pfltIm[IM]) * 0.5;
     pfltIm[IM]  = -pfltIm[I];
   }
  pfltRe[iNrOfPoints*3/4] = pfltIm[iNrOfPoints/4];
  pfltRe[ND2] = pfltIm[0];
  pfltIm[iNrOfPoints*3/4] = 0;
  pfltIm[ND2] = 0;
  pfltIm[iNrOfPoints/4] = 0;
  pfltIm[0]   = 0;

  DOBINI();

  // Complete the last FFT stage
  // L  = CINT(LOG(N)/LOG(2));
  LE = 0; I=iNrOfPoints; while(I>1){ ++LE; I=(I>>1); } // -> LE = log2( N )
  LE = 1<<LE;  // LE = CINT(2^LE);

  LE2= LE/2;
  UR = 1;
  UI = 0;
  SR =  cos(C_PI/(float)LE2); // only once per calculation.. no need for an array
  SI = -sin(C_PI/(float)LE2);
  for(J=1; J<=LE2; ++J)
   { JM1 = J-1;
     for( I=JM1; I<=NM1; I+=LE )
      { IP = I+LE2;
        TR = pfltRe[IP]*UR - pfltIm[IP]*UI;
        TI = pfltRe[IP]*UI + pfltIm[IP]*UR;
        pfltRe[IP] = pfltRe[I]-TR;
        pfltIm[IP] = pfltIm[I]-TI;
        pfltRe[I]  = pfltRe[I]+TR;
        pfltIm[I]  = pfltIm[I]+TI;
      }
     TR = UR;
     UR = TR*SR - UI*SI;
     UI = TR*SI + UI*SR;
   } // NEXT J%

  DOBINI();

} // end FFT_CalcRealFft()

#if( SWI_FFT_NEED_DOUBLE_PRECISION )
//---------------------------------------------------------------------------
void FFT_CalcRealFft_Double(  // similar as FFT_CalcRealFft(), for double precision
          int iNrOfPoints, // number of points in the time domain (input), 2^N
          double *pfltRe,   // the real input signal, also used as result (real part)
          double *pfltIm )  // output, imaginary part
 // For details, see FFT_CalcRealFft() !
{

  int I, IM, IP, IP2, IPM, J,JM1, LE, LE2, NH, NM1, ND2, N4;
  double UR, UI, SR, SI, TR, TI;

  // Separate even and odd points
  NH = iNrOfPoints/2-1;
  for(I=0; I<=NH; ++I)
   { pfltRe[I] = pfltRe[2*I];
     pfltIm[I] = pfltRe[2*I+1];
   }

  // Calculate N/2 point FFT complex FFT
  FFT_CalcComplexFft_Double(
             iNrOfPoints / 2, // N =  number of points in the DFT
             pfltRe,          // real parts of input and output
             pfltIm );        // imaginary parts of input and output

  // Even/odd frequency domain decomposition
  NM1 = iNrOfPoints-1 ;
  ND2 = iNrOfPoints/2 ;
  N4  = iNrOfPoints/4-1;
  for(I=1; I<=N4; ++I)
   { IM = ND2-I;
     IP2 = I+ND2;
     IPM = IM+ND2;
     pfltRe[IP2] = (pfltIm[I] + pfltIm[IM]) * 0.5;
     pfltRe[IPM] =  pfltRe[IP2];
     pfltIm[IP2] = -(pfltRe[I] - pfltRe[IM]) * 0.5;
     pfltIm[IPM] = -pfltIm[IP2];
     pfltRe[I]   = (pfltRe[I] + pfltRe[IM]) * 0.5;
     pfltRe[IM]  =  pfltRe[I];
     pfltIm[I]   = (pfltIm[I] - pfltIm[IM]) * 0.5;
     pfltIm[IM]  = -pfltIm[I];
   }
  pfltRe[iNrOfPoints*3/4] = pfltIm[iNrOfPoints/4];
  pfltRe[ND2] = pfltIm[0];
  pfltIm[iNrOfPoints*3/4] = 0;
  pfltIm[ND2] = 0;
  pfltIm[iNrOfPoints/4] = 0;
  pfltIm[0]   = 0;


  // Complete the last FFT stage
  LE = 0; I=iNrOfPoints; while(I>1){ ++LE; I=(I>>1); } // -> LE = log2( N )
  LE = 1<<LE;  // LE = CINT(2^LE);

  LE2= LE/2;
  UR = 1;
  UI = 0;
  SR =  cos(C_PI/(double)LE2); // only once per calculation.. no need for an array
  SI = -sin(C_PI/(double)LE2);
  for(J=1; J<=LE2; ++J)
   { JM1 = J-1;
     for( I=JM1; I<=NM1; I+=LE )
      { IP = I+LE2;
        TR = pfltRe[IP]*UR - pfltIm[IP]*UI;
        TI = pfltRe[IP]*UI + pfltIm[IP]*UR;
        pfltRe[IP] = pfltRe[I]-TR;
        pfltIm[IP] = pfltIm[I]-TI;
        pfltRe[I]  = pfltRe[I]+TR;
        pfltIm[I]  = pfltIm[I]+TI;
      }
     TR = UR;
     UR = TR*SR - UI*SI;
     UI = TR*SI + UI*SR;
   } // NEXT J%

  DOBINI();

} // end FFT_CalcRealFft_Double()
#endif // SWI_FFT_NEED_DOUBLE_PRECISION ?

//---------------------------------------------------------------------------
void FFT_CalcRealInverseFft(
          int iNrOfPoints, // N  number of points in the IDFT (?!?) .. IN THE TIME DOMAIN
          float *pfltRe,   // REX[] = real parts of frequency domain, AND result
          float *pfltIm )  // IMX[] = imaginary parts of frequency domain
 //  INVERSE FFT FOR REAL SIGNALS  - inspired by [SGDSP] TABLE 12-6 .
 //  Upon entry, N contains the number of points in the IDFT ("time domain" ?) ,
 //  REX[ ] and IMX[ ] contain the real & imaginary parts of the frequency domain
 //  running from index 0 to N%/2.  The remaining samples in REX[] and IMX[]
 //  are ignored. Upon return, REX[ ] contains the real time domain, IMX[ ]
 //  contains zeros. (SET to zero, cannot be used to check the algorithm !)
{
  int K;
  float fltFactor;

  DOBINI();

  for(K=(iNrOfPoints/2+1); K<iNrOfPoints; ++K)  // Make frequency domain symmetrical
   { pfltRe[K] =  pfltRe[iNrOfPoints-K];        // (as in [SGDSP] Table 12-1)
     pfltIm[K] = -pfltIm[iNrOfPoints-K];
   }

  DOBINI();
  for(K=0; K<iNrOfPoints; ++K)       // Add real and imaginary parts together
   { pfltRe[K] =  pfltRe[K]+pfltIm[K];
   }

  DOBINI();
  // Calculate forward real DFT (TABLE 12-6, ex: "GOSUB 3000" )
  FFT_CalcRealFft( // Calculate the REAL FFT ..
          iNrOfPoints, // N  number of points in the DFT (for example 1024 points)
          pfltRe,      // REX[] = the real input signal, also used as result
          pfltIm );    // IMX[] = output, imaginary part (for example 513(!) points)
     // 2008-08-15: Crashed in FFT_CalcRealFft() with an access violation,
     //    after switching from the "default configuration" to
     //    "SDR-IQ with converter and audio filter" .  Reason: Multithreading ?
  DOBINI();

  // Add real and imaginary parts together and divide the time domain by N
  fltFactor = 1.0 / (float)iNrOfPoints;
  for(K=0; K<iNrOfPoints; ++K)  // see: iNrOfPoints are the number of samples IN THE TIME DOMAIN again !
   {
     pfltRe[K] = (pfltRe[K]+pfltIm[K]) * fltFactor;
     pfltIm[K] = 0; // set IMAGINARY part to zero for the sake of "mathematical correctness"
   }
} // end FFT_CalcRealInverseFft()


#if( SWI_FFT_NEED_DOUBLE_PRECISION )
//---------------------------------------------------------------------------
void FFT_CalcRealInverseFft_Double(
          int iNrOfPoints, // N  number of points in the IDFT (?!?) .. IN THE TIME DOMAIN
          double *pfltRe,  // REX[] = real parts of frequency domain, AND result
          double *pfltIm)  // IMX[] = imaginary parts of frequency domain
 // Same purpose as FFT_CalcRealInverseFft(), but for DOUBLE PRECISION .
{
  int K;
  double fltFactor;

  DOBINI();

  for(K=(iNrOfPoints/2+1); K<iNrOfPoints; ++K)  // Make frequency domain symmetrical
   { pfltRe[K] =  pfltRe[iNrOfPoints-K];        // (as in [SGDSP] Table 12-1)
     pfltIm[K] = -pfltIm[iNrOfPoints-K];
   }

  DOBINI();
  for(K=0; K<iNrOfPoints; ++K)       // Add real and imaginary parts together
   { pfltRe[K] =  pfltRe[K]+pfltIm[K];
   }

  DOBINI();
  // Calculate forward real DFT (TABLE 12-6, ex: "GOSUB 3000" )
  FFT_CalcRealFft_Double( // Calculate the REAL FFT ..
          iNrOfPoints, // N  number of points in the DFT (for example 1024 points)
          pfltRe,      // REX[] = the real input signal, also used as result
          pfltIm );    // IMX[] = output, imaginary part (for example 513(!) points)
     // 2008-08-15: Crashed in FFT_CalcRealFft() with an access violation,
     //    after switching from the "default configuration" to
     //    "SDR-IQ with converter and audio filter" .  Reason: Multithreading ?
  DOBINI();

  // Add real and imaginary parts together and divide the time domain by N
  fltFactor = 1.0 / (double)iNrOfPoints;
  for(K=0; K<iNrOfPoints; ++K)  // see: iNrOfPoints are the number of samples IN THE TIME DOMAIN again !
   {
     pfltRe[K] = (pfltRe[K]+pfltIm[K]) * fltFactor;
     pfltIm[K] = 0; // set IMAGINARY part to zero for the sake of "mathematical correctness"
   }

  DOBINI();

} // end FFT_CalcRealInverseFft_Double()
#endif // SWI_FFT_NEED_DOUBLE_PRECISION

//---------------------------------------------------------------------------
void FFT_MultiplyHannWindow( float *pfltArray, int iLength ) // .. aka raised cosine
  // CALCULATES the window coefficient in each call.
  // Much faster for 'continuous' signal processing :
  //   * Precalculate the window in an array ONCE : FFT_BuildWindowTable()
  //   * Multiply the samples with the window :     FFT_MultiplyWindow_XYZ()
{ int i;
  float fltAngle, fltAngleIncr;
   fltAngle = 0.0;
   fltAngleIncr = (2.0 * C_PI) / (float)(iLength-1) ;

   for(i=0; i<iLength; i++) // multiply the table with FFT WINDOW FUNCTION ..
    {
     pfltArray[i] *= ( .5 - .5*cos(fltAngle) );
     fltAngle += fltAngleIncr;
    }
} // end FFT_MultiplyHannWindow()


//---------------------------------------------------------------------------
float FFT_BuildWindowTable(
           float *pfltWindowTbl, // [out] window table, 32- or 64-bit floating point
           int   iFftSize,         // [in] number of points (usually 2^n)
           int   iWindowFunction)  // [in] FFT_WINDOW_HANN, etc etc
  // Builds a table with one of the usual FFT windowing functions,
  //  and returns the AVERAGE of that window (which is usually ~ 0.5) .
{
  int i;
  float sum = 0.0;
  double dbl, z;

  DOBINI();
   for(i=0; i<iFftSize; i++) // fill table with FFT WINDOW FUNCTION ..
    { // 'z' introduced 2014-05-02 to simplify the implementation
      //     of the 'new' flat-top windows, as used in [HFTWIN],
      //     appendix C, "List of window functions",
      //     same for MOST (but not all) window functions calculated
      //     as a sum of cosines.
      // Example: C.5 Hamming window
      //        2 * pi * i
      //   z = ------------  ,     where i = 0... N-1,   N=number of points (discrete window length)
      //           N
      // Note 1: Engineers don't use 'j' as a counting index, because  j * j := -1 .
      // Note 2: [HFTWIN] divides by N, not N-1; aka use as "periodic" window.
      //         Because in SL, these window is applied to periodic short-term fourier transforms,
      //         dividing by N (i.e. "periodic" window) seems appropriate.
      //
      //   w[i] = 0.54 - 0.46 * cos(z)   [coefficients of the Hamming window]
      //
      z = (2.0 * C_PI * (double)i) / (double)iFftSize; // divide by N=iFftSize or (N-1) ?
      //
      // The above 'z' applies to the following window functions as implemented in [HFTWIN] :
      //    Hamming, Blackman-Harris, all Nuttall windows, 'Salvatore' flat-top windows,
      //    'Old HP flat-top window', 'Stanford Research' flat-top window,
      //    and all 'New flat-top windows' by G. Heinzel (D.3) .
      // The above 'z' does definitely NOT apply to the following windows in [HFTWIN] :
      //    Kaiser,
      switch(iWindowFunction) // Pick a data windowing function:
       {
        case FFT_WINDOW_RECTANGLE:   // rectangle (bad but "fast reacting")
              pfltWindowTbl[i] = 1.0;
              break;
        case FFT_WINDOW_HAMMING:     // Hamming
              // Not significantly better (compared to the Hann window) .
              // ex: From http://en.wikipedia.org/wiki/Window_function :
              // pfltWindowTbl[i] = .53836 - .46164*cos( (2.0*C_PI*(float)i)/(float)(iFftSize-1));
              // From [HFTWIN] C.5 Hamming window:
              pfltWindowTbl[i] = 0.54 - 0.46 * cos(z);
              break;
        case FFT_WINDOW_HANN: // Hann aka "cos^2" aka "raised cosine" (still a good tradeoff between frequency resolution and dynamic range!)
              // From http://en.wikipedia.org/wiki/Window_function :
              // > The Hann window is sometimes called the "Hanning" window,
              // > in analogy to the Hamming window. However, this is incorrect,
              // > because the windows were named after Julius von Hann
              // > and Richard Hamming, respectively.
              pfltWindowTbl[i] = .5 - .5*cos( (2.0*C_PI*(float)i)/(float)(iFftSize-1) );
              break;
        case FFT_WINDOW_GAUSS:       // Gauss window
              // Also found at http://en.wikipedia.org/wiki/Window_function .
              dbl = ( (double)i-(iFftSize-1)/2.0 ) / ( 0.4 *(iFftSize-1)/2.0 );
              pfltWindowTbl[i] = exp( -.5 * dbl * dbl );
              break;
        case FFT_WINDOW_NUTTALL4B:  // Nuttall4b :
              // Low resolution but large dynamic range (low sidelobes, theor. 93 dB below main lobe).
              // See discussion / comparison with other (newer) Flat-Top windows in [HFTWIN] :
              // > The window called Nuttall4b is derived by requiring a SLDR of f^-3
              // > for a four-term function and using the remaining two degrees of freedom
              // > to minimize the PSLL (Peak SideLobe Level) .
              // > NENBW = 2.0212 bins  (Normalized Equivalent Noise BandWidth)
              // > PSLL  = -93.3 dB
              // > The first zero  is located  at  f  =  4.00  bins.
              // > The highest  sidelobe  is - 93.3 dB,  located  at f  =  4.57 bins.
              // > At the optimal overlap of 66.3%, the amplitude flatness is 0.924,
              // > the power flatness is 0.715, and the overlap correlation is 0.233 .
              //
              // Due to their almost 'flat top', this window (as similar other)
              // is utterly un-suited for frequency measurements using SL's interpolation !
              //
              // ex: pfltWindowTbl[i] = .355768
              //              - .487396*cos( (2.0*C_PI*i)/(iFftSize-1) )
              //               + .144232*cos( (4.0*C_PI*i)/(iFftSize-1) )
              //               - .012604*cos( (6.0*C_PI*i)/(iFftSize-1) );
              pfltWindowTbl[i] = .355768 /* "c0" */
                               - .487396 /* "c1" */ * cos( 1.0/*"k"*/ * z )
                               + .144232 /* "c2" */ * cos( 2.0/*"k"*/ * z )
                               - .012604 /* "c3" */ * cos( 3.0/*"k"*/ * z );
              // In http://en.wikipedia.org/wiki/Window_function,
              //    the above window function is titled
              //  "Nuttall window, continuous first derivative"
              //    but the denominator (in "z") is N-1, not N  !
              break;

#if(0)  // old "flat top" window.  Which ? There are hundreds of flat-top windows out there,
        // and THIS ONE was one of the worst ever seen (compared to [HFTWIN]'s) ..
        case FFT_WINDOW_FLAT_TOP:    // Flat Top (for specialists too)
              // Low resolution, mediocre stopband attenuation,
              // but 'low passband ripple' for whatever it's worth.
              pfltWindowTbl[i] = 1.0
                               - 1.93*cos( (2.0*C_PI*i)/(iFftSize-1) )
                               + 1.29*cos( (4.0*C_PI*i)/(iFftSize-1) )
                               - 0.388*cos( (6.0*C_PI*i)/(iFftSize-1) )
                               + 0.032*cos( (8.0*C_PI*i)/(iFftSize-1) );
              break;
#endif

        case FFT_WINDOW_FLATTOP5F: // "Fast decaying 5-term flat top window" (from [HFTWIN] D.1.3, "SFT5F")
              // See discussion / comparison with other ("newer") Flat-Top windows in [HFTWIN] :
              // > NENBW = 4.3412 bins
              // > PSLL  = -57.3 dB
              // > emax = 0.0025 dB = 0.0282 %.
              pfltWindowTbl[i] = 0.1881   /* "c0" */
                               - .36923   /* "c1" */ * cos( 1.0/*"k"*/ * z )
                               + .28702   /* "c2" */ * cos( 2.0/*"k"*/ * z )
                               - .13077   /* "c3" */ * cos( 3.0/*"k"*/ * z )
                               + .02488   /* "c4" */ * cos( 4.0/*"k"*/ * z );
              break;

        case FFT_WINDOW_FLATTOP5M: // "Minimum sidelobe 5-term flat top window" (from [HFTWIN] D.1.6, "SFT5M")
              // See discussion / comparison with other ("newer") Flat-Top windows in [HFTWIN] :
              // > NENBW = 3.8852 bins
              // > PSLL  = -89.9 dB
              // > emax = 0.0039 dB = 0.0449 %.
              pfltWindowTbl[i] = 0.209671 /* "c0" */
                               - .407331  /* "c1" */ * cos( 1.0/*"k"*/ * z )
                               + .281225  /* "c2" */ * cos( 2.0/*"k"*/ * z )
                               - .092669  /* "c3" */ * cos( 3.0/*"k"*/ * z )
                               + .0091036 /* "c4" */ * cos( 4.0/*"k"*/ * z );
              break;

        case FFT_WINDOW_HFT95: // "Heinzel Flat-Top -95 dB sidelobe" (from [HFTWIN] D.3.2, "HFT95")
              // > This window was optimized for the lowest sidelobe level
              // > that is achieveable with 4 cosine terms.
              // > NENBW = 3.8112 bins
              // > PSLL  = -95.0 dB
              // > emax = 0.0044 dB = 0.0507 %.
              pfltWindowTbl[i] = 1.0 - 1.9383379*cos(z)     + 1.3045202*cos(2.0*z)
                                     - 0.4028270*cos(3.0*z) + 0.0350665*cos(4.0*z);
              break;

        case FFT_WINDOW_HFT144D: // "Heinzel Flat-Top -144 dB sidelobe" (from [HFTWIN] D.3.5, "HFT144D")
              // > This window was optimized for the lowest sidelobe level
              // > that is achieveable with 6 cosine terms (..)
              // > NENBW = 4.5386 bins
              // > PSLL  = -144.1 dB  (highest sidelobe located at f +/- 7.07 bins)
              // > emax  = 0.0021 dB
              pfltWindowTbl[i] = 1.0 - 1.96760033*cos(z)     + 1.57983607*cos(2.0*z)
                                     - 0.81123644*cos(3.0*z) + 0.22583558*cos(4.0*z)
                                     - 0.02773848*cos(5.0*z) + 0.00090360*cos(6.0*z);
              break;

     case FFT_WINDOW_HFT196D  :  /* "Heinzel Flat-Top -196 dB sidelobe" (from [HFTWIN] D.3.7, "HFT196D") */
              // > This window was optimized for the lowest sidelobe level
              // > that is achieveable with 8 cosine terms (..)
              // > NENBW = 5.1134 bins
              // > PSLL  = -196.2 dB  (highest sidelobe located at f +/- 9.06 bins)
              // > Optimal overlap = 82.3 %
              pfltWindowTbl[i] = 1.0 - 1.979280420*cos(z)     + 1.710288951*cos(2.0*z)
                                     - 1.081629853*cos(3.0*z) + 0.448734314*cos(4.0*z)
                                     - 0.112376628*cos(5.0*z) + 0.015122992*cos(6.0*z)
                                     - 0.000871252*cos(7.0*z) + 0.000011896*cos(8.0*z);
              break;
     case FFT_WINDOW_HFT248D  :  /* "Heinzel Flat-Top -248 dB sidelobe" (from [HFTWIN] D.3.9, "HFT248D") */
              // > This window was optimized for the lowest sidelobe level
              // > that is achieveable with 10 cosine terms (..)
              // > NENBW =  5.6512 bins
              // > PSLL  = -248.4 dB  (highest sidelobe located at f +/- 13.37 bins)
              // > Optimal overlap = 84.1 %
              // Exceeds the dynamic range of 32-bit floating point arithmetics,
              // but WB decided to keep this remarkable function for future projects
              // (which would use 'double precision' floating point numbers
              //  in the entire processing chain) .
              //
              pfltWindowTbl[i] = 1.0 - 1.985844164102*cos(z)     + 1.791176438506*cos(2.0*z)
                                     - 1.282075284005*cos(3.0*z) + 0.667777530266*cos(4.0*z)
                                     - 0.240160796576*cos(5.0*z) + 0.056656381764*cos(6.0*z)
                                     - 0.008134974479*cos(7.0*z) + 0.000624544650*cos(8.0*z)
                                     - 0.000019808998*cos(9.0*z) + 0.000000132974*cos(10.0*z);
              break;


        default:
              iWindowFunction = FFT_WINDOW_RECTANGLE;
              pfltWindowTbl[i] = 1.0;   //rectangle
              break;
       } // end switch( window_function )
      sum += pfltWindowTbl[i];  // for 'window weighting factor'
    }  // end for

  DOBINI();
  sum /= (float)iFftSize;  // -> RECT:1.0,  HAMMING:0.54,  HANN:0.5
  return sum;  // returns the AVERAGE of the window function (later used to normalize amplitudes)
} // end FFT_BuildWindowTable()

//---------------------------------------------------------------------------
void FFT_MultiplyWindow_Real( float *pfltInputSamples, float *pfltWindow,
                              float *pfltDestSamples,  int iLength )
  // MULTIPLIES samples in the time domain with a pre-calculated WINDOW.
  // Simple variant with REAL-VALUED input, REAL-VALUED window function,
  //                 and REAL-VALUE output.
  // Faster for 'continuous' signal processing than e.g. FFT_MultiplyHannWindow().
  // How to use:
  //   * Precalculate the window in an array ONCE : FFT_BuildWindowTable()
  //   * Multiply the samples with the window :     FFT_MultiplyWindow_Real()
{
  while( iLength >= 4 )  // reduce the CPU time spent here by 'partially unrolled loop'
   { iLength -= 4;
     *(pfltDestSamples++) = *(pfltInputSamples++)  * *(pfltWindow++);
     *(pfltDestSamples++) = *(pfltInputSamples++)  * *(pfltWindow++);
     *(pfltDestSamples++) = *(pfltInputSamples++)  * *(pfltWindow++);
     *(pfltDestSamples++) = *(pfltInputSamples++)  * *(pfltWindow++);
   }
  while( (iLength--) > 0 )  // process the 'last few samples'
   { *(pfltDestSamples++) = *(pfltInputSamples++)  * *(pfltWindow++);
   }
} // end FFT_MultiplyWindow_Real()



/* EOF < ?/cbproj/Remote_CW_Keyer/FFT_API.c > */
