//---------------------------------------------------------------------------
// File  :  C:\CBproj\SoundUtl\SoundDec.cpp
// Date  :  2015-03-11 (ISO 8601, YYYY-MM-DD)
// Author:  Wolfgang Buescher  (DL4YHF)
//
// Description:
//     Implementation of a class for decimating and buffering audio samples.
//
// Revision history (yyyy-mm-dd):
//
// 2015-03-11: Using a Noxon DAB Stick, running at 1.6 MSamples/second,
//             got here with t2-t1 = 152395 / pf_clock_3.57MHz = 43 milliseconds
//             to decimate 65536 I/Q samples by EIGHT into 8192 samples @ fs=200 kHz,
//             using SoundTab_DEC_FIR_LENGTH = 25 .
//             That's 43 ms to process 65536/1.6MHz = 41 ms of data.
//             Too slow for real-time processing on an old Centrino notebook !
//             Considered boosting the performance of CSoundDecimatingBuffer
//             by using a CIC filter as the "frontend" .
// 2004-01-29  Minor bugfixes in ProcessWithoutBuffering(), which is now
//             also used in the PSK detector/demodulator (pskdet.cpp) .
// 2003-07-03  Added the method ProcessWithoutBuffering() for SpecLab .
//
// 2002-05-05  Written for the "Audio Input Tool" in c:\CBProj\SndInput\..
//
//---------------------------------------------------------------------------


#include "SWITCHES.H"  // project specific compiler switches ("options")
                       // must be included before anything else !


#include <windows.h>
#include <math.h>

#pragma hdrstop   // BORLAND stuff: no precompiled headers after this point

#include "SoundTab.h"  // some required tables, filter coeffs, etc.
#include "utility1.h"  // uses UTL_NamedMalloc instead of malloc (for debugging)
#include "SoundDec.h"  // header for this module

#pragma warn -8017
#pragma warn -8004 // <var is a assigned a value that is never used> - so what


#ifndef  SWI_USE_UNROLLED_LOOPS
# define SWI_USE_UNROLLED_LOOPS 1  /* 1 = yes (faster but larger code), 0 = no (shorter but slower code) */
#endif


//**************************************************************************
//    Global Variables
//**************************************************************************

BOOL  SndDec_cpu_FMA4_available = FALSE;  // see https://msdn.microsoft.com/de-de/library/vstudio/gg445140%28v=vs.100%29.aspx  on FMA4 / __cpuid




//**************************************************************************
//    Functions (no class methods)
//**************************************************************************

/***************************************************************************/
void SndDec_Init(void)
  // Checks the CPU types, and which instruction set is available .
  // [out] : SndDec_cpu_FMA4_available , etc .
{

} // end SndDec_Init()

/***************************************************************************/
int SndDec_GetClosestRealizeableDecimatorRatio(
     int iWantedRatio,   // [in] wanted decimation ratio, ideally 2^N * 3^M
     int nMaxDecimators) // [in] max number of usable stages
{
  // How can the desired decimation ratio be realized ?
  // Simply try all combinations of 2^(n2) *  3^(n3)
  //  and use the one which works best.
  int best_ratio=1, best_n2=0, best_n3=0, n2_ratio = 1;
  int n3_ratio, n2, n3, realized_ratio;
  for(n2=0; n2<=nMaxDecimators; ++n2)
   {
     n3_ratio = 1;
     for(n3=0; n3+n2<=nMaxDecimators; ++n3)
      { // Note: n2+n3 must never exceed 'nr_decimators' !
        realized_ratio = n2_ratio * n3_ratio;
        if(   abs(realized_ratio - iWantedRatio)
            < abs(best_ratio - iWantedRatio) )
         { // found a better combination to realize the decimation ratio :
           best_ratio = realized_ratio;
           best_n2 = n2;
           best_n3 = n3;
         }
        n3_ratio *= 3;  // next power of three for next 'n3' loop
      } // end for (n3)
     n2_ratio *= 2;  // next power of two for next 'n2' loop
   } // end for (n2)
  // arrived here, we know how many 'decimate-by-2' and 'decimate-by-3' - stages
  // are required.
  // Example: looking for 'decimate-by-100', realizable: 2*2 * 3*3*3 = 108
  //          -> best_n2 = 2,   best_n3 = 3
  return best_ratio;
} // end SndDec_GetClosestRealizeableDecimatorRatio()


/***************************************************************************/
void  SndDec_InitDecimator( T_SOUND_DECIMATOR *pDecimator,
                                  int decimation_ratio )
{
 int i;
    // Initialize all decimator stages, clear the circular delay line.
    for(i=0;i<SoundTab_DEC_FIR_LENGTH;++i)
     {
      pDecimator->queue[i].re =
      pDecimator->queue[i].re =
      pDecimator->queue[i].im =
      pDecimator->queue[i].im = 0.0; // also good for COMPLEX processing !
     }
    pDecimator->ratio = 1;
    pDecimator->count = 0;
    pDecimator->inptr = pDecimator->queue;
    switch(decimation_ratio)
     { case 2:  pDecimator->coeffs= SoundTab_Dec2FilterCoeffs; break;
       case 3:  pDecimator->coeffs= SoundTab_Dec3FilterCoeffs; break;
       default: pDecimator->coeffs= SoundTab_Dec2FilterCoeffs; break; /* ! */
     }

} // end SndDec_InitDecimator()


/***************************************************************************/
int  SndDec_InitDecimatorChain(
      T_SOUND_DECIMATOR *decimator, // [in,out] array of decimator stages
                 int nr_decimators, // [in] max number of usable stages
              int decimation_ratio, // [in] wanted(!) decimation ratio
          int *pi_NrOfUsedStages  ) // optional result, may be NULL
   /* Programs a chain (here:array) of 'decimation' stages for the desired
    * decimation ratio.
    * Note: Not all ratios are possible, because every decimation stage
    *       can divide the input sample rate either by 1,2 or 3 !
    * The return value is the decimation which could be REALIZED.
    */
{
  int i;
  int realized_ratio;
  long n2,n3, n2_ratio, n3_ratio, best_ratio,best_n2,best_n3;

  // 'default settings' for all stages:
  for(i=0;i<nr_decimators;++i)
   {
     SndDec_InitDecimator( &decimator[i], 2/*default ratio*/ );
   }


  // How can the desired decimation ratio be realized ?
  // Simply try all combinations of 2^(n2) *  3^(n3)
  //  and use the one which works best.
  best_ratio=1; best_n2=0; best_n3=0;
  n2_ratio = 1;
  for(n2=0; n2<=nr_decimators; ++n2)
   {
     n3_ratio = 1;
     for(n3=0; n3+n2<=nr_decimators; ++n3)
      { // Note: n2+n3 must never exceed 'nr_decimators' !
        realized_ratio = n2_ratio * n3_ratio;
        if(   abs(realized_ratio - decimation_ratio)
            < abs(best_ratio - decimation_ratio) )
         { // found a better combination to realize the decimation ratio :
           best_ratio = realized_ratio;
           best_n2 = n2;
           best_n3 = n3;
         }
        n3_ratio *= 3;  // next power of three for next 'n3' loop
      } // end for (n3)
     n2_ratio *= 2;  // next power of two for next 'n2' loop
   } // end for (n2)
  // arrived here, we know how many 'decimate-by-2' and 'decimate-by-3' - stages
  // are required.
  // Example: looking for 'decimate-by-100', realizable: 2*2 * 3*3*3 = 108
  //          -> best_n2 = 2,   best_n3 = 3
  // Setup the decimator stages ...
  i = 0;
  while(i<best_n2)
        decimator[i++].ratio = 2;  // from example: i=0,1
  while(i<best_n2+best_n3 && i<nr_decimators)
        decimator[i++].ratio = 3;  // from example: i=2,3,4
  if(pi_NrOfUsedStages)
    *pi_NrOfUsedStages = i;        // optional result: number of used stages
  while(i<nr_decimators)
        decimator[i++].ratio = 1;  // all the rest is 'unused'
  // arrived here: ratios for all decimator stages are set.


  // Set the filter coefficients for all stages
  //  and calculate the resulting 'total decimation ratio'.
  //  The digital filter coefficients are in SoundTab.cpp !
  realized_ratio = 1;
  for(i=0;i<nr_decimators;++i)
   {
    switch(decimator[i].ratio)
     {
       case 2:
          decimator[i].ratio = 2;
          decimator[i].coeffs= SoundTab_Dec2FilterCoeffs;
          break;
       case 3:
          decimator[i].ratio = 3;
          decimator[i].coeffs= SoundTab_Dec3FilterCoeffs;
          break;
       default:
          decimator[i].ratio = 1;
          decimator[i].coeffs= SoundTab_Dec2FilterCoeffs;
          break;
     }
    realized_ratio *= decimator[i].ratio;
   }
  return realized_ratio;
} // end SndDec_InitDecimatorChain(..)

#if( SWI_USE_UNROLLED_LOOPS==1 ) /* 1 = yes (faster but larger code), 0 = no (shorter but slower code) */
/***************************************************************************/
static void SndDec_CalcMACs_2Channels_NoWrap(  float *pInput, float *pCoeffs, BYTE nMACs, float *pResult )
  // Subroutine, called from CSoundDecimatingBuffer::ProcessSamples(),
  //             with various attempts to make it "a bit faster" .
  //             Does NOT care about circular buffer indices (for speed)
  //             thus called TWO TIMES (before and after the wrapping point)
  //             from SndDec_CalcMACs_2Channels() .
  //             Used when the FMA4 instruction 'vfmaddps' isn't available
  //             because the compiler or the CPU on which we're running doesn't support it.
  // [in]  pCoeffs : typically points to SoundTab_Dec2FilterCoeffs[SoundTab_DEC_FIR_LENGTH]
  // [out] pResult : 1st channel in pResult->re, 2nd channel in pResult->im .
{ register float acc1, acc2;
  acc1 = acc2 = 0.0;  // -> fld; fst; fstp.

  // fmaf(x,y,z) = (x * y) + z, for 'float' type ? Not available for Borland .
  switch( nMACs )  // looks terrible but doesn't cost much (cycle-wise) :
   { case  0 : goto  _0;  // because of the constant "stepwidth" of the opcodes
     case  1 : goto  _1;  // between the case marks,
     case  2 : goto  _2;  // any non-brain-damaged compiler will use
     case  3 : goto  _3;  // a 'computed goto', or maybe a simple lookup table,
     case  4 : goto  _4;  // to implement this stupid switch..case list .
     case  5 : goto  _5;  // Result when compiled with Borland C++ Builder:
     case  6 : goto  _6;  //   xor eax, eax             ; clear 32 bit, omg..
     case  7 : goto  _7;  //   mov al, [epb+something]  ; = nMACs
     case  8 : goto  _8;  //   cmp eax, 0x1F            ; may use jump table ?
     case  9 : goto  _9;  //   jnbe +[to the default case]
     case 10 : goto _10;  //   jmp  dword ptr [eax+jump_table_base]
     case 11 : goto _11;  //   (which jumps straight to the label, as intended)
     case 12 : goto _12;
     case 13 : goto _13;
     case 14 : goto _14;
     case 15 : goto _15;
     case 16 : goto _16;
     case 17 : goto _17;
     case 18 : goto _18;
     case 19 : goto _19;
     case 20 : goto _20;
     case 21 : goto _21;
     case 22 : goto _22;
     case 23 : goto _23;
     case 24 : goto _24;
     case 25 : goto _25;
     case 26 : goto _26;
     case 27 : goto _27;
     case 28 : goto _28;
     case 29 : goto _29;
     case 30 : goto _30;
     case 31 : goto _31;
     default : goto _32;
   }

  // Calculate 0 to 32 MAC (multiply,accumulate) operations, for TWO CHANNELS.
  // Here: UNROLLED LOOP, and no checking for a circular index wrap.
  //       (it's the caller's responsibility to check how many samples
  //        can be processed BEFORE, and AFTER the circular buffer's end).
  //   When convolving the input samples (in the circular queue)
  //   with the filter coefficients (from a simple linear array),
  //   the sequence of the MACs doesn't matter as long as
  //   the i-th sample if multiplied with the i-th coefficient
  // (   pInput[qi+0]  * coeffs[0] + pInput[qi+1] * coeffs[1] + ... pInput[qi+25, wrapped] * coeffs[25]
  // or  pInput[qi+25] * coeffs[25]+ pInput[qi+24] * coeffs[24] + ... + pInput[qi+0]  * coeffs[0] ).
  //   Thus, beginning with the LAST ARRAY entry here (due to the goto-labels
  //   as entry points) isn't a problem .
  //   TO COMPILE WITH BORLAND, SET "OPTIMISATION FOR SPEED, PENTIUM PRO" !
_32: acc1 +=  pInput[62] * pCoeffs[31]; // -> fld; fmul; fadd; fstp.
     acc2 +=  pInput[63] * pCoeffs[31]; // -> fld; fmul; fadd; fstp.....
_31: acc1 +=  pInput[60] * pCoeffs[30]; //     ' ' '
     acc2 +=  pInput[61] * pCoeffs[30]; //
_30: acc1 +=  pInput[58] * pCoeffs[29]; //
     acc2 +=  pInput[59] * pCoeffs[29]; //
_29: acc1 +=  pInput[56] * pCoeffs[28]; //
     acc2 +=  pInput[57] * pCoeffs[28]; //
_28: acc1 +=  pInput[54] * pCoeffs[27]; //
     acc2 +=  pInput[55] * pCoeffs[27]; //
_27: acc1 +=  pInput[52] * pCoeffs[26]; //
     acc2 +=  pInput[53] * pCoeffs[26]; //
_26: acc1 +=  pInput[50] * pCoeffs[25]; //
     acc2 +=  pInput[51] * pCoeffs[25]; //
_25: acc1 +=  pInput[48] * pCoeffs[24]; //
     acc2 +=  pInput[49] * pCoeffs[24]; //
_24: acc1 +=  pInput[46] * pCoeffs[23]; //
     acc2 +=  pInput[47] * pCoeffs[23]; //
_23: acc1 +=  pInput[44] * pCoeffs[22]; //
     acc2 +=  pInput[45] * pCoeffs[22]; //
_22: acc1 +=  pInput[42] * pCoeffs[21]; //
     acc2 +=  pInput[43] * pCoeffs[21]; //
_21: acc1 +=  pInput[40] * pCoeffs[20]; //
     acc2 +=  pInput[41] * pCoeffs[20]; //
_20: acc1 +=  pInput[38] * pCoeffs[19]; //
     acc2 +=  pInput[39] * pCoeffs[19]; //
_19: acc1 +=  pInput[36] * pCoeffs[18]; //
     acc2 +=  pInput[37] * pCoeffs[18]; //
_18: acc1 +=  pInput[34] * pCoeffs[17]; //
     acc2 +=  pInput[35] * pCoeffs[17]; //
_17: acc1 +=  pInput[32] * pCoeffs[16]; //
     acc2 +=  pInput[33] * pCoeffs[16]; //
_16: acc1 +=  pInput[30] * pCoeffs[15]; //
     acc2 +=  pInput[31] * pCoeffs[15]; //
_15: acc1 +=  pInput[28] * pCoeffs[14]; //
     acc2 +=  pInput[29] * pCoeffs[14]; //
_14: acc1 +=  pInput[26] * pCoeffs[13]; //
     acc2 +=  pInput[27] * pCoeffs[13]; //
_13: acc1 +=  pInput[24] * pCoeffs[12]; //
     acc2 +=  pInput[25] * pCoeffs[12]; //
_12: acc1 +=  pInput[22] * pCoeffs[11]; //
     acc2 +=  pInput[23] * pCoeffs[11]; //
_11: acc1 +=  pInput[20] * pCoeffs[10]; //
     acc2 +=  pInput[21] * pCoeffs[10]; //
_10: acc1 +=  pInput[18] * pCoeffs[9];  //
     acc2 +=  pInput[19] * pCoeffs[9];  //
_9:  acc1 +=  pInput[16] * pCoeffs[8];  //
     acc2 +=  pInput[17] * pCoeffs[8];  //
_8:  acc1 +=  pInput[14] * pCoeffs[7];  //
     acc2 +=  pInput[15] * pCoeffs[7];  //
_7:  acc1 +=  pInput[12] * pCoeffs[6];  //
     acc2 +=  pInput[13] * pCoeffs[6];  //
_6:  acc1 +=  pInput[10] * pCoeffs[5];  //
     acc2 +=  pInput[11] * pCoeffs[5];  //
_5:  acc1 +=  pInput[8]  * pCoeffs[4];  //
     acc2 +=  pInput[9]  * pCoeffs[4];  //
_4:  acc1 +=  pInput[6]  * pCoeffs[3];  //
     acc2 +=  pInput[7]  * pCoeffs[3];  //
_3:  acc1 +=  pInput[4]  * pCoeffs[2];  //
     acc2 +=  pInput[5]  * pCoeffs[2];  //
_2:  acc1 +=  pInput[2]  * pCoeffs[1];  //
     acc2 +=  pInput[3]  * pCoeffs[1];  //
_1:  acc1 +=  pInput[0]  * pCoeffs[0];  //
     acc2 +=  pInput[1]  * pCoeffs[0];  //  -> fld; fmul; fadd; fstp. (last MAC)
_0:  // that's all .. ZERO MACs left to calculate at this point ...
     pResult[0] = acc1;
     pResult[1] = acc2;
} // end SndDec_CalcMACs_2Channels_NoWrap()
#endif // SWI_USE_UNROLLED_LOOPS ?


/***************************************************************************/
void SndDec_RunComplexLowpass(
      T_SOUND_DECIMATOR *pDecimator, // pointer to struct with coeffs and queue
      T_Complex *pcplxValue )        // pointer to in- and output value
        // ( T_Complex defined in \cbproj\SoundUtl\SoundTab.h )
   // Don't be fooled by the term "Decimator" - we don't DECIMATE here !
   //   This function is mostly used to cut the bandwidth by two .
{
 T_Complex acc;
 T_Complex *inptr   = pDecimator->inptr; // local copy for better speed..
 T_Complex *qptr    = pDecimator->queue;
 T_Complex *firptr;
 const T_Float *kptr;      // pointer to FIR-filter coefficients
 int   qlen = SoundTab_DEC_FIR_LENGTH; // Length of filter queue + number of coeffs

#if( !SWI_USE_UNROLLED_LOOPS )
 int j;   // only need a loop counter if the loop isn't "unrolled" :o)
#endif

   inptr--;
   if(inptr < qptr)        // deal with FIR pointer wrap
      inptr = qptr+qlen-1; // (2 indices per COMPLEX sample!)
   inptr->re = pcplxValue->re; // place real part in circular Queue
   inptr->im = pcplxValue->im; // place imaginary part "   "   "
   kptr = pDecimator->coeffs;  // pointer to the filter coefficients

   pDecimator->count = 0;
   acc.re  = 0.0;           // prepare accumulation
   acc.im = 0.0;
   firptr = inptr;
#if( 0 ) // SWI_USE_UNROLLED_LOOPS==0 ) /* 0 = no (shorter but slower code), 1 = yes (faster but larger code) */
   for(j=0; j<qlen; ++j )   // do the complex MAC's
    {
     acc.re += ( (firptr->re)*(*kptr) );
     acc.im += ( (firptr->im)*(*kptr++) );
     if( (++firptr) >= qptr+qlen ) //deal with wraparound
            firptr  =  qptr;
    }
   // filter output now in acc .
   *pcplxValue = acc;              // re+im back to the caller

   // save position in circular delay line
   pDecimator->inptr = inptr;
#else  // compile with option SWI_USE_UNROLLED_LOOPS :

#endif // SWI_USE_UNROLLED_LOOPS ?

} // end SndDec_RunComplexLowpass(..)



//***************************************************************************
//  Implementation of methods for the CSoundDecimatingBuffer class
//***************************************************************************


//***************************************************************************
CSoundDecimatingBuffer::CSoundDecimatingBuffer()     // constructor
{
  m_fOpenForBusiness = FALSE; // to prevent multithreading issues (2015-05-14)
  m_dblInputSampleRate = m_dblOutputSampleRate = 0;
  m_dblNcoFrequency    = 0;
  m_dblNcoPhase        = 0.0;
  m_iDecimationRatio   = 1;
  m_iUsedDecimatorStages = 0;
  m_i64OutputSampleCount = 0;
  m_dwOutputSampleCountModified = 0;
  m_fFreqConversion    = FALSE;
  m_iConversionMode    = SNDDEC_MODE_REAL_IN_REAL_OUT;
  m_nOutBufCompsPerSample = m_numInputStreams = 0;
  m_lBufferSize = 0;     // "constructed but not initialized" : buffer capacity ZERO
  m_pfltBuffer  = NULL;
  m_iFastAndUgly= 0;     // use proper anti-aliasing by default
  m_iComplex2RealMixerPhase = 0;
#if( SWI_USE_CHUNK_INFO2 )    // SoundUtl/ChunkInfo2.c (with T_ChunkArray) present ?
  memset( &m_ChunkArray, 0, sizeof(m_ChunkArray) );
#elif ( SWI_USE_CHUNK_INFO )
  memset( &m_ChunkInfo, 0, sizeof(m_ChunkInfo) );
#endif // SWI_USE_CHUNK_INFO[2] ?
} // end CSoundDecimatingBuffer::CSoundDecimatingBuffer()

void CSoundDecimatingBuffer::Close(void) // clean up (for static class instances)
{
  if( m_fOpenForBusiness )    // to cure multithreading issues (2015-05-14)...
   { m_fOpenForBusiness = FALSE;
     Sleep( 100 ); // give other thread(s) the chance to return from ProcessSamples(), etc
   }
  if(m_pfltBuffer)
   { UTL_free(m_pfltBuffer);  // fsssh
     m_pfltBuffer = NULL;
   }
}

CSoundDecimatingBuffer::~CSoundDecimatingBuffer() // destructor, clean up
{
  Close();
} // end CSoundDecimatingBuffer::~CSoundDecimatingBuffer() [destructor]


//***************************************************************************
BOOL CSoundDecimatingBuffer::Init(
       long i32BufferLength,      // max number of sample points for OPTIONAL buffering
                                //  (use 0 if you need no buffered processing)
            // internally multiplied with "the number of components per sample",
            //         so i32BufferLength is in fact the NUMBER OF SAMPLE POINTS .
     double fltInputSampleRate, // often 11025, 22050 or 44100
       long lSampleRateDivisor, // powers of two are preferred, powers of three possible
     double fltNcoFrequency,  // "L.O." frequency in Hertz, 0 = no conversion
        int iConversionMode,  // SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)
        int numInputStreams)  // 1=ONE input stream (which may be REAL or COMPLEX), 2=TWO independent inputs, ..
   // Prepares the decimation of audio samples by a certain ratio
   //       with optional frequency shift and optional COMPLEX output.
   //       Some "less important" parameters can be defined with other
   //       SetXxxx-methods of the CSoundDecimatingBuffer class.
   //
   // Note: Not all ratios are possible, because every decimation stage
   //       can divide the input sample rate either by 1,2 or 3 !
   //       m_iDecimationRatio  is the decimation factor
   //            (or the divisor for a simple integrate-and-dump filter)
   //            which COULD BE REALIZED.
   // Sample usages:
   //       SOUND_DecimatingSaveBuffer.Init() in
{
 long lOldBufferSize,lNewBufferSize;

  if( m_fOpenForBusiness )    // to cure multithreading issues (2015-05-14)...
   { m_fOpenForBusiness = FALSE;
     Sleep( 100 ); // give other thread(s) the chance to return from ProcessSamples(), etc
   }

  lOldBufferSize = m_lBufferSize;

  if(numInputStreams<1)
     numInputStreams=1;
  if(numInputStreams>2)
     numInputStreams=2;
  m_numInputStreams = numInputStreams;

  m_dblInputSampleRate= fltInputSampleRate;
  m_dblNcoFrequency  = fltNcoFrequency;
  m_dblNcoPhase      = 0.0;
  m_ldblPrevCheckedTimestamp = 0.0; // only for debugging (timestamp-check)
  m_ldblNextExpectedOutputTimestamp = 0.0;  // " " "
  m_iDecimationRatio = lSampleRateDivisor;
  if(m_iDecimationRatio < 1)
     m_iDecimationRatio = 1;

  m_iDecimationRatio = SndDec_InitDecimatorChain(
      m_Decimator[0], SoundDec_MAX_DECIMATION_STAGES,
      m_iDecimationRatio, &m_iUsedDecimatorStages );
  SndDec_InitDecimatorChain( m_Decimator[1], SoundDec_MAX_DECIMATION_STAGES,
      m_iDecimationRatio, &m_iUsedDecimatorStages );
  if( m_iDecimationRatio < 1 )
      m_iDecimationRatio = 1;
  m_dblOutputSampleRate = m_dblInputSampleRate / (double)m_iDecimationRatio;

  // To convert a decimated COMPLEX signal (with 'negative and positive' frequencies
  //                    into a REAL signal (with only 'positive' frequencies),
  //                    these extra decimating half-band filters are necessary:
  SndDec_InitDecimator( &m_Complex2RealDecimator[0], 2/*ratio*/ );
  SndDec_InitDecimator( &m_Complex2RealDecimator[1], 2/*ratio*/ );

  // Delete the old contents of the 'buffer'
  // (because with new a sample rate, the old data are useless)
  //  SOUND_big_buffer_index = 0;
  m_i64OutputSampleCount = 0; // reset buffer counter AND BUFFER INDEX(!)
  m_dwOutputSampleCountModified = 0;
#if( SWI_USE_CHUNK_INFO2 )    // SoundUtl/ChunkInfo2.c (with T_ChunkArray) present ?
  ChunkInfoArray_Init( &m_ChunkArray );
#elif ( SWI_USE_CHUNK_INFO )
  ChunkInfo_Init( &m_ChunkInfo );
#endif // SWI_USE_CHUNK_INFO[2] ?

  // Save the type of samples contained in the buffer:
  m_iConversionMode = iConversionMode; // initial conversion mode : SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)

  m_fFreqConversion = (fltNcoFrequency!=0);  // frequency conversion enabled ?

  // Which buffer structure ?  1, 2, or 4 components per "sample pair" ,
  //  depends on channels+complex/real output.
  // In ProcessSamples(), the input signal is optionally converted
  //    via multiplication with a complex oscillator signal (NCO with cos+sin)
  //    *BEFORE* the result enters the buffer,
  //    thus the buffer structure (m_nOutBufCompsPerSample) depends on the OUTPUT.
  switch( m_iConversionMode )
   { case SNDDEC_MODE_REAL_IN_REAL_OUT       :  // REAL input (not complex), real output, even if downconverted and decimated
     default:
        m_nOutBufCompsPerSample = m_numInputStreams;  // 1 component per channel (in the buffer)
        break;
     case SNDDEC_MODE_COMPLEX_IN_COMPLEX_OUT :  // COMPLEX input, complex output
     case SNDDEC_MODE_REAL_IN_COMPLEX_OUT    :  // REAL    input, COMPLEX output (e.g. with complex NCO multiplication AND decimation)
        m_nOutBufCompsPerSample = 2 * m_numInputStreams; // complex output -> 2 components/channel *in the buffer*
        break;
     case SNDDEC_MODE_COMPLEX_IN_REAL_OUT    :  // complex input, REAL output (2015-01-14: future plan)
        m_nOutBufCompsPerSample = m_numInputStreams;  // 1 component per channel (in the buffer)
        break;
   } // end switch( m_iConversionMode )

  // Allocate a "big" ring-buffer where the decimated samples are collected .
  //  Note that buffering is OPTIONAL (important for Spectrum Lab) ,
  //  and that the buffer size in BYTES can get incredibly large :
  // With 4 components per sample, and 4 bytes per component (=sizeof T_Float),
  //  a buffer capturing 10 seconds of audio sampled at 96 kHz
  //  the buffer size will be  4 * 4 * 10 * 96000 = 15 MByte (!) .
  lNewBufferSize = i32BufferLength * (long)m_nOutBufCompsPerSample;
  if(lNewBufferSize!=lOldBufferSize)
   { // buffer size was changed..
     if(m_pfltBuffer!=NULL)
       { // must free the old buffer and allocate a new because size has been changed
         m_lBufferSize = 0;
         UTL_free(m_pfltBuffer);  // fsssh
         m_pfltBuffer = NULL;
       }
   }
  if(lNewBufferSize>0)
   { // only for "buffered" processing:
     if(m_pfltBuffer==NULL)   // only allocate a new buffer if really necessary
      { m_pfltBuffer = (T_Float*)UTL_NamedMalloc( "SoundDec", lNewBufferSize * sizeof(T_Float) );
        m_lBufferSize = lNewBufferSize;   // open for business now
      }
     if( m_pfltBuffer!=NULL )
      { m_fOpenForBusiness = TRUE;
        return TRUE;
      }
     else
      { return FALSE;
      }
   }
  else // no buffered operation, result will overwrite the non-decimated input:
   { m_fOpenForBusiness = TRUE;
     return TRUE;
   }
} // end CSoundDecimatingBuffer::Init()


/***************************************************************************/
double CSoundDecimatingBuffer::GetNcoFrequency(void)
{
  return m_dblNcoFrequency;
} // end CSoundDecimatingBuffer::GetNcoFrequency()


/***************************************************************************/
BOOL   CSoundDecimatingBuffer::SetNcoFrequency(double fltNewFreq)
  // [in] center frequeny for "downconversion" .
  //      If the input (fed into the decimator) is complex, aka "I/Q",
  //      a POSITIVE "center frequency" moves a signal on a "positive frequency" *DOWN*
  //      so it appears at 0 Hz in the decimated OUTPUT signal .
  //   See notes in msk_modem.cpp and CSoundDecimatingBuffer::ProcessSamples()
  //      (2015-09-09) !
{
  // some plausibility check may be performed here one fine day..
  m_dblNcoFrequency = fltNewFreq;
  return TRUE;
} // end CSoundDecimatingBuffer::SetNcoFrequency()


/***************************************************************************/
int CSoundDecimatingBuffer::GetDecimationRatio(void)  // safe read-access to .m_iDecimationRatio
{
  return m_iDecimationRatio>=1 ? m_iDecimationRatio : 1; // avoid div-by-zero in the caller !
} // end CSoundDecimatingBuffer::GetDecimationRatio()

/***************************************************************************/
double CSoundDecimatingBuffer::GetDecimatedSampleRate(void)
{
  int i,j;
  double d;

  d = m_dblInputSampleRate;
  for( i=0; i<m_iUsedDecimatorStages; ++i)
   {
     j = m_Decimator[0][i].ratio;
     if (j>1)
       d /= (double)j;
   }
  return d;  // assume both channels use the same decimation parameters !
} // end CSoundDecimatingBuffer::GetDecimatedSampleRate()


/***************************************************************************/
static void SndDec_CalcMACs_2Channels(  T_Complex *firptr, T_Complex *qptr, T_Float *pCoeffs, int qlen, T_Complex *pResult )
  // Subroutine to calculate the MACs (Multiply-Accumulate) for a FIR filter
  //               with TWO channels.
  //             Called from CSoundDecimatingBuffer::ProcessSamples(),
  //             Uses the unrolled-loop subroutine to make it "a bit faster" .
  // [in]  firptr  : pointer to the next complex FIR filter sample to be processed
  // [in]  qptr    : pointer to the begin of the circular queue
  //                 Note : On entry, 'pInput' isn't (always) equal to 'qptr',
  //                        reasons explained in CSoundDecimatingBuffer::ProcessSamples() .
  // [in]  pCoeffs : typically points to SoundTab_Dec2FilterCoeffs[SoundTab_DEC_FIR_LENGTH]
  // [in]  qlen    : length of the queue; number of FIR filter coeffiecients (taps)
  // [out] pResult : 1st channel in pResult->re, 2nd channel in pResult->im .
{
#if( SWI_USE_UNROLLED_LOOPS==0 ) /* 0 = no (shorter but slower code), 1 = yes (faster but larger code) */
  // Too slow, at least too slow for 1.6 MSamples/second on a Centrino notebook..
  register T_Float acc1, acc2;
  register T_Complex *qend = qptr+qlen;
  acc1 = acc2 = 0.0;
  int j;
  for(j=0; j<qlen; ++j )   // do the MAC's (multiply,accumulate)
   {
     acc1 += ( (firptr->re)*(*pCoeffs) );
     acc2 += ( (firptr->im)*(*pCoeffs++) );
     if( (++firptr) >= qend ) // deal with wraparound
      { firptr = qptr;
      }
   }
  pResult->re = acc1;
  pResult->im = acc2;
#else // speed-optimized with unrolled loops (and another trick to reduce overhead from the CIRCULAR buffer)
  float part1[2], part2[2];
  int queue_index = firptr - qptr;           // -> 0..24 (for qlen=25); example: queue_index=3
  int nSamplesBeforeIndexWrap = qlen - queue_index; // -> example : 25-3 = 22 MACs for 'part1'
  int nSamplesAfterIndexWrap  = queue_index;        // -> example : 3 MACs for 'part2'
  SndDec_CalcMACs_2Channels_NoWrap( &firptr->re, pCoeffs, nSamplesBeforeIndexWrap, part1 );
  if( nSamplesAfterIndexWrap > 0 )
   { SndDec_CalcMACs_2Channels_NoWrap( &qptr->re, pCoeffs+nSamplesBeforeIndexWrap, nSamplesAfterIndexWrap, part2 );
     pResult->re = part1[0] + part2[0];
     pResult->im = part1[1] + part2[1];
   }
  else  // was lucky because all MACs could be calculated in a single call..
   { pResult->re = part1[0];
     pResult->im = part1[1];
   }
#endif // SWI_USE_UNROLLED_LOOPS ?
} // end SndDec_CalcMACs_2Channels()


/***************************************************************************/
int CSoundDecimatingBuffer::ProcessSamples(
       T_Float *pfltSource,  // pointer to LEFT SOURCE CHANNEL (or 'paired' data, or "I"
       T_Float *pfltSource2, // pointer to RIGHT SOURCE CHANNEL (NULL for 'paired' data), or "Q"
       int     iConversionMode,     // SNDDEC_MODE_xx_IN_yy_OUT ('xx','yy' = REAL or COMPLEX)
       // ex: int iNrSamplePairs, // number of sample points (not single floats) to process
       T_ChunkInfo *pChunkInfo,   // [in] #samples, #channels per sample, precise SR, Timestamps, GPS, ...
       T_Float *pfltDest,    // pointer to LEFT DESTINATION BLOCK (or 'paired' data)
       T_Float *pfltDest2,   // pointer to RIGHT SOURCE CHANNEL (only for TWO audio channels)
       int iDestIndex, int iDestSize )  // index and length of destination buffer (may be CIRCULAR)
  /* Decimates a number of audio samples    with    or without buffering .
   * One or two channels are supported, depending on
   *     the parameters during construction of the class.
   * The source data for dual-channel-operation may be either "paired" : ..
   *     pfltSource[0] = left channel, pfltSource[1] = right channel,
   *     pfltSource[2] = left channel, pfltSource[3] = right channel, etc..
   *  ... or "separate blocks", in this case with a 2nd source pointer:
   *     pfltSource[0..NrSamplePairs-1]  = left channel,
   *     pfltSource2[0..NrSamplePairs-1] = right channel.
   * If pfltSource2 is NULL but the the source data are "paired",  MUST be NULL .
   *
   * Optionally, a COMPLEX FREQUENCY CONVERSION and/or DECIMATION
   *             can take place (BEFORE putting the values into the buffer!)
   *
   * Return value: Count of decimated sample-pairs (or -quads).
   *     MAY VARY if the number of samples is not a multiple of the decimation ratio !
   *     A negative value indicates an error; usually a buffer overflow .
   *
   *    Called by the real-time sound thread,
   *           also as the "frontend" (input pre-processor) for an SDR running
   *           at > 1 MSample/second ["Moxon" DAB-Stick : 1.6 MSamples/sec]
   *           so every microsecond counts !
   */
{
 int i,j,chn;
 int iNrSamplePairs;
 int stage;
 double nco_re,nco_im;
 T_SOUND_DECIMATOR *pDecimator;
 T_Complex acc;
 T_Complex *inptr;
 T_Complex *qptr;
 T_Complex *firptr;
 const T_Float *kptr;
 int    qlen;
 bool   filter_output_ready;
 int   iCountOutputSamples = 0;
 T_ChunkInfo sDecimatedChunkInfo;
#if( SWI_USE_CHUNK_INFO2 )
 T_ChunkInfo *pDstChunkInfo;
#endif // SWI_USE_CHUNK_INFO2 ?

  if( pChunkInfo == NULL ) // NOT OPTIONAL anymore, but MANDATORY since 2011-12
     return -1;

  iNrSamplePairs = pChunkInfo->dwNrOfSamplePoints; // number of sample points (not single floats) to process
  if( (iNrSamplePairs<=0) || (m_numInputStreams<=0) )
     return -2;

  if( ! m_fOpenForBusiness )
     return -3;

  if( iConversionMode != m_iConversionMode )  // now incompatible with the initialisation parameter
     return -4;  // 2017-11-13 : MSK demodulator failed because of this

  if( iConversionMode==SNDDEC_MODE_COMPLEX_IN_COMPLEX_OUT )
   { // ex: if( pfltSource2==NULL )
     //      { return -4;   // ERROR: input shall be COMPLEX, in SEPARATE channels,
     //        // but there is no 2nd input channel (with the IMAGINARY parts / "Q")
     //      }
     // Since 2015-11, COMPLEX_IN must also support 'paired' input (in ONE source block).
     //                Used to tap the input for the audio recorder
     //                at the COMPLEX DECIMATED output of the main frequency analyser.
     //                See SoundThd.cpp : SoundThd_WaveSaveProcess()  !
   }



  // Copy the 'CHUNK INFO' into an internal queue ?
  //   (The 'chunk info array' is limited to CHUNK_ARRAY_MAX_ENTRIES,
  //    so be careful not to flood the queue from here. The 'chunk info history'
  //    must cover as many 'chunk infos' as there are SAMPLES in the decimating buffer.
  //    Not much more, and never any less. Modified 2016-10-28; grep for the date.
  //    ChunkInfoArray_Append() decides if a new entry shall be appended to the history
  //    depending on the entry's 'age', and for this, it needs the SAMPLING RATE.
  //    Because the decimating buffer stores DECIMATED samples, we cannot pass
  //    the caller's pChunkInfo, but a *modified* T_ChunkInfo to ChunkInfoArray_Append() :
  sDecimatedChunkInfo = *pChunkInfo;

  // A few members of the T_ChunkInfo copied above must be modified
  //               *before* passing it to ChunkInfoArray_Append() :
  //  - the sampling rate of the buffer's OUTPUT may be lower than the INPUT;
  //  - the 'radio frequency offset' must be adjusted if the buffer
  //    *decimates and translates frequency" (i.e. downconverts). Example:
  //    Input (from soundcard) : fs=48000 Hz, covering 0..24 kHz,
  //         Decimator with frequency shift:
  //      -> T_ChunkInfo.dblRadioFrequency must be INCREMENTED by the NCO frequency !
  //  - T_ChunkInfo.i64TotalSampleCounter shall count the OUTPUT samples (decimated),
  //      because that's what also the SAMPLE BUFFER contains: *DECIMATED* samples .
  if( m_iDecimationRatio > 1)
   { sDecimatedChunkInfo.dblPrecSamplingRate /= (double)m_iDecimationRatio;
   }
  if( m_fFreqConversion )
   { sDecimatedChunkInfo.dblRadioFrequency += m_dblNcoFrequency;
     // (added 2015-01-10; the result can be seen in the "inf1" chunk
     //  if the decimator is used as 'pre-processor' when saving WAVE FILES.
     //  Example: m_dblNcoFrequency = 8270.0 Hz (VLF "center of activity");
     //     c:\cproj\SoundUtl\CWaveIO.cpp : C_WaveIO::WriteHeader()
     //       -> c:\cbproj\SoundUtl\ChunkInfo2.c : ChunkInfoToString()
     //            -> sz1kInfo = "sr=98.7654321 rf=8270.000000000 ut=1420930608.62 ...."
     //
     // 2015-01-14: Another instance of a CSoundDecimatingBuffer object is used as part
     //  of Spectrum Lab's INPUT PREPROCESSOR, to downconvert and decimate
     //  an "RF" input signal. Example from preproc_test_dcf77_and_gps.usr :
     //    m_dblNcoFrequency = 77 kHz, m_dblInputSampleRate = 192 kHz,
     //    decimate by 16   ->          output sampling rate = 12 kHz.
     //  Soundcard running in STEREO but one of the two
     //    inputs is was used for the GPS/NMEA/sampling rate calibration
     //    thus the input for CSoundDecimatingBuffer was a REAL signal
     //     ( iConversionMode = 0 = SNDDEC_MODE_REAL_IN_REAL_OUT ),
     //    and described further below ("Frequency conversion, but NO complex output")
     //    the output is also REAL (thus m_fComplexBuffer=FALSE)
     //    and appears as a bandpass-filtered(!) signal,
     //
   }
  sDecimatedChunkInfo.i64TotalSampleCounter = m_i64OutputSampleCount; // *before* incrementing m_i64OutputSampleCount !
  sDecimatedChunkInfo.nChannelsPerSample = m_nOutBufCompsPerSample;   // not 'm_numInputStreams' !
  // now our 'local copy' of the chunk info, describing the DECIMATED buffer contents, is ready for the history buffer:
#if( SWI_USE_CHUNK_INFO2 )    // SoundUtl/ChunkInfo2.c (with T_ChunkArray) present ?
  pDstChunkInfo = ChunkInfoArray_Append(
        &m_ChunkArray, // [out] an array of ChunkInfo entries
        &sDecimatedChunkInfo, // [in] precise sampling rate, sample index, date+time, "radio" frequency, GPS data,....
        GetTotalBufferSpace_seconds() ); // [in] dblHistoryLen_s, to tell how many old info chunks must remain available in the history
#elif( SWI_USE_CHUNK_INFO )
  ChunkInfo_CopyFromTo( &sDecimatedChunkInfo, &m_ChunkInfo, sizeof(m_ChunkInfo) );
#endif // ..CHUNK_INFO[2] ?
  if(  (iDestIndex<0) || (iDestIndex >= iDestSize) )
    {  /* handle 'old' buffer wrap  */
      iDestIndex=0;
    }

  if( (!m_fFreqConversion) && (iConversionMode==SNDDEC_MODE_REAL_IN_REAL_OUT) )
   { // Not COMPLEX, no frequency conversion,
     //  but a REAL-VALUE input AND OUTPUT (means: no I/Q-mixing):

     /* Copy the chunk into the "big buffer".                              */
     /* If required, also do the "downsampling" here to save buffer space. */
     if( m_iDecimationRatio <= 1)
      { // NO downsampling to a lower sample rate for the FFT:
        iCountOutputSamples = iNrSamplePairs;
        i = iNrSamplePairs;
        switch(m_numInputStreams) // expecting(!) 1 or 2 channels; depends on channels (here: no complex values)
         { case 1:   // one channel, "mono", no decimation, no I/Q processing
              while(i--)
               { // Copy the next sample from the input chunk to the buffer
                 pfltDest[iDestIndex++] = *pfltSource++;
                 if (iDestIndex >= iDestSize)
                     iDestIndex = 0;       // circular buffer wrap
               } // end for(i..)
              break;
         case 2: // Two channels, "stereo", no decimation, no I/Q processing.
                 // There are FOUR combinations how source- and destination chunks
                 // may be structured; to optimize speed each has its own loop:
              if( pfltSource2 )
               { // pointer to RIGHT SOURCE CHANNEL exists
                 //    (a NULL pointer would mean 'paired' data, see below)
                 if( pfltDest2 )
                  {  // also two separate destination blocks ?
                    while(i--)
                     { // Separate input- and output blocks:
                       pfltDest[iDestIndex]    = *pfltSource++;
                       pfltDest2[iDestIndex++] = *pfltSource2++; // << big difference!
                       if(iDestIndex >= iDestSize)
                          iDestIndex = 0;       // buffer wrap, ONLY AT EVEN INDEX
                     }
                  }
                 else // two separate source blocks, but only one destination:
                  { while(i--)
                     { // Combine the next sample from the input chunks
                       // into a "pair" for the buffer:
                       pfltDest[iDestIndex++] = *pfltSource++;
                       pfltDest[iDestIndex++] = *pfltSource2++; // << big difference!
                       if(iDestIndex >= iDestSize)
                          iDestIndex = 0;       // buffer wrap, ONLY AT EVEN INDEX
                     }
                  } // end else <two source blocks, but only one destination>
               }
              else // no "split source blocks" for LEFT and RIGHT channel, but "paired" samples:
               {
                if( pfltDest2 )
                 {  // a single source block but split destination blocks :
                   if( pChunkInfo->nChannelsPerSample==2 ) // TWO source channels ?
                    { while(i--)
                       { pfltDest[iDestIndex]   = *pfltSource++;
                         pfltDest2[iDestIndex++]= *pfltSource++;
                         if(iDestIndex >= iDestSize)
                            iDestIndex = 0;       // circular buffer wrap
                       }
                    }
                   else // two separate destination blocks but only ONE source-channel:
                    { while(i--)
                       { pfltDest[iDestIndex]   = *pfltSource++;
                         pfltDest2[iDestIndex++]= 0.0;
                         if(iDestIndex >= iDestSize)
                            iDestIndex = 0;       // circular buffer wrap
                       }
                    }
                 }
                else // a single source- and a single destination block for 2 channels:
                 {
                   if( pChunkInfo->nChannelsPerSample==2 ) // TWO source channels ?
                    { while(i--)
                       { // Copy the next sample from the input chunk to the buffer
                         pfltDest[iDestIndex++] = *pfltSource++;
                         pfltDest[iDestIndex++] = *pfltSource++;
                         if(iDestIndex >= iDestSize)
                            iDestIndex = 0;  // buffer wrap, ONLY AT EVEN INDEX
                       }
                    }
                   else  // destination with two channels per sample (interleaved in one block), source with only one channel:
                    { while(i--)
                       { // Copy the next sample from the input chunk to the buffer
                         pfltDest[iDestIndex++] = *pfltSource++;
                         pfltDest[iDestIndex++] = 0.0;
                         if(iDestIndex >= iDestSize)
                            iDestIndex = 0;  // buffer wrap, ONLY AT EVEN INDEX
                       }
                    }
                 }
               } // end if (pfltSource2)
              break;
        } // end switch(m_nOutBufCompsPerSample)
      }
     else // m_iDecimationRatio >= 2 : DECIMATE when copying the chunk into the "big buffer(s)".
      {   // Here: still WITHOUT frequency conversion.
          // With m_numInputStreams==2, this was often used as the 'frontend'
          // for SDRs with an unacceptably large sampling rate,
          // for example 'RTL-SDR' (Terratec/Noxon "DAB Stick" : 1.6 MSamples/sec);
          // so it may pay off to optimize this for TWO CHANNELS,
          // so the filter coefficients only need to be picked ONCE (for both I+Q).
        if( m_numInputStreams == 2 )   // 2015-03-11 : Optimized for TWO channels (but still no CIC filter..)
         {
           // 2015-03-11: Using a Noxon DAB Stick, running at 1.6 MSamples/second,
           //             the following 'slightly optimized loop' for TWO CHANNELS
           //             required 25 (instead of 43) milliseconds to decimate
           //             65536 I/Q samples (at fs=1.6 MHz; Noxon DAB stick) by EIGHT
           //             into 8192 samples @ fs=200 kHz.
           //             That's 25 ms to process 65536/1.6MHz = 41 ms of data.
           //             Still an unacceptable average CPU load for the old Centrino.
           //
           for(i=0; i<iNrSamplePairs; ++i)
            {
              // get next audio sample for filter input:
              if( (pfltSource2!=NULL) && (m_numInputStreams>1) )
               { // pointer to RIGHT SOURCE CHANNEL exists
                 //    (a NULL pointer would mean 'paired' data, see below)
                 acc.re = pfltSource[i];
                 acc.im = pfltSource2[i];
               }
              else
               { // no "split source blocks", but "pairs" of samples:
                 acc.re = pfltSource[i*m_numInputStreams + 0];
                 acc.im = pfltSource[i*m_numInputStreams + 1];
               }
              // Run the sample (in acc) through some decimation stages
              //     with decent low-pass filtering to avoid aliasing .
              //     Details about FIR-based decimation further below (in the "universal" loop).
              //   Here: Only decimation but no frequency shifting for exactly TWO channels.
              filter_output_ready = true;
              for(stage=0; (stage<m_iUsedDecimatorStages) && filter_output_ready; ++stage )
               {
                 pDecimator = &m_Decimator[0][stage]; // local pointer to "current" decimator for speed
                 inptr = pDecimator->inptr;  // local copy for speed: pointer to current element in circular queue
                 qptr  = pDecimator->queue;  //   "     "   "    "  : pointer to the queue itself
                 qlen  = SoundTab_DEC_FIR_LENGTH;  // queue length, aka 'number of taps', for example 25 (see ... )
                 if(--inptr < qptr)   // deal with FIR pointer wrap around
                      inptr = qptr+qlen-1;
                 inptr->re = acc.re;  // place in circular Queue (here: 1st channel in the real part)
                 inptr->im = acc.im;  // place in circular Queue (here: 2nd channel in imaginary part)
                            // (the above must be done even if we don't actually CALCULATE THE FILTER OUTPUT)
                 if( (++pDecimator->count) >= pDecimator->ratio)
                  { // calculate the decimation filter now (example decimate by two: only get here for every 2nd sample)
                    pDecimator->count = 0;
#                  if( SWI_USE_UNROLLED_LOOPS==0 ) /* 0 = no (shorter but slower code), 1 = yes (faster but larger code) */
                    // Prepare decimation (by 2 or 3).
                    kptr = pDecimator->coeffs; // typically points to SoundTab_Dec2FilterCoeffs[SoundTab_DEC_FIR_LENGTH]
                    acc.re = acc.im = 0.0;
                    firptr = inptr;
                    for(j=0; j<qlen; ++j )   // do the MAC's (multiply,accumulate)
                     {
                       acc.re += ( (firptr->re)*(*kptr) );
                       acc.im += ( (firptr->im)*(*kptr) );
                       // Disassembly of the above two MACs (compiled with Borland C++ Builder V6):
                       //   mov  ecx, [ebp-0x40]       ; input for 1st MAC
                       //   fld  dword ptr [ecx]
                       //   fmul dword ptr [eax]
                       //   fadd dword ptr [ebp-0x34]
                       //   fstp dword ptr [ebp-0x34]
                       //   mov  edx, [ebp-0x40]       ; input for 2nd MAC
                       //   fld  dword ptr [edx+0x04]
                       //   mov  ecx, [ebp-0x44]
                       //   fmul dword ptr [ecx]
                       //   fadd dword ptr [ebp-0x30]
                       //   fstp dword ptr [ebp-0x30]
                       //
                       ++kptr;
                       if( (++firptr) >= qptr+qlen ) // deal with wraparound
                              firptr  =  qptr;
                     }
#                  else //  2015-03-12: speed optimisation by LOOP UNROLLING (for the FIR MACs) ..
                    SndDec_CalcMACs_2Channels( inptr, qptr, (float*)pDecimator->coeffs, qlen, &acc );
                       // -> 47% CPU load to decimate 1.6 MSamples/sec by 8 on the Z61m,
                       //    using SndDec_CalcMACs_2Channels_NoWrap() with SIX INSTRUCTIONS PER 'MAC' .
#                  endif // (0,1); optimisation 2015-03-12
                    // filter output for the next stage now in acc.re,im (2 channels).
                  } // end if ( ..m_Decimator[].count >= .ratio )
                 else
                  { // Do not CALCULATE the decimation filter now (only place sample in its queue),
                    // because the result would be thrown away anyway .. save a couple of MAC's .
                    //  (example decimate by two: only get here for every 2nd sample)
                    filter_output_ready = false; // no output from the filter this time
                  }
                 // save position in circular delay line
                 pDecimator->inptr = inptr;
               } // end for <all lowpass & decimation stages>

              if (filter_output_ready)
               { // "decimated" sample is complete.
                 // Emit the value into the destination block..
                 if(m_numInputStreams<=1)
                  { // only one channel:
                    pfltDest[iDestIndex++] = acc.re;
                  }
                 else      // TWO channels; two separate or one common destination block ?
                  { if( pfltDest2 )
                     { // TWO separate destination blocks :
                       pfltDest[iDestIndex]    = acc.re;
                       pfltDest2[iDestIndex++] = acc.im;
                     }
                    else // two channels, but only ONE destination block:
                     { pfltDest[iDestIndex++] = acc.re;
                       pfltDest[iDestIndex++] = acc.im;
                     }
                  } // end if <two channels>

                 if(iDestIndex>= iDestSize)  // handle buffer wrap ..
                  { iDestIndex = 0;  // (the destination MAY BE a circular buffer!)
                  }

                 // Count the samples emitted during THIS call ..
                 iCountOutputSamples++;
               } // end if <filter_output_ready at the end of the filter cascade>
            } // end for <all input samples> [optimized for TWO input channels]
         } // end if( m_numInputStreams == 2 )
        else  // m_numInputStreams != 2 : Use the slower, but universal decimate-without-frequency-conversion loop....
         {
           for(i=0; i<iNrSamplePairs; ++i)
            {
              for(chn=0; chn<m_numInputStreams; ++chn)
               {
                 // get next audio sample for filter input:
                 if( (pfltSource2!=NULL) && (m_numInputStreams>1) )
                  { // pointer to RIGHT SOURCE CHANNEL exists
                    //    (a NULL pointer would mean 'paired' data, see below)
                    if(chn==0)  acc.re = pfltSource[i];
                          else  acc.re = pfltSource2[i];
                  }
                 else
                  { // no "split source blocks", but "pairs" of samples:
                    acc.re = pfltSource[i*m_numInputStreams + chn];
                  }
#               if(1) // 1=normal compilation, 0=TEST: real FIR filtering or crude 'skipping' of samples ?
                 // Run the sample (acc.re) through some decimation stages
                 //     with decent low-pass filtering to avoid aliasing .
                 // Principle (for decimate-by-2^n) : Cascaded halfband filters.
                 //     The 1st filter runs at f_sample, and cuts off everything
                 //         above f_sample/4.
                 //     The 2nd filter runs at f_sample/2, so the first (FIR!-)filter
                 //         only needs to calculate every 2nd output value.
                 //         Every (2nd+1)-th sample is only shifted through the
                 //         filter's chain but no MAC's are needed then ...
                 //     and so on.
                 //  For a simple halfband filter, we don't need a long FIR kernel
                 //      (here: SoundTab_DEC_FIR_LENGTH = 25 ) .
                 //  If the necessary lowpass-filtering would be performed in a
                 //  SINGLE FIR-stage, a very long kernel would be required
                 //  for large decimation ratios.
                 //
                 //   Here: Only decimation but no frequency shifting .
                 filter_output_ready = true;
                 for(stage=0; (stage<m_iUsedDecimatorStages) && filter_output_ready; ++stage )
                  {
                    pDecimator = &m_Decimator[chn][stage]; // local pointer to "current" decimator for speed
                    inptr = pDecimator->inptr;  // local copy for speed: pointer to current element in circular queue
                    qptr  = pDecimator->queue;  //   "     "   "    "  : pointer to the queue itself
                    qlen  = SoundTab_DEC_FIR_LENGTH;  // queue length, aka 'number of taps', for example 25 (see ... )
                    if(--inptr < qptr)   // deal with FIR pointer wrap around
                         inptr = qptr+qlen-1;
                    inptr->re = acc.re;  // place in circular Queue (here only the real part)
                    if( (++pDecimator->count) >= pDecimator->ratio)
                     { // calculate the decimation filter now (example decimate by two: only get here for every 2nd sample)
                       pDecimator->count = 0;
                       // Prepare decimation (by 2 or 3).
                       kptr = pDecimator->coeffs; // typically points to SoundTab_Dec2FilterCoeffs[SoundTab_DEC_FIR_LENGTH]
                       acc.re  = 0.0;
                       firptr = inptr;
                       for(j=0; j<qlen; ++j )   // do the MAC's (multiply,accumulate)
                        {
                          acc.re += ( (firptr->re)*(*kptr++) );
                          if( (++firptr) >= qptr+qlen ) //deal with wraparound
                                 firptr  =  qptr;
                        }
                       // filter output for the next stage now in acc.re .
                     } // end if ( ..m_Decimator[].count >= .ratio )
                    else
                     { // Do not CALCULATE the decimation filter now (only place sample in its queue),
                       // because the result would be thrown away anyway .. save a couple of MAC's .
                       //  (example decimate by two: only get here for every 2nd sample)
                       filter_output_ready = false; // no output from the filter this time
                     }
                    // save position in circular delay line
                    pDecimator->inptr = inptr;
                  } // end for <all lowpass & decimation stages>
#               else   // Test: just SKIP samples from the input, without filtering, without antialiasing:
                 filter_output_ready = (i % m_iDecimationRatio) == 0;
                 // 2009-03-14, test result: the spurious signal was NOT caused by the decimator !
#               endif // TEST: real FIR filtering or crude integrate-and-dump ?

                 if (filter_output_ready)
                  { // "decimated" sample is complete.
                    // Emit the value into the destination block..
                    if(m_numInputStreams<=1)
                     {   // only one channel:
                       pfltDest[iDestIndex++] = acc.re;
                     }
                    else      // TWO channels; two separate or one common destination block ?
                     { if( pfltDest2 )
                        { // TWO separate destination blocks :
                          if(chn==0)
                           {  pfltDest[iDestIndex] = acc.re;
                           }
                          else
                           {  pfltDest2[iDestIndex++] = acc.re;
                           }
                        }
                       else // two channels, but only ONE destination block:
                        {     pfltDest[iDestIndex++] = acc.re;
                        }
                     } // end if <two channels>

                    if(iDestIndex>= iDestSize)  // handle buffer wrap ..
                     { iDestIndex = 0;  // (the destination MAY BE a circular buffer!)
                     }

                    // Count the samples emitted during THIS call ..
                    if(chn==0) // (only count ONCE per channel loop)
                     { iCountOutputSamples++;
                     }
                  } // end if <filter_output_ready at the end of the filter cascade>
               } // end for <chn... >
            } // end for <all input samples> [for ANY number of input channels]
         } // end else < m_numInputStreams != 2 >
      } // end if <put sampled data into the ring buffer WITH DOWNSAMPLING>
   } // end if (!complex_input && !frequency_conversion)
  else  // COMPLEX output samples ("I/Q")  and/or FREQUENCY CONVERSION :
   { // Multiply the REAL input samples with the COMPLEX output
     // of a 'numerical controlled oscillator',
     // before running them through a chain of decimation stages.
     int  iCosTableIndex, iSineTableOffset;
     double dblPhzInc;

     // Prepare generation of a complex NCO signal using a cos/sin table .
     // (the IN-phase component in a digital downconverter is multiplied with a COSINE wave,
     //  the Quadrature component should be multiplied with a SINE wave,
     //  see  en.wikipedia.org/wiki/Digital_down_converter .
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // The "I" component is multiplied with a COSINE,
     // the "Q" component with a SINE (*).
     //                  Q (quadrature component)
     //                 /|\ +im
     //                  |
     //                  |
     //        -re <-----+-----> +re I (in-phase comp.)
     //                  |             |
     //                  |             |
     //                 \|/ -im       /
     //                             |/_
     //                            Rotation CLOCKWISE
     //                            for POSITIVE NCO frequency
     //
     //  Complex NCO (as a rotating complex pointer in the above coordinate system)
     //      starts rotating at t=0, I=cos(0)=1,  Q=sin(0)=0 (->"points right") .
     //  (*) To read a SINEWAVE from the COSINE table :  sin(x) = cos(x - 90),
     //                   _____________________________________________|____|
     //                  |
     //      which means iSineTableOffset is NEGATIVE if the NCO frequency is positive(!),
     //                                  and POSITIVE if the NCO frequency is negative(!).
     //  But that would result in negative array indices - see NCO example below.
     //       To avoid the risk of negative array indices, use
     //                   nco.re = cos(x)
     //                   nco.im = sin(x) = -cos(X+90) instead of cos(x-90) .
     //                                      _____________________________|
     //                                     |
     //                                     iSineTableOffset determined below
     SoundTab_GetNcoParams( // prepare the "numeric controlled oscillator" :
        m_dblNcoFrequency/m_dblInputSampleRate, // input : NCO frequency (may be NEGATIVE) divided by sampling rate
        &iSineTableOffset,  // output: positive offset representing 90 phase shift (between cos and -sin)
        &dblPhzInc );       // output: phase increment value (floating point!!)
     if(m_dblNcoPhase<0)
        m_dblNcoPhase=0;
     for(i=0; i<iNrSamplePairs; ++i)
      {
       if (dblPhzInc != 0.0)
        {
          // Increment the NCO phase, and calculate an index into the cosine table:
          m_dblNcoPhase  = fmod( m_dblNcoPhase+dblPhzInc, SOUND_COS_TABLE_LEN );
          // 2015-05-16 : Crashed here with an access violation when TERMINATING.
          iCosTableIndex = (int)m_dblNcoPhase;
          nco_re =  SoundTab_fltCosTable[iCosTableIndex];
          nco_im = -SoundTab_fltCosTable[(iCosTableIndex+iSineTableOffset) % SOUND_COS_TABLE_LEN];
          // 2015-01-21 : After the above modification, a decimated complex WAVE FILE
          //              with m_dblNcoFrequency = 77490 Hz recorded a 77500 Hz signal
          //              at a complex baseband frequency of "+10 Hz" (before that, -10 Hz).
          // 2015-09-09 : A DGPS beacon at 303.5 kHz, SDR-IQ VFO at 300 kHz,
          //              baseband RX frequeny = 3500 Hz (=m_dblNcoFrequency),
          //              was not successfully demodulated with I/Q-input anymore.
          //     Got here with iSineTableOffset = 24576 = (3/4) * SOUND_COS_TABLE_LEN,
          //     dblPhzInc = 2064.47, InputSampleRate = 55555 Hz.
          //              I- and Q- input had to be REVERSED (at the input)
          //              for the MSK-demodulator (in C:\CBproj\Digimodes\msk_mod.cpp)
          //              to make the demodulator/GDPS decoder work again.
          //              To avoid trashing OTHER SL-components (which also use CSoundDecimatingBuffer
          //              with complex input and complex "down"-conversion),
          //        NO MODIFICATIONS WERE MADE HERE (in CSoundDecimatingBuffer::ProcessSamples) !
          //        Instead of fixing the bug HERE (at the risk of adding new bugs),
          //        the mofication was made ONLY in C:\cbproj\Digimodes\msk_modem.cpp .
          //              Time will tell other places where the "sign of the Q-channel" was wrong...
          //
        }
       else // no complex_mixing (because the DDS's phase increment is ZERO)
        {
          nco_re = 1.0;
          nco_im = 0.0;  // !
        }


        int nChannels = m_numInputStreams;
        if( iConversionMode==SNDDEC_MODE_COMPLEX_IN_COMPLEX_OUT )
         { // If the COMPLEX input is in separate blocks,
           //               pfltSource[] contains the REAL parts,
           //               pfltSource2[] contains the IMAGINARY parts,
           //     and nChannels = 1 (regardless of m_numInputStreams) .
           nChannels = 1;
         }
        for(chn=0; chn<nChannels; ++chn)
         {
           // get next audio sample for filter input.... input may be complex or real !
           switch(iConversionMode)
            {
              case SNDDEC_MODE_REAL_IN_REAL_OUT    : // REAL input, real output
              case SNDDEC_MODE_REAL_IN_COMPLEX_OUT : // real input, COMPLEX output (no difference in this INPUT part)
              default:
                 if( m_numInputStreams==1 )
                  { acc.re = pfltSource[i];   // only one REAL input channel -> simple !
                  }
                 else // (m_numInputStreams>1)
                  {
                    if(pfltSource2!=NULL)
                     { // pointer to RIGHT SOURCE CHANNEL exists
                       //    (a NULL pointer would mean 'paired' data in one block, see below)
                       if(chn==0)  acc.re = pfltSource[i];     // LEFT channel
                             else  acc.re = pfltSource2[i];    // RIGHT channel
                     }
                    else
                     { // no "split source blocks", but "pairs" of samples:
                       acc.re = pfltSource[i*m_numInputStreams +  chn];
                     }
                  }
                 // Generate a complex sample by mixing input sample with NCO's sin/cos
                 //  (this is NOT a complex multiplication; see below)
                 acc.im =  nco_im * acc.re; // acc.re=REAL input -> acc=OMPLEX output !
                 acc.re *= nco_re;
                 break; // end case SNDDEC_MODE_REAL_IN_REAL_OUT (that applies to the INPUT !)

              case SNDDEC_MODE_COMPLEX_IN_COMPLEX_OUT: // COMPLEX input (usually in separate blocks)
              case SNDDEC_MODE_COMPLEX_IN_REAL_OUT:    // future plan, not supported yet
                 // Anything with COMPLEX input, multiplied with a COMPLEX ocillator signal..
                 if( pfltSource2 != NULL )
                  {
                    // In ths case, pfltSource[] contains the REAL parts,
                    //               pfltSource2[] contains the IMAGINARY parts,
                    //        and a multiplication of TWO complex values takes place here:
                    //  (a+jb) * (nco_re+j*nco_im)
                    //      = a*nco_re + a*j*nco_im + j*b*nco_re + j*j*b*nco_im
                    //      = a*nco_re - b*nco_im   + j * (a*nco_im + b*nco_re)
                    // with  j*j = -1;  a = pfltSource[i];   b = pfltSource2[i]..
                    // Ex (ended up in the "wrong sideband" with this : )
                    // acc.re = pfltSource[i]*nco_re -  pfltSource2[i]*nco_im;
                    // acc.im = pfltSource[i]*nco_im +  pfltSource2[i]*nco_re;
                    acc.re = pfltSource[i]*nco_re +  pfltSource2[i]*nco_im;
                    acc.im = pfltSource[i]*nco_im -  pfltSource2[i]*nco_re;
                  }
                 else // the input shall be COMPLEX but there's only ONE source channel:
                  {   // in this case, but 'I' and 'Q' are interlaced in one array (re1,im1,  re2,im2, .. ).
                    // The calculation is the same as above, but since there is only ONE SOURCE POINTER,
                    // the input samples can only be arranged like this:
                    //      a = pfltSource[ 2*i ]    ( real part of the input )
                    //      b = pfltSource[ 2*i + 1] ( imaginary part of the input )
                    T_Float a,b;
                    int i2 = 2*i;
                    if( chn==0 )  // complex number from FIRST channel:
                     { a = pfltSource[i2];     // real part
                       b = pfltSource[i2+1];   // imaginary part
                     }
                    else // 2nd channel (also with COMPLEX pairs):
                     { a = pfltSource[i2];     // real part
                       b = pfltSource[i2+1];   // imaginary part
                     }
                    // Multiply COMPLEX input with COMPLEX ocillator signal:
                    acc.re = a*nco_re + b*nco_im;
                    acc.im = a*nco_im - b*nco_re;
                  } // end if < COMPLEX input but only ONE "source pointer" >
                 break; // end case < anything with COMPLEX INPUT >
            } // end switch ( iConversionMode )


          // Pass the complex product 'acc' (an I/Q sample) through some decimation stages
          //     with decent low-pass filtering to avoid aliasing .
          // Decimation principle same as for real-valued decimation,
          //     but complex numbers are used here ("I"+"Q" components).
          filter_output_ready = true;
          for(stage=0; (stage<SoundDec_MAX_DECIMATION_STAGES)
                      && filter_output_ready;    ++stage )
            {
             pDecimator = &m_Decimator[chn][stage];
             if( (filter_output_ready) && (pDecimator->ratio > 1) )
              {
               inptr = pDecimator->inptr; // local copy for better speed..
               qptr  = pDecimator->queue;
               qlen  = SoundTab_DEC_FIR_LENGTH;  // 25 (?)
               inptr--;
               if(inptr < qptr)        // deal with FIR pointer wrap
                  inptr = qptr+qlen-1; // (2 indices per COMPLEX sample!)
               inptr->re = acc.re;     // place real part in circular Queue
               inptr->im = acc.im;     // place imaginary part "   "   "
               if( (++pDecimator->count) >= pDecimator->ratio)
                { // calc decimation filter now:
                  pDecimator->count = 0;
                  // Prepare decimation (by 2 or 3).
                  kptr = pDecimator->coeffs;
                  acc.re  = 0.0;
                  acc.im = 0.0;
                  firptr = inptr;
                  for(j=0; j<qlen; ++j )   // do the complex MAC's (this is where most of the time will be spent...)
                   {                       // 2015-01-14 : qlen=25
                    acc.re += ( (firptr->re)*(*kptr) );
                    acc.im += ( (firptr->im)*(*kptr++) );
                    if( (++firptr) >= qptr+qlen ) //deal with wraparound
                           firptr  =  qptr;
                   }
                  // filter output for the next stage now in acc .
                } // end if ( ..pDecimator->count.. >= .ratio )
               else
                { // do not calc decimation filter now (only shifted queue)
                 filter_output_ready = false; // no output from the filter this time
                }
               // save position in circular delay line
               pDecimator->inptr = inptr;
              } // end if (filter_output_ready && ..Ratio.. > 1) )
            } // end for <all lowpass & decimation stages>
           if (filter_output_ready)
            { // "decimated" sample is complete.
             if(  (m_iConversionMode==SNDDEC_MODE_REAL_IN_COMPLEX_OUT)
                ||(m_iConversionMode==SNDDEC_MODE_COMPLEX_IN_COMPLEX_OUT) )
              { // Emit a truly COMPLEX sample into the buffer.
                if(m_numInputStreams<=1)
                 {   // only one channel:
                     pfltDest[iDestIndex++] = acc.re;
                     pfltDest[iDestIndex++] = acc.im;
                 }
                else  // TWO channels; two separate or one common destination block ?
                 { if( pfltDest2 )
                    { // TWO separate destination blocks :
                      if(chn==0)
                       { pfltDest[iDestIndex]   = acc.re;
                         pfltDest[iDestIndex+1] = acc.im;
                         // don't increment the index; see below !
                       }
                      else // 2nd channel
                       {
                         pfltDest2[iDestIndex++] = acc.re;
                         pfltDest2[iDestIndex++] = acc.im;
                       }
                    }
                   else // two channels, but only ONE destination block:
                    {
                      // Note: For stereo input, and complex (I/Q) output,
                      //         iDestIndex will be incremented
                      //         FOUR times per sample.  Thats ok.
                      pfltDest[iDestIndex++] = acc.re;
                      pfltDest[iDestIndex++] = acc.im;
                    }
                 } // end if <two channels>
              } // end if(m_fComplexBuffer)
             else // filter_output_ready, frequency conversion, but NO complex output (since 2003-07) !
              {   // Principle to convert the complex values back into real numbers:
                  // Assume ..
                  //  fs_in   = sample rate of the input,
                  //            for example 5512 samples/sec.
                  //  fs_cout = sample rate of the last complex decimator,
                  //            for example fs_in/dec=5512/8 = 689 samples/sec.
                  //  The theoretic (almost impossible) frequency range in the
                  //    complex output is -fs_cout/2..+fs_cout/2 = -344..+344 Hz.
                  //    This signal runs through another COMPLEX "halfband" LOWPASS
                  //    with f_cutoff <= fs_cout/4, and -in contrast to the
                  //    other decimation stages- is calculated for EVERY sample
                  //    The output of this final stage (example: -172..+172 Hz),
                  //    is effectively moved up by fs_cout/4 = 172 Hz
                  //    (to produce a REAL-valued output, "positive frequencies")
                  //    by multiplying it with a complex 172Hz-signal.
                  //    No sin/cos required for this, just pick real and imginary
                  //    parts of the complex basband signal (n=acc) as shown below:
                  //    Sample-Index modulo 4: |   Output:
                  //                [0]        |    re(n)
                  //                [1]        |    im(n+1)
                  //                [2]        |    -re(n+2)
                  //                [3]        |    -im(n+2)
                T_Float d;
                SndDec_RunComplexLowpass(  // halfband filter, cutoff at fs/4 (!)
                     &m_Complex2RealDecimator[chn], // pointer to lowpass (or decimator)
                     &acc );        // pointer to in- and output value
                if(chn==0)
                    m_iComplex2RealMixerPhase = (m_iComplex2RealMixerPhase+1) & 3;
                // Multiply the complex stream 'acc' with fs_cout/4,
                //    which moves it into "positive frequencies" (details above)
                switch(m_iComplex2RealMixerPhase)
                 { case 0:  d= acc.re; break;
                   case 1:  d= acc.im; break;
                   case 2:  d=-acc.re; break;
                   default: d=-acc.im; break;
                 }
                // Emit a single, REAL(!) floating-point value :
                if( (m_numInputStreams<=1) || (!pfltDest2) )
                 {   // only one channel, or only one output block :
                     pfltDest[iDestIndex++] = d;
                 }
                else  // TWO channels && two separate destination blocks :
                 {
                   if(chn==0)
                    { pfltDest[iDestIndex] = d;
                      // don't increment the index; see below !
                    }
                   else // 2nd channel
                    {
                      pfltDest2[iDestIndex++] = d;
                    }
                 } // end if <two separate output blocks ?>
              } // end else < !m_fComplexBuffer >
             // wrap the output buffer index if required :
             if (iDestIndex>= iDestSize)
                 iDestIndex = 0;
             // Count the samples emitted during THIS call ..
             if(chn==0) // (only count ONCE per channel loop)
              { iCountOutputSamples++;
              }
            } // end if <filter_output_ready at the end of the filter cascade>
         } // end for(chn=0; chn<m_numInputStreams; ++chn)
      } // end for <all REAL input samples>
   } // end else (complex_input)

  // Add the number of decimated samples to the "total" count
  //  (the "total" count is important for buffer index calculation)
  m_i64OutputSampleCount += iCountOutputSamples; // NON-WRAPPING INDEX !

  // m_dblLatestRecordingTime is closely related to m_i64OutputSampleCount ,
  // so update that 'timestamp' here, too:
  if( pChunkInfo->dblPrecSamplingRate > 0.0 )
   {
     m_dblLatestRecordingTime = pChunkInfo->ldblUnixDateAndTime
         + (double)pChunkInfo->dwNrOfSamplePoints
                 / pChunkInfo->dblPrecSamplingRate;
         // 2019-02-28 : Got an exception with div-by-zero here.
         //              Not sure who the real culprit was.
   }
  else // don't let errors like invalid "chunk info" slip through:
   { DEBUG_EnterErrorHistory( DEBUG_LEVEL_ERROR, 0, UTL_USE_CURRENT_TIME,
       "DecimatingBuffer: Illegal chunk info (PrecSamplingRate=0)" );
   }

  ++m_dwOutputSampleCountModified; // to allow checking integrity of m_i64OutputSampleCount + m_dblLatestRecordingTime,
                                   // see ..GetUnixTimeForOutputSampleIndex() !

#if( SWI_USE_CHUNK_INFO2 )
  if( pDstChunkInfo != NULL )  // added 2012-01-08 22:20 : Make the T_ChunkInfo "valid" now:
   {  pDstChunkInfo->dwSizeOfStruct = sizeof( T_ChunkInfo );
   }
#endif // SWI_USE_CHUNK_INFO2 ?

  return iCountOutputSamples;   // number of samples AFTER decimation
                                // 2015-01-14 : iCountOutputSamples = 256 (input was 4096)
} // end CSoundDecimatingBuffer::ProcessSamples()


/***************************************************************************/
int CSoundDecimatingBuffer::ProcessWithoutBuffering(
       T_Float *pfltData,     // pointer to LEFT  DATABLOCK (or 'paired' data)
       T_Float *pfltData2,    // pointer to RIGHT DATABLOCK (NULL for 'paired' data)
         int iConversionMode,   // [in] SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)
       T_ChunkInfo *pChunkInfo) // [in] #samples, precise SR, Timestamps, GPS, ...
  /* Decimates a number of audio samples without buffering .
   * One or two channels are supported, depending on the constructor's parameters.
   * The source data for 2-channel-operation may be either "paired" :
   *     pfltData[0] = left channel, pfltData[1] = right channel,
   *     pfltData[2] = left channel, pfltData[3] = right channel, etc..
   *  ... or "separate blocks", in this case with a 2nd source pointer:
   *     pfltData[0..NrSamplePairs-1]  = left channel,
   *     pfltData2[0..NrSamplePairs-1] = right channel.
   * If the source data are "paired", pfltData2 MUST be NULL .
   *     iConversionMode : SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)
   *
   * Optionally, a COMPLEX FREQUENCY CONVERSION and/or DECIMATION
   *             can take place.
   * Return value: Count of decimated sample-pairs (or -quads).
   *     MAY VARY if the number of samples is not a multiple of the decimation ratio !
   *
   *    Called by the real-time sound thread,
   *           also as the "frontend" (input pre-processor) for an SDR running
   *           at > 1 MSample/second ["Moxon" DAB-Stick : 1.6 MSamples/sec]
   *           so every microsecond counts !
   */
{
  return ProcessSamples(
       pfltData,    // pointer to LEFT SOURCE CHANNEL (or 'paired' data)
       pfltData2,   // pointer to RIGHT SOURCE CHANNEL (if any; optional Q-component)
       iConversionMode,  // SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)
       pChunkInfo,  // [in] #sample points, sample rate, timestamps, etc etc
       pfltData,    // pointer to LEFT DESTINATION BLOCK (here: same as SOURCE)
       pfltData2,   // pointer to RIGHT DESTINATION (optional)
       0/*iDestIndex*/, pChunkInfo->dwNrOfSamplePoints/*iDestSize*/ );  // index and length of destination buffer

} // end CSoundDecimatingBuffer::ProcessWithoutBuffering()



/***************************************************************************/
int CSoundDecimatingBuffer::ProcessAndCopy(  // returns the number of samples AFTER decimation
       T_Float *pfltData,     // pointer to LEFT  DATABLOCK (or 'paired' data)
       T_Float *pfltData2,    // pointer to RIGHT DATABLOCK (NULL for 'paired' data)
         int iConversionMode,   // SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)
        // ex: int iNrSamplePairs,  // number of samples(/pairs?) to be processed
       T_ChunkInfo *pChunkInfo, // [in] #samples, precise SR, Timestamps, GPS, ...
       T_Float *pfltDest,     // pointer to LEFT DESTINATION BLOCK (or 'paired' data)
       T_Float *pfltDest2,    // // pointer to RIGHT DESTINATION (NULL for 'paired' data)
         int iMaxDestSize)    // max number of T_Floats in each of the destination blocks
  // Non-buffered processing, WITHOUT overwriting the input block:
{
  return ProcessSamples(
       pfltData,    // pointer to LEFT SOURCE CHANNEL (or 'paired' data)
       pfltData2,   // pointer to RIGHT SOURCE CHANNEL (if any)
       iConversionMode,  // SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)
       pChunkInfo,  // [in] #samples, precise SR, Timestamps, GPS, ...
       pfltDest,    // pointer to LEFT DESTINATION BLOCK (here: same as SOURCE)
       pfltDest2,   // pointer to RIGHT SOURCE CHANNEL
       0, iMaxDestSize);  // index and length of destination buffer
} // end CSoundDecimatingBuffer::ProcessAndCopy()


/***************************************************************************/
BOOL CSoundDecimatingBuffer::EnterSamples(
       T_Float *pfltSource,  // pointer to LEFT SOURCE CHANNEL (or 'paired' data, e.g. I+Q interleaved)
       T_Float *pfltSource2, // pointer to RIGHT SOURCE CHANNEL (NULL for 'paired' data)
         int iConversionMode,  // SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)
       T_ChunkInfo *pInChunkInfo) // [in] #sample points, precise sample rate, date+time,
                                  //     'radio' frequency, GPS position
                                  // (for the first sample in the source buffer)
  /* Puts a number of audio samples into a circular buffer.
   * One or two channels are supported, depending on parameters set in :Init() .
   * The source data for 2-channel-operation may be either "paired" :
   *     pfltSource[0] = left channel, pfltSource[1] = right channel,
   *     pfltSource[2] = left channel, pfltSource[3] = right channel, etc..
   *  ... or "separate blocks", in this case with a 2nd source pointer:
   *     pfltSource[0..NrSamplePairs-1]  = left channel,
   *     pfltSource2[0..NrSamplePairs-1] = right channel.
   * If the source data are "paired", pfltSource2 MUST be NULL .
   *
   * Optionally, a COMPLEX FREQUENCY CONVERSION and/or DECIMATION
   *             can take place (BEFORE putting the values into the buffer!)
   *
   * Doesn't care if the buffer is already full: The buffer will always
   *             contain the "most recent" samples, available for
   *             MULTIPLE readers (with different 'i64UniqueIndex' values).
   */
{
 long lBufferIndex;
 int  iCountDecimatedSamples;

  if( (pInChunkInfo->dwNrOfSamplePoints<=0) || (m_lBufferSize<=0) || (m_pfltBuffer==NULL) || (m_numInputStreams<=0) )
     return FALSE;


#   if(1)   // plausibility check for the timestamps (put into the buffer) ?
     if( pInChunkInfo != NULL )
      {
        if( pInChunkInfo->dblPrecSamplingRate <= 0.0 )  // ERROR (don't ignore) !
         { DEBUG_EnterErrorHistory( DEBUG_LEVEL_ERROR, 0, UTL_USE_CURRENT_TIME,
             "DecBuffer.EnterSamples: Illegal chunk info (PrecSamplingRate=0)" );
         }
        else
        if( m_ldblPrevCheckedTimestamp > 0 )
         { long double ldblTimestamp = m_ldblPrevCheckedTimestamp
            + (long double)pInChunkInfo->dwNrOfSamplePoints / pInChunkInfo->dblPrecSamplingRate;
           double d = ( ldblTimestamp - pInChunkInfo->ldblUnixDateAndTime )
                      * pInChunkInfo->dblPrecSamplingRate; // -> d = number of SAMPLES (!)
           if( (d<-1.5) || (d>1.5) )
            { d = d;  // <<< set breakpoint here <<<
              // 2012-01-08 : Added this after problems with timestamps in Vorbis streams.
              //  2012-01-08 21:20 : Got here with d = 1.01 [samples] ?!
              //      Caller: SOUND_RunProcessingChain() in SoundThd.cpp .
              //   But this cannot explain the 0.127 second error observed
              //   when timestamped data were pulled out of the CSoundDecimatingBuffer
              //   - see C_VorbisFileIO::WriteSamples_Float() .
            }
         }
        m_ldblPrevCheckedTimestamp = pInChunkInfo->ldblUnixDateAndTime;
      }
     else
      { m_ldblPrevCheckedTimestamp = 0.0;
      }
#   endif   // plausibility check for the timestamps ?


  // Sample counter (m_i64OutputSampleCount) used as buffer index:
  lBufferIndex = (long)( (m_i64OutputSampleCount * m_nOutBufCompsPerSample) % m_lBufferSize);

  if(  (lBufferIndex<0) || (lBufferIndex >= m_lBufferSize) )
    {  /* handle 'old' buffer wrap-around : */
      lBufferIndex=0;
    }

  iCountDecimatedSamples = ProcessSamples(
       pfltSource,  // pointer to LEFT SOURCE CHANNEL (or 'paired' data)
       pfltSource2, // pointer to RIGHT SOURCE CHANNEL (may be NULL)
       iConversionMode,  // SNDDEC_MODE_xxx_IN_yyy_OUT (xxx,yyy = REAL or COMPLEX)
       pInChunkInfo,   // number of sample points (not single floats), etc etc
       m_pfltBuffer, // pointer to LEFT DESTINATION BLOCK (here: same as SOURCE)
       NULL    ,     // pointer to RIGHT SOURCE CHANNEL (NULL=only one destination)
       lBufferIndex, // start index for destination buffer
       m_lBufferSize );  // length of destination buffer (for index wrap!)

 if( m_iDecimationRatio==1 ) // sanity check ... only without decimation :
  { if( iCountDecimatedSamples != (int)pInChunkInfo->dwNrOfSamplePoints )
     { iCountDecimatedSamples = iCountDecimatedSamples; // <<< something fishy.. set breakpoint here
       // 2014-05-24: Audio streamed via CSoundDecimatingBuffer + C_VorbisFileIO was stuttering,
       //             but the sanity check here did NOT fail .
     }
  }

  return (iCountDecimatedSamples>0);
} // end CSoundDecimatingBuffer::EnterSamples()


/***************************************************************************/
long double CSoundDecimatingBuffer::GetUnixTimeForOutputSampleIndex(
              LONGLONG i64UniqueIndex) // [in] index of sampling point
  // Determines the 'timestamp' for a certain OUTPUT-SAMPLE-INDEX in the buffer .
  // Caution: i64UniqueIndex is not compatible with
  //          T_ChunkInfo.i64TotalSampleCounter of the *INPUT* samples,
  //     i.e. the chunk-info passed into CSoundDecimatingBuffer::EnterSamples() !
  //          (reason: different sampling rates at input and output,
  //                   which makes those 'sample counters' incompatible)
{
  long double ldblUnixTime;
  LONGLONG i64Offset;
  DWORD dwOldOutputSampleCountModified;

  // Because m_dblLatestRecordingTime and m_i64OutputSampleCount
  // may be written in a different thread, this loop may have to be
  // repeated (in some VERY rare cases) :
  do
   { dwOldOutputSampleCountModified = m_dwOutputSampleCountModified;
     ldblUnixTime = m_dblLatestRecordingTime;
     if( (m_dblOutputSampleRate>0)   // prevent div-by-zero..
       &&(m_nOutBufCompsPerSample>0) )
      {
        i64Offset = m_i64OutputSampleCount - i64UniqueIndex;
        if(i64Offset>=0)
         { // not looking at the "latest entry" in the buffer, but for an older one:
           if( i64Offset >  m_lBufferSize/m_nOutBufCompsPerSample )
               i64Offset =  m_lBufferSize/m_nOutBufCompsPerSample;
           ldblUnixTime -= (double)i64Offset / m_dblOutputSampleRate;
           if( ldblUnixTime < 0.0 )
               ldblUnixTime = 0.0;
         }
      }
   } while (dwOldOutputSampleCountModified != m_dwOutputSampleCountModified);
  return ldblUnixTime;
} // end CSoundDecimatingBuffer::GetUnixTimeForOutputSampleIndex()

/***************************************************************************/
LONGLONG CSoundDecimatingBuffer::GetLatestOutputSampleIndex( void )
  // Returns the "output-sample-index for the NEWEST available sample in the buffer"
{ return m_i64OutputSampleCount;
}


/***************************************************************************/
LONGLONG CSoundDecimatingBuffer::GetOldestOutputSampleIndex( void )
  // Returns the "output-sample-index for the OLDEST available sample in the buffer"
{ LONGLONG i64 = m_i64OutputSampleCount;
  if( m_nOutBufCompsPerSample>0 )
   { i64 -= m_lBufferSize/m_nOutBufCompsPerSample;
   }
  if(i64<0)
   { i64=0;
   }
  return i64;
} // end GetOldestOutputSampleIndex()


/***************************************************************************/
long CSoundDecimatingBuffer::GetOccupiedBufferSpace(
              LONGLONG i64UniqueIndex) // pair index of 1st sample to be read
  /* Calculates the number of SAMPLE PAIRS still waiting in the buffer .
   *      Originally used for debugging and to detect how much buffer
   *      was really required (depending on the CPU speed).
   */
{
 LONGLONG i64;

 if(m_nOutBufCompsPerSample<=0)
     return 0;

 // How many SAMPLE PAIRS are presently in the buffer (for caller's index?)
 // NOTE: In SoundUps, 'm_iNrInputComponents' is used (INTERPOLATE on READ)
 //       In SoundDec, 'm_nOutBufCompsPerSample' is used (DECIMATE on ENTER)

 i64 = m_i64OutputSampleCount - i64UniqueIndex;
 if(i64 >  m_lBufferSize/m_nOutBufCompsPerSample )
  {
    return m_lBufferSize/m_nOutBufCompsPerSample;
  }
 if(i64<0)
  { // If the buffer index for the requested data is "totally off" :
    return 0;
  }
 return (long)i64;
} // end CSoundDecimatingBuffer::GetOccupiedBufferSpace()

/***************************************************************************/
double CSoundDecimatingBuffer::GetOccupiedBufferSpace_seconds(LONGLONG i64UniqueIndex)
  /* Returns the occupied buffer capacity in SECONDS OF RECORDING TIME .
   */
{
  double s = GetOccupiedBufferSpace(i64UniqueIndex);  // -> number of sample points (!)

  // Because the buffer contains DECIMATED samples,
  //  divide by m_dblOutputSampleRate, not m_dblInputSampleRate :
  if( m_dblOutputSampleRate > 0.0 )  // avoid div-by-zero when called "too early"
   { s /= m_dblOutputSampleRate;
   }
  else
   { s = 0.0;
   }

  return s;
} // GetOccupiedBufferSpace_seconds()


/***************************************************************************/
long CSoundDecimatingBuffer::GetFreeBufferSpace(
              LONGLONG i64UniqueIndex) // pair index of 1st sample to be read
  /* Calculates the FREE (unused) number of SAMPLE PAIRS in the internal buffer,
   *      from a certain READERS's point of view .
   *    (remember, there may be
   *      multiple readers but one writer; and each reader maintains
   *      his own 64-bit "unique index". Example in SoundThd.cpp ) .
   *
   *  Note: The 'internal buffer' possibly contains DECIMATED samples.
   *        To get a rough estimate for the "INPUT buffer space"
   *        (from the WRITER's point of view, i.e. for the caller of
   *         CSoundDecimatingBuffer::EnterSamples() ),
   *        multiply the result with m_iDecimationRatio !
   *        (example in  SoundThd.cpp : SOUND_RunProcessingChain() )
   *
   *      Originally used for debugging and to detect how much buffer
   *      was really required (depending on the CPU speed).
   */
{
  long i32Result = GetTotalBufferSpace() - GetOccupiedBufferSpace(i64UniqueIndex);
  if( i32Result < 0 )
   {  i32Result = 0;  // fishy; set breakpoint here .. or there:
      DEBUGGER_BREAK();
   }
  return i32Result;  // -> number of SAMPLE POINTS which are still "free",
                     //    from the caller's point of view .
} // end CSoundDecimatingBuffer::GetFreeBufferSpace()


/***************************************************************************/
long CSoundDecimatingBuffer::GetTotalBufferSpace(void)
  /* Returns the total number of SAMPLE POINTS (pairs, quads, or whatever)
   * which can be placed in the buffer .
   */
{
  if(m_nOutBufCompsPerSample<=0)
     return 0; // no "zero-length" components are allowed in my pretty buffer ;-)
  return m_lBufferSize / m_nOutBufCompsPerSample;
} // GetTotalBufferSpace()

/***************************************************************************/
double CSoundDecimatingBuffer::GetTotalBufferSpace_seconds(void)
  /* Returns the total buffer capacity in SECONDS OF RECORDING TIME .
   */
{
  double s = GetTotalBufferSpace();  // -> number of sample points (!)

  // Because the buffer contains DECIMATED samples,
  //  divide by m_dblOutputSampleRate, not m_dblInputSampleRate :
  if( m_dblOutputSampleRate > 0.0 )  // avoid div-by-zero when called "too early"
   { s /= m_dblOutputSampleRate;
   }
  else
   { s = 0.0;
   }

  return s;
} // GetTotalBufferSpace_seconds()


/***************************************************************************/
long CSoundDecimatingBuffer::GetSamplePoints(
               int iNrSamplePoints, // [in] max number of SAMPLE PAIRS (with one or more "CHANNELS" per sample)
         LONGLONG *pi64UniqueIndex, // [in,out] pair index of 1st sample to be read, will be updated for next call
                 T_Float *pfltDest, // [out] destination buffer, IF 32-BIT FLOATING POINT
                  SHORT  *pi16Dest, // [out} destination buffer, IF 16-BIT INTEGERS
           T_ChunkInfo *pChunkInfo) // [out] precise sampling rate, date+time, "radio" frequency, GPS data
  /* Copies some audio samples from an internal buffer into the caller's buffer.
   *    The maximum number of samples copied this way should be  less than
   *    the "internal" buffer size (which has been defined upon creation).
   *  Reading samples this way doesn't affect the buffer's content, because
   *    MULTIPLE callers can read from the same buffer at different positions.
   *    (which makes checking for 'buffer overflows' on entry a bit difficult)
   *
   *
   * Parameters:
   *   lMaxSamples:  Count of samples the caller wants to get from the buffer
   *                 as A MAXIMUM COUNT (caller can handle less than this size).
   *                 If the buffer contains complex samples (I/Q pairs),
   *                 lMaxSamples must be an even number.
   *
   *  *pi64UniqueIndex: A continously growing index for the first sample to be
   *         copied. For continuous processing, this index will be incremented
   *         by the number of samples (or SAMPLE PAIRS).
   *         If the value is totally wrong, it will be corrected here.
   *   Return value:  <0 : error, i64UniqueIndex is completely invalid.
   *                   0 : no samples available
   *                  >0 : success, number of samples placed in dst.
   * Notes:
   *  -  If the buffer holds COMPLEX values (I/Q samples),
   *     2 * <iNrSamplePairs> 64-bit floating point values are copied !
   *     Buffer format for complex samples:
   *             dst[2*k]   = real part of complex value[k]
   *             dst[2*k+1] = imaginary part,  with 0 <= k <= lMaxSamples/2 .
   *  -  If the buffer is configured for STEREO MODE, with complex (I/Q-)output,
   *     4 * <iNrSamplePairs> 64-bit floating point values are copied !
   */
{
 long l, lBufIndex;
 int  iComponent;
 LONGLONG i64;
 // static T_Float fltVcoPhase=0;  // statics are EVIL ! Weg damit !
 long lBufferSizeAsSamplePairs;
 if( m_nOutBufCompsPerSample<=0 || m_lBufferSize<=0 )
  { return -1;  // this will never work
  }

 lBufferSizeAsSamplePairs = m_lBufferSize/m_nOutBufCompsPerSample;

 // How many SAMPLE PAIRS can be delivered for the caller's index ?
 //  ( the 'unique sample index' must be limited BEFORE calling
 //    ChunkInfoArray_GetInterpolatedEntry(), because ChunkInfo2.c
 //    only keeps a limited number of 'chunk infos' in memory )
 //
 i64 = m_i64OutputSampleCount - *pi64UniqueIndex;    // 2016-10-28 : i64=120000 (10 seconds of "pre-trigger" data with fs=12 kHz)
 if(i64 >= lBufferSizeAsSamplePairs)                 // 2016-10-28 : lBufferSizeAsSamplePairs = 132000 (i.e. "ok, we still have those samples")
  { // the caller's index is too large / 'too old'. He must have been sleeping...
    *pi64UniqueIndex = m_i64OutputSampleCount - lBufferSizeAsSamplePairs;
    if(*pi64UniqueIndex<0)  // correct the caller's buffer index..
       *pi64UniqueIndex=0;  // ..to get 'up-to-date' again
    i64 = m_i64OutputSampleCount - *pi64UniqueIndex; // cannot be < 0 now
  }

 if(i64<0)
  { // the caller's index is 'looking at the future' - tell him he's wrong
    *pi64UniqueIndex = m_i64OutputSampleCount;
    return -1;
  }


  if(iNrSamplePoints > i64)  // limit the number of 'delivered samples'
   { iNrSamplePoints = (int)i64;
     // 2012-01-08 : Repeatedly got here with i64 = 7. That's not worth
     //              delivering anything to the caller -> modified !
     //              The MINIMUM number of samples returned to the caller
     //              is now a quarter of the "requested" (wanted) value .
   }

  if( iNrSamplePoints<=0 )
   { return 0;    // nothing in the buffer now, see you later...
     // (modified 2015-01-19, formerly bailed out if less than a quarter of the requested size
     //  was available, but due to decimation the caller must be able to process even A SINGLE sample)
   }


  /* Calculate the true buffer start index for the requested samples.      */
  /* Note: CSoundDecimatingBuffer uses a circular buffer which only holds  */
  /*       the "latest" recorded samples                                   */
  lBufIndex = (long)((*pi64UniqueIndex * m_nOutBufCompsPerSample) % m_lBufferSize);
  if (lBufIndex<0)
   {  lBufIndex += m_lBufferSize;  // "inverse" wrap (rarely ever happens)
   }

  // Retrieve the 'chunk info' (including UTC, radio frequency, GPS pos)
  // for the requested buffer index (counter or OUTPUT samples):
  if( pChunkInfo )    // caller needs a CHUNK INFO which exactly matches the first sample
   {                  // which we're about to read from the (decimating) buffer.
     // This is not as trivial as it seems .. so WB put it in an extra module :
#   if( SWI_USE_CHUNK_INFO2 ) // SoundUtl/ChunkInfo2.c (with T_ChunkArray) present ?
     if( ! ChunkInfoArray_GetInterpolatedEntry(
                        &m_ChunkArray, // [in] an array of ChunkInfo entries
                         pChunkInfo,   // [out] precise sampling rate, date+time, "radio" frequency, GPS data
                     *pi64UniqueIndex))// [in] compared with pChunkInfoArray->i64TotalSampleCounter
       { return 0;   // <<< set breakpoint here <<< ... not a single T_ChunkInfo available ?
         // 2016-10-28 : GetSamplePoints() bailed out here because the 'chunk history'
         //              had been flooded by calling ChunkInfoArray_Append() too frequently
         //                                     from __ ,
         //              so that m_ChunkArray.nUsedEntries reached CHUNK_ARRAY_MAX_ENTRIES
         //              before the SAMPLE BUFFER was full !
       }
     else   // Successfully retrieved a time-interpolated chunk info.
       {    // Note: A few(!) of the T_ChunkInfo's members have already
            // been modified when ENTERING those chunk-infos in the queue
            //  - see CSoundDecimatingBuffer::ProcessSamples() !
#   if(1)   // plausibility check for the timestamps (pulled from the buffer) ?
        if( m_ldblNextExpectedOutputTimestamp > 0 )
         {
           double d = ( pChunkInfo->ldblUnixDateAndTime - m_ldblNextExpectedOutputTimestamp )
                      * pChunkInfo->dblPrecSamplingRate; // -> d = number of SAMPLES (!)
           if( (d<-1.5) || (d>1.5) )
            { d = d;  // <<< set breakpoint here <<<
              // 2012-01-08 : Added this after problems with timestamps in Vorbis streams.
              //  2012-01-08 21:00 : NEVER got here even though
              //                     a problem with the timestamp was detected
              //                     a bit later in C_VorbisFileIO::WriteSamples_Float() .
            }
         }
#   endif   // plausibility check for the timestamps ?
       }
#   elif( SWI_USE_CHUNK_INFO ) // SoundUtl/ChunkInfo.c present ?
     // ! SWI_USE_CHUNK_INFO2 -> cannot interpolate, so just return the latest chunk-info we have
     ChunkInfo_CopyFromTo( &m_ChunkInfo, pChunkInfo, sizeof(T_ChunkInfo) );
#   endif //  SWI_USE_CHUNK_INFO[2] ?

   } // end if( pChunkInfo )

 // Now copy the audio samples into the caller's buffer, so that the LAST
 // copied sample will be the LATEST sample that has been recorded.
 if(pfltDest!=NULL)
  {
   for(l=0; l<iNrSamplePoints; ++l)
    {
     for(iComponent=0; iComponent<m_nOutBufCompsPerSample; ++iComponent)
      {
       if (lBufIndex>=m_lBufferSize)
           lBufIndex=0;
       *pfltDest++ = m_pfltBuffer[lBufIndex++];
      } // end <loop for all COMPONENTS of a sample point>
    } // end <loop for SAMPLE POINTS>
  } // end if <copy into destination as FLOAT's >
 else
 if(pi16Dest!=NULL)
  {
   T_Float d; // << only for sample values, not for the NCO
   for(l=0; l<iNrSamplePoints; ++l)
    {
     for(iComponent=0; iComponent<m_nOutBufCompsPerSample; ++iComponent)
      {
       if (lBufIndex>=m_lBufferSize)
           lBufIndex=0;
       d = m_pfltBuffer[lBufIndex++];
       if(d<-32767) d=-32767;  // "-32768" is reserved for special occasions ;-)
       if(d> 32767) d= 32767;
       *pi16Dest++ = (SHORT)d;
      } // end <loop for COMPONENTS in a sample point>
    } // end <loop for SAMPLE POINTS>
  } // end if <copy into destination as 16-BIT INTEGERs >

 if(iNrSamplePoints>0)    // keep the caller's buffer index up-to-date
   { *pi64UniqueIndex+=iNrSamplePoints;
     if( pChunkInfo )  // don't assume the caller always uses 'Chunk Info' !
      {                // (for example, the 'Sound-Input-Utility' does NOT)
        m_ldblNextExpectedOutputTimestamp = pChunkInfo->ldblUnixDateAndTime
           + (long double)iNrSamplePoints / pChunkInfo->dblPrecSamplingRate;
        pChunkInfo->dwNrOfSamplePoints = iNrSamplePoints; // !!
      }
   }
 return iNrSamplePoints;  // returns the number of samples placed in the caller's buffer
} // end   CSoundDecimatingBuffer::GetSamplePoints()


/***************************************************************************/
void CSoundDecimatingBuffer::SetFastMode(int iFastMode)
{ // "fast mode" means less accurate anti-aliasing filtering but higher speed
  m_iFastAndUgly = iFastMode;
} // end CSoundDecimatingBuffer::SetFastMode()

/***************************************************************************/
int CSoundDecimatingBuffer::GetFastMode(void)
{
  return m_iFastAndUgly;
} // end CSoundDecimatingBuffer::GetFastMode()
