diff --git a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt
index e004746f046157c22a0c677b50aec3d0549a439d..cbd24ffa5edd3f502c5840525e726117043e3ce8 100644
--- a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt
+++ b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt
@@ -22,85 +22,96 @@ Pipeline
 For max size, we assume:
  - 48 stations
  - 1 subband
- - 1 second blocks (195312.5 samples) rounded to next multiple of 4096
+ - 1 second blocks (195312.5 samples) rounded to next multiple of 4096 (= 196608 samples).
 
 Note:
-  MiB = 2^20 bytes (1048576)
-  (*) = requires change from current implementation
+  MiB = 2^20 bytes (= 1048576 bytes).
 
 Flow:           Data dimensions:                        Max size (fcomplex):                        Buffer:
 ===================================================================================================================
-(input)         [station][samples][pol]                 [48][196608][2]     =  72 MiB               A
+(input)         [station][sample][pol]                  [48][196608][2]     =  72 MiB               A
    |                                                    (i16complex)
    V
 IntToFloat + Transpose
-   |            [station][pol][samples]                 [48][2][196608]     = 144 MiB               B
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               B
+   V
+FFT-shift {inplace}
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               B
    V
 FFT-64 {inplace}
-   |            [station][pol][samples][channel]        [48][2][3072][64]   = 144 MiB               B
+   |            [station][pol][sample][channel]         [48][2][3072][64]   = 144 MiB               B
+   V
+Delay compensation + Transpose {I/O: delays}
+   |            [station][pol][channel][sample]         [48][2][64][3072]   = 144 MiB               A
    V
-Delay compensation (*: no transpose) {I/O: delays}
-   |            [station][pol][channel][samples]        [48][2][64][3072]   = 144 MiB               A
+FFT-shift {inplace}
+   |            [station][pol][channel][sample]         [48][2][64][3072]   = 144 MiB               A
    V
 FFT-64 {inplace}
-   |            [station][pol][chan1][samples][chan2]   [48][2][64][48][64] = 144 MiB               A
+   |            [station][pol][chan1][sample][chan2]    [48][2][64][48][64] = 144 MiB               A
    V
 BandPass + Transpose {I/O: weights}
-   |            [station][chan1][chan2][samples][pol]   [48][64][64][48][2] = 144 MiB               B
-   V          = [stations][channel][samples][pol]
+   |            [station][chan1][chan2][sample][pol]    [48][64][64][48][2] = 144 MiB               B
+   V          = [station][channel][sample][pol]
    X
 
 Complex Voltages/Coherent Stokes:
 -----------------------------------
-   X            [station][channel][samples][pol]        [48][4096][48][2]   = 144 MiB               B
+   X            [station][channel][sample][pol]         [48][4096][48][2]   = 144 MiB               B
    |
    V
 BeamFormer {I/O: weights}
-   |            [channel][samples][tab][pol]            [4096][48][tab][2]  = 3 MiB/TAB             A
+   |            [channel][sample][tab][pol]             [4096][48][tab][2]  = 3 MiB/TAB             A
    V
 Transpose  
-   |            [tab][pol][samples][channel]            [tab][2][48][4096]  = 3 MiB/TAB             1ch: CS: C, CV: D
+   |            [tab][pol][sample][channel]             [tab][2][48][4096]  = 3 MiB/TAB             1ch: CS: C, CV: D
    |                                                                                                Nch: CS: D, CV: C
    V
 iFFT-4k {inplace}
-   |            [tab][pol][samples]                     [tab][2][196608]    = 3 MiB/TAB             1ch: CS: C, CV: D
+   |            [tab][pol][sample]                      [tab][2][196608]    = 3 MiB/TAB             1ch: CS: C, CV: D
+   |                                                                                                Nch: CS: D, CV: C
+   V
+FFT-shift {inplace}
+   |            [tab][pol][sample]                      [tab][2][196608]    = 3 MiB/TAB             1ch: CS: C, CV: D
    |                                                                                                Nch: CS: D, CV: C
    V
 FIR-16 (if >1ch)
-   |            [tab][pol][samples]                     [tab][2][196608]    = 3 MiB/TAB             1ch: CS: -, CV: -
+   |            [tab][pol][sample]                      [tab][2][196608]    = 3 MiB/TAB             1ch: CS: -, CV: -
    |                                                                                                Nch: CS: C, CV: D
    V
 FFT-16 {inplace} (if >1ch)
-   |            [tab][pol][samples][channel]            [tab][2][12288][16] = 3 MiB/TAB             1ch: CS: -, CV: -
+   |            [tab][pol][sample][channel]             [tab][2][12288][16] = 3 MiB/TAB             1ch: CS: -, CV: -
    |                                                                                                Nch: CS: C, CV: D
    V
-Coherent Stokes (*: no transpose)
-   |            [tab][stokes][samples][channel]         [tab][4][12288][16] = 0.75 MiB/TAB/Stokes   1ch: CS: D, CV: -
+Coherent Stokes
+   |            [tab][stokes][sample][channel]          [tab][4][12288][16] = 0.75 MiB/TAB/Stokes   1ch: CS: D, CV: -
    |                                                    (float)                                     Nch: CS: D, CV: -
    V
 (output)
 
 Incoherent Stokes:
 -----------------------------------
-   X            [station][channel][samples][pol]        [48][4096][48][2]   = 144 MiB               B
+   X            [station][channel][sample][pol]         [48][4096][48][2]   = 144 MiB               B
    |
    V
 Transpose + Copy
-   |            [station][pol][samples][channel]        [48][2][48][4096]   = 144 MiB               A
+   |            [station][pol][sample][channel]         [48][2][48][4096]   = 144 MiB               A
    V
 iFFT-4k {inplace}
-   |            [station][pol][samples]                 [48][2][196608]     = 144 MiB               A
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               A
+   V
+FFT-shift {inplace}
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               A
    V
 FIR-16 (if >1ch) 
-   |            [station][pol][samples]                 [48][2][196608]     = 144 MiB               B
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               B
    |
    V
 FFT-16 {inplace} (if >1ch)
-   |            [station][pol][samples][channel]        [48][2][12288][16]  = 144 MiB               B
-   |
+   |            [station][pol][sample][channel]         [48][2][12288][16]  = 144 MiB               B
    V
-Incoherent Stokes (*: no transpose)
-   |            [stokes][samples][channel]              [4][12288][16]      = 3 MiB                 E
+Incoherent Stokes
+   |            [stokes][sample][channel]               [4][12288][16]      = 3 MiB                 E
    V                                                    (float)
 (output)
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc
index 78601b3b78abb6ea8f02af3e24db5fd9cfc20c00..f33b69542d2b811aa67402b17df6704f784d91f6 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc
@@ -34,7 +34,7 @@ namespace LOFAR
         delayCompensation(delayCompensationParams(ps)),
         beamFormer(beamFormerParams(ps)),
         coherentTranspose(coherentTransposeParams(ps)),
-        fftShiftKernel(FFTShiftKernelParams(ps)),
+        fftShift(fftShiftParams(ps)),
         firFilter(firFilterParams(ps, nrSubbandsPerSubbandProc)),
         coherentStokes(coherentStokesParams(ps)),
         incoherentStokes(incoherentStokesParams(ps)),
@@ -129,7 +129,7 @@ namespace LOFAR
       }
 
       FFTShiftKernel::Parameters
-      BeamFormerFactories::FFTShiftKernelParams(const Parset &ps) const
+      BeamFormerFactories::fftShiftParams(const Parset &ps) const
       {
         FFTShiftKernel::Parameters params(ps);
         // Currently a static in the subband proc
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h
index 9cbc56f4ce3ba9758a837ee867fcf8c9214352e5..fe2400634f55219c62b19421a4b838f8f525c9ad 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h
@@ -50,7 +50,7 @@ namespace LOFAR
       KernelFactory<DelayAndBandPassKernel> delayCompensation;
       KernelFactory<BeamFormerKernel> beamFormer;
       KernelFactory<CoherentStokesTransposeKernel> coherentTranspose;
-      KernelFactory<FFTShiftKernel> fftShiftKernel;
+      KernelFactory<FFTShiftKernel> fftShift;
       KernelFactory<FIR_FilterKernel> firFilter;
       KernelFactory<CoherentStokesKernel> coherentStokes;
       KernelFactory<IncoherentStokesKernel> incoherentStokes;
@@ -74,7 +74,7 @@ namespace LOFAR
       delayCompensationParams(const Parset &ps) const;
 
       FFTShiftKernel::Parameters
-        FFTShiftKernelParams(const Parset &ps) const;
+      fftShiftParams(const Parset &ps) const;
 
       FIR_FilterKernel::Parameters
       firFilterParams(const Parset &ps, size_t nrSubbandsPerSubbandProc) const;
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc
index 8738668893b08229f54e6b3fc5644393c70a39ae..ec57b6b3e6ced765564b4f3a33d435bbc684321f 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc
@@ -98,6 +98,11 @@ namespace LOFAR
       intToFloatBuffers(devInput.inputSamples, devB),
       intToFloatKernel(factories.intToFloat.create(queue, intToFloatBuffers)),
 
+      // FFTShift: B -> B
+      firstFFTShiftBuffers(devB, devB),
+      firstFFTShiftKernel(
+        factories.fftShift.create(queue, firstFFTShiftBuffers)),
+
       // FFT: B -> B
       firstFFT(queue,
         ps.settings.beamFormer.nrDelayCompensationChannels,
@@ -112,6 +117,11 @@ namespace LOFAR
       delayCompensationKernel(
         factories.delayCompensation.create(queue, delayCompensationBuffers)),
 
+      // FFTShift: A -> A
+      secondFFTShiftBuffers(devA, devA),
+      secondFFTShiftKernel(
+        factories.fftShift.create(queue, secondFFTShiftBuffers)),
+
       // FFT: A -> A
       secondFFT(queue,
         ps.settings.beamFormer.nrHighResolutionChannels /
@@ -167,6 +177,12 @@ namespace LOFAR
          ps.settings.beamFormer.nrHighResolutionChannels),
          false, coherentTransposeBuffers.output),
 
+      // fftshift: C/D -> C/D (in-place) = transposeBuffers.output
+      inverseFFTShiftBuffers(
+        coherentTransposeBuffers.output, coherentTransposeBuffers.output),
+      inverseFFTShiftKernel(
+        factories.fftShift.create(queue, inverseFFTShiftBuffers)),
+
       // FIR filter: D/C -> C/D
       //
       // Input buffer:
@@ -225,6 +241,11 @@ namespace LOFAR
          ps.settings.beamFormer.nrHighResolutionChannels),
         false, devA),
 
+      // inverse FFTShift: A -> A
+      incoherentInverseFFTShiftBuffers(devA, devA),
+      incoherentInverseFFTShiftKernel(
+        factories.fftShift.create(queue, incoherentInverseFFTShiftBuffers)),
+
       // FIR filter: A -> B
       devIncoherentFilterWeights(
         context,
@@ -335,17 +356,21 @@ namespace LOFAR
     BeamFormerSubbandProc::Counters::Counters(gpu::Context &context)
       :
     intToFloat(context),
+    firstFFTShift(context),
     firstFFT(context),
     delayBp(context),
+    secondFFTShift(context),
     secondFFT(context),
     correctBandpass(context),
     beamformer(context),
     transpose(context),
     inverseFFT(context),
+    inverseFFTShift(context),
     firFilterKernel(context),
     finalFFT(context),
     coherentStokes(context),
     incoherentInverseFFT(context),
+    incoherentInverseFFTShift(context),
     incoherentFirFilterKernel(context),
     incoherentFinalFFT(context),
     incoherentStokes(context),
@@ -362,13 +387,16 @@ namespace LOFAR
       LOG_INFO_STR(
         "**** BeamFormerSubbandProc GPU mean and stDev ****" << endl <<
         std::setw(20) << "(intToFloat)" << intToFloat.stats << endl <<
+        std::setw(20) << "(firstFFTShift)" << firstFFTShift.stats << endl <<
         std::setw(20) << "(firstFFT)" << firstFFT.stats << endl <<
         std::setw(20) << "(delayBp)" << delayBp.stats << endl <<
+        std::setw(20) << "(secondFFTShift)" << secondFFTShift.stats << endl <<
         std::setw(20) << "(secondFFT)" << secondFFT.stats << endl <<
         std::setw(20) << "(correctBandpass)" << correctBandpass.stats << endl <<
         std::setw(20) << "(beamformer)" << beamformer.stats << endl <<
         std::setw(20) << "(transpose)" << transpose.stats << endl <<
         std::setw(20) << "(inverseFFT)" << inverseFFT.stats << endl <<
+        std::setw(20) << "(inverseFFTShift)" << inverseFFTShift.stats << endl <<
         std::setw(20) << "(firFilterKernel)" << firFilterKernel.stats << endl <<
         std::setw(20) << "(finalFFT)" << finalFFT.stats << endl <<
         std::setw(20) << "(coherentStokes)" << coherentStokes.stats << endl <<
@@ -376,6 +404,7 @@ namespace LOFAR
         std::setw(20) << "(visibilities)" << visibilities.stats << endl <<
         std::setw(20) << "(incoherentOutput )" << incoherentOutput.stats << endl <<
         std::setw(20) << "(incoherentInverseFFT)" << incoherentInverseFFT.stats << endl <<
+        std::setw(20) << "(incoherentInverseFFTShift)" << incoherentInverseFFTShift.stats << endl <<
         std::setw(20) << "(incoherentFirFilterKernel)" << incoherentFirFilterKernel.stats << endl <<
         std::setw(20) << "(incoherentFinalFFT)" << incoherentFinalFFT.stats << endl <<
         std::setw(20) << "(incoherentStokes)" << incoherentStokes.stats << endl <<
@@ -419,6 +448,7 @@ namespace LOFAR
       // Otherwise, a kernel arg may not be set...
       intToFloatKernel->enqueue(input.blockID, counters.intToFloat);
 
+      firstFFTShiftKernel->enqueue(input.blockID, counters.firstFFTShift);
       firstFFT.enqueue(input.blockID, counters.firstFFT);
 
       delayCompensationKernel->enqueue(
@@ -426,6 +456,7 @@ namespace LOFAR
         ps.settings.subbands[subband].centralFrequency,
         ps.settings.subbands[subband].SAP);
 
+      secondFFTShiftKernel->enqueue(input.blockID, counters.secondFFTShift);
       secondFFT.enqueue(input.blockID, counters.secondFFT);
 
       bandPassCorrectionKernel->enqueue(
@@ -440,7 +471,9 @@ namespace LOFAR
           ps.settings.subbands[subband].SAP);
 
         coherentTransposeKernel->enqueue(input.blockID, counters.transpose);
+
         inverseFFT.enqueue(input.blockID, counters.inverseFFT);
+        inverseFFTShiftKernel->enqueue(input.blockID, counters.inverseFFTShift);
 
         if (coherentStokesPPF) 
         {
@@ -464,6 +497,8 @@ namespace LOFAR
 
         incoherentInverseFFT.enqueue(
           input.blockID, counters.incoherentInverseFFT);
+        incoherentInverseFFTShiftKernel->enqueue(
+          input.blockID, counters.incoherentInverseFFTShift);
 
         if (incoherentStokesPPF) 
         {
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h
index 9e7dc2562a28897ac83e9a73e17f326ada61abab..c946e76b1004d899a63f6913aeef21ea82f500a4 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h
@@ -37,6 +37,7 @@
 #include <GPUProc/Kernels/CoherentStokesTransposeKernel.h>
 #include <GPUProc/Kernels/CoherentStokesKernel.h>
 #include <GPUProc/Kernels/DelayAndBandPassKernel.h>
+#include <GPUProc/Kernels/FFTShiftKernel.h>
 #include <GPUProc/Kernels/FFT_Kernel.h>
 #include <GPUProc/Kernels/FIR_FilterKernel.h>
 #include <GPUProc/Kernels/IncoherentStokesKernel.h>
@@ -90,18 +91,22 @@ namespace LOFAR
 
         // gpu kernel counters
         PerformanceCounter intToFloat;
+        PerformanceCounter firstFFTShift;
         PerformanceCounter firstFFT;
         PerformanceCounter delayBp;
+        PerformanceCounter secondFFTShift;
         PerformanceCounter secondFFT;
         PerformanceCounter correctBandpass;
         PerformanceCounter beamformer;
         PerformanceCounter transpose;
         PerformanceCounter inverseFFT;
+        PerformanceCounter inverseFFTShift;
         PerformanceCounter firFilterKernel;
         PerformanceCounter finalFFT;
         PerformanceCounter coherentStokes;
 
         PerformanceCounter incoherentInverseFFT;
+        PerformanceCounter incoherentInverseFFTShift;
         PerformanceCounter incoherentFirFilterKernel;
         PerformanceCounter incoherentFinalFFT;
         PerformanceCounter incoherentStokes;
@@ -149,6 +154,10 @@ namespace LOFAR
       IntToFloatKernel::Buffers intToFloatBuffers;
       std::auto_ptr<IntToFloatKernel> intToFloatKernel;
 
+      // First FFT-shift
+      FFTShiftKernel::Buffers firstFFTShiftBuffers;
+      std::auto_ptr<FFTShiftKernel> firstFFTShiftKernel;
+
       // First (64 points) FFT
       FFT_Kernel firstFFT;
 
@@ -156,6 +165,10 @@ namespace LOFAR
       DelayAndBandPassKernel::Buffers delayCompensationBuffers;
       std::auto_ptr<DelayAndBandPassKernel> delayCompensationKernel;
 
+      // Second FFT-shift
+      FFTShiftKernel::Buffers secondFFTShiftBuffers;
+      std::auto_ptr<FFTShiftKernel> secondFFTShiftKernel;
+
       // Second (64 points) FFT
       FFT_Kernel secondFFT;
 
@@ -182,6 +195,10 @@ namespace LOFAR
       // inverse (4k points) FFT
       FFT_Kernel inverseFFT;
 
+      // inverse FFT-shift
+      FFTShiftKernel::Buffers inverseFFTShiftBuffers;
+      std::auto_ptr<FFTShiftKernel> inverseFFTShiftKernel;
+
       // Poly-phase filter (FIR + FFT)
       gpu::DeviceMemory devFilterWeights;
       gpu::DeviceMemory devFilterHistoryData;
@@ -205,6 +222,10 @@ namespace LOFAR
       // Inverse (4k points) FFT
       FFT_Kernel incoherentInverseFFT;
 
+      // Inverse FFT-shift
+      FFTShiftKernel::Buffers incoherentInverseFFTShiftBuffers;
+      std::auto_ptr<FFTShiftKernel> incoherentInverseFFTShiftKernel;
+
       // Poly-phase filter (FIR + FFT)
       gpu::DeviceMemory devIncoherentFilterWeights;
       gpu::DeviceMemory devIncoherentFilterHistoryData;
diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc
index a01887b3da6827c31b0fbdb981e9957af99d0197..3b75c8ea5530d0aa09c9f989143431ca562f89ec 100644
--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc
@@ -23,6 +23,7 @@
 
 #include <complex>
 #include <cmath>
+#include <iomanip>
 
 #include <Common/LofarLogger.h>
 #include <CoInterface/Parset.h>
@@ -39,16 +40,20 @@ using namespace LOFAR::TYPES;
 template<typename T> T inputSignal(size_t t)
 {
   size_t nrBits = sizeof(T) / 2 * 8;
-  double freq = 1.0 / 4.0; // in samples
-  // double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
   double amp = (1 << (nrBits - 1)) - 1;
-
+#if 1 // Toggle to experiment with pulse like input
+  // Sine wave
+  // double freq = 1.0 / 4.0; // in samples
+  double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
   double angle = (double)t * 2.0 * M_PI * freq;
-
   double s = ::sin(angle);
   double c = ::cos(angle);
-
   return T(::round(amp * c), ::round(amp * s));
+#else
+  // Pulse train
+  if (t % (2 * 64 + 17) == 0) return T(amp);
+  else return T(0);
+#endif
 }
 
 int main() {
@@ -152,7 +157,7 @@ int main() {
   // Block number: 0 .. inf
   in.blockID.block = 0;
 
- // Subband index in the observation: [0, ps.nrSubbands())
+  // Subband index in the observation: [0, ps.nrSubbands())
   in.blockID.globalSubbandIdx = 0;
 
   // Subband index for this pipeline/workqueue: [0, subbandIndices.size())
@@ -206,9 +211,10 @@ int main() {
   for (size_t s = 0; s < nrStokes; s++)
     for (size_t t = 0; t < nrSamples; t++)
       for (size_t c = 0; c < nrChannels; c++)
-        ASSERTSTR(fpEquals(out[0][s][t][c], outVal), 
+        ASSERTSTR(fpEquals(out[0][s][t][c], outVal, 1e-4f), 
                   "out[" << s << "][" << t << "][" << c << "] = " << 
-                  out[0][s][t][c] << "; outVal = " << outVal);
+                  setprecision(12) << out[0][s][t][c] << 
+                  "; outVal = " << outVal);
   
   return 0;
 }
diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc
index eff715074769797ed01c8da923f7f235ba9fd3ab..19b703c87012bf0588faf5d5e7e2c7f370d3dceb 100644
--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc
@@ -40,16 +40,20 @@ using namespace LOFAR::TYPES;
 template<typename T> T inputSignal(size_t t)
 {
   size_t nrBits = sizeof(T) / 2 * 8;
-  double freq = 1.0 / 4.0; // in samples
-  // double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
   double amp = (1 << (nrBits - 1)) - 1;
-
+#if 1  // Toggle to experiment with pulse type input
+  // Sine wave
+  // double freq = 1.0 / 4.0; // in samples
+  double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
   double angle = (double)t * 2.0 * M_PI * freq;
-
   double s = ::sin(angle);
   double c = ::cos(angle);
-
   return T(::round(amp * c), ::round(amp * s));
+#else
+  // Pulse train
+  if (t % (2 * 64 + 17) == 0) return T(amp);
+  else return T(0);
+#endif
 }
 
 int main() {
@@ -219,7 +223,7 @@ int main() {
     for (size_t t = 0; t < nrSamples; t++)
     for (size_t c = 0; c < nrChannels; c++)
     {
-      ASSERTSTR(fpEquals(out[tab][s][t][c], outVal),
+      ASSERTSTR(fpEquals(out[tab][s][t][c], outVal, 1e-4f),
         "out[" << tab << "][" << s << "][" << t << "][" << c << "] = " << setprecision(12) <<
         out[tab][s][t][c] << "; outVal = " << outVal);
     }