diff --git a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt
index 1354c2e361631b2781e94d293f1f2a43aa2528d7..fc638f25c4f72ea52a354950285d27442a668d67 100644
--- a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt
+++ b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt
@@ -100,7 +100,7 @@ FFT-16 {inplace} (if >1ch)
    |            [station][pol][samples][channel]        [48][2][12288][16]  = 144 MiB               B
    |
    V
-Incoherent Stokes (*: no transpose) (*** kernel must be fixed: wrong input format ***)
+Incoherent Stokes (*: no transpose)
    |            [stokes][samples][channel]              [4][12288][16]      = 3 MiB                 E
    V                                                    (float)
 (output)
diff --git a/RTCP/Cobalt/GPUProc/share/gpu/kernels/BandPassCorrection.cu b/RTCP/Cobalt/GPUProc/share/gpu/kernels/BandPassCorrection.cu
index 61a95f5aa260ddf99d8472c5ec35037dca2a5052..10f68b42f871e5e049c0cb0f9d7af1f244c41e86 100644
--- a/RTCP/Cobalt/GPUProc/share/gpu/kernels/BandPassCorrection.cu
+++ b/RTCP/Cobalt/GPUProc/share/gpu/kernels/BandPassCorrection.cu
@@ -34,6 +34,7 @@
  * - @c NR_CHANNELS_2: a multiple of 16
  * - @c NR_SAMPLES_PER_CHANNEL: > a multiple of 16
  * - @c NR_BITS_PER_SAMPLE: 8 or 16
+ * - @c DO_BANDPASS_CORRECTION: if defined, perform bandpass correction
  */
 
 #include "gpu_math.cuh"
@@ -69,7 +70,8 @@ typedef  const float (* BandPassFactorsType)[NR_CHANNELS_1 * NR_CHANNELS_2];
 
 /**
  * This kernel performs on the input data:
- * - Apply a bandpass correction to compensate for the errors introduced by the
+ * - If the preprocessor variable \c DO_BANDPASS_CORRECTION is defined, apply a
+ *   bandpass correction to compensate for the errors introduced by the
  *   polyphase filter that produced the subbands. This error is deterministic,
  *   hence it can be fully compensated for.
  * - Transpose the data so that the samples for each channel are placed
@@ -92,9 +94,11 @@ __global__ void bandPassCorrection( fcomplex * outputDataPtr,
   OutputDataType outputData = (OutputDataType) outputDataPtr;
   InputDataType inputData   = (InputDataType)  inputDataPtr;
 
+#if defined DO_BANDPASS_CORRECTION
   // Band pass to apply to the channels  
   BandPassFactorsType bandPassFactors = (BandPassFactorsType) bandPassFactorsPtr;
-  
+#endif
+
   // fasted dims
   unsigned chan2        = blockIdx.x * blockDim.x + threadIdx.x;
   unsigned sample       = blockIdx.y * blockDim.y + threadIdx.y;
@@ -109,17 +113,18 @@ __global__ void bandPassCorrection( fcomplex * outputDataPtr,
 
   for (unsigned idx_channel1 = 0; idx_channel1 < NR_CHANNELS_1; ++idx_channel1)
   {
-    // idx_channel1 steps with NR_CHANNELS_2 tru the channel weights 
-    float weight((*bandPassFactors)[idx_channel1 * NR_CHANNELS_2 + chan2]);
-
     // Read from global memory in the quickest dimension (optimal)
     fcomplex sampleX = (*inputData)[station][0][idx_channel1][sample][chan2];
     fcomplex sampleY = (*inputData)[station][1][idx_channel1][sample][chan2];
-    
+
+#if defined DO_BANDPASS_CORRECTION
+    // idx_channel1 steps with NR_CHANNELS_2 tru the channel weights 
+    float weight((*bandPassFactors)[idx_channel1 * NR_CHANNELS_2 + chan2]);
     sampleX.x *= weight;
     sampleX.y *= weight;
     sampleY.x *= weight;
     sampleY.y *= weight;
+#endif
 
     // Write the data to shared memory
     tmp[threadIdx.y][threadIdx.x][0] = sampleX;
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.cc
index 5b72388f988c5fbe4a8313e16013c41ca265dc09..ee638a8ced69b66722b19ba6e6fe6cd25c3adad9 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.cc
@@ -48,7 +48,10 @@ namespace LOFAR
       Kernel::Parameters(ps),
       nrBitsPerSample(ps.settings.nrBitsPerSample),
       nrBytesPerComplexSample(ps.nrBytesPerComplexSample()),
-      nrSAPs(ps.settings.SAPs.size())
+      nrSAPs(ps.settings.SAPs.size()),
+      nrChannels1(64),  // TODO: Must be read from parset?
+      nrChannels2(64),  // TODO: Must be read from parset?
+      correctBandPass(ps.settings.corrections.bandPass)
     {
       dumpBuffers = 
         ps.getBool("Cobalt.Kernels.BandPassCorrectionKernel.dumpOutput", false);
@@ -77,7 +80,7 @@ namespace LOFAR
       nrBytesRead = nrBytesWritten = nrSamples * sizeof(std::complex<float>);
 
       gpu::HostMemory bpWeights(stream.getContext(), buffers.bandPassCorrectionWeights.size());
-      BandPass::computeCorrectionFactors(bpWeights.get<float>(), params.nrChannelsPerSubband);
+      BandPass::computeCorrectionFactors(bpWeights.get<float>(), params.nrChannels1 * params.nrChannels2);
       stream.writeBuffer(buffers.bandPassCorrectionWeights, bpWeights, true);
      
     }
@@ -125,6 +128,8 @@ namespace LOFAR
         lexical_cast<string>(itsParameters.nrChannels1);
       defs["NR_CHANNELS_2"] =
         lexical_cast<string>(itsParameters.nrChannels2);
+      if (itsParameters.correctBandPass)
+        defs["DO_BANDPASS_CORRECTION"] = "1";
       return defs;
     }
   }
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.h
index 2e8dbf86b59b842939f1dd6c6f2e8597ff231f75..556ba7564428e10638ff65d20c462ad59abeee2d 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.h
@@ -48,6 +48,7 @@ namespace LOFAR
         size_t nrSAPs;
         size_t nrChannels1;
         size_t nrChannels2;
+        bool correctBandPass;
       };
 
       enum BufferType
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc
index ad05e85795463acd3ada4f5b87e731534cd83051..b419476869311631f36644a477c13d8cf9061e35 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc
@@ -70,6 +70,13 @@ namespace LOFAR
                                        const Parameters& params) :
       Kernel(stream, gpu::Function(module, theirFunction), buffers, params)
     {
+      LOG_DEBUG_STR("DelayAndBandPassKernel: " <<
+                    "correctBandPass=" << 
+                    (params.correctBandPass ? "true" : "false") <<
+                    ", delayCompensation=" <<
+                    (params.delayCompensation ? "true" : "false") <<
+                    ", transpose=" << (params.transpose ? "true" : "false"));
+
       ASSERT(params.nrChannelsPerSubband % 16 == 0 || params.nrChannelsPerSubband == 1);
       ASSERT(params.nrSamplesPerChannel % 16 == 0);
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/FFT_Kernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/FFT_Kernel.cc
index 1f7b6362f38f229c336fdafa83231221ce34d0b6..59f1d71013778caf12fc66033b22ad197e425529 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/FFT_Kernel.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/FFT_Kernel.cc
@@ -46,6 +46,10 @@ namespace LOFAR
       buffer(buffer),
       itsStream(stream)
     {
+      LOG_DEBUG_STR("FFT_Kernel: " <<
+                    "fftSize=" << fftSize << 
+                    ", direction=" << (forward ? "forward" : "inverse") <<
+                    ", nrFFTs=" << nrFFTs);
     }
 
     void FFT_Kernel::enqueue(const BlockID &blockId, 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc
index 6e217c691c35805472b27d7394797b2b12500b7f..f4ad933533cdb9c0cd11221228ed4d0466c91e64 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc
@@ -58,7 +58,8 @@ namespace LOFAR
                    const Parameters &params)
       : 
       gpu::Function(function),
-      maxThreadsPerBlock(stream.getContext().getDevice().getMaxThreadsPerBlock()),
+      maxThreadsPerBlock(
+        stream.getContext().getDevice().getMaxThreadsPerBlock()),
       itsStream(stream),
       itsBuffers(buffers),
       itsParameters(params)
@@ -71,27 +72,37 @@ namespace LOFAR
         function.getAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS));
     }
 
-    void Kernel::setEnqueueWorkSizes(gpu::Grid globalWorkSize, gpu::Block localWorkSize)
+    void Kernel::setEnqueueWorkSizes(gpu::Grid globalWorkSize, 
+                                     gpu::Block localWorkSize)
     {
       gpu::Grid grid;
       ostringstream errMsgs;
 
-      // Enforce by the hardware supported work sizes to see errors clearly and early.
+      // Enforce by the hardware supported work sizes to see errors clearly and
+      // early.
 
-      gpu::Block maxLocalWorkSize = itsStream.getContext().getDevice().getMaxBlockDims();
+      gpu::Block maxLocalWorkSize = 
+        itsStream.getContext().getDevice().getMaxBlockDims();
       if (localWorkSize.x > maxLocalWorkSize.x ||
           localWorkSize.y > maxLocalWorkSize.y ||
           localWorkSize.z > maxLocalWorkSize.z)
-        errMsgs << "  - localWorkSize must be at most " << maxLocalWorkSize << endl;
-
-      if (localWorkSize.x * localWorkSize.y * localWorkSize.z > maxThreadsPerBlock)
-        errMsgs << "  - localWorkSize total must be at most " << maxThreadsPerBlock << " threads/block" << endl;
-
-      // globalWorkSize may (in theory) be all zero (no work). Reject such localWorkSize.
-      if (localWorkSize.x == 0 || localWorkSize.y == 0 || localWorkSize.z == 0) {
+        errMsgs << "  - localWorkSize must be at most " << maxLocalWorkSize 
+                << endl;
+
+      if (localWorkSize.x * localWorkSize.y * localWorkSize.z > 
+          maxThreadsPerBlock)
+        errMsgs << "  - localWorkSize total must be at most " 
+                << maxThreadsPerBlock << " threads/block" << endl;
+
+      // globalWorkSize may (in theory) be all zero (no work). Reject such
+      // localWorkSize.
+      if (localWorkSize.x == 0 || 
+          localWorkSize.y == 0 ||
+          localWorkSize.z == 0) {
         errMsgs << "  - localWorkSize components must be non-zero" << endl;
       } else {
-        // TODO: to globalWorkSize in terms of localWorkSize (CUDA) ('gridWorkSize').
+        // TODO: to globalWorkSize in terms of localWorkSize (CUDA)
+        // ('gridWorkSize').
         if (globalWorkSize.x % localWorkSize.x != 0 ||
             globalWorkSize.y % localWorkSize.y != 0 ||
             globalWorkSize.z % localWorkSize.z != 0)
@@ -100,19 +111,21 @@ namespace LOFAR
                          globalWorkSize.y / localWorkSize.y,
                          globalWorkSize.z / localWorkSize.z);
 
-        gpu::Grid maxGridWorkSize = itsStream.getContext().getDevice().getMaxGridDims();
+        gpu::Grid maxGridWorkSize =
+          itsStream.getContext().getDevice().getMaxGridDims();
         if (grid.x > maxGridWorkSize.x ||
             grid.y > maxGridWorkSize.y ||
             grid.z > maxGridWorkSize.z)
-          errMsgs << "  - globalWorkSize / localWorkSize must be at most " << maxGridWorkSize << endl;
+          errMsgs << "  - globalWorkSize / localWorkSize must be at most "
+                  << maxGridWorkSize << endl;
       }
 
-
       string errStr(errMsgs.str());
       if (!errStr.empty())
-        THROW(gpu::GPUException, "setEnqueueWorkSizes(): unsupported globalWorkSize " <<
-              globalWorkSize << " and/or localWorkSize " << localWorkSize << " selected:" <<
-              endl << errStr);
+        THROW(gpu::GPUException,
+              "setEnqueueWorkSizes(): unsupported globalWorkSize " <<
+              globalWorkSize << " and/or localWorkSize " << localWorkSize <<
+              " selected:" << endl << errStr);
 
       LOG_DEBUG_STR("CUDA Grid size: " << grid);
       LOG_DEBUG_STR("CUDA Block size: " << localWorkSize);
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h
index 701842f84bbe0deb33b3fb683f25597ee8ea5951..591f8a7f333e1f6d98373ead29a5523a688e2e17 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h
@@ -41,7 +41,9 @@ namespace LOFAR
     {
     public:
       // Parameters that must be passed to the constructor of this Kernel class.
-      // TODO: more at constructor passed immediates can be turned into defines (blockDim/gridDim too if enforced fixed (consider conditional define) or drop opt)
+      // TODO: more at constructor passed immediates can be turned into defines
+      // (blockDim/gridDim too if enforced fixed (consider conditional define)
+      // or drop opt)
       struct Parameters
       {
         Parameters(const Parset& ps);
@@ -85,7 +87,8 @@ namespace LOFAR
       // Explicit destructor, because the implicitly generated one is public.
       ~Kernel();
 
-      void setEnqueueWorkSizes(gpu::Grid globalWorkSize, gpu::Block localWorkSize);
+      void setEnqueueWorkSizes(gpu::Grid globalWorkSize, 
+                               gpu::Block localWorkSize);
       
 
       const unsigned maxThreadsPerBlock;
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/BeamFormerPipeline.cc b/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/BeamFormerPipeline.cc
index 6b12ced4112103f2774144c877da897bdbef49a5..e514450fe4cf0034c533247d1eea7dcfeff0426a 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/BeamFormerPipeline.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/BeamFormerPipeline.cc
@@ -112,7 +112,6 @@ namespace LOFAR
           
             samples += proc->counters.samples.stats;
             visibilities += proc->counters.visibilities.stats;
-            copyBuffers += proc->counters.copyBuffers.stats;
             incoherentOutput += proc->counters.incoherentOutput.stats;
           }
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc
index 63381eaa60305dd5e999e77a9d1835d4daa6176d..45532308ec28f281ec4a75ca07e1091d519a1fe0 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc
@@ -32,15 +32,15 @@ namespace LOFAR
                                              size_t nrSubbandsPerSubbandProc) :
         intToFloat(ps),
         delayCompensation(delayCompensationParams(ps)),
-        correctBandPass(correctBandPassParams(ps)),
         beamFormer(beamFormerParams(ps)),
         transpose(transposeParams(ps)),
         firFilter(firFilterParams(ps, nrSubbandsPerSubbandProc)),
         coherentStokes(coherentStokesParams(ps)),
         incoherentStokes(incoherentStokesParams(ps)),
-        incoherentFirFilter(incoherentFirFilterParams(ps, nrSubbandsPerSubbandProc)),
+        incoherentStokesTranspose(incoherentStokesTransposeParams(ps)),
+        incoherentFirFilter(
+          incoherentFirFilterParams(ps, nrSubbandsPerSubbandProc)),
         bandPassCorrection(bandPassCorrectionParams(ps))
-
       {
       }
 
@@ -74,21 +74,6 @@ namespace LOFAR
         return params;
       }
 
-      DelayAndBandPassKernel::Parameters
-      BeamFormerFactories::correctBandPassParams(const Parset &ps) const
-      {
-        DelayAndBandPassKernel::Parameters params(ps);
-        params.nrChannelsPerSubband =
-          BeamFormerSubbandProc::BEAM_FORMER_NR_CHANNELS;
-        params.nrSamplesPerChannel =
-          ps.nrSamplesPerSubband() /
-          BeamFormerSubbandProc::BEAM_FORMER_NR_CHANNELS;
-        params.delayCompensation = false;
-        params.transpose = true;
-
-        return params;
-      }
-
       BeamFormerKernel::Parameters 
       BeamFormerFactories::beamFormerParams(const Parset &ps) const
       {
@@ -158,15 +143,15 @@ namespace LOFAR
       {
         FIR_FilterKernel::Parameters params(ps);
 
-        params.nrSTABs = ps.settings.beamFormer.maxNrTABsPerSAP();
+        params.nrSTABs = ps.nrStations();
 
-        // define at least 16 channels to get the FIR_Filter.cu to compile, even
-        // if we won't use it.
-        params.nrChannelsPerSubband = std::max(16U,
-          ps.settings.beamFormer.incoherentSettings.nrChannels);
+        params.nrChannelsPerSubband = 
+          ps.settings.beamFormer.incoherentSettings.nrChannels;
 
-        // time integration has not taken place yet, so calculate the nrSamples manually
-        params.nrSamplesPerChannel = ps.nrSamplesPerSubband() / params.nrChannelsPerSubband;
+        // time integration has not taken place yet, so calculate the nrSamples
+        // manually
+        params.nrSamplesPerChannel = 
+          ps.nrSamplesPerSubband() / params.nrChannelsPerSubband;
 
         params.nrSubbands = nrSubbandsPerSubbandProc;
 
@@ -178,12 +163,26 @@ namespace LOFAR
       incoherentStokesParams(const Parset &ps) const 
       {
         IncoherentStokesKernel::Parameters params(ps);
-        //TODO: beamformer params
-        params.nrChannelsPerSubband = ps.settings.beamFormer.incoherentSettings.nrChannels;
-        params.nrSamplesPerChannel = ps.nrSamplesPerSubband() / params.nrChannelsPerSubband;
+        params.nrChannelsPerSubband = 
+          ps.settings.beamFormer.incoherentSettings.nrChannels;
+        params.nrSamplesPerChannel = 
+          ps.nrSamplesPerSubband() / params.nrChannelsPerSubband;
 
         return params;
       }
 
+      IncoherentStokesTransposeKernel::Parameters 
+      BeamFormerFactories::
+      incoherentStokesTransposeParams(const Parset &ps) const 
+      {
+        IncoherentStokesTransposeKernel::Parameters params(ps);
+        params.nrChannelsPerSubband =
+          BeamFormerSubbandProc::BEAM_FORMER_NR_CHANNELS;
+        params.nrSamplesPerChannel =
+          ps.nrSamplesPerSubband() /
+          BeamFormerSubbandProc::BEAM_FORMER_NR_CHANNELS;
+
+        return params;
+      }
   }
 }
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h
index 813f52d356b040c1fcac16e4dfc890bcc4ce4f74..ccc2875c261f1bbc131032dbbae1b6ed076be313 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h
@@ -23,6 +23,7 @@
 #define LOFAR_GPUPROC_CUDA_BEAM_FORMER_FACTORIES_H
 
 #include <GPUProc/KernelFactory.h>
+#include <GPUProc/Kernels/BandPassCorrectionKernel.h>
 #include <GPUProc/Kernels/BeamFormerKernel.h>
 #include <GPUProc/Kernels/BeamFormerTransposeKernel.h>
 #include <GPUProc/Kernels/CoherentStokesKernel.h>
@@ -30,7 +31,7 @@
 #include <GPUProc/Kernels/FIR_FilterKernel.h>
 #include <GPUProc/Kernels/IntToFloatKernel.h>
 #include <GPUProc/Kernels/IncoherentStokesKernel.h>
-#include <GPUProc/Kernels/BandPassCorrectionKernel.h>
+#include <GPUProc/Kernels/IncoherentStokesTransposeKernel.h>
 
 namespace LOFAR
 {
@@ -46,12 +47,12 @@ namespace LOFAR
 
       KernelFactory<IntToFloatKernel> intToFloat;
       KernelFactory<DelayAndBandPassKernel> delayCompensation;
-      KernelFactory<DelayAndBandPassKernel> correctBandPass;
       KernelFactory<BeamFormerKernel> beamFormer;
       KernelFactory<BeamFormerTransposeKernel> transpose;
       KernelFactory<FIR_FilterKernel> firFilter;
       KernelFactory<CoherentStokesKernel> coherentStokes;
       KernelFactory<IncoherentStokesKernel> incoherentStokes;
+      KernelFactory<IncoherentStokesTransposeKernel> incoherentStokesTranspose;
       KernelFactory<FIR_FilterKernel> incoherentFirFilter;
       KernelFactory<BandPassCorrectionKernel> bandPassCorrection;
 
@@ -67,9 +68,6 @@ namespace LOFAR
       CoherentStokesKernel::Parameters
       coherentStokesParams(const Parset &ps) const;
 
-      DelayAndBandPassKernel::Parameters
-      correctBandPassParams(const Parset &ps) const;
-
       DelayAndBandPassKernel::Parameters
       delayCompensationParams(const Parset &ps) const;
 
@@ -83,6 +81,9 @@ namespace LOFAR
       IncoherentStokesKernel::Parameters 
       incoherentStokesParams(const Parset &ps) const;
 
+      IncoherentStokesTransposeKernel::Parameters 
+      incoherentStokesTransposeParams(const Parset &ps) const;
+
 
     };
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc
index 32d0d04e09ec5c8928bc4bf688312fab3676f226..d9f0baaab2a05890d3c63990afe86b0e1b4f7b7b 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc
@@ -25,6 +25,7 @@
 
 #include <GPUProc/global_defines.h>
 #include <GPUProc/gpu_wrapper.h>
+#include <GPUProc/gpu_utils.h>   // for dumpBuffer()
 
 #include <CoInterface/Parset.h>
 #include <ApplCommon/PosixTime.h>
@@ -70,19 +71,21 @@ namespace LOFAR
       // FIR, which the beam former does in a later stage!
       devInput(
         std::max(
-                 factories.intToFloat.bufferSize(IntToFloatKernel::INPUT_DATA),
+          factories.intToFloat.bufferSize(IntToFloatKernel::OUTPUT_DATA),
           factories.beamFormer.bufferSize(BeamFormerKernel::OUTPUT_DATA)),
         factories.delayCompensation.bufferSize(
           DelayAndBandPassKernel::DELAYS),
         factories.delayCompensation.bufferSize(
           DelayAndBandPassKernel::PHASE_OFFSETS),
-               context),
+        context),
       // coherent stokes buffers
       devA(devInput.inputSamples),
       devB(context, devA.size()),
       // Buffers for incoherent stokes
       devC(context, devA.size()),
       devD(context, devA.size()),
+      devE(context, factories.incoherentStokes.bufferSize(
+             IncoherentStokesKernel::OUTPUT_DATA)),
       devNull(context, 1),
 
       // intToFloat: input -> B
@@ -113,15 +116,19 @@ namespace LOFAR
       // bandPass: A -> B
       devBandPassCorrectionWeights(
         context,
-        factories.correctBandPass.bufferSize(
-          DelayAndBandPassKernel::BAND_PASS_CORRECTION_WEIGHTS)),
-      correctBandPassBuffers(
-        devA, devB, devNull, devNull, devNull, devBandPassCorrectionWeights),
-      correctBandPassKernel(
-        factories.correctBandPass.create(queue, correctBandPassBuffers)),
+        factories.bandPassCorrection.bufferSize(
+          BandPassCorrectionKernel::BAND_PASS_CORRECTION_WEIGHTS)),
+      bandPassCorrectionBuffers(
+        devA, devB, devBandPassCorrectionWeights),
+      bandPassCorrectionKernel(
+        factories.bandPassCorrection.create(queue, bandPassCorrectionBuffers)),
 
       //**************************************************************
       //coherent stokes
+      outputComplexVoltages(
+        ps.settings.beamFormer.coherentSettings.type == STOKES_XXYY),
+      coherentStokesPPF(ps.settings.beamFormer.coherentSettings.nrChannels > 1),
+
       // beamForm: B -> A
       // TODO: support >1 SAP
       devBeamFormerDelays(
@@ -130,89 +137,128 @@ namespace LOFAR
       beamFormerBuffers(devB, devA, devBeamFormerDelays),
       beamFormerKernel(factories.beamFormer.create(queue, beamFormerBuffers)),
 
-      // transpose after beamforming: A -> B
-      transposeBuffers(devA, devB),
+      // transpose after beamforming: A -> C/D
+      //
+      // Output buffer: 
+      // 1ch: CS: C, CV: D
+      // PPF: CS: D, CV: C
+      transposeBuffers(
+        devA, outputComplexVoltages ^ coherentStokesPPF ? devD : devC),
       transposeKernel(factories.transpose.create(queue, transposeBuffers)),
 
-      // inverse FFT: B -> B
+      // inverse FFT: C/D -> C/D (in-place) = transposeBuffers.output
       inverseFFT(
         queue,
         BEAM_FORMER_NR_CHANNELS,
         (ps.settings.beamFormer.maxNrTABsPerSAP() * NR_POLARIZATIONS *
          ps.nrSamplesPerSubband() / BEAM_FORMER_NR_CHANNELS),
-        false, devB),
-
-      // FIR filter: B -> A
+        false, transposeBuffers.output),
+
+      // FIR filter: D/C -> C/D
+      //
+      // Input buffer:
+      // 1ch: CS: -, CV: - (no FIR will be done)
+      // PPF: CS: D, CV: C = transposeBuffers.output
+      //
+      // Output buffer:
+      // 1ch: CS: -, CV: - (no FIR will be done)
+      // PPF: CS: C, CV: D = transposeBuffers.input
       devFilterWeights(
         context,
         factories.firFilter.bufferSize(FIR_FilterKernel::FILTER_WEIGHTS)),
       devFilterHistoryData(
         context,
         factories.firFilter.bufferSize(FIR_FilterKernel::HISTORY_DATA)),
-      firFilterBuffers(devB, devA, devFilterWeights, devFilterHistoryData),
+      firFilterBuffers(
+        transposeBuffers.output, transposeBuffers.input, 
+        devFilterWeights, devFilterHistoryData),
       firFilterKernel(factories.firFilter.create(queue, firFilterBuffers)),
 
-      // final FFT: A -> A
+      // final FFT: C/D -> C/D (in-place) = firFilterBuffers.output
       finalFFT(
         queue,
         ps.settings.beamFormer.coherentSettings.nrChannels,
         (ps.settings.beamFormer.maxNrTABsPerSAP() *
          NR_POLARIZATIONS * ps.nrSamplesPerSubband() /
          ps.settings.beamFormer.coherentSettings.nrChannels),
-        true, devA),
+        true, firFilterBuffers.output),
 
-      // coherentStokes: 1ch: A -> B, Nch: B -> A
+      // coherentStokes: C -> D
+      //
+      // 1ch: input comes from inverseFFT in C
+      // Nch: input comes from finalFFT in C
       coherentStokesBuffers(
-          ps.settings.beamFormer.coherentSettings.nrChannels > 1 ? devA : devB,
-          ps.settings.beamFormer.coherentSettings.nrChannels > 1 ? devB : devA),
+        devC,
+        devD),
       coherentStokesKernel(
         factories.coherentStokes.create(queue, coherentStokesBuffers)),
 
-      // result buffer
-      devResult(ps.settings.beamFormer.coherentSettings.nrChannels > 1 ? devB : devA),
       //**************************************************************
       //incoherent stokes
-      // TODO: Add a transpose
-      // inverse FFT: C -> C
-      incoherentInverseFFT(queue, BEAM_FORMER_NR_CHANNELS,
-                 NR_POLARIZATIONS * 
-                 ps.nrSamplesPerSubband() / BEAM_FORMER_NR_CHANNELS, false, devC),
-
-      // FIR filter: C -> D
-      // TODO: provide history samples separately
-      // TODO: do a FIR for each individual TAB!!
-      devIncoherentFilterWeights(context,
-           factories.incoherentFirFilter.bufferSize(FIR_FilterKernel::FILTER_WEIGHTS)),
-      devIncoherentFilterHistoryData(context,
-           factories.incoherentFirFilter.bufferSize(FIR_FilterKernel::HISTORY_DATA)),
-      incoherentFirFilterBuffers(devC, devD,
-              devIncoherentFilterWeights, devIncoherentFilterHistoryData),
-      incoherentFirFilterKernel(
-          factories.incoherentFirFilter.create(
-                queue, incoherentFirFilterBuffers)),
+      incoherentStokesPPF(
+        ps.settings.beamFormer.incoherentSettings.nrChannels > 1),
 
-      // final FFT: D -> D
-      incoherentFinalFFT(queue, ps.settings.beamFormer.incoherentSettings.nrChannels,
-                                NR_POLARIZATIONS * ps.nrSamplesPerSubband() / 
-                                ps.settings.beamFormer.incoherentSettings.nrChannels, true, devD),
+      // Transpose: B -> A
+      incoherentTransposeBuffers(devB, devA),
 
-      // incoherentstokes kernel
+      incoherentTranspose(
+        factories.incoherentStokesTranspose.create(
+          queue, incoherentTransposeBuffers)),
+
+      // inverse FFT: A -> A
+      incoherentInverseFFT(
+        queue, BEAM_FORMER_NR_CHANNELS,
+        (ps.nrStations() * NR_POLARIZATIONS * 
+         ps.nrSamplesPerSubband() / BEAM_FORMER_NR_CHANNELS),
+        false, devA),
+
+      // FIR filter: A -> B
+      devIncoherentFilterWeights(
+        context,
+        factories.incoherentFirFilter.bufferSize(
+          FIR_FilterKernel::FILTER_WEIGHTS)),
+
+      devIncoherentFilterHistoryData(
+        context,
+        factories.incoherentFirFilter.bufferSize(
+          FIR_FilterKernel::HISTORY_DATA)),
+
+      incoherentFirFilterBuffers(
+        devA, devB, devIncoherentFilterWeights, devIncoherentFilterHistoryData),
+
+      incoherentFirFilterKernel(
+        factories.incoherentFirFilter.create(
+          queue, incoherentFirFilterBuffers)),
+
+      // final FFT: B -> B
+      incoherentFinalFFT(
+        queue, ps.settings.beamFormer.incoherentSettings.nrChannels,
+        (ps.nrStations() * NR_POLARIZATIONS * ps.nrSamplesPerSubband() / 
+         ps.settings.beamFormer.incoherentSettings.nrChannels),
+        true, devB),
+
+      // incoherentstokes kernel: A/B -> E
+      //
+      // 1ch: input comes from incoherentInverseFFT in A
+      // Nch: input comes from incoherentFinalFFT in B
       incoherentStokesBuffers(
-          ps.settings.beamFormer.incoherentSettings.nrChannels > 1 ? devD : devC,
-          ps.settings.beamFormer.incoherentSettings.nrChannels > 1 ? devC : devD),
-      incoherentStokesKernel(
-          factories.incoherentStokes.create(queue, incoherentStokesBuffers)),
+        incoherentStokesPPF ? devB : devA,
+        devE),
 
-      devIncoherentStokes(ps.settings.beamFormer.incoherentSettings.nrChannels > 1 ? devC : devD)
+      incoherentStokesKernel(
+        factories.incoherentStokes.create(queue, incoherentStokesBuffers))
     {
-      // initialize history data
+      // initialize history data for both coherent and incoherent stokes.
       devFilterHistoryData.set(0);
+      devIncoherentFilterHistoryData.set(0);
 
       // TODO For now we only allow pure coherent and incoherent runs
       // count the number of coherent and incoherent saps
       size_t nrCoherent = 0;
       size_t nrIncoherent = 0;
-      for (size_t idx_sap = 0; idx_sap < ps.settings.beamFormer.SAPs.size(); ++idx_sap)
+      for (size_t idx_sap = 0; 
+           idx_sap < ps.settings.beamFormer.SAPs.size();
+           ++idx_sap)
       {
         if (ps.settings.beamFormer.SAPs[idx_sap].nrIncoherent != 0)
           nrIncoherent++;
@@ -223,12 +269,17 @@ namespace LOFAR
       // raise exception if the parset contained an incorrect configuration
       if (nrCoherent != 0 && nrIncoherent != 0)
         THROW(GPUProcException, 
-           "Parset contained both incoherent and coherent stokes SAPS. This is not supported");
+              "Parset contained both incoherent and coherent stokes SAPS. "
+              "This is not supported");
 
       if (nrCoherent)
         coherentBeamformer = true;
       else
         coherentBeamformer = false;
+
+      LOG_INFO_STR("Running "
+                   << (coherentBeamformer ? "a coherent" : "an incoherent")
+                   << " Stokes beamformer pipeline");
       
       // put enough objects in the outputPool to operate
       for (size_t i = 0; i < nrOutputElements(); ++i)
@@ -240,16 +291,18 @@ namespace LOFAR
           new BeamFormedData(
             (ps.settings.beamFormer.maxNrTABsPerSAP() *
              ps.settings.beamFormer.coherentSettings.nrStokes),
-                ps.settings.beamFormer.coherentSettings.nrChannels,
+            ps.settings.beamFormer.coherentSettings.nrChannels,
             ps.settings.beamFormer.coherentSettings.nrSamples(
               ps.nrSamplesPerSubband()),
-                context));
+            context));
         else
-          outputPool.free.append(new BeamFormedData(
-                    ps.settings.beamFormer.incoherentSettings.nrStokes,
-                ps.settings.beamFormer.incoherentSettings.nrChannels,
-                ps.settings.beamFormer.incoherentSettings.nrSamples(ps.nrSamplesPerSubband()),
-                context));
+          outputPool.free.append(
+            new BeamFormedData(
+              ps.settings.beamFormer.incoherentSettings.nrStokes,
+              ps.settings.beamFormer.incoherentSettings.nrChannels,
+              ps.settings.beamFormer.incoherentSettings.nrSamples(
+                ps.nrSamplesPerSubband()),
+              context));
       }
 
       //// CPU timers are set by CorrelatorPipeline
@@ -283,25 +336,23 @@ namespace LOFAR
     incoherentFirFilterKernel(context),
     incoherentFinalFFT(context),
     incoherentStokes(context),
+    incoherentStokesTranspose(context),
     samples(context),
     visibilities(context),
-    copyBuffers(context),
     incoherentOutput(context)
     {
     }
 
     void BeamFormerSubbandProc::Counters::printStats()
     {     
-
       // Print the individual counter stats: mean and stDev
-      LOG_INFO_STR("**** BeamFormerSubbandProc GPU mean and stDev ****" << endl <<
+      LOG_INFO_STR(
+        "**** BeamFormerSubbandProc GPU mean and stDev ****" << endl <<
         std::setw(20) << "(intToFloat)" << intToFloat.stats << endl <<
         std::setw(20) << "(firstFFT)" << firstFFT.stats << endl <<
         std::setw(20) << "(delayBp)" << delayBp.stats << endl <<
         std::setw(20) << "(secondFFT)" << secondFFT.stats << endl <<
-        std::setw(20) << "(correctBandpass)" << correctBandpass.stats << endl <<        
-
-
+        std::setw(20) << "(correctBandpass)" << correctBandpass.stats << endl <<
         std::setw(20) << "(beamformer)" << beamformer.stats << endl <<
         std::setw(20) << "(transpose)" << transpose.stats << endl <<
         std::setw(20) << "(inverseFFT)" << inverseFFT.stats << endl <<
@@ -310,21 +361,19 @@ namespace LOFAR
         std::setw(20) << "(coherentStokes)" << coherentStokes.stats << endl <<
         std::setw(20) << "(samples)" << samples.stats << endl <<       
         std::setw(20) << "(visibilities)" << visibilities.stats << endl <<
-
-        std::setw(20) << "(copyBuffers)" << copyBuffers.stats << endl <<
         std::setw(20) << "(incoherentOutput )" << incoherentOutput.stats << endl <<
         std::setw(20) << "(incoherentInverseFFT)" << incoherentInverseFFT.stats << endl <<
         std::setw(20) << "(incoherentFirFilterKernel)" << incoherentFirFilterKernel.stats << endl <<
         std::setw(20) << "(incoherentFinalFFT)" << incoherentFinalFFT.stats << endl <<
-        std::setw(20) << "(incoherentStokes)" <<  incoherentStokes.stats << endl );
-
+        std::setw(20) << "(incoherentStokes)" << incoherentStokes.stats << endl <<
+        std::setw(20) << "(incoherentStokesTranspose)" << incoherentStokesTranspose.stats << endl);
     }
 
     void BeamFormerSubbandProc::processSubband(SubbandProcInputData &input,
       StreamableData &_output)
     {
       BeamFormedData &output = dynamic_cast<BeamFormedData&>(_output);
-      BeamFormedData &incoherentOutput = static_cast<BeamFormedData&>(_output);
+      BeamFormedData &incoherentOutput = dynamic_cast<BeamFormedData&>(_output);
 
       size_t block = input.blockID.block;
       unsigned subband = input.blockID.globalSubbandIdx;
@@ -358,19 +407,22 @@ namespace LOFAR
       intToFloatKernel->enqueue(input.blockID, counters.intToFloat);
 
       firstFFT.enqueue(input.blockID, counters.firstFFT);
+      dumpBuffer(devB, "firstFFT.output.dat");
+
       delayCompensationKernel->enqueue(
         input.blockID, counters.delayBp,
         ps.settings.subbands[subband].centralFrequency,
         ps.settings.subbands[subband].SAP);
+      dumpBuffer(delayCompensationBuffers.output, 
+                 "delayCompensation.output.dat");
 
       secondFFT.enqueue(input.blockID, counters.secondFFT);
-      correctBandPassKernel->enqueue(
-        input.blockID, counters.correctBandpass,
-        ps.settings.subbands[subband].centralFrequency,
-        ps.settings.subbands[subband].SAP);
+      dumpBuffer(devA, "secondFFT.output.dat");
 
-      // TODO: To allow the copy of data to new buffer we need a sync here?  
-      queue.copyBuffer(devC, devB, counters.copyBuffers, true);
+      bandPassCorrectionKernel->enqueue(
+        input.blockID, counters.correctBandpass);
+      dumpBuffer(bandPassCorrectionBuffers.output,
+                 "bandPassCorrection.output.dat");
 
       // ********************************************************************
       // coherent stokes kernels
@@ -384,40 +436,59 @@ namespace LOFAR
 
         inverseFFT.enqueue(input.blockID, counters.inverseFFT);
 
-        if (ps.settings.beamFormer.coherentSettings.nrChannels > 1) 
+        if (coherentStokesPPF) 
         {
           firFilterKernel->enqueue(input.blockID, 
             counters.firFilterKernel,
             input.blockID.subbandProcSubbandIdx);
           finalFFT.enqueue(input.blockID, counters.finalFFT);
         }
-        
-        coherentStokesKernel->enqueue(input.blockID, counters.coherentStokes);
+
+        if (!outputComplexVoltages)
+        {
+          coherentStokesKernel->enqueue(input.blockID, counters.coherentStokes);
+        }
       }
       else
       {
         // ********************************************************************
         // incoherent stokes kernels
-        incoherentInverseFFT.enqueue(input.blockID, counters.incoherentInverseFFT);
+        incoherentTranspose->enqueue(
+          input.blockID, counters.incoherentStokesTranspose);
 
-        if (ps.settings.beamFormer.incoherentSettings.nrChannels > 1) 
+        incoherentInverseFFT.enqueue(
+          input.blockID, counters.incoherentInverseFFT);
+        dumpBuffer(devA, "inverseFFT.output.dat");
+
+        if (incoherentStokesPPF) 
         {
-          incoherentFirFilterKernel->enqueue(input.blockID, counters.incoherentFirFilterKernel,
+          incoherentFirFilterKernel->enqueue(
+            input.blockID, counters.incoherentFirFilterKernel,
             input.blockID.subbandProcSubbandIdx);
-          incoherentFinalFFT.enqueue(input.blockID, counters.incoherentFinalFFT);
+
+          incoherentFinalFFT.enqueue(
+            input.blockID, counters.incoherentFinalFFT);
+          dumpBuffer(devB, "finalFFT.output.dat");
         }
 
-        incoherentStokesKernel->enqueue(input.blockID, counters.incoherentStokes);
+        incoherentStokesKernel->enqueue(
+          input.blockID, counters.incoherentStokes);
+
         // TODO: Propagate flags
       }
+
       queue.synchronize();
 
+      // Output in devD and devE, by design.
       if (coherentBeamformer)
-        queue.readBuffer(output, devResult, counters.visibilities, true);
+        queue.readBuffer(
+          output, devD, counters.visibilities, true);
       else
-        queue.readBuffer(incoherentOutput, devIncoherentStokes, counters.incoherentOutput, true);
+        queue.readBuffer(
+          incoherentOutput, devE, 
+          counters.incoherentOutput, true);
 
-            // ************************************************
+      // ************************************************
       // Perform performance statistics if needed
       if (gpuProfiling)
       {
@@ -431,10 +502,9 @@ namespace LOFAR
         counters.correctBandpass.logTime();
 
         counters.samples.logTime();
-        counters.copyBuffers.logTime();
         if (coherentBeamformer)
         {
-          if (ps.settings.beamFormer.coherentSettings.nrChannels > 1) 
+          if (coherentStokesPPF) 
           {
             counters.firFilterKernel.logTime();
             counters.finalFFT.logTime();
@@ -443,13 +513,19 @@ namespace LOFAR
           counters.beamformer.logTime();
           counters.transpose.logTime();
           counters.inverseFFT.logTime();
-          counters.coherentStokes.logTime();
+
+          if (!outputComplexVoltages)
+          {
+            counters.coherentStokes.logTime();
+          }
+
           counters.visibilities.logTime();
         }
         else
         {
+          counters.incoherentStokesTranspose.logTime();
           counters.incoherentInverseFFT.logTime();
-          if (ps.settings.beamFormer.incoherentSettings.nrChannels > 1) 
+          if (incoherentStokesPPF) 
           {
             counters.incoherentFirFilterKernel.logTime();
             counters.incoherentFinalFFT.logTime();
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h
index 5a3a4e7c2be1b37a1f1c312cd0e2132b793fde1c..58f113bca48e9724c294008de5a5dd107a33d680 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h
@@ -35,10 +35,12 @@
 #include <GPUProc/Kernels/IntToFloatKernel.h>
 #include <GPUProc/Kernels/FFT_Kernel.h>
 #include <GPUProc/Kernels/DelayAndBandPassKernel.h>
+#include <GPUProc/Kernels/BandPassCorrectionKernel.h>
 #include <GPUProc/Kernels/BeamFormerKernel.h>
 #include <GPUProc/Kernels/BeamFormerTransposeKernel.h>
 #include <GPUProc/Kernels/CoherentStokesKernel.h>
 #include <GPUProc/Kernels/IncoherentStokesKernel.h>
+#include <GPUProc/Kernels/IncoherentStokesTransposeKernel.h>
 #include <GPUProc/Kernels/FIR_FilterKernel.h>
 
 #include "SubbandProc.h"
@@ -105,12 +107,12 @@ namespace LOFAR
         PerformanceCounter incoherentFirFilterKernel;
         PerformanceCounter incoherentFinalFFT;
         PerformanceCounter incoherentStokes;
+        PerformanceCounter incoherentStokesTranspose;
 
 
         // gpu transfer counters
         PerformanceCounter samples;
         PerformanceCounter visibilities;
-        PerformanceCounter copyBuffers;
         PerformanceCounter incoherentOutput;
         // Print the mean and std of each performance counter on the logger
         void printStats();
@@ -129,10 +131,14 @@ namespace LOFAR
       // in the InputData class
       SubbandProcInputData::DeviceBuffers devInput;
 
+      // @{
+      // Device memory buffers. These buffers are used interleaved.
       gpu::DeviceMemory devA;
       gpu::DeviceMemory devB;
       gpu::DeviceMemory devC;
-      gpu::DeviceMemory devD;     
+      gpu::DeviceMemory devD;
+      gpu::DeviceMemory devE;
+      // @}
 
       // NULL placeholder for unused DeviceMemory parameters
       gpu::DeviceMemory devNull;
@@ -141,39 +147,44 @@ namespace LOFAR
        * Kernels
        */
 
-      // int -> float
+      // Int -> Float conversion
       IntToFloatKernel::Buffers intToFloatBuffers;
       std::auto_ptr<IntToFloatKernel> intToFloatKernel;
 
-      // first FFT
+      // First (64 points) FFT
       FFT_Kernel firstFFT;
 
-      // delay compensation
+      // Delay compensation
       DelayAndBandPassKernel::Buffers delayCompensationBuffers;
       std::auto_ptr<DelayAndBandPassKernel> delayCompensationKernel;
 
-      // second FFT
+      // Second (64 points) FFT
       FFT_Kernel secondFFT;
 
-      // bandpass correction
+      // Bandpass correction and tranpose
       gpu::DeviceMemory devBandPassCorrectionWeights;
-      DelayAndBandPassKernel::Buffers correctBandPassBuffers;
-      std::auto_ptr<DelayAndBandPassKernel> correctBandPassKernel;
+      BandPassCorrectionKernel::Buffers bandPassCorrectionBuffers;
+      std::auto_ptr<BandPassCorrectionKernel> bandPassCorrectionKernel;
 
       // *****************************************************************
       //  Objects needed to produce Coherent stokes output
       // beam former
+
+      const bool outputComplexVoltages;
+      const bool coherentStokesPPF;
+
       gpu::DeviceMemory devBeamFormerDelays;
       BeamFormerKernel::Buffers beamFormerBuffers;
       std::auto_ptr<BeamFormerKernel> beamFormerKernel;
 
+      // Transpose 
       BeamFormerTransposeKernel::Buffers transposeBuffers;
       std::auto_ptr<BeamFormerTransposeKernel> transposeKernel;
 
-      // inverse FFT
+      // inverse (4k points) FFT
       FFT_Kernel inverseFFT;
 
-      // PPF
+      // Poly-phase filter (FIR + FFT)
       gpu::DeviceMemory devFilterWeights;
       gpu::DeviceMemory devFilterHistoryData;
       FIR_FilterKernel::Buffers firFilterBuffers;
@@ -184,32 +195,34 @@ namespace LOFAR
       CoherentStokesKernel::Buffers coherentStokesBuffers;
       std::auto_ptr<CoherentStokesKernel> coherentStokesKernel;
 
-      // end result
-      gpu::DeviceMemory &devResult;
-
       // *****************************************************************
       //  Objects needed to produce incoherent stokes output
+
+      const bool incoherentStokesPPF;
+
+      // Transpose 
+      IncoherentStokesTransposeKernel::Buffers incoherentTransposeBuffers;
+      std::auto_ptr<IncoherentStokesTransposeKernel> incoherentTranspose;
+
+      // Inverse (4k points) FFT
       FFT_Kernel incoherentInverseFFT;
 
-      //// PPF
+      // Poly-phase filter (FIR + FFT)
       gpu::DeviceMemory devIncoherentFilterWeights;
       gpu::DeviceMemory devIncoherentFilterHistoryData;
       FIR_FilterKernel::Buffers incoherentFirFilterBuffers;
       std::auto_ptr<FIR_FilterKernel> incoherentFirFilterKernel;
       FFT_Kernel incoherentFinalFFT;
 
-      //// Incoherent Stokes
+      // Incoherent Stokes
       IncoherentStokesKernel::Buffers incoherentStokesBuffers;
       std::auto_ptr<IncoherentStokesKernel> incoherentStokesKernel;
 
-      //output for Incoherent stokes 
-      gpu::DeviceMemory &devIncoherentStokes;
-
       bool coherentBeamformer; // TODO temporary hack to allow typing of subband proc
     };
 
   }
-      }
+}
 
 #endif
 
diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc
index a19670dc636714ee00c0a21bef5e6de2a8d99b43..316660401c922fce92913dc402183d2e82bc5370 100644
--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc
@@ -1,4 +1,5 @@
-//# tCorrelatorWorKQueueProcessSb.cc: test CorrelatorSubbandProc::processSubband()
+//# tBeamFormerSubbandProcProcessSb: test of Beamformer subband processor.
+//#
 //# Copyright (C) 2013  ASTRON (Netherlands Institute for Radio Astronomy)
 //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
 //#
@@ -29,14 +30,21 @@
 #include <GPUProc/SubbandProcs/BeamFormerSubbandProc.h>
 #include <GPUProc/SubbandProcs/BeamFormerFactories.h>
 
+#include "../fpequals.h"
+
 using namespace std;
 using namespace LOFAR::Cobalt;
 using namespace LOFAR::TYPES;
 
+const unsigned nrChannel1 = 64;
+const unsigned nrChannel2 = 64;
+
 template<typename T> T inputSignal(size_t t)
 {
-  double freq = 1.0 / 2.0; // in samples
-  double amp = 255.0;
+  size_t nrBits = sizeof(T) / 2 * 8;
+  double freq = 1.0 / 4.0; // in samples
+  // double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
+  double amp = (1 << (nrBits - 1)) - 1;
 
   double angle = (double)t * 2.0 * M_PI * freq;
 
@@ -63,38 +71,93 @@ int main() {
 
   Parset ps("tBeamFormerSubbandProcProcessSb.parset");
 
-  // Create very simple kernel programs, with predictable output. Skip as much as possible.
-  // Nr of channels/sb from the parset is 1, so the PPF will not even run.
-  // Parset also has turned of delay compensation and bandpass correction
-  // (but that kernel will run to convert int to float and to transform the data order).
+  // Input array sizes
+  const size_t nrBeams = ps.nrBeams();
+  const size_t nrStations = ps.nrStations();
+  const size_t nrPolarisations = ps.settings.nrPolarisations;
+  const size_t maxNrTABsPerSAP = ps.settings.beamFormer.maxNrTABsPerSAP();
+  const size_t nrSamplesPerSubband = ps.nrSamplesPerSubband();
+  const size_t nrBitsPerSample = ps.settings.nrBitsPerSample;
+  const size_t nrBytesPerComplexSample = ps.nrBytesPerComplexSample();
+
+  // We only support 8-bit or 16-bit input samples
+  ASSERT(nrBitsPerSample == 8 || nrBitsPerSample == 16);
+
+  // TODO: Amplitude is now calculated at two places. Dangerous!
+  const size_t amplitude = (1 << (nrBitsPerSample - 1)) - 1;
+  const size_t scaleFactor = nrBitsPerSample == 16 ? 1 : 16;
+
+  LOG_INFO_STR(
+    "Input info:" <<
+    "\n  nrBeams = " << nrBeams <<
+    "\n  nrStations = " << nrStations <<
+    "\n  nrPolarisations = " << nrPolarisations <<
+    "\n  maxNrTABsPerSAP = " << maxNrTABsPerSAP <<
+    "\n  nrSamplesPerSubband = " << nrSamplesPerSubband <<
+    "\n  nrBitsPerSample = " << nrBitsPerSample <<
+    "\n  nrBytesPerComplexSample = " << nrBytesPerComplexSample);
+
+  // Output array sizes
+  const size_t nrStokes = ps.settings.beamFormer.incoherentSettings.nrStokes;
+  const size_t nrChannels = 
+    ps.settings.beamFormer.incoherentSettings.nrChannels;
+  const size_t nrSamples = 
+    ps.settings.beamFormer.incoherentSettings.nrSamples(
+      ps.settings.nrSamplesPerSubband());
+
+  LOG_INFO_STR(
+    "Output info:" <<
+    "\n  nrStokes = " << nrStokes <<
+    "\n  nrChannels = " << nrChannels <<
+    "\n  nrSamples = " << nrSamples <<
+    "\n  scaleFactor = " << scaleFactor);
+
+  // Create very simple kernel programs, with predictable output. Skip as much
+  // as possible. Nr of channels/sb from the parset is 1, so the PPF will not
+  // even run. Parset also has turned of delay compensation and bandpass
+  // correction (but that kernel will run to convert int to float and to
+  // transform the data order).
 
   BeamFormerFactories factories(ps);
   BeamFormerSubbandProc bwq(ps, ctx, factories);
 
-  SubbandProcInputData in(ps.nrBeams(), ps.nrStations(), ps.settings.nrPolarisations,
-                          ps.settings.beamFormer.maxNrTABsPerSAP(), 
-                          ps.nrSamplesPerSubband(), ps.nrBytesPerComplexSample(), ctx);
+  SubbandProcInputData in(
+    nrBeams, nrStations, nrPolarisations, maxNrTABsPerSAP, 
+    nrSamplesPerSubband, nrBytesPerComplexSample, ctx);
 
   // Initialize synthetic input to input signal
-  for (size_t st = 0; st < ps.nrStations(); st++)
-    for (size_t i = 0; i < ps.nrSamplesPerSubband(); i++)
-      for (size_t pol = 0; pol < NR_POLARIZATIONS; pol++)
+  for (size_t st = 0; st < nrStations; st++)
+    for (size_t i = 0; i < nrSamplesPerSubband; i++)
+      for (size_t pol = 0; pol < nrPolarisations; pol++)
       {
-        if (ps.nrBytesPerComplexSample() == 4) { // 16 bit mode
-          *(i16complex*)&in.inputSamples[st][i][pol][0] = inputSignal<i16complex>(i);
-        } else if (ps.nrBytesPerComplexSample() == 2) { // 8 bit mode
-          *(i8complex*)&in.inputSamples[st][i][pol][0] = inputSignal<i8complex>(i);
-        } else {
-          cerr << "Error: number of bits per sample must be 8, or 16" << endl;
-          exit(1);
+        switch(nrBitsPerSample) {
+        case 8:
+          reinterpret_cast<i8complex&>(in.inputSamples[st][i][pol][0]) =
+            inputSignal<i8complex>(i);
+          break;
+        case 16:
+          reinterpret_cast<i16complex&>(in.inputSamples[st][i][pol][0]) = 
+            inputSignal<i16complex>(i);
+          break;
+        default:
+          break;
         }
       }
 
-  // Initialize subbands partitioning administration (struct BlockID). We only do the 1st block of whatever.
-  in.blockID.block = 0;            // Block number: 0 .. inf
-  in.blockID.globalSubbandIdx = 0; // Subband index in the observation: [0, ps.nrSubbands())
-  in.blockID.localSubbandIdx = 0;  // Subband index for this pipeline/workqueue: [0, subbandIndices.size())
-  in.blockID.subbandProcSubbandIdx = 0; // Subband index for this SubbandProc
+  // Initialize subbands partitioning administration (struct BlockID). We only
+  // do the 1st block of whatever.
+
+  // Block number: 0 .. inf
+  in.blockID.block = 0;
+
+ // Subband index in the observation: [0, ps.nrSubbands())
+  in.blockID.globalSubbandIdx = 0;
+
+  // Subband index for this pipeline/workqueue: [0, subbandIndices.size())
+  in.blockID.localSubbandIdx = 0;
+
+  // Subband index for this SubbandProc
+  in.blockID.subbandProcSubbandIdx = 0;
 
   // Initialize delays. We skip delay compensation, but init anyway,
   // so we won't copy uninitialized data to the device.
@@ -107,10 +170,7 @@ int main() {
   for (size_t i = 0; i < in.tabDelays.num_elements(); i++)
     in.tabDelays.get<float>()[i] = 0.0f;
 
-  BeamFormedData out(ps.settings.beamFormer.maxNrTABsPerSAP() * ps.settings.beamFormer.coherentSettings.nrStokes,
-                     ps.settings.beamFormer.coherentSettings.nrChannels,
-                     ps.settings.beamFormer.coherentSettings.nrSamples(ps.settings.nrSamplesPerSubband()),
-                     ctx);
+  BeamFormedData out(maxNrTABsPerSAP * nrStokes, nrChannels, nrSamples, ctx);
 
   for (size_t i = 0; i < out.num_elements(); i++)
     out.get<float>()[i] = 42.0f;
@@ -124,39 +184,42 @@ int main() {
   cout << "Output: " << endl;
 
   // Output verification
-  // The int2float conversion scales its output to the same amplitude as in 16 bit mode.
-  // For 8 bit mode, that is a factor 256.
-  // Since we inserted all (1, 1) vals, for 8 bit mode this means that the correlator
-  // outputs 256*256. It then sums over nrSamplesPerSb values.
-#if 0
-  unsigned scale = 1*1;
-  if (ps.nrBitsPerSample() == 8)
-    scale = 256*256;
-#endif
-  bool unexpValueFound = false;
-  for (size_t b = 0; b < ps.settings.beamFormer.maxNrTABsPerSAP() * ps.settings.beamFormer.coherentSettings.nrStokes; b++)
-    for (size_t t = 0; t < ps.settings.beamFormer.coherentSettings.nrSamples(ps.settings.nrSamplesPerSubband()); t++)
-      for (size_t c = 0; c < ps.settings.beamFormer.coherentSettings.nrChannels; c++)
-        {
-          float v = out[b][t][c];
-// disable output validation until we've verified the beamformer pipeline!
-#if 0
-          if (v != 4.0f)
-          {
-            unexpValueFound = true;
-            cout << '*'; // indicate error in output
-          }
-#endif
-          cout << v << " ";
-        }
-  cout << endl;
 
-  if (unexpValueFound)
-  {
-    cerr << "Error: Found unexpected output value(s)" << endl;
-    return 1;
+  // We can calculate the expected output values, since we're supplying a
+  // complex sine/cosine input signal. We only have Stokes-I, so the output
+  // should be: (nrStation * amp * scaleFactor * nrChannel1 * nrChannel2)^2
+  // - amp is set to the maximum possible value for the bit-mode:
+  //   i.e. 127 for 8-bit and 32767 for 16-bit mode
+  // - scaleFactor is the scaleFactor applied by the IntToFloat kernel. 
+  //   It is 16 for 8-bit mode and 1 for 16-bit mode.
+  // Hence, each output sample should be: 
+  // - for 16-bit input: (2 * 32767 * 1 * 64 * 64)^2 = 72053196058525696
+  // - for 8-bit input: (2 * 127 * 16 * 64 * 64)^2 = 1082398867456
+
+  float outVal;
+  switch(nrBitsPerSample) {
+  case 8:
+    outVal = 
+      nrStations * amplitude * scaleFactor * nrChannel1 * nrChannel2 *
+      nrStations * amplitude * scaleFactor * nrChannel1 * nrChannel2; 
+    break;
+  case 16:
+    outVal = 
+      nrStations * amplitude * scaleFactor * nrChannel1 * nrChannel2 *
+      nrStations * amplitude * scaleFactor * nrChannel1 * nrChannel2; 
+    break;
+  default:
+    break;
   }
-
+  cout << "outVal = " << outVal << endl;
+
+  for (size_t s = 0; s < nrStokes; s++)
+    for (size_t t = 0; t < nrSamples; t++)
+      for (size_t c = 0; c < nrChannels; c++)
+        ASSERTSTR(fpEquals(out[s][t][c], outVal), 
+                  "out[" << s << "][" << t << "][" << c << "] = " << 
+                  out[s][t][c] << "; outVal = " << outVal);
+  
   return 0;
 }
 
diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.parset b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.parset
index 36d890a142dd345a2a8f7df6f45179ee1374174d..e64611e615b91e354ae007be39d4d1fd389886ba 100644
--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.parset
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.parset
@@ -8,21 +8,30 @@ Observation.VirtualInstrument.stationList = [CS002]
 Observation.antennaSet = HBA_DUAL
 Observation.nrBeams                      = 1
 Observation.Beam[0].subbandList	         = [300]
-Observation.Beam[0].nrTiedArrayBeams     = 2
+Observation.Beam[0].nrTiedArrayBeams     = 1
 Observation.Beam[0].TiedArrayBeam[0].angle1 = 0
 Observation.Beam[0].TiedArrayBeam[0].angle2 = 0
-Observation.Beam[0].TiedArrayBeam[0].coherent = T
-Observation.Beam[0].TiedArrayBeam[1].angle1 = 0
-Observation.Beam[0].TiedArrayBeam[1].angle2 = 0
-Observation.Beam[0].TiedArrayBeam[1].coherent = T
+Observation.Beam[0].TiedArrayBeam[0].coherent = F
 
 OLAP.CNProc_CoherentStokes.which	 = I # IQUV
 OLAP.CNProc_CoherentStokes.timeIntegrationFactor = 1
 OLAP.CNProc_CoherentStokes.channelsPerSubband = 1
+OLAP.CNProc_IncoherentStokes.which	 = I # IQUV
+OLAP.CNProc_IncoherentStokes.timeIntegrationFactor = 1
+OLAP.CNProc_IncoherentStokes.channelsPerSubband = 1
 Observation.rspBoardList                 = [0]
 Observation.rspSlotList                  = [0]
 
 Observation.DataProducts.Output_Beamformed.enabled=true
-Observation.DataProducts.Output_Beamformed.filenames=[tab0.raw,tab1.raw]
-Observation.DataProducts.Output_Beamformed.locations=[2*:.]
+Observation.DataProducts.Output_Beamformed.filenames=[tab0.raw]
+Observation.DataProducts.Output_Beamformed.locations=[1*:.]
 
+Cobalt.Kernels.BeamFormerKernel.dumpOutput = true
+Cobalt.Kernels.BeamFormerTransposeKernel.dumpOutput = true
+Cobalt.Kernels.CoherentStokesKernel.dumpOutput = true
+Cobalt.Kernels.DelayAndBandPassKernel.dumpOutput = true
+Cobalt.Kernels.FFT_Kernel.dumpOutput = true
+Cobalt.Kernels.FIR_FilterKernel.dumpOutput = true
+Cobalt.Kernels.IncoherentStokesKernel.dumpOutput = true
+Cobalt.Kernels.IncoherentStokesTransposeKernel.dumpOutput = true
+Cobalt.Kernels.IntToFloatKernel.dumpOutput = true
diff --git a/RTCP/Cobalt/GPUProc/test/cuda/tBandPassCorrection.cc b/RTCP/Cobalt/GPUProc/test/cuda/tBandPassCorrection.cc
index 5625c13993b976ba2e6e3b885dbc504151da32d2..126caca50e33f56b5f6d7bd4160c440dcb49b8ad 100644
--- a/RTCP/Cobalt/GPUProc/test/cuda/tBandPassCorrection.cc
+++ b/RTCP/Cobalt/GPUProc/test/cuda/tBandPassCorrection.cc
@@ -127,6 +127,7 @@ CompileDefinitions getDefaultCompileDefinitions()
     boost::lexical_cast<string>(NR_POLARIZATIONS);
   defs["NR_BITS_PER_SAMPLE"] =
     boost::lexical_cast<string>(NR_BITS_PER_SAMPLE);
+  defs["DO_BANDPASS_CORRECTION"] = "1";
 
   return defs;
 }