diff --git a/RTCP/Cobalt/GPUProc/doc/pipeline-buffers.txt b/RTCP/Cobalt/GPUProc/doc/pipeline-buffers.txt
index 347929e4e1f1424c800351af09d694074594fa1f..07312c924071f2234805705cfcdce9ab57f6b3fa 100644
--- a/RTCP/Cobalt/GPUProc/doc/pipeline-buffers.txt
+++ b/RTCP/Cobalt/GPUProc/doc/pipeline-buffers.txt
@@ -57,6 +57,8 @@ Correlator
    V
 (output)
 
+If 1ch, IntToFloat (A->E) is used instead of FIR+FFT.
+
 (BF) Preprocessing [A -> B, trashes A]
 -----------------------------------
 NB: The key Cobalt.BeamFormer.stationList can be used to select a subset of antenna fields in the observation
diff --git a/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu b/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu
index 62ef0e89e40a45fbd729e955005375e73a1d8121..3ae289a1258685063d07fcc77493c790ae049ee9 100644
--- a/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu
+++ b/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu
@@ -65,31 +65,7 @@ typedef  fcomplex(*OutputDataType)[NR_STATIONS][NR_CHANNELS][NR_SAMPLES_PER_CHAN
 typedef  fcomplex(*OutputDataType)[NR_STATIONS][NR_POLARIZATIONS][NR_CHANNELS][NR_SAMPLES_PER_CHANNEL];
 #endif
 
-//# TODO: Unify #dims in input type to 4: [NR_SAMPLES_PER_SUBBAND] -> [NR_SAMPLES_PER_CHANNEL][NR_CHANNELS] (see kernel test)
-//#       It is technically incorrect, but different dims for the same input type is a real pain to use/supply.
-//#       Also unify order of #chn, #sampl to [NR_SAMPLES_PER_CHANNEL][NR_CHANNELS]
-#ifdef INPUT_IS_STATIONDATA
-#  if NR_BITS_PER_SAMPLE == 16
-typedef  short_complex rawSampleType;
-typedef  short_complex(*InputDataType)[NR_STATIONS][NR_SAMPLES_PER_SUBBAND][NR_POLARIZATIONS];
-#define REAL(sample) sample.x
-#define IMAG(sample) sample.y
-#  elif NR_BITS_PER_SAMPLE == 8
-typedef  char_complex  rawSampleType;
-typedef  char_complex(*InputDataType)[NR_STATIONS][NR_SAMPLES_PER_SUBBAND][NR_POLARIZATIONS];
-#define REAL(sample) sample.x
-#define IMAG(sample) sample.y
-#  elif NR_BITS_PER_SAMPLE == 4
-typedef  signed char   rawSampleType;
-typedef  signed char   (*InputDataType)[NR_STATIONS][NR_SAMPLES_PER_SUBBAND][NR_POLARIZATIONS];
-#define REAL(sample) extractRI(sample, false)
-#define IMAG(sample) extractRI(sample, true)
-#  else
-#    error unsupported NR_BITS_PER_SAMPLE
-#  endif
-#else
 typedef  fcomplex(*InputDataType)[NR_STATIONS][NR_POLARIZATIONS][NR_SAMPLES_PER_CHANNEL][NR_CHANNELS];
-#endif
 typedef  const double(*DelaysType)[NR_SAPS][NR_DELAYS][NR_POLARIZATIONS]; // 2 Polarizations; in seconds
 typedef  const double2(*Phase0sType)[NR_STATIONS]; // 2 Polarizations; in radians
 typedef  const float(*BandPassFactorsType)[NR_CHANNELS];
@@ -124,9 +100,7 @@ inline __device__ fcomplex sincos_d2f(double phi)
 *                                 of ::complex (2 complex polarizations)
 * @param[in]  filteredDataPtr     pointer to input data; this can either be a
 *                                 4D array [station][polarization][sample][channel][complex]
-*                                 of ::fcomplex, or a 2D array [station][subband][complex]
-*                                 of ::short_complex2 or ::char_complex2,
-*                                 depending on the value of @c NR_CHANNELS
+*                                 of ::fcomplex
 * @param[in]  subbandFrequency    center freqency of the subband
 * @param[in]  beam                index number of the beam
 * @param[in]  delaysAtBeginPtr    pointer to delay data of ::DelaysType,
@@ -251,17 +225,8 @@ extern "C" {
 
     for (unsigned time = timeStart; time < NR_SAMPLES_PER_CHANNEL; time += timeInc)
     {
-#ifdef INPUT_IS_STATIONDATA
-      const rawSampleType sampleXraw = (*inputData)[station][time][0];
-      fcomplex sampleX = make_float2(convertIntToFloat(REAL(sampleXraw)),
-                                     convertIntToFloat(IMAG(sampleXraw)));
-      const rawSampleType sampleYraw = (*inputData)[station][time][1];
-      fcomplex sampleY = make_float2(convertIntToFloat(REAL(sampleYraw)),
-                                     convertIntToFloat(IMAG(sampleYraw)));
-#else
       fcomplex sampleX = (*inputData)[station][0][time][channel];
       fcomplex sampleY = (*inputData)[station][1][time][channel];
-#endif
 
 #if defined DELAY_COMPENSATION    
       // Offset of this sample between begin and end.
diff --git a/RTCP/Cobalt/GPUProc/share/gpu/kernels/IntToFloat.cu b/RTCP/Cobalt/GPUProc/share/gpu/kernels/IntToFloat.cu
index b8e002d8ee5299e1a90a58e72e1fc22b82be0cb8..f0a8fb035c35727cad3470df3cdd702eb4dd2af4 100644
--- a/RTCP/Cobalt/GPUProc/share/gpu/kernels/IntToFloat.cu
+++ b/RTCP/Cobalt/GPUProc/share/gpu/kernels/IntToFloat.cu
@@ -67,6 +67,8 @@ typedef float2     (*ConvertedDataType)[NR_OUTPUT_STATIONS][NR_POLARIZATIONS][NR
  *
  * Optional preprocessor symbols:
  * - DO_FFTSHIFT, if an fft-shift is to be performed as well
+ * - DO_STATIONSUBSET, if the stationIndices input array is to be used to select a subset
+ *                     of stations.
  *
  * Execution configuration:
  * - Use a 1D thread block. No restrictions.
@@ -82,8 +84,13 @@ __global__ void intToFloat(void *convertedDataPtr,
   ConvertedDataType convertedData = (ConvertedDataType)convertedDataPtr;
   SampledDataType   sampledData   = (SampledDataType)  sampledDataPtr;
 
+#ifdef DO_STATIONSUBSET
   uint station_in  = stationIndices[blockIdx.y];
   uint station_out = blockIdx.y;
+#else
+  uint station_in  = blockIdx.y;
+  uint station_out = blockIdx.y;
+#endif
 
 #ifdef DO_FFTSHIFT
   // Multiplication factor: 1 for even samples, -1 for odd samples
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h b/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h
index 5efc4d02212d4a0a6e7d5eae8ec049a431c19912..9fc75474dd12c2915322dc8f775ed1e15a27a0f6 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h
@@ -99,8 +99,12 @@ namespace LOFAR
       {
         // Since we use overlapping input/output buffers, their size
         // could be larger than we need.
-        ASSERT(buffers.input.size() >= bufferSize(T::INPUT_DATA));
-        ASSERT(buffers.output.size() >= bufferSize(T::OUTPUT_DATA));
+        ASSERTSTR(buffers.input.size() >= bufferSize(T::INPUT_DATA),
+          "Require " << bufferSize(T::INPUT_DATA) << " bytes for input, "
+          "but buffer is only " << buffers.input.size() << " bytes.");
+        ASSERTSTR(buffers.output.size() >= bufferSize(T::OUTPUT_DATA),
+          "Require " << bufferSize(T::OUTPUT_DATA) << " bytes for output, "
+          "but buffer is only " << buffers.output.size() << " bytes.");
 
         return new T(
           stream, createModule(stream.getContext(), 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc
index 05cbe311d80d9568bb0046f8bee28c3eed442eb8..6205b45ad6c2e935adaf8f71e52e622bacac7103 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc
@@ -50,10 +50,6 @@ namespace LOFAR
       delayIndices(nrStations),
       nrDelays(ps.settings.antennaFieldNames.size()),
       nrBitsPerSample(ps.settings.nrBitsPerSample),
-      inputIsStationData(correlator && ps.settings.correlator.nrChannels == 1
-                                    ? true
-                                    : false),
-
       nrChannels(correlator ? ps.settings.correlator.nrChannels
                             : ps.settings.beamFormer.nrDelayCompensationChannels),
       nrSamplesPerChannel(ps.settings.blockSize / nrChannels),
@@ -94,9 +90,7 @@ namespace LOFAR
 
 
     unsigned DelayAndBandPassKernel::Parameters::nrBytesPerComplexSample() const {
-      return inputIsStationData
-               ? 2 * nrBitsPerSample / 8
-               : sizeof(std::complex<float>);
+      return sizeof(std::complex<float>);
     }
 
 
@@ -213,9 +207,6 @@ namespace LOFAR
       defs["NR_BITS_PER_SAMPLE"] =
         lexical_cast<string>(itsParameters.nrBitsPerSample);
 
-      if (itsParameters.inputIsStationData)
-        defs["INPUT_IS_STATIONDATA"] = "1";
-
       defs["NR_CHANNELS"] = lexical_cast<string>(itsParameters.nrChannels);
       defs["NR_SAMPLES_PER_CHANNEL"] = 
         lexical_cast<string>(itsParameters.nrSamplesPerChannel);
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.h
index d5bcd7bb673b5fa8eb789edab75c6abf9b9a1957..a2f051c6f0aef458ac27e3d96d3b58e9d50fb314 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.h
@@ -60,7 +60,6 @@ namespace LOFAR
         std::vector<unsigned> delayIndices;
         unsigned nrDelays;
         unsigned nrBitsPerSample;
-        bool inputIsStationData;
 
         unsigned nrChannels;
         unsigned nrSamplesPerChannel;
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.cc
index e29f6af6e5cf7bd86832eb27c1d59fa40bf72b80..57717698695df1aa0c066546f7b9f642ee10204b 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.cc
@@ -45,17 +45,16 @@ namespace LOFAR
     string IntToFloatKernel::theirSourceFile = "IntToFloat.cu";
     string IntToFloatKernel::theirFunction = "intToFloat";
 
-    IntToFloatKernel::Parameters::Parameters(const Parset& ps) :
+    IntToFloatKernel::Parameters::Parameters(const Parset& ps, bool fftShift, bool beamFormerStationSubset) :
       Kernel::Parameters("intToFloat"),
       nrInputStations(ps.settings.antennaFields.size()),
-      stationIndices(ObservationSettings::AntennaFieldName::indices(ps.settings.beamFormer.antennaFieldNames, ps.settings.antennaFieldNames)),
+      stationIndices(beamFormerStationSubset ? ObservationSettings::AntennaFieldName::indices(ps.settings.beamFormer.antennaFieldNames, ps.settings.antennaFieldNames) : vector<unsigned>()),
       nrBitsPerSample(ps.settings.nrBitsPerSample),
 
       nrSamplesPerSubband(ps.settings.blockSize),
-      fftShift(ps.settings.beamFormer.nrDelayCompensationChannels > 1)
+      fftShift(fftShift),
+      doStationSubset(beamFormerStationSubset)
     {
-      ASSERTSTR(ps.settings.beamFormer.enabled, "IntToFloatKernel::Parameters assumes it will be used in the beamFormer, but the beamFormer is not enabled.");
-
       dumpBuffers = 
         ps.getBool("Cobalt.Kernels.IntToFloatKernel.dumpOutput", false);
       dumpFilePattern = 
@@ -133,6 +132,8 @@ namespace LOFAR
 
       if (itsParameters.fftShift)
         defs["DO_FFTSHIFT"] = "1";
+      if (itsParameters.doStationSubset)
+        defs["DO_STATIONSUBSET"] = "1";
 
       return defs;
     }
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.h
index 3c212b974f6475565936841e9d0f127afef0aad7..434d6f31b097ee17b94aa685120b47f61e76e5a6 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.h
@@ -51,7 +51,7 @@ namespace LOFAR
       // IntToFloatKernel class.
       struct Parameters : Kernel::Parameters
       {
-        Parameters(const Parset& ps);
+        Parameters(const Parset& ps, bool fftShift, bool beamFormerStationSubset);
         unsigned nrInputStations;
         std::vector<unsigned> stationIndices; // input station nr for ewch output station
 
@@ -61,10 +61,11 @@ namespace LOFAR
         unsigned nrSamplesPerSubband;
 
         bool fftShift;
+        bool doStationSubset;
 
         size_t bufferSize(BufferType bufferType) const;
 
-        unsigned nrOutputStations() const { return stationIndices.size(); }
+        unsigned nrOutputStations() const { return doStationSubset ? stationIndices.size() : nrInputStations; }
       };
 
       IntToFloatKernel(const gpu::Stream &stream,
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/ZeroingKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/ZeroingKernel.cc
index 4d0d9a1e8cab696c808f26a988f3d628f840c9ce..cb828e4eb085a3aa533e2715ed22362fff90adf2 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/ZeroingKernel.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/ZeroingKernel.cc
@@ -108,6 +108,7 @@ namespace LOFAR
       // marshall flags to GPU host buffer
       computeMaskTimer.start();
       for(unsigned station = 0; station < nrSTABs; ++station) {
+        LOG_DEBUG_STR("Flags for block " << blockId << ", station " << station << ": " << channelFlags[station]);
         channelFlags[station].toByteset(hostMask.get<char>() + station * nrSamplesPerChannel, nrSamplesPerChannel);
       }
       computeMaskTimer.stop();
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerPreprocessingStep.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerPreprocessingStep.cc
index e85db40772492a8a63dfd4eabc20818ef7389670..a9aac05e5e1c927a5f538b3b92381ff5498701fc 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerPreprocessingStep.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerPreprocessingStep.cc
@@ -37,7 +37,10 @@ namespace LOFAR
   namespace Cobalt
   {
     BeamFormerPreprocessingStep::Factories::Factories(const Parset &ps) :
-        intToFloat(ps),
+        intToFloat(IntToFloatKernel::Parameters(
+          ps, 
+          ps.settings.beamFormer.nrDelayCompensationChannels > 1,
+          true)),
 
         firstFFT(FFT_Kernel::Parameters(
           ps.settings.beamFormer.nrDelayCompensationChannels,
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.cc
index b58796880974d8c4665ccf843e7c61bb5ba37269..e4d68345a01252163e97a7c743e6f0f88552087c 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.cc
@@ -61,12 +61,15 @@ namespace LOFAR
             std::sqrt((double)ps.settings.correlator.nrChannels),
             "FIR (correlator)"))
           : NULL),
-      zeroing(ps.settings.correlator.nrChannels > 1
-          ? new KernelFactory<ZeroingKernel>(ZeroingKernel::Parameters(ps,
+
+      intToFloat(ps.settings.correlator.nrChannels == 1
+          ? new KernelFactory<IntToFloatKernel>(IntToFloatKernel::Parameters(ps, false, false))
+          : NULL),
+
+      zeroing(ZeroingKernel::Parameters(ps,
             ps.settings.antennaFields.size(),
             ps.settings.correlator.nrChannels,
-            "Zeroing (correlator)"))
-          : NULL),
+            "Zeroing (correlator)")),
 
       delayAndBandPass(DelayAndBandPassKernel::Parameters(ps, true)),
 
@@ -257,10 +260,8 @@ namespace LOFAR
       :
       ProcessStep(parset, i_queue),
       correlatorPPF(ps.settings.correlator.nrChannels > 1),
-      devE(context, correlatorPPF
-                    ? std::max(factories.correlator.bufferSize(CorrelatorKernel::INPUT_DATA),
-                               factories.correlator.bufferSize(CorrelatorKernel::OUTPUT_DATA))
-                    : factories.correlator.bufferSize(CorrelatorKernel::OUTPUT_DATA)),
+      devE(context, std::max(factories.correlator.bufferSize(CorrelatorKernel::INPUT_DATA),
+                             factories.correlator.bufferSize(CorrelatorKernel::OUTPUT_DATA))),
       outputCounter(context, "output (correlator)"),
       integratedData(nrSubbandsPerSubbandProc)
     {
@@ -273,14 +274,16 @@ namespace LOFAR
 
         // FFT: B -> E
         fftKernel = factories.fft->create(queue, *devB, devE);
-
-        // Zeroing: E -> E
-        zeroingKernel = factories.zeroing->create(queue, devE, devE);
+      } else {
+        // intToFloat: A -> E
+        intToFloatKernel = factories.intToFloat->create(queue, *devA, devE);
       }
 
-      // Delay and Bandpass: A/E -> B
-      delayAndBandPassKernel = std::auto_ptr<DelayAndBandPassKernel>(factories.delayAndBandPass.create(queue, 
-        correlatorPPF ? devE : *devA, *devB));
+      // Zeroing: E -> E
+      zeroingKernel = factories.zeroing.create(queue, devE, devE);
+
+      // Delay and Bandpass: E -> B
+      delayAndBandPassKernel = std::auto_ptr<DelayAndBandPassKernel>(factories.delayAndBandPass.create(queue, devE, *devB));
 
       // Correlator: B -> E
       correlatorKernel = std::auto_ptr<CorrelatorKernel>(factories.correlator.create(queue,
@@ -333,6 +336,11 @@ namespace LOFAR
 
         // Zero the output of each FFT that had flagged input samples
         zeroingKernel->enqueue(input.blockID, flagsPerChannel);
+      } else {
+        intToFloatKernel->enqueue(input.blockID);
+
+        // Zero the flagged samples
+        zeroingKernel->enqueue(input.blockID, input.inputFlags);
       }
 
       // Even if we skip delay compensation and bandpass correction (rare), run
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.h
index 048fb6f5fb5a114a812c8658b865ba76f4e85278..052c850cb21e3aa47c3d54880464b40db00224d7 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.h
@@ -45,6 +45,7 @@
 #include <GPUProc/Kernels/DelayAndBandPassKernel.h>
 #include <GPUProc/Kernels/FIR_FilterKernel.h>
 #include <GPUProc/Kernels/FFT_Kernel.h>
+#include <GPUProc/Kernels/IntToFloatKernel.h>
 #include <GPUProc/Kernels/ZeroingKernel.h>
 #include <GPUProc/Kernels/CorrelatorKernel.h>
 
@@ -61,7 +62,9 @@ namespace LOFAR
 
         SmartPtr< KernelFactory<FFT_Kernel> > fft;
         SmartPtr< KernelFactory<FIR_FilterKernel> > firFilter;
-        SmartPtr< KernelFactory<ZeroingKernel> > zeroing;
+        SmartPtr< KernelFactory<IntToFloatKernel> > intToFloat;
+
+        KernelFactory<ZeroingKernel> zeroing;
 
         KernelFactory<DelayAndBandPassKernel> delayAndBandPass;
 
@@ -148,6 +151,9 @@ namespace LOFAR
       // FFT
       SmartPtr<FFT_Kernel> fftKernel;
 
+      // IntToFloat (in case of no FFT)
+      SmartPtr<IntToFloatKernel> intToFloatKernel;
+
       // Zeroing
       SmartPtr<ZeroingKernel> zeroingKernel;
 
diff --git a/RTCP/Cobalt/GPUProc/test/Kernels/tIntToFloatKernel.cc b/RTCP/Cobalt/GPUProc/test/Kernels/tIntToFloatKernel.cc
index 1736f9c0aee34b7547d224421ed7239b46a285eb..b7f6e4a8fa45f127f191c7055428d617a2fb4e5c 100644
--- a/RTCP/Cobalt/GPUProc/test/Kernels/tIntToFloatKernel.cc
+++ b/RTCP/Cobalt/GPUProc/test/Kernels/tIntToFloatKernel.cc
@@ -49,7 +49,7 @@ int main() {
   gpu::Stream stream(ctx);
 
   Parset ps("tIntToFloatKernel.in_parset");
-  KernelFactory<IntToFloatKernel> factory(ps);
+  KernelFactory<IntToFloatKernel> factory(IntToFloatKernel::Parameters(ps, true, true));
 
   size_t nSampledData = factory.bufferSize(IntToFloatKernel::INPUT_DATA) / sizeof(char);
   size_t sizeSampledData = nSampledData * sizeof(char);
diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/CMakeLists.txt b/RTCP/Cobalt/GPUProc/test/SubbandProcs/CMakeLists.txt
index c906aeb43a7312c84ad702ba7c63d033cc73a09c..4a693506fbf7af9f111e17e629077a78c26d2de3 100644
--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/CMakeLists.txt
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/CMakeLists.txt
@@ -5,14 +5,13 @@ include(LofarCTest)
 if(UNITTEST++_FOUND)
   lofar_add_test(tCorrelatorSubbandProc tCorrelatorSubbandProc.cc)
   lofar_add_test(tCorrelatorStep tCorrelatorStep.cc)
+  lofar_add_test(tCorrelatorSubbandProcProcessSb tCorrelatorSubbandProcProcessSb.cc)
 endif()
 
 # This test is instable. Added to issue tracker: https://support.astron.nl/lofar_issuetracker/issues/5807
 # Understand the proble
 lofar_add_test(tBeamFormerSubbandProcProcessSb tBeamFormerSubbandProcProcessSb.cc)
 
-lofar_add_test(tCorrelatorSubbandProcProcessSb
-  tCorrelatorSubbandProcProcessSb.cc)
 lofar_add_test(tCoherentStokesBeamFormerSubbandProcProcessSb
   tCoherentStokesBeamFormerSubbandProcProcessSb.cc ../Kernels/KernelTestHelpers.cc)
 lofar_add_test(tFlysEyeBeamFormerSubbandProcProcessSb
diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorSubbandProcProcessSb.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorSubbandProcProcessSb.cc
index bd14f401529520869f2da3a006c2cef9266c9ed9..21a4cc8eeeb5baa1e34d9fe243ffd30c04cbf5ac 100644
--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorSubbandProcProcessSb.cc
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorSubbandProcProcessSb.cc
@@ -29,148 +29,240 @@
 #include <GPUProc/SubbandProcs/KernelFactories.h>
 #include <GPUProc/SubbandProcs/SubbandProc.h>
 
+#include <UnitTest++.h>
+
 using namespace std;
 using namespace LOFAR;
 using namespace LOFAR::Cobalt;
 
+struct SubbandProcWrapper {
+  // Create very simple kernel programs, with predictable output. Skip as much
+  // as possible. Nr of channels/sb from the parset is 1, so the PPF will not
+  // even run.  Parset also has turned of delay compensation and bandpass
+  // correction (but that kernel will run to convert int to float and to
+  // transform the data order).
+
+  gpu::Device device;
+  gpu::Context ctx;
+
+  Parset ps;
+  KernelFactories factories;
+  SubbandProc cwq;
+  SubbandProcInputData in;
+  SubbandProcOutputData out;
+
+  fcomplex inputValue;
+
+  SubbandProcWrapper(Parset &ps):
+    device(0),
+    ctx(device),
+    ps(ps),
+    factories(ps, 1),
+    cwq(ps, ctx, factories),
+    in(ps.settings.SAPs.size(),
+       ps.settings.antennaFields.size(),
+       ps.settings.nrPolarisations,
+       ps.settings.beamFormer.maxNrTABsPerSAP(),
+       ps.settings.blockSize,
+       ps.nrBytesPerComplexSample(),
+       ctx),
+    out(ps, ctx),
+
+    inputValue(1,1)
+  {
+    // Input info
+    const size_t nrBeams = ps.settings.SAPs.size();
+    const size_t nrStations = ps.settings.antennaFields.size();
+    const size_t nrPolarisations = ps.settings.nrPolarisations;
+    const size_t maxNrTABsPerSAP = ps.settings.beamFormer.maxNrTABsPerSAP();
+    const size_t nrSamplesPerIntegration = ps.settings.correlator.nrSamplesPerIntegration();
+    const size_t nrSamplesPerSubband = ps.settings.blockSize;
+    const size_t nrBitsPerSample = ps.settings.nrBitsPerSample;
+    const size_t nrBytesPerComplexSample = ps.nrBytesPerComplexSample();
+
+    // We only support 8-bit or 16-bit input samples
+    ASSERT(nrBitsPerSample == 8 || nrBitsPerSample == 16);
+
+    // Output info
+    const size_t nrBaselines = ps.nrBaselines();
+    const size_t nrBlocksPerIntegration = 
+      ps.settings.correlator.nrBlocksPerIntegration;
+    const size_t nrChannelsPerSubband = ps.settings.correlator.nrChannels;
+    const size_t integrationSteps = ps.settings.correlator.nrSamplesPerIntegration();
+
+    LOG_INFO_STR(
+      "\nInput info:" <<
+      "\n  nrBeams = " << nrBeams <<
+      "\n  nrStations = " << nrStations <<
+      "\n  nrPolarisations = " << nrPolarisations <<
+      "\n  maxNrTABsPerSAP = " << maxNrTABsPerSAP <<
+      "\n  nrSamplesPerIntegration = " << nrSamplesPerIntegration <<
+      "\n  nrSamplesPerSubband = " << nrSamplesPerSubband <<
+      "\n  nrBitsPerSample = " << nrBitsPerSample <<
+      "\n  nrBytesPerComplexSample = " << nrBytesPerComplexSample << 
+      "\n  ----------------------------" <<
+      "\n  Total bytes = " << in.inputSamples.size());
+
+    LOG_INFO_STR(
+      "\nOutput info:" <<
+      "\n  nrBaselines = " << nrBaselines <<
+      "\n  nrBlockPerIntegration = " << nrBlocksPerIntegration << 
+      "\n  nrChannelsPerSubband = " << nrChannelsPerSubband <<
+      "\n  integrationSteps = " << integrationSteps <<
+      "\n  ----------------------------" <<
+      "\n  Total bytes = " << out.correlatedData.subblocks[0]->visibilities.size());
+
+    // Initialize synthetic input to all (1, 1).
+    for (size_t st = 0; st < nrStations; st++)
+      for (size_t i = 0; i < nrSamplesPerSubband; i++)
+        for (size_t pol = 0; pol < nrPolarisations; pol++)
+          setInputValue(st, i, pol, inputValue);
+
+    // Initialize subbands partitioning administration (struct BlockID). We only
+    // do the 1st block of whatever.
+    in.blockID.block = 0;
+    in.blockID.globalSubbandIdx = 0;
+    in.blockID.localSubbandIdx = 0;
+    in.blockID.subbandProcSubbandIdx = 0;
+
+    // Initialize delays. We skip delay compensation, but init anyway,
+    // so we won't copy uninitialized data to the device.
+    for (size_t i = 0; i < in.delaysAtBegin.size(); i++)
+      in.delaysAtBegin.get<float>()[i] = 0.0f;
+    for (size_t i = 0; i < in.delaysAfterEnd.size(); i++)
+      in.delaysAfterEnd.get<float>()[i] = 0.0f;
+    for (size_t i = 0; i < in.phase0s.size(); i++)
+      in.phase0s.get<float>()[i] = 0.0f;
+  }
+
+  void setInputValue(unsigned station, unsigned t, unsigned pol, fcomplex value) {
+    const size_t nrBitsPerSample = ps.settings.nrBitsPerSample;
+
+    switch(nrBitsPerSample) {
+    case 8:
+      reinterpret_cast<i8complex&>(in.inputSamples[station][t][pol][0]) = 
+        i8complex(value);
+      break;
+    case 16:
+      reinterpret_cast<i16complex&>(in.inputSamples[station][t][pol][0]) = 
+        i16complex(value);
+      break;
+    }
+  }
+
+  void process() {
+    const ssize_t nrBlocksPerIntegration = 
+      ps.settings.correlator.nrBlocksPerIntegration;
+
+    ssize_t block(0);
+
+    LOG_INFO("Processing ...");
+    for (block = -1; block < nrBlocksPerIntegration; block++) {
+      LOG_DEBUG_STR("Processing block #" << block);
+
+      in.blockID.block = block;
+      cwq.processSubband(in, out);
+      if (block >= 0)
+        cwq.postprocessSubband(out);
+    }
+    ASSERT(block == nrBlocksPerIntegration);
+    ASSERT(out.emit_correlatedData);
+  }
+
+  fcomplex outputValue() const {
+    // The output is the correlation-product of two inputs (with identical
+    // `inputValue`).
+
+    const size_t scaleFactor = ps.settings.nrBitsPerSample == 16 ? 1 : 16;
+
+    return norm(inputValue) * scaleFactor * scaleFactor;
+  }
+
+  void verifyOutput() const {
+    // we don't process the FFT in our reference calculations yet
+    ASSERT(ps.settings.correlator.nrChannels == 1);
+
+    LOG_INFO("Verifying output ...");
+    for (size_t b = 0; b < ps.nrBaselines(); b++)
+      for (size_t c = 0; c < ps.settings.correlator.nrChannels; c++)
+        for (size_t pol0 = 0; pol0 < ps.settings.nrPolarisations; pol0++)
+          for (size_t pol1 = 0; pol1 < ps.settings.nrPolarisations; pol1++)
+            ASSERTSTR(fpEquals(out.correlatedData.subblocks[0]->visibilities[b][c][pol0][pol1], outputValue()),
+                      "out[" << b << "][" << c << "][" << pol0 << 
+                      "][" << pol1 << "] = " << out.correlatedData.subblocks[0]->visibilities[b][c][pol0][pol1] << 
+                      "; outputValue = " << outputValue());
+  }
+};
+
+// Test the output on clean data
+TEST(output_noflags) {
+  Parset  ps("tCorrelatorSubbandProcProcessSb.parset");
+  SubbandProcWrapper wrapper(ps);
+
+  wrapper.process();
+  wrapper.verifyOutput();
+}
+
+// Test the output if there is a flagged sample
+TEST(output_flags) {
+  Parset  ps("tCorrelatorSubbandProcProcessSb.parset");
+  SubbandProcWrapper wrapper(ps);
+
+  // Flag one sample
+  wrapper.in.inputFlags[0].include(13);
+  // Replace it with an extreme value to know whether it's actually skipped
+  wrapper.setInputValue(0, 13, 0, fcomplex(100,100));
+  wrapper.setInputValue(0, 13, 1, fcomplex(100,100));
+
+  // process
+  wrapper.process();
+  wrapper.verifyOutput();
+}
+
+// Test the final weights after FFT if there is flagged data
+TEST(weights_flags_64ch) {
+  // Override nr channels to 64
+  Parset  ps("tCorrelatorSubbandProcProcessSb.parset");
+  ps.replace("Cobalt.Correlator.nrChannelsPerSubband", "64");
+  ps.updateSettings();
+  SubbandProcWrapper wrapper(ps);
+
+  // Flag one sample
+  wrapper.in.inputFlags[0].include(13);
+
+  // process
+  wrapper.process();
+
+  // We reuse the flagged sample in all of the blocks, so
+  // we actually lose "nrBlocks" samples.
+  const unsigned nrBlocks = ps.settings.correlator.nrBlocksPerIntegration;
+  const unsigned nrValidSamples = wrapper.ps.settings.correlator.nrSamplesPerIntegration();
+  const unsigned nrValidSamplesFlagged =
+    nrValidSamples
+    // we lose "NR_TAPS" samples per block for each flagged sample
+    - nrBlocks * NR_TAPS;
+
+  LOG_INFO("Verifying output weights ...");
+  for (size_t b = 0; b < wrapper.ps.nrBaselines(); b++)
+    for (size_t c = 1; c < wrapper.ps.settings.correlator.nrChannels; c++) {
+      // baseline 0 and 1 contain station 0 with a flagged sample.
+      unsigned expected = b < 2 ? nrValidSamplesFlagged : nrValidSamples;
+      ASSERTSTR(wrapper.out.correlatedData.subblocks[0]->getNrValidSamples(b, c) == expected,
+                "nrValidSamples[" << b << "][" << c << "] = "
+                << wrapper.out.correlatedData.subblocks[0]->getNrValidSamples(b, c)
+                << "; expected " << expected);
+    }
+}
+
 int main() {
   INIT_LOGGER("tCorrelatorSubbandProcProcessSb");
 
   try {
     gpu::Platform pf;
-    cout << "Detected " << pf.size() << " CUDA devices" << endl;
+    return UnitTest::RunAllTests() > 0;
   } catch (gpu::CUDAException& e) {
     cerr << e.what() << endl;
     return 3;
   }
-
-  gpu::Device device(0);
-  vector<gpu::Device> devices(1, device);
-  gpu::Context ctx(device);
-
-  Parset ps("tCorrelatorSubbandProcProcessSb.parset");
-
-  // Input info
-  const size_t nrBeams = ps.settings.SAPs.size();
-  const size_t nrStations = ps.settings.antennaFields.size();
-  const size_t nrPolarisations = ps.settings.nrPolarisations;
-  const size_t maxNrTABsPerSAP = ps.settings.beamFormer.maxNrTABsPerSAP();
-  const size_t nrSamplesPerChannel = ps.settings.correlator.nrSamplesPerIntegration();
-  const size_t nrSamplesPerSubband = ps.settings.blockSize;
-  const size_t nrBitsPerSample = ps.settings.nrBitsPerSample;
-  const size_t nrBytesPerComplexSample = ps.nrBytesPerComplexSample();
-  const fcomplex inputValue(1,1);
-
-  // We only support 8-bit or 16-bit input samples
-  ASSERT(nrBitsPerSample == 8 || nrBitsPerSample == 16);
-
-  // Output info
-  const size_t nrBaselines = nrStations * (nrStations + 1) / 2;
-  const size_t nrBlocksPerIntegration = 
-    ps.settings.correlator.nrBlocksPerIntegration;
-  const size_t nrChannelsPerSubband = ps.settings.correlator.nrChannels;
-  const size_t integrationSteps = ps.settings.correlator.nrSamplesPerIntegration();
-  const size_t scaleFactor = nrBitsPerSample == 16 ? 1 : 16;
-
-  // The output is the correlation-product of two inputs (with identical
-  // `inputValue`).
-  const fcomplex outputValue = 
-    norm(inputValue) * scaleFactor * scaleFactor;
-
-  // Create very simple kernel programs, with predictable output. Skip as much
-  // as possible. Nr of channels/sb from the parset is 1, so the PPF will not
-  // even run.  Parset also has turned of delay compensation and bandpass
-  // correction (but that kernel will run to convert int to float and to
-  // transform the data order).
-
-  KernelFactories factories(ps, 1);
-  SubbandProc cwq(ps, ctx, factories);
-
-  SubbandProcInputData in(
-    nrBeams, nrStations, nrPolarisations, maxNrTABsPerSAP,
-    nrSamplesPerSubband, nrBytesPerComplexSample, ctx);
-
-  SubbandProcOutputData out(ps, ctx);
-
-  LOG_INFO_STR(
-    "\nInput info:" <<
-    "\n  nrBeams = " << nrBeams <<
-    "\n  nrStations = " << nrStations <<
-    "\n  nrPolarisations = " << nrPolarisations <<
-    "\n  maxNrTABsPerSAP = " << maxNrTABsPerSAP <<
-    "\n  nrSamplesPerChannel = " << nrSamplesPerChannel <<
-    "\n  nrSamplesPerSubband = " << nrSamplesPerSubband <<
-    "\n  nrBitsPerSample = " << nrBitsPerSample <<
-    "\n  nrBytesPerComplexSample = " << nrBytesPerComplexSample << 
-    "\n  inputValue = " << inputValue <<
-    "\n  ----------------------------" <<
-    "\n  Total bytes = " << in.inputSamples.size());
-
-  LOG_INFO_STR(
-    "\nOutput info:" <<
-    "\n  nrBaselines = " << nrBaselines <<
-    "\n  nrBlockPerIntegration = " << nrBlocksPerIntegration << 
-    "\n  nrChannelsPerSubband = " << nrChannelsPerSubband <<
-    "\n  integrationSteps = " << integrationSteps <<
-    "\n  scaleFactor = " << scaleFactor << 
-    "\n  outputValue = " << outputValue <<
-    "\n  ----------------------------" <<
-    "\n  Total bytes = " << out.correlatedData.subblocks[0]->visibilities.size());
-
-  // Initialize synthetic input to all (1, 1).
-  for (size_t st = 0; st < nrStations; st++)
-    for (size_t i = 0; i < nrSamplesPerSubband; i++)
-      for (size_t pol = 0; pol < nrPolarisations; pol++)
-      {
-        switch(nrBitsPerSample) {
-        case 8:
-          reinterpret_cast<i8complex&>(in.inputSamples[st][i][pol][0]) = 
-            i8complex(inputValue);
-          break;
-        case 16:
-          reinterpret_cast<i16complex&>(in.inputSamples[st][i][pol][0]) = 
-            i16complex(inputValue);
-          break;
-        }
-      }
-
-  // Initialize subbands partitioning administration (struct BlockID). We only
-  // do the 1st block of whatever.
-  in.blockID.block = 0;
-  in.blockID.globalSubbandIdx = 0;
-  in.blockID.localSubbandIdx = 0;
-  in.blockID.subbandProcSubbandIdx = 0;
-
-  // Initialize delays. We skip delay compensation, but init anyway,
-  // so we won't copy uninitialized data to the device.
-  for (size_t i = 0; i < in.delaysAtBegin.size(); i++)
-    in.delaysAtBegin.get<float>()[i] = 0.0f;
-  for (size_t i = 0; i < in.delaysAfterEnd.size(); i++)
-    in.delaysAfterEnd.get<float>()[i] = 0.0f;
-  for (size_t i = 0; i < in.phase0s.size(); i++)
-    in.phase0s.get<float>()[i] = 0.0f;
-
-  size_t block(0);
-
-  LOG_INFO("Processing ...");
-  for (block = 0; block < nrBlocksPerIntegration && !out.emit_correlatedData; block++) {
-    LOG_DEBUG_STR("Processing block #" << block);
-    cwq.processSubband(in, out);
-    cwq.postprocessSubband(out);
-  }
-  ASSERT(block == nrBlocksPerIntegration);
-
-  LOG_INFO("Verifying output ...");
-  for (size_t b = 0; b < nrBaselines; b++)
-    for (size_t c = 0; c < nrChannelsPerSubband; c++)
-      for (size_t pol0 = 0; pol0 < nrPolarisations; pol0++)
-        for (size_t pol1 = 0; pol1 < nrPolarisations; pol1++)
-          ASSERTSTR(fpEquals(out.correlatedData.subblocks[0]->visibilities[b][c][pol0][pol1], outputValue),
-                    "out[" << b << "][" << c << "][" << pol0 << 
-                    "][" << pol1 << "] = " << out.correlatedData.subblocks[0]->visibilities[b][c][pol0][pol1] << 
-                    "; outputValue = " << outputValue);
-
-  LOG_INFO("Test OK");
-  return 0;
 }
 
diff --git a/RTCP/Cobalt/GPUProc/test/cuda/tDelayAndBandPass.cc b/RTCP/Cobalt/GPUProc/test/cuda/tDelayAndBandPass.cc
index 2272df30b5d09e962de11fa975ca7ff0bd2a3a2e..f78dab5af26fdbd43884b80b0dadefefbaa0b571 100644
--- a/RTCP/Cobalt/GPUProc/test/cuda/tDelayAndBandPass.cc
+++ b/RTCP/Cobalt/GPUProc/test/cuda/tDelayAndBandPass.cc
@@ -59,7 +59,6 @@ const unsigned NR_POLARIZATIONS = 2;
 
 const unsigned NR_SAPS = 8;
 const double SUBBAND_BANDWIDTH = 0.0 * NR_CHANNELS;
-const bool INPUT_IS_STATIONDATA = false;
 const bool BANDPASS_CORRECTION = true;
 const bool DELAY_COMPENSATION = false;
 const bool DO_TRANSPOSE = true;
@@ -150,8 +149,6 @@ CompileDefinitions getDefaultCompileDefinitions()
     boost::lexical_cast<string>(NR_STATIONS);
   defs["NR_DELAYS"] =
     boost::lexical_cast<string>(NR_DELAYS);
-  if (INPUT_IS_STATIONDATA)
-    defs["INPUT_IS_STATIONDATA"] = "1";
 
   defs["NR_CHANNELS"] =
     boost::lexical_cast<string>(NR_CHANNELS);
@@ -178,9 +175,7 @@ CompileDefinitions getDefaultCompileDefinitions()
   return defs;
 }
 
-// T is an LCS i*complex type, or complex<float> when #chnl > 1.
 // It is the value type of the data input array.
-template <typename T>
 vector<fcomplex> runTest(const CompileDefinitions& compileDefs,
                          double subbandFrequency,
                          unsigned beam,
@@ -192,15 +187,7 @@ vector<fcomplex> runTest(const CompileDefinitions& compileDefs,
   gpu::Context ctx(stream->getContext());
 
   boost::scoped_ptr<MultiDimArrayHostBuffer<fcomplex, 4> > outputData;
-  boost::scoped_ptr<MultiDimArrayHostBuffer<T,        4> > inputData;
-
-  if (compileDefs.find("INPUT_IS_STATIONDATA") == compileDefs.end()) {
-    // If input does not come from station, we'll read fcomplex.
-    ASSERT(sizeof(T) == sizeof(fcomplex));
-  } else {
-    // If input does NOT come from station, NR_BITS_PER_SAMPLE needs to match T.
-    ASSERT(boost::lexical_cast<unsigned>(compileDefs.at("NR_BITS_PER_SAMPLE")) == 8 * sizeof(T) / 2);
-  }
+  boost::scoped_ptr<MultiDimArrayHostBuffer<fcomplex, 4> > inputData;
 
   if(compileDefs.find("DO_TRANSPOSE") != compileDefs.end())
     outputData.reset(
@@ -225,7 +212,7 @@ vector<fcomplex> runTest(const CompileDefinitions& compileDefs,
   unsigned nchnl = boost::lexical_cast<unsigned>(cit->second);
   if (nchnl == 1) // integer input data (FIR+FFT skipped)
     inputData.reset(
-      new MultiDimArrayHostBuffer<T, 4>(boost::extents
+      new MultiDimArrayHostBuffer<fcomplex, 4>(boost::extents
                                         [NR_STATIONS]
                                         [NR_SAMPLES_PER_CHANNEL]
                                         [NR_CHANNELS]
@@ -317,7 +304,7 @@ TEST(BandPass)
   // The input samples are all ones. After correction, multiply with 2.
   // The first and the last complex values are retrieved. They should be scaled
   // with the bandPassFactor == 2
-  vector<fcomplex> results(runTest<fcomplex>(
+  vector<fcomplex> results(runTest(
                              defs,
                              0.0, // sb freq
                              0U,  // beam
@@ -342,7 +329,7 @@ TEST(Phase0s)
   defs["DELAY_COMPENSATION"] = "1";
   defs["SUBBAND_BANDWIDTH"] = "1.0";
 
-  vector<fcomplex> results(runTest<fcomplex>(
+  vector<fcomplex> results(runTest(
                              defs,
                              1.0,    // sb freq
                              0U,     // beam
@@ -371,7 +358,7 @@ SUITE(DelayCompensation)
     defs["DELAY_COMPENSATION"] = "1";
     defs["SUBBAND_BANDWIDTH"] = "1.0";
 
-    vector<fcomplex> results(runTest<fcomplex>(
+    vector<fcomplex> results(runTest(
                                defs,
                                1.0,    // sb freq
                                0U,     // beam
@@ -468,7 +455,7 @@ SUITE(DelayCompensation)
     defs["DELAY_COMPENSATION"] = "1";
     defs["SUBBAND_BANDWIDTH"] = "1.0";
 
-    vector<fcomplex> results(runTest<fcomplex>(
+    vector<fcomplex> results(runTest(
                                defs,
                                1.0,    // sb freq
                                0U,     // beam
@@ -549,7 +536,7 @@ TEST(AllAtOnce)
   defs["DELAY_COMPENSATION"] = "1";
   defs["SUBBAND_BANDWIDTH"] = "1.0";
 
-  vector<fcomplex> results(runTest<fcomplex>(
+  vector<fcomplex> results(runTest(
                              defs,
                              1.0,    // sb freq
                              0U,     // beam