From f73e08e3e4720044b38f2921127dd4c70dacca5d Mon Sep 17 00:00:00 2001
From: Alexander van Amesfoort <amesfoort@astron.nl>
Date: Tue, 4 Jun 2013 17:33:28 +0000
Subject: [PATCH] Task #4338: MultiDimArrayHostBuffer.h, 1st version,
 CorrelatorWorkQueue partially adapted

---
 RTCP/Cobalt/GPUProc/src/BandPass.cc           |  9 +-
 RTCP/Cobalt/GPUProc/src/FilterBank.h          |  4 -
 .../{Buffers.h => MultiDimArrayHostBuffer.h}  | 12 +--
 .../src/cuda/MultiDimArrayHostBuffer.h        | 62 ++++++++++++
 .../GPUProc/src/cuda/Pipelines/Pipeline.cc    |  1 -
 .../src/cuda/WorkQueues/BeamFormerWorkQueue.h |  2 +-
 .../cuda/WorkQueues/CorrelatorWorkQueue.cc    | 99 ++++++++-----------
 .../src/cuda/WorkQueues/CorrelatorWorkQueue.h | 87 +++++++++-------
 .../src/cuda/WorkQueues/UHEP_WorkQueue.h      |  2 +-
 .../{Buffers.h => MultiDimArrayHostBuffer.h}  |  6 +-
 .../GPUProc/src/opencl/Pipelines/Pipeline.cc  |  5 -
 .../opencl/WorkQueues/BeamFormerWorkQueue.h   |  2 +-
 .../opencl/WorkQueues/CorrelatorWorkQueue.h   |  2 +-
 .../src/opencl/WorkQueues/UHEP_WorkQueue.h    |  2 +-
 14 files changed, 170 insertions(+), 125 deletions(-)
 rename RTCP/Cobalt/GPUProc/src/{Buffers.h => MultiDimArrayHostBuffer.h} (80%)
 create mode 100644 RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h
 rename RTCP/Cobalt/GPUProc/src/opencl/{Buffers.h => MultiDimArrayHostBuffer.h} (97%)

diff --git a/RTCP/Cobalt/GPUProc/src/BandPass.cc b/RTCP/Cobalt/GPUProc/src/BandPass.cc
index db79484649a..c42e4b74983 100644
--- a/RTCP/Cobalt/GPUProc/src/BandPass.cc
+++ b/RTCP/Cobalt/GPUProc/src/BandPass.cc
@@ -2130,7 +2130,7 @@ namespace LOFAR
       const std::complex<float> l = out[(i - 3 * nrChannels / 2) % fftSize];
       const std::complex<float> r = out[i + nrChannels / 2];
 
-      factors[i] = pow(2, 25) / sqrt(abs(m * m + l * l + r * r));
+      factors[i] = std::pow(2, 25) / std::sqrt(std::abs(m * m + l * l + r * r));
     }
   }
 
@@ -2140,11 +2140,12 @@ namespace LOFAR
 } // namespace LOFAR
 
 
-#if 0
+#ifdef TEST_COMPUTE_BANDPASS_CORRECTION_FACTORS
 int main()
 {
-  std::vector<float> factors(4096);
-  BandPass::computeCorrectionFactors(&factors[0], 4096);
+  unsigned nrChannelsPerSb = 4096;
+  std::vector<float> factors(nrChannelsPerSb);
+  BandPass::computeCorrectionFactors(&factors[0], nrChannelsPerSb);
   return 0;
 }
 #endif
diff --git a/RTCP/Cobalt/GPUProc/src/FilterBank.h b/RTCP/Cobalt/GPUProc/src/FilterBank.h
index 4e9b057b219..b4f186ff863 100644
--- a/RTCP/Cobalt/GPUProc/src/FilterBank.h
+++ b/RTCP/Cobalt/GPUProc/src/FilterBank.h
@@ -23,10 +23,6 @@
 
 #define USE_ORIGINAL_FILTER 0
 
-#if 0 || !defined HAVE_BGP
-#define FIR_C_IMPLEMENTATION
-#endif
-
 #include <boost/multi_array.hpp>
 
 namespace LOFAR
diff --git a/RTCP/Cobalt/GPUProc/src/Buffers.h b/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h
similarity index 80%
rename from RTCP/Cobalt/GPUProc/src/Buffers.h
rename to RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h
index 315e490f93b..dee99d41fa5 100644
--- a/RTCP/Cobalt/GPUProc/src/Buffers.h
+++ b/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h
@@ -1,4 +1,4 @@
-//# Buffers.h
+//# MultiDimArrayHostBuffer.h
 //#
 //# Copyright (C) 2013  ASTRON (Netherlands Institute for Radio Astronomy)
 //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
@@ -20,19 +20,19 @@
 //# $Id$
 
 // \file
-// Support for our GPU processing buffer types.
+// Support for our multi-dim array-ed GPU host buffer.
 
-#ifndef LOFAR_GPUPROC_BUFFERS_H
-#define LOFAR_GPUPROC_BUFFERS_H
+#ifndef LOFAR_GPUPROC_MULTI_DIM_ARRAY_HOST_BUFFER_H
+#define LOFAR_GPUPROC_MULTI_DIM_ARRAY_HOST_BUFFER_H
 
 #if defined (USE_CUDA) && defined (USE_OPENCL)
 # error "Either CUDA or OpenCL must be enabled, not both"
 #endif
 
 #if defined (USE_CUDA)
-# include "cuda/Buffers.h"
+# include "cuda/MultiDimArrayHostBuffer.h"
 #elif defined (USE_OPENCL)
-# include "opencl/Buffers.h"
+# include "opencl/MultiDimArrayHostBuffer.h"
 #else
 # error "Either CUDA or OpenCL must be enabled, not neither"
 #endif
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h b/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h
new file mode 100644
index 00000000000..c2fa3409c35
--- /dev/null
+++ b/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h
@@ -0,0 +1,62 @@
+//# MultiDimArrayHostBuffer.h
+//# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
+//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
+//#
+//# This file is part of the LOFAR software suite.
+//# The LOFAR software suite is free software: you can redistribute it and/or
+//# modify it under the terms of the GNU General Public License as published
+//# by the Free Software Foundation, either version 3 of the License, or
+//# (at your option) any later version.
+//#
+//# The LOFAR software suite is distributed in the hope that it will be useful,
+//# but WITHOUT ANY WARRANTY; without even the implied warranty of
+//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//# GNU General Public License for more details.
+//#
+//# You should have received a copy of the GNU General Public License along
+//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
+//#
+//# $Id$
+
+#ifndef LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H
+#define LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H
+
+#include <CoInterface/MultiDimArray.h>
+
+#include "gpu_wrapper.h"
+
+namespace LOFAR
+{
+  namespace Cobalt
+  {
+
+    // A MultiDimArray allocated as a HostBuffer
+    // Note: Elements are not constructed/destructed.
+    template <typename T, unsigned DIM>
+    class MultiDimArrayHostBuffer : public gpu::HostMemory,
+                                    public MultiDimArray<T, DIM>
+    {
+    public:
+      template <typename ExtentList>
+      MultiDimArrayHostBuffer(const ExtentList &extents, gpu::Context &context,
+                              unsigned int flags)
+      :
+        HostMemory(context, size(), flags),
+        MultiDimArray<T, DIM>(extents, gpu::HostMemory::get<T>(), false)
+      {
+      }
+
+      using HostMemory::size;
+
+    private:
+      MultiDimArrayHostBuffer(); // don't use
+      MultiDimArrayHostBuffer(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use
+      MultiDimArrayHostBuffer<T, DIM> &operator=(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use
+      using MultiDimArray<T, DIM>::resize; // don't use
+    };
+
+  } // namespace Cobalt
+} // namespace LOFAR
+
+#endif
+
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/Pipeline.cc b/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/Pipeline.cc
index 0a6d1bfe490..5065e76f421 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/Pipeline.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/Pipeline.cc
@@ -25,7 +25,6 @@
 #include <Common/LofarLogger.h>
 #include <Common/lofar_iomanip.h>
 
-#include <GPUProc/Buffers.h>
 #include <GPUProc/gpu_utils.h>
 
 namespace LOFAR
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/BeamFormerWorkQueue.h b/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/BeamFormerWorkQueue.h
index cc8d02bc4b0..8f117daac2b 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/BeamFormerWorkQueue.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/BeamFormerWorkQueue.h
@@ -26,7 +26,7 @@
 #include <Common/LofarLogger.h>
 #include <CoInterface/Parset.h>
 
-#include <GPUProc/Buffers.h>
+#include <GPUProc/MultiDimArrayHostBuffer.h>
 #include <GPUProc/BandPass.h>
 #include <GPUProc/Pipelines/BeamFormerPipeline.h>
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/CorrelatorWorkQueue.cc b/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/CorrelatorWorkQueue.cc
index 9aee86578bb..4f217501aa5 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/CorrelatorWorkQueue.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/CorrelatorWorkQueue.cc
@@ -55,65 +55,40 @@ namespace LOFAR
       WorkQueue( parset, context ),
       prevBlock(-1),
       prevSAP(-1),
-      devInput(ps.nrBeams(),
-                ps.nrStations(),
-                NR_POLARIZATIONS,
-                ps.nrHistorySamples() + ps.nrSamplesPerSubband(),
-                ps.nrBytesPerComplexSample(),
-                queue,
-
-                // reserve enough space in inputSamples for the output of
-                // the delayAndBandPassKernel.
-                ps.nrStations() * NR_POLARIZATIONS * ps.nrSamplesPerSubband() * sizeof(std::complex<float>)),
-      devFilteredData(queue,
-                      CL_MEM_READ_WRITE,
-
-                      // reserve enough space for the output of the
-                      // firFilterKernel,
-                      std::max(ps.nrStations() * NR_POLARIZATIONS * ps.nrSamplesPerSubband() * sizeof(std::complex<float>),
-                      // and the correlatorKernel.
-                      ps.nrBaselines() * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>))),
-      devFIRweights(queue,
-                    CL_MEM_READ_ONLY,
-                    ps.nrChannelsPerSubband() * NR_TAPS * sizeof(float)),
-      firFilterKernel(ps,
-                      queue,
-                      programs.firFilterProgram,
-                      devFilteredData,
-                      devInput.inputSamples,
-                      devFIRweights),
-      fftKernel(ps,
-                context,
-                devFilteredData),
-      bandPassCorrectionWeights(boost::extents[ps.nrChannelsPerSubband()],
-                                queue,
-                                CL_MEM_WRITE_ONLY,
-                                CL_MEM_READ_ONLY),
-      delayAndBandPassKernel(ps,
-                             programs.delayAndBandPassProgram,
+      // TODO: have the Kernel classes be able to provide input and output buffer sizes to use below
+      devInput(ps.nrBeams(), ps.nrStations(), NR_POLARIZATIONS,
+               ps.nrHistorySamples() + ps.nrSamplesPerSubband(),
+               ps.nrBytesPerComplexSample(), context,
+
+               // reserve enough space in inputSamples for the output of the delayAndBandPassKernel.
+               ps.nrStations() * NR_POLARIZATIONS * ps.nrSamplesPerSubband() * sizeof(std::complex<float>)),
+      // reserve enough space for the output of the firFilterKernel,
+      devFilteredData(context, std::max(
+                        ps.nrStations() * NR_POLARIZATIONS * ps.nrSamplesPerSubband() * sizeof(std::complex<float>),
+                        // and the correlatorKernel.
+                        ps.nrBaselines() * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>)
+                      )),
+      devFIRweights(context, ps.nrChannelsPerSubband() * NR_TAPS * sizeof(float)),
+      devBandPassCorrectionWeights(context, ps.nrChannelsPerSubband() * sizeof(float)),
+
+      firFilterKernel(ps, queue, programs.firFilterProgram,
+                      devFilteredData, devInput.inputSamples, devFIRweights),
+      fftKernel(ps, context, devFilteredData),
+      delayAndBandPassKernel(ps, programs.delayAndBandPassProgram,
                              devInput.inputSamples,
                              devFilteredData,
                              devInput.delaysAtBegin,
                              devInput.delaysAfterEnd,
                              devInput.phaseOffsets,
-                             bandPassCorrectionWeights),
+                             devBandPassCorrectionWeights),
 #if defined USE_NEW_CORRELATOR
-      correlateTriangleKernel(ps,
-                              queue,
-                              programs.correlatorProgram,
-                              devFilteredData,
-                              devInput.inputSamples),
-      correlateRectangleKernel(ps,
-                              queue,
-                              programs.correlatorProgram, 
-                              devFilteredData, 
-                              devInput.inputSamples)
+      correlateTriangleKernel(ps, queue, programs.correlatorProgram,
+                              devFilteredData, devInput.inputSamples),
+      correlateRectangleKernel(ps, queue, programs.correlatorProgram, 
+                              devFilteredData, devInput.inputSamples)
 #else
-      correlatorKernel(ps,
-                       queue, 
-                       programs.correlatorProgram, 
-                       devFilteredData, 
-                       devInput.inputSamples)
+      correlatorKernel(ps, queue, programs.correlatorProgram, 
+                              devFilteredData, devInput.inputSamples)
 #endif
     {
       // put enough objects in the inputPool to operate
@@ -141,7 +116,6 @@ namespace LOFAR
                 ps.nrStations(),
                 ps.nrChannelsPerSubband(),
                 ps.integrationSteps(),
-                devFilteredData,
                 *this));
       }
 
@@ -172,16 +146,21 @@ namespace LOFAR
       addTimer("GPU - compute");
       addTimer("GPU - wait");
 
-      // Copy the FIR filter weights to the device in two steps (TODO: make FilterBank supply the right buffer, or do like BandPassCorrectionWeights below)
-      size_t fbBytes = filterBank.getWeights().num_elements() * sizeof(float);
-      gpu::HostMemory fbBuffer(context, fbBytes);
-      std::memcpy(fbBuffer.get<void>(), filterBank.getWeights().origin(), fbBytes);
-      queue.writeBuffer(devFIRweights, fbBuffer, true);
+      // Copy the FIR filter and bandpass weights to the device.
+      // Note that these constant weights are now (unnecessarily) stored on the
+      // device for every workqueue. A single copy per device could be used, but
+      // first verify that the device platform still allows workqueue overlap.
+      size_t firWeightsSize = filterBank.getWeights().num_elements() * sizeof(float);
+      gpu::HostMemory firWeights(context, firWeightsSize);
+      std::memcpy(firWeights.get<void>(), filterBank.getWeights().origin(), fbBytes);
+      queue.writeBuffer(devFIRweights, firWeights, true);
 
       if (ps.correctBandPass())
       {
-        BandPass::computeCorrectionFactors(bandPassCorrectionWeights.origin(), ps.nrChannelsPerSubband());
-        bandPassCorrectionWeights.hostToDevice(true);
+        gpu::HostMemory bpWeights(context, ps.nrChannelsPerSubband() * sizeof(float));
+        BandPass::computeCorrectionFactors(bpWeights.origin(),
+                                           ps.nrChannelsPerSubband());
+        queue.writeBuffer(devBandPassCorrectionWeights, bpWeights, true);
       }
     }
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/CorrelatorWorkQueue.h b/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/CorrelatorWorkQueue.h
index b7c9b54b191..af86ee52a39 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/CorrelatorWorkQueue.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/CorrelatorWorkQueue.h
@@ -33,7 +33,7 @@
 #include <CoInterface/SubbandMetaData.h>
 
 #include <GPUProc/global_defines.h>
-#include <GPUProc/Buffers.h>
+#include <GPUProc/MultiDimArrayHostBuffer.h>
 #include <GPUProc/FilterBank.h>
 #include <GPUProc/Pipelines/CorrelatorPipelinePrograms.h>
 #include <GPUProc/Kernels/FIR_FilterKernel.h>
@@ -99,24 +99,29 @@ namespace LOFAR
     };
 
     // A CorrelatedData object tied to a HostBuffer and WorkQueue. Such links
-    // are needed for performance -- the visibilities are stored in a buffer
-    // directly linked to the GPU output buffer.
-    class CorrelatedDataHostBuffer: public MultiArrayHostBuffer<fcomplex, 4>, public CorrelatedData
+    // After the visibilities have been written to storage, we need remember
+    // the queue to recycle the buffer.
+    class CorrelatedDataHostBuffer: public CorrelatedData,
+                                    public MultiDimArrayHostBuffer<fcomplex, 4>
     {
     public:
-      CorrelatedDataHostBuffer(unsigned nrStations, unsigned nrChannels, unsigned maxNrValidSamples, DeviceBuffer &deviceBuffer, CorrelatorWorkQueue &queue) 
+      CorrelatedDataHostBuffer(unsigned nrStations, unsigned nrChannels,
+                               unsigned maxNrValidSamples, CorrelatorWorkQueue &workQueue)
       :
-        MultiArrayHostBuffer<fcomplex, 4>(boost::extents[nrStations * (nrStations + 1) / 2][nrChannels][NR_POLARIZATIONS][NR_POLARIZATIONS], CL_MEM_WRITE_ONLY, deviceBuffer),
-        CorrelatedData(nrStations, nrChannels, maxNrValidSamples, this->origin(), this->num_elements(), heapAllocator, 1),
-        queue(queue)
+        CorrelatedData(nrStations, nrChannels, maxNrValidSamples, this->origin(),
+                       this->num_elements(), heapAllocator, 1),
+        MultiDimArrayHostBuffer<fcomplex, 4>(boost::extents[nrStations * (nrStations + 1) / 2]
+                                                           [nrChannels][NR_POLARIZATIONS]
+                                                           [NR_POLARIZATIONS], 0),
+        workQueue(workQueue)
       {
       }
 
-      // Annotation required, as we'll loose track of the exact order
+      // Annotation required, as we'll lose track of the exact order
       size_t block;
       unsigned subband;
 
-      CorrelatorWorkQueue &queue;
+      CorrelatorWorkQueue &workQueue;
 
     private:
       CorrelatedDataHostBuffer();
@@ -134,24 +139,26 @@ namespace LOFAR
     {
     public:
 
-      // The set of GPU buffers to link our HostBuffers to.
+      // The set of GPU buffers to link our host buffers to.
+      // Device buffers may be reused between different pairs of kernels,
+      // since device memory size is a concern. Use inputSamplesMinSize
+      // to specify a minimum derived from other uses apart from input.
       struct DeviceBuffers
       {
-        DeviceBuffer delaysAtBegin;
-        DeviceBuffer delaysAfterEnd;
-        DeviceBuffer phaseOffsets;
-        DeviceBuffer inputSamples;
+        gpu::DeviceMemory delaysAtBegin;
+        gpu::DeviceMemory delaysAfterEnd;
+        gpu::DeviceMemory phaseOffsets;
+        gpu::DeviceMemory inputSamples;
 
         DeviceBuffers(size_t n_beams, size_t n_stations, size_t n_polarizations,
-                         size_t n_samples, size_t bytes_per_complex_sample,
-                         gpu::Stream &queue,
-                         size_t inputSamplesMinSize = 0,
-                         cl_mem_flags deviceBufferFlags = CL_MEM_READ_ONLY)
+                      size_t n_samples, size_t bytes_per_complex_sample,
+                      gpu::Context &context, size_t inputSamplesMinSize = 0)
         :
-          delaysAtBegin(queue, deviceBufferFlags, n_beams * n_stations * n_polarizations * sizeof(float)),
-          delaysAfterEnd(queue, deviceBufferFlags, n_beams * n_stations * n_polarizations * sizeof(float)),
-          phaseOffsets(queue, deviceBufferFlags, n_stations * n_polarizations * sizeof(float)),
-          inputSamples(queue, CL_MEM_READ_WRITE, std::max(inputSamplesMinSize, n_stations * n_samples * n_polarizations * bytes_per_complex_sample))
+          delaysAtBegin (context, n_beams * n_stations * n_polarizations * sizeof(float)),
+          delaysAfterEnd(context, n_beams * n_stations * n_polarizations * sizeof(float)),
+          phaseOffsets  (context,           n_stations * n_polarizations * sizeof(float)),
+          inputSamples  (context, std::max(inputSamplesMinSize,
+                                n_samples * n_stations * n_polarizations * bytes_per_complex_sample))
         {
         }
       };
@@ -162,26 +169,30 @@ namespace LOFAR
       // Relevant subband
       unsigned subband;
 
-      MultiArrayHostBuffer<float, 3> delaysAtBegin; //!< Whole sample delays at the start of the workitem      
-      MultiArrayHostBuffer<float, 3> delaysAfterEnd;//!< Whole sample delays at the end of the workitem      
-      MultiArrayHostBuffer<float, 2> phaseOffsets;  //!< Remainder of delays
+      //!< Whole sample delays at the start of the workitem      
+      MultiDimArrayHostBuffer<float, 3> delaysAtBegin;
+
+      //!< Whole sample delays at the end of the workitem      
+      MultiDimArrayHostBuffer<float, 3> delaysAfterEnd;
+
+      //!< Remainder of delays
+      MultiDimArrayHostBuffer<float, 2> phaseOffsets;
 
       // inputdata with flagged data set to zero
-      MultiArrayHostBuffer<char, 4> inputSamples;
+      MultiDimArrayHostBuffer<char, 4> inputSamples;
 
       // The input flags
-      MultiDimArray<SparseSet<unsigned>,1> inputFlags;
+      MultiDimArray<SparseSet<unsigned>, 1> inputFlags;
 
       // Create the inputData object we need shared host/device memory on the supplied devicequeue
       WorkQueueInputData(size_t n_beams, size_t n_stations, size_t n_polarizations,
                          size_t n_samples, size_t bytes_per_complex_sample,
-                         DeviceBuffers &deviceBuffers,
-                         cl_mem_flags hostBufferFlags = CL_MEM_WRITE_ONLY)
+                         unsigned int hostBufferFlags = 0)
         :
-        delaysAtBegin(boost::extents[n_beams][n_stations][n_polarizations], hostBufferFlags, deviceBuffers.delaysAtBegin),
-        delaysAfterEnd(boost::extents[n_beams][n_stations][n_polarizations], hostBufferFlags, deviceBuffers.delaysAfterEnd),
-        phaseOffsets(boost::extents[n_stations][n_polarizations], hostBufferFlags, deviceBuffers.phaseOffsets),
-        inputSamples(boost::extents[n_stations][n_samples][n_polarizations][bytes_per_complex_sample], hostBufferFlags, deviceBuffers.inputSamples), // TODO: The size of the buffer is NOT validated
+        delaysAtBegin(boost::extents[n_beams][n_stations][n_polarizations], hostBufferFlags),
+        delaysAfterEnd(boost::extents[n_beams][n_stations][n_polarizations], hostBufferFlags),
+        phaseOffsets(boost::extents[n_stations][n_polarizations], hostBufferFlags),
+        inputSamples(boost::extents[n_stations][n_samples][n_polarizations][bytes_per_complex_sample], hostBufferFlags), // TODO: The size of the buffer is NOT validated
         inputFlags(boost::extents[n_stations])
       {
       }
@@ -251,7 +262,7 @@ namespace LOFAR
       // in the InputData class
       WorkQueueInputData::DeviceBuffers devInput;
 
-      DeviceBuffer devFilteredData;
+      gpu::DeviceMemory devFilteredData;
 
     public:
       // A pool of input data, to allow items to be filled and
@@ -263,11 +274,13 @@ namespace LOFAR
       Pool<CorrelatedDataHostBuffer> outputPool;
 
     private:
+      // Constant input buffers for the kernels
+      gpu::DeviceMemory devFIRweights;
+      gpu::DeviceMemory devBandPassCorrectionWeights;
+
       // Compiled kernels
-      DeviceBuffer devFIRweights;
       FIR_FilterKernel firFilterKernel;
       Filter_FFT_Kernel fftKernel;
-      MultiArraySharedBuffer<float, 1> bandPassCorrectionWeights;
       DelayAndBandPassKernel delayAndBandPassKernel;
 #if defined USE_NEW_CORRELATOR
       CorrelateTriangleKernel correlateTriangleKernel;
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/UHEP_WorkQueue.h b/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/UHEP_WorkQueue.h
index ec9e95809d2..2abeb81523d 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/UHEP_WorkQueue.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/WorkQueues/UHEP_WorkQueue.h
@@ -27,7 +27,7 @@
 #include <CoInterface/Parset.h>
 
 #include <GPUProc/global_defines.h>
-#include <GPUProc/Buffers.h>
+#include <GPUProc/MultiDimArrayHostBuffer.h>
 #include <GPUProc/Pipelines/UHEP_Pipeline.h>
 #include <GPUProc/Kernels/UHEP_TriggerKernel.h>
 #include "WorkQueue.h"
diff --git a/RTCP/Cobalt/GPUProc/src/opencl/Buffers.h b/RTCP/Cobalt/GPUProc/src/opencl/MultiDimArrayHostBuffer.h
similarity index 97%
rename from RTCP/Cobalt/GPUProc/src/opencl/Buffers.h
rename to RTCP/Cobalt/GPUProc/src/opencl/MultiDimArrayHostBuffer.h
index 191340906ac..41d48185c8c 100644
--- a/RTCP/Cobalt/GPUProc/src/opencl/Buffers.h
+++ b/RTCP/Cobalt/GPUProc/src/opencl/MultiDimArrayHostBuffer.h
@@ -1,4 +1,4 @@
-//# Buffers.h
+//# MultiDimArrayHostBuffer.h
 //# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
 //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
 //#
@@ -18,8 +18,8 @@
 //#
 //# $Id$
 
-#ifndef LOFAR_GPUPROC_OPENCL_BUFFERS_H
-#define LOFAR_GPUPROC_OPENCL_BUFFERS_H
+#ifndef LOFAR_GPUPROC_OPENCL_MULTI_DIM_ARRAY_HOST_BUFFER_H
+#define LOFAR_GPUPROC_OPENCL_MULTI_DIM_ARRAY_HOST_BUFFER_H
 
 #include <CoInterface/Allocator.h>
 #include <CoInterface/MultiDimArray.h>
diff --git a/RTCP/Cobalt/GPUProc/src/opencl/Pipelines/Pipeline.cc b/RTCP/Cobalt/GPUProc/src/opencl/Pipelines/Pipeline.cc
index 376287aed29..dadb9d7483d 100644
--- a/RTCP/Cobalt/GPUProc/src/opencl/Pipelines/Pipeline.cc
+++ b/RTCP/Cobalt/GPUProc/src/opencl/Pipelines/Pipeline.cc
@@ -27,11 +27,6 @@
 
 #include <GPUProc/gpu_utils.h>
 
-#if 0
-#include <boost/format.hpp>
-using boost::format;
-#endif
-
 namespace LOFAR
 {
   namespace Cobalt
diff --git a/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/BeamFormerWorkQueue.h b/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/BeamFormerWorkQueue.h
index 6c07062aea9..aa0dbe0a5ae 100644
--- a/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/BeamFormerWorkQueue.h
+++ b/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/BeamFormerWorkQueue.h
@@ -26,7 +26,7 @@
 #include <Common/LofarLogger.h>
 #include <CoInterface/Parset.h>
 
-#include <GPUProc/Buffers.h>
+#include <GPUProc/MultiDimArrayHostBuffer.h>
 #include <GPUProc/BandPass.h>
 #include <GPUProc/Pipelines/BeamFormerPipeline.h>
 
diff --git a/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/CorrelatorWorkQueue.h b/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/CorrelatorWorkQueue.h
index d034bf4ebf9..3c507d4ec5c 100644
--- a/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/CorrelatorWorkQueue.h
+++ b/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/CorrelatorWorkQueue.h
@@ -33,7 +33,7 @@
 #include <CoInterface/SubbandMetaData.h>
 
 #include <GPUProc/global_defines.h>
-#include <GPUProc/Buffers.h>
+#include <GPUProc/MultiDimArrayHostBuffer.h>
 #include <GPUProc/FilterBank.h>
 #include <GPUProc/Pipelines/CorrelatorPipelinePrograms.h>
 #include <GPUProc/Kernels/FIR_FilterKernel.h>
diff --git a/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/UHEP_WorkQueue.h b/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/UHEP_WorkQueue.h
index 11298670e2c..46c00cde1c3 100644
--- a/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/UHEP_WorkQueue.h
+++ b/RTCP/Cobalt/GPUProc/src/opencl/WorkQueues/UHEP_WorkQueue.h
@@ -27,7 +27,7 @@
 #include <CoInterface/Parset.h>
 
 #include <GPUProc/global_defines.h>
-#include <GPUProc/Buffers.h>
+#include <GPUProc/MultiDimArrayHostBuffer.h>
 #include <GPUProc/Pipelines/UHEP_Pipeline.h>
 #include <GPUProc/Kernels/UHEP_TriggerKernel.h>
 #include "WorkQueue.h"
-- 
GitLab