From 0f7e584913fe015013db51a3e29b22d733f2e664 Mon Sep 17 00:00:00 2001
From: Jorrit Schaap <schaap@astron.nl>
Date: Fri, 15 Mar 2019 07:50:49 +0000
Subject: [PATCH] COB-60: moved reamining cuda files up one level to the
 one-and-only implementation dir.

---
 .../CobaltTest/test/tManyPartTABOutput.cc     |   2 +-
 .../CobaltTest/test/tMultiPartTABOutput.cc    |   2 +-
 RTCP/Cobalt/GPUProc/src/CMakeLists.txt        |  14 +-
 .../GPUProc/src/{cuda => }/KernelFactory.cc   |   0
 RTCP/Cobalt/GPUProc/src/KernelFactory.h       | 133 +++-
 .../GPUProc/src/MultiDimArrayHostBuffer.h     |  53 +-
 .../src/{cuda => }/PerformanceCounter.cc      |   0
 RTCP/Cobalt/GPUProc/src/PerformanceCounter.h  |  56 +-
 RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt   |   7 -
 RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h  | 140 ----
 .../src/cuda/MultiDimArrayHostBuffer.h        |  62 --
 .../GPUProc/src/cuda/PerformanceCounter.h     |  65 --
 RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h       |  33 -
 RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h    | 627 ------------------
 .../GPUProc/src/{cuda => }/cuda_config.h.in   |   0
 RTCP/Cobalt/GPUProc/src/gpu_incl.h            |  12 +-
 .../GPUProc/src/{cuda => }/gpu_utils.cc       |   0
 .../GPUProc/src/{cuda => }/gpu_wrapper.cc     |   0
 RTCP/Cobalt/GPUProc/src/gpu_wrapper.h         | 601 ++++++++++++++++-
 .../GPUProc/src/{cuda => }/gpu_wrapper.tcc    |   0
 .../test/SubbandProcs/tCorrelatorStep.cc      |   2 +-
 RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc |   2 +-
 RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc  |   2 +-
 23 files changed, 794 insertions(+), 1019 deletions(-)
 rename RTCP/Cobalt/GPUProc/src/{cuda => }/KernelFactory.cc (100%)
 rename RTCP/Cobalt/GPUProc/src/{cuda => }/PerformanceCounter.cc (100%)
 delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt
 delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h
 delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h
 delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.h
 delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h
 delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h
 rename RTCP/Cobalt/GPUProc/src/{cuda => }/cuda_config.h.in (100%)
 rename RTCP/Cobalt/GPUProc/src/{cuda => }/gpu_utils.cc (100%)
 rename RTCP/Cobalt/GPUProc/src/{cuda => }/gpu_wrapper.cc (100%)
 rename RTCP/Cobalt/GPUProc/src/{cuda => }/gpu_wrapper.tcc (100%)

diff --git a/RTCP/Cobalt/CobaltTest/test/tManyPartTABOutput.cc b/RTCP/Cobalt/CobaltTest/test/tManyPartTABOutput.cc
index d310005bdb5..3cd86fb6639 100644
--- a/RTCP/Cobalt/CobaltTest/test/tManyPartTABOutput.cc
+++ b/RTCP/Cobalt/CobaltTest/test/tManyPartTABOutput.cc
@@ -30,7 +30,7 @@
 #include <Common/LofarLogger.h>
 #include <CoInterface/Parset.h>
 #include <GPUProc/Pipelines/Pipeline.h>
-#include <GPUProc/cuda/SubbandProcs/SubbandProcOutputData.h>
+#include <GPUProc/SubbandProcs/SubbandProcOutputData.h>
 #include <GPUProc/Station/StationInput.h>
 #include <GPUProc/Storage/StorageProcesses.h>
 
diff --git a/RTCP/Cobalt/CobaltTest/test/tMultiPartTABOutput.cc b/RTCP/Cobalt/CobaltTest/test/tMultiPartTABOutput.cc
index f7cb9a1464c..9177c52d59d 100644
--- a/RTCP/Cobalt/CobaltTest/test/tMultiPartTABOutput.cc
+++ b/RTCP/Cobalt/CobaltTest/test/tMultiPartTABOutput.cc
@@ -30,7 +30,7 @@
 #include <Common/LofarLogger.h>
 #include <CoInterface/Parset.h>
 #include <GPUProc/Pipelines/Pipeline.h>
-#include <GPUProc/cuda/SubbandProcs/SubbandProcOutputData.h>
+#include <GPUProc/SubbandProcs/SubbandProcOutputData.h>
 #include <GPUProc/Station/StationInput.h>
 #include <GPUProc/Storage/StorageProcesses.h>
 
diff --git a/RTCP/Cobalt/GPUProc/src/CMakeLists.txt b/RTCP/Cobalt/GPUProc/src/CMakeLists.txt
index 8b52611b0b5..a73efffb36a 100644
--- a/RTCP/Cobalt/GPUProc/src/CMakeLists.txt
+++ b/RTCP/Cobalt/GPUProc/src/CMakeLists.txt
@@ -7,6 +7,11 @@ execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink
   ${CMAKE_CURRENT_SOURCE_DIR}
   ${CMAKE_BINARY_DIR}/include/${PACKAGE_NAME})
 
+configure_file(
+  "${CMAKE_CURRENT_SOURCE_DIR}/cuda_config.h.in"
+  "${CMAKE_BINARY_DIR}/include/cuda_config.h"  # internal, no need to install
+)
+
 set(_gpuproc_sources
   #Package__Version.cc
   BandPass.cc
@@ -26,10 +31,10 @@ set(_gpuproc_sources
 )
 
 list(APPEND _gpuproc_sources
-  cuda/gpu_wrapper.cc
-  cuda/gpu_utils.cc
-  cuda/KernelFactory.cc
-  cuda/PerformanceCounter.cc
+  gpu_wrapper.cc
+  gpu_utils.cc
+  KernelFactory.cc
+  PerformanceCounter.cc
   Kernels/Kernel.cc
   Kernels/BeamFormerKernel.cc
   Kernels/BeamFormerTransposeKernel.cc
@@ -67,7 +72,6 @@ list(APPEND _gpuproc_sources
 
 #    SubbandProcs/UHEP_SubbandProc.cc
 )
-add_subdirectory(cuda)
 
 lofar_add_library(gpuproc ${_gpuproc_sources})
 if(CUDA_cufft_LIBRARY)
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.cc b/RTCP/Cobalt/GPUProc/src/KernelFactory.cc
similarity index 100%
rename from RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.cc
rename to RTCP/Cobalt/GPUProc/src/KernelFactory.cc
diff --git a/RTCP/Cobalt/GPUProc/src/KernelFactory.h b/RTCP/Cobalt/GPUProc/src/KernelFactory.h
index 8a3bcef789d..9fc75474dd1 100644
--- a/RTCP/Cobalt/GPUProc/src/KernelFactory.h
+++ b/RTCP/Cobalt/GPUProc/src/KernelFactory.h
@@ -1,6 +1,6 @@
-//# KernelFactory.h: Factory for Kernel objects.
+//# KernelFactory.h
 //#
-//# Copyright (C) 2013  ASTRON (Netherlands Institute for Radio Astronomy)
+//# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
 //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
 //#
 //# This file is part of the LOFAR software suite.
@@ -19,23 +19,122 @@
 //#
 //# $Id$
 
-// \file
-// Factory for Kernel objects.
+#ifndef LOFAR_GPUPROC_CUDA_KERNELFACTORY_H
+#define LOFAR_GPUPROC_CUDA_KERNELFACTORY_H
 
-#ifndef LOFAR_GPUPROC_KERNELFACTORY_H
-#define LOFAR_GPUPROC_KERNELFACTORY_H
+#include <string>
+#include <CoInterface/Parset.h>
+#include <GPUProc/Kernels/Kernel.h>
+#include <GPUProc/gpu_wrapper.h>
+#include <GPUProc/gpu_utils.h>
 
-#if defined (USE_CUDA) && defined (USE_OPENCL)
-# error "Either CUDA or OpenCL must be enabled, not both"
-#endif
+namespace LOFAR
+{
+  namespace Cobalt
+  {
+    // Abstract base class of the templated KernelFactory class.
+    class KernelFactoryBase
+    {
+    public:
+      // Pure virtual destructor, because this is an abstract base class.
+      virtual ~KernelFactoryBase() = 0;
 
-#if defined (USE_CUDA)
-# include "cuda/KernelFactory.h"
-#elif defined (USE_OPENCL)
-# include "opencl/KernelFactory.h"
-#else
-# error "Either CUDA or OpenCL must be enabled, not neither"
-#endif
+    protected:
+      // Return compile definitions to use when creating PTX code for any
+      // Kernel.
+      CompileDefinitions
+      compileDefinitions(const Kernel::Parameters& param) const;
 
-#endif
+      // Return compile flags to use when creating PTX code for any Kernel.
+      CompileFlags
+      compileFlags(const Kernel::Parameters& param) const;
+    };
+
+    // Declaration of a generic factory class. For each concrete Kernel class
+    // (e.g. FIR_FilterKernel), a specialization must exist of the constructor
+    // and of the bufferSize() method.
+    template<typename T> class KernelFactory : public KernelFactoryBase
+    {
+    public:
+      // typedef typename T::Parameters Parameters;
+      typedef typename T::BufferType BufferType;
+      typedef typename T::Buffers Buffers;
+
+      // Construct a factory for creating Kernel objects of type \c T, using the
+      // settings provided by \a params.
+      KernelFactory(const typename T::Parameters &params) :
+        itsParameters(params),
+        itsPTX(_createPTX())
+      {
+      }
+
+      // Create a new Kernel object of type \c T.
+      T* create(const gpu::Stream& stream,
+                gpu::DeviceMemory &inputBuffer,
+                gpu::DeviceMemory &outputBuffer) const
+      {
+        const typename T::Buffers buffers(inputBuffer, outputBuffer);
+
+        return create(stream, buffers);
+      }
+
+      // Return required buffer size for \a bufferType
+      size_t bufferSize(BufferType bufferType) const
+      {
+        return itsParameters.bufferSize(bufferType);
+      }
+
+    private:
+      // Used by the constructors to construct the PTX from the other
+      // members.
+      std::string _createPTX() const {
+        return createPTX(T::theirSourceFile,
+                           compileDefinitions(),
+                           compileFlags());
+      }
 
+      // Create a new Kernel object of type \c T.
+      T* create(const gpu::Stream& stream,
+                const typename T::Buffers& buffers) const
+      {
+        // Since we use overlapping input/output buffers, their size
+        // could be larger than we need.
+        ASSERTSTR(buffers.input.size() >= bufferSize(T::INPUT_DATA),
+          "Require " << bufferSize(T::INPUT_DATA) << " bytes for input, "
+          "but buffer is only " << buffers.input.size() << " bytes.");
+        ASSERTSTR(buffers.output.size() >= bufferSize(T::OUTPUT_DATA),
+          "Require " << bufferSize(T::OUTPUT_DATA) << " bytes for output, "
+          "but buffer is only " << buffers.output.size() << " bytes.");
+
+        return new T(
+          stream, createModule(stream.getContext(), 
+                               T::theirSourceFile,
+                               itsPTX), 
+          buffers, itsParameters);
+      }
+
+      // Return compile definitions to use when creating PTX code for kernels of
+      // type \c T, using the parameters stored in \c itsParameters.
+      CompileDefinitions compileDefinitions() const {
+        return KernelFactoryBase::compileDefinitions(itsParameters);
+      }
+
+      // Return compile flags to use when creating PTX code for kernels of type
+      // \c T.
+      CompileFlags compileFlags() const {
+        return KernelFactoryBase::compileFlags(itsParameters);
+      }
+
+      // Additional parameters needed to create a Kernel object of type \c T.
+      typename T::Parameters itsParameters;
+
+      // PTX code, generated for kernels of type \c T, using information in the
+      // Parset that was passed to the constructor.
+      std::string itsPTX;
+    };
+
+  } // namespace Cobalt
+
+} // namespace LOFAR
+
+#endif
diff --git a/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h b/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h
index dee99d41fa5..dafb5ba201a 100644
--- a/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h
+++ b/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h
@@ -1,6 +1,5 @@
 //# MultiDimArrayHostBuffer.h
-//#
-//# Copyright (C) 2013  ASTRON (Netherlands Institute for Radio Astronomy)
+//# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
 //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
 //#
 //# This file is part of the LOFAR software suite.
@@ -19,23 +18,45 @@
 //#
 //# $Id$
 
-// \file
-// Support for our multi-dim array-ed GPU host buffer.
+#ifndef LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H
+#define LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H
 
-#ifndef LOFAR_GPUPROC_MULTI_DIM_ARRAY_HOST_BUFFER_H
-#define LOFAR_GPUPROC_MULTI_DIM_ARRAY_HOST_BUFFER_H
+#include <CoInterface/MultiDimArray.h>
 
-#if defined (USE_CUDA) && defined (USE_OPENCL)
-# error "Either CUDA or OpenCL must be enabled, not both"
-#endif
+#include "gpu_wrapper.h"
 
-#if defined (USE_CUDA)
-# include "cuda/MultiDimArrayHostBuffer.h"
-#elif defined (USE_OPENCL)
-# include "opencl/MultiDimArrayHostBuffer.h"
-#else
-# error "Either CUDA or OpenCL must be enabled, not neither"
-#endif
+namespace LOFAR
+{
+  namespace Cobalt
+  {
+
+    // A MultiDimArray allocated as a HostBuffer
+    // Note: Elements are not constructed/destructed.
+    template <typename T, unsigned DIM>
+    class MultiDimArrayHostBuffer : public gpu::HostMemory,
+                                    public MultiDimArray<T, DIM>
+    {
+    public:
+      template <typename ExtentList>
+      MultiDimArrayHostBuffer(const ExtentList &extents, const gpu::Context &context,
+                              unsigned int flags = 0)
+      :
+        HostMemory(context, MultiDimArray<T, DIM>::nrElements(extents) * sizeof(T), flags),
+        MultiDimArray<T, DIM>(extents, gpu::HostMemory::get<T>(), false)
+      {
+      }
+
+      using HostMemory::size;
+
+    private:
+      MultiDimArrayHostBuffer(); // don't use
+      MultiDimArrayHostBuffer(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use
+      MultiDimArrayHostBuffer<T, DIM> &operator=(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use
+      using MultiDimArray<T, DIM>::resize; // don't use
+    };
+
+  } // namespace Cobalt
+} // namespace LOFAR
 
 #endif
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.cc b/RTCP/Cobalt/GPUProc/src/PerformanceCounter.cc
similarity index 100%
rename from RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.cc
rename to RTCP/Cobalt/GPUProc/src/PerformanceCounter.cc
diff --git a/RTCP/Cobalt/GPUProc/src/PerformanceCounter.h b/RTCP/Cobalt/GPUProc/src/PerformanceCounter.h
index d7bca1ca404..5748b5f4d19 100644
--- a/RTCP/Cobalt/GPUProc/src/PerformanceCounter.h
+++ b/RTCP/Cobalt/GPUProc/src/PerformanceCounter.h
@@ -1,6 +1,5 @@
 //# PerformanceCounter.h
-//#
-//# Copyright (C) 2013  ASTRON (Netherlands Institute for Radio Astronomy)
+//# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
 //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
 //#
 //# This file is part of the LOFAR software suite.
@@ -19,23 +18,48 @@
 //#
 //# $Id$
 
-// \file
-// Support GPU kernel performance timing.
+#ifndef LOFAR_GPUPROC_CUDA_PERFORMANCECOUNTER_H
+#define LOFAR_GPUPROC_CUDA_PERFORMANCECOUNTER_H
 
-#ifndef LOFAR_PERFORMANCE_COUNTER_H
-#define LOFAR_PERFORMANCE_COUNTER_H
 
-#if defined (USE_CUDA) && defined (USE_OPENCL)
-# error "Either CUDA or OpenCL must be enabled, not both"
-#endif
+#include <GPUProc/gpu_wrapper.h>
+#include <CoInterface/RunningStatistics.h>
 
-#if defined (USE_CUDA)
-# include "cuda/PerformanceCounter.h"
-#elif defined (USE_OPENCL)
-# include "opencl/PerformanceCounter.h"
-#else
-# error "Either CUDA or OpenCL must be enabled, not neither"
-#endif
+namespace LOFAR
+{
+  namespace Cobalt
+  {
+    class PerformanceCounter
+    {
+    public:
+      PerformanceCounter(const gpu::Context &context, const std::string &name);
+      ~PerformanceCounter();
+
+      void recordStart(const gpu::Stream &stream);
+      void recordStop(const gpu::Stream &stream);
+
+      // Warning: user must make sure that the counter is not running!
+      RunningStatistics getStats() { logTime(); return stats; }
+
+    private:
+      const std::string name;
+
+      // Public event: it needs to be inserted into a stream.
+      // @{
+      gpu::Event start;
+      gpu::Event stop;
+      // @}
+
+      // Whether we have posted events that still need to be
+      // processed in logTime()
+      bool recording;
+
+      RunningStatistics stats;
+
+      void logTime();
+    };
+  }
+}
 
 #endif
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt b/RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt
deleted file mode 100644
index 17eac6967b6..00000000000
--- a/RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# $Id$
-
-configure_file(
-  "${CMAKE_CURRENT_SOURCE_DIR}/cuda_config.h.in"
-  "${CMAKE_BINARY_DIR}/include/cuda_config.h"  # internal, no need to install
-)
-
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h b/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h
deleted file mode 100644
index 9fc75474dd1..00000000000
--- a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h
+++ /dev/null
@@ -1,140 +0,0 @@
-//# KernelFactory.h
-//#
-//# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
-//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
-//#
-//# This file is part of the LOFAR software suite.
-//# The LOFAR software suite is free software: you can redistribute it and/or
-//# modify it under the terms of the GNU General Public License as published
-//# by the Free Software Foundation, either version 3 of the License, or
-//# (at your option) any later version.
-//#
-//# The LOFAR software suite is distributed in the hope that it will be useful,
-//# but WITHOUT ANY WARRANTY; without even the implied warranty of
-//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//# GNU General Public License for more details.
-//#
-//# You should have received a copy of the GNU General Public License along
-//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
-//#
-//# $Id$
-
-#ifndef LOFAR_GPUPROC_CUDA_KERNELFACTORY_H
-#define LOFAR_GPUPROC_CUDA_KERNELFACTORY_H
-
-#include <string>
-#include <CoInterface/Parset.h>
-#include <GPUProc/Kernels/Kernel.h>
-#include <GPUProc/gpu_wrapper.h>
-#include <GPUProc/gpu_utils.h>
-
-namespace LOFAR
-{
-  namespace Cobalt
-  {
-    // Abstract base class of the templated KernelFactory class.
-    class KernelFactoryBase
-    {
-    public:
-      // Pure virtual destructor, because this is an abstract base class.
-      virtual ~KernelFactoryBase() = 0;
-
-    protected:
-      // Return compile definitions to use when creating PTX code for any
-      // Kernel.
-      CompileDefinitions
-      compileDefinitions(const Kernel::Parameters& param) const;
-
-      // Return compile flags to use when creating PTX code for any Kernel.
-      CompileFlags
-      compileFlags(const Kernel::Parameters& param) const;
-    };
-
-    // Declaration of a generic factory class. For each concrete Kernel class
-    // (e.g. FIR_FilterKernel), a specialization must exist of the constructor
-    // and of the bufferSize() method.
-    template<typename T> class KernelFactory : public KernelFactoryBase
-    {
-    public:
-      // typedef typename T::Parameters Parameters;
-      typedef typename T::BufferType BufferType;
-      typedef typename T::Buffers Buffers;
-
-      // Construct a factory for creating Kernel objects of type \c T, using the
-      // settings provided by \a params.
-      KernelFactory(const typename T::Parameters &params) :
-        itsParameters(params),
-        itsPTX(_createPTX())
-      {
-      }
-
-      // Create a new Kernel object of type \c T.
-      T* create(const gpu::Stream& stream,
-                gpu::DeviceMemory &inputBuffer,
-                gpu::DeviceMemory &outputBuffer) const
-      {
-        const typename T::Buffers buffers(inputBuffer, outputBuffer);
-
-        return create(stream, buffers);
-      }
-
-      // Return required buffer size for \a bufferType
-      size_t bufferSize(BufferType bufferType) const
-      {
-        return itsParameters.bufferSize(bufferType);
-      }
-
-    private:
-      // Used by the constructors to construct the PTX from the other
-      // members.
-      std::string _createPTX() const {
-        return createPTX(T::theirSourceFile,
-                           compileDefinitions(),
-                           compileFlags());
-      }
-
-      // Create a new Kernel object of type \c T.
-      T* create(const gpu::Stream& stream,
-                const typename T::Buffers& buffers) const
-      {
-        // Since we use overlapping input/output buffers, their size
-        // could be larger than we need.
-        ASSERTSTR(buffers.input.size() >= bufferSize(T::INPUT_DATA),
-          "Require " << bufferSize(T::INPUT_DATA) << " bytes for input, "
-          "but buffer is only " << buffers.input.size() << " bytes.");
-        ASSERTSTR(buffers.output.size() >= bufferSize(T::OUTPUT_DATA),
-          "Require " << bufferSize(T::OUTPUT_DATA) << " bytes for output, "
-          "but buffer is only " << buffers.output.size() << " bytes.");
-
-        return new T(
-          stream, createModule(stream.getContext(), 
-                               T::theirSourceFile,
-                               itsPTX), 
-          buffers, itsParameters);
-      }
-
-      // Return compile definitions to use when creating PTX code for kernels of
-      // type \c T, using the parameters stored in \c itsParameters.
-      CompileDefinitions compileDefinitions() const {
-        return KernelFactoryBase::compileDefinitions(itsParameters);
-      }
-
-      // Return compile flags to use when creating PTX code for kernels of type
-      // \c T.
-      CompileFlags compileFlags() const {
-        return KernelFactoryBase::compileFlags(itsParameters);
-      }
-
-      // Additional parameters needed to create a Kernel object of type \c T.
-      typename T::Parameters itsParameters;
-
-      // PTX code, generated for kernels of type \c T, using information in the
-      // Parset that was passed to the constructor.
-      std::string itsPTX;
-    };
-
-  } // namespace Cobalt
-
-} // namespace LOFAR
-
-#endif
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h b/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h
deleted file mode 100644
index dafb5ba201a..00000000000
--- a/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-//# MultiDimArrayHostBuffer.h
-//# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
-//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
-//#
-//# This file is part of the LOFAR software suite.
-//# The LOFAR software suite is free software: you can redistribute it and/or
-//# modify it under the terms of the GNU General Public License as published
-//# by the Free Software Foundation, either version 3 of the License, or
-//# (at your option) any later version.
-//#
-//# The LOFAR software suite is distributed in the hope that it will be useful,
-//# but WITHOUT ANY WARRANTY; without even the implied warranty of
-//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//# GNU General Public License for more details.
-//#
-//# You should have received a copy of the GNU General Public License along
-//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
-//#
-//# $Id$
-
-#ifndef LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H
-#define LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H
-
-#include <CoInterface/MultiDimArray.h>
-
-#include "gpu_wrapper.h"
-
-namespace LOFAR
-{
-  namespace Cobalt
-  {
-
-    // A MultiDimArray allocated as a HostBuffer
-    // Note: Elements are not constructed/destructed.
-    template <typename T, unsigned DIM>
-    class MultiDimArrayHostBuffer : public gpu::HostMemory,
-                                    public MultiDimArray<T, DIM>
-    {
-    public:
-      template <typename ExtentList>
-      MultiDimArrayHostBuffer(const ExtentList &extents, const gpu::Context &context,
-                              unsigned int flags = 0)
-      :
-        HostMemory(context, MultiDimArray<T, DIM>::nrElements(extents) * sizeof(T), flags),
-        MultiDimArray<T, DIM>(extents, gpu::HostMemory::get<T>(), false)
-      {
-      }
-
-      using HostMemory::size;
-
-    private:
-      MultiDimArrayHostBuffer(); // don't use
-      MultiDimArrayHostBuffer(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use
-      MultiDimArrayHostBuffer<T, DIM> &operator=(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use
-      using MultiDimArray<T, DIM>::resize; // don't use
-    };
-
-  } // namespace Cobalt
-} // namespace LOFAR
-
-#endif
-
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.h b/RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.h
deleted file mode 100644
index 5748b5f4d19..00000000000
--- a/RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.h
+++ /dev/null
@@ -1,65 +0,0 @@
-//# PerformanceCounter.h
-//# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
-//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
-//#
-//# This file is part of the LOFAR software suite.
-//# The LOFAR software suite is free software: you can redistribute it and/or
-//# modify it under the terms of the GNU General Public License as published
-//# by the Free Software Foundation, either version 3 of the License, or
-//# (at your option) any later version.
-//#
-//# The LOFAR software suite is distributed in the hope that it will be useful,
-//# but WITHOUT ANY WARRANTY; without even the implied warranty of
-//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//# GNU General Public License for more details.
-//#
-//# You should have received a copy of the GNU General Public License along
-//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
-//#
-//# $Id$
-
-#ifndef LOFAR_GPUPROC_CUDA_PERFORMANCECOUNTER_H
-#define LOFAR_GPUPROC_CUDA_PERFORMANCECOUNTER_H
-
-
-#include <GPUProc/gpu_wrapper.h>
-#include <CoInterface/RunningStatistics.h>
-
-namespace LOFAR
-{
-  namespace Cobalt
-  {
-    class PerformanceCounter
-    {
-    public:
-      PerformanceCounter(const gpu::Context &context, const std::string &name);
-      ~PerformanceCounter();
-
-      void recordStart(const gpu::Stream &stream);
-      void recordStop(const gpu::Stream &stream);
-
-      // Warning: user must make sure that the counter is not running!
-      RunningStatistics getStats() { logTime(); return stats; }
-
-    private:
-      const std::string name;
-
-      // Public event: it needs to be inserted into a stream.
-      // @{
-      gpu::Event start;
-      gpu::Event stop;
-      // @}
-
-      // Whether we have posted events that still need to be
-      // processed in logTime()
-      bool recording;
-
-      RunningStatistics stats;
-
-      void logTime();
-    };
-  }
-}
-
-#endif
-
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h b/RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h
deleted file mode 100644
index 7e56db7b5b4..00000000000
--- a/RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//# gpu_incl.h: portable CUDA header to mirror OpenCL sources
-//# Copyright (C) 2012-2013  ASTRON (Netherlands Institute for Radio Astronomy)
-//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
-//#
-//# This file is part of the LOFAR software suite.
-//# The LOFAR software suite is free software: you can redistribute it and/or
-//# modify it under the terms of the GNU General Public License as published
-//# by the Free Software Foundation, either version 3 of the License, or
-//# (at your option) any later version.
-//#
-//# The LOFAR software suite is distributed in the hope that it will be useful,
-//# but WITHOUT ANY WARRANTY; without even the implied warranty of
-//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//# GNU General Public License for more details.
-//#
-//# You should have received a copy of the GNU General Public License along
-//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
-//#
-//# $Id$
-
-#ifndef LOFAR_GPUPROC_CUDA_GPU_INCL_H
-#define LOFAR_GPUPROC_CUDA_GPU_INCL_H
-
-// Pointless in itself; to mirror the OpenCL sources
-// Note: nvcc automatically includes cuda.h, but for most code we don't need it.
-
-// CUDA include option(s)
-//<none>
-
-#include <cuda.h>
-
-#endif
-
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h b/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h
deleted file mode 100644
index 831a5e05993..00000000000
--- a/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h
+++ /dev/null
@@ -1,627 +0,0 @@
-//# gpu_wrapper.h: CUDA-specific wrapper classes for GPU types.
-//#
-//# Copyright (C) 2013  ASTRON (Netherlands Institute for Radio Astronomy)
-//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
-//#
-//# This file is part of the LOFAR software suite.
-//# The LOFAR software suite is free software: you can redistribute it and/or
-//# modify it under the terms of the GNU General Public License as published
-//# by the Free Software Foundation, either version 3 of the License, or
-//# (at your option) any later version.
-//#
-//# The LOFAR software suite is distributed in the hope that it will be useful,
-//# but WITHOUT ANY WARRANTY; without even the implied warranty of
-//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//# GNU General Public License for more details.
-//#
-//# You should have received a copy of the GNU General Public License along
-//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
-//#
-//# $Id$
-
-#ifndef LOFAR_GPUPROC_CUDA_GPU_WRAPPER_H
-#define LOFAR_GPUPROC_CUDA_GPU_WRAPPER_H
-
-// \file cuda/gpu_wrapper.h
-// C++ wrappers for CUDA akin the OpenCL C++ wrappers.
-// Uses the "Pimpl" idiom for resource managing classes (i.e. that need to
-// control copying having a non-trivial destructor. For more info on Pimpl, see
-// http://www.boost.org/doc/libs/release/libs/smart_ptr/sp_techniques.html#pimpl
-// Not Pimpl-ed are class Platform, Device, and Function.
-// These are also passed by value.
-
-#include <cstddef>
-#include <string>
-#include <vector>
-#include <map>
-#include <iosfwd>
-
-#include <boost/shared_ptr.hpp>
-#include "gpu_incl.h" // ideally, this goes into the .cc, but too much leakage
-#include <cufft.h>
-
-#include <GPUProc/gpu_wrapper.h> // GPUException
-
-#if CUDA_VERSION < 4020
-typedef int CUsharedconfig;
-#endif
-
-namespace LOFAR
-{
-  namespace Cobalt
-  {
-    class PerformanceCounter;
-    namespace gpu
-    {
-
-      // Exception class for CUDA errors.
-      EXCEPTION_CLASS(CUDAException, GPUException);
-
-      // Return the cuFFT error string associated with \a errcode.
-      std::string cufftErrorMessage(cufftResult errcode);
-
-      // Return the CUDA error string associated with \a errcode.
-      std::string errorMessage(CUresult errcode);
-
-
-      // Struct representing a CUDA Grid, which is similar to the @c dim3 type
-      // in the CUDA Runtime API.
-      struct Grid
-      {
-        Grid(unsigned int x_ = 1, unsigned int y_ = 1, unsigned int z_ = 1);
-        unsigned int x;
-        unsigned int y;
-        unsigned int z;
-        friend std::ostream& operator<<(std::ostream& os, const Grid& grid);
-      };
-
-      // Struct representing a CUDA Block, which is similar to the @c dim3 type
-      // in the CUDA Runtime API.
-      //
-      // @invariant x > 0, y > 0, z > 0
-      struct Block
-      {
-        Block(unsigned int x_ = 1, unsigned int y_ = 1, unsigned int z_ = 1);
-        unsigned int x;
-        unsigned int y;
-        unsigned int z;
-        friend std::ostream& operator<<(std::ostream& os, const Block& block);
-      };
-
-      // Struct containing kernel launch configuration.
-      struct ExecConfig
-      {
-        ExecConfig(Grid gr = Grid(), Block bl = Block(), size_t dynShMem = 0);
-        Grid   grid;
-        Block  block;
-        size_t dynSharedMemSize;
-        friend std::ostream& operator<<(std::ostream& os,
-                                        const ExecConfig& execConfig);
-      };
-
-
-      // Forward declaration needed by Platform::devices.
-      class Device;
-
-      // This class is not strictly needed, because in CUDA there's only one
-      // platform, but it hides the CUDA calls and makes it similar to OpenCL.
-      class Platform
-      {
-      public:
-        // Initialize the CUDA platform.
-        // \param flags must be 0 (at least up till CUDA 5.0).
-        Platform(unsigned int flags = 0);
-
-        // The CUDA version (e.g. 5.0 -> 5000).
-        int version() const;
-
-        // Returns the number of devices in the CUDA platform.
-        size_t size() const;
-
-        // Returns a vector of all devices in the CUDA platform.
-        std::vector<Device> devices() const;
-
-        // Returns the name of the CUDA platform. (currently, "NVIDIA CUDA")
-        std::string getName() const;
-
-        // Return the maximum number of threads per block, that
-        // is supported by all devices on the platform.
-        // 
-        // Hardware dependent.
-        // - Returns at least 512 (except for ancient hardware)
-        // - Returns 1024 for K10 (= Cobalt hardware)
-        unsigned getMaxThreadsPerBlock() const;
-      };
-
-      // Wrap a CUDA Device.
-      class Device
-      {
-      public:
-        // Create a device.
-        // \param ordinal is the device number; 
-        //        valid range: [0, Platform.size()-1]
-        Device(int ordinal = 0);
-
-        // Order Devices by PCI ID (used in std::sort)
-        bool operator<(const Device &other) const;
-
-        // Return the name of the device in human readable form.
-        std::string getName() const;
-
-        // Return the compute capability (major)
-        unsigned getComputeCapabilityMajor() const;
-
-        // Return the compute capability (minor)
-        unsigned getComputeCapabilityMinor() const;
-
-        // Return the total amount of global memory, in bytes
-        size_t getTotalGlobalMem() const;
-
-        // Return the maximum amount of shared memory per block
-        size_t getBlockSharedMem() const;
-
-        // Return the total amount of constant memory
-        size_t getTotalConstMem() const;
-
-        // Return the PCI ID (bus:device) of this GPU
-        std::string pciId() const;
-
-        // Return the maximum number of threads per block
-        // 
-        // Hardware dependent.
-        // - Returns at least 512 (except for ancient hardware)
-        // - Returns 1024 for K10 (= Cobalt hardware)
-        unsigned getMaxThreadsPerBlock() const;
-
-        // Return the maximum dimensions of a block of threads.
-        struct Block getMaxBlockDims() const;
-
-        // Return the maximum dimensions of a grid of blocks.
-        struct Grid getMaxGridDims() const;
-
-        // Return the number of multi-processors.
-        unsigned getMultiProcessorCount() const;
-
-        // Return the maximum number of threads that can be
-        // resident on a multi-processor.
-        unsigned getMaxThreadsPerMultiProcessor() const;
-
-        // Return information on a specific \a attribute.
-        // \param attribute CUDA device attribute
-        int getAttribute(CUdevice_attribute attribute) const;
-
-      private:
-        // Context needs access to our \c _device to create a context.
-        friend class Context;
-
-        // The CUDA device.
-        CUdevice _device;
-      };
-
-
-      // Wrap a CUDA Context. Since this class manages a resource (a CUDA
-      // context), it uses the pimpl idiom in combination with a reference
-      // counted pointer to make it copyable.
-      //
-      // We do not tie any context to any thread by default -- all contexts
-      // are `floating', and are to be tied to a thread only by pushing them
-      // as the current context, performing operation(s), and popping them
-      // from the current context stack. The pushing and popping is automated
-      // in the ScopedCurrentContext class.
-      class Context
-      {
-      public:
-        // Create a new CUDA context and associate it with the calling thread.
-        // In other words, \c setCurrent() is implied.
-        //
-        // Flags:
-
-        //    CU_CTX_SCHED_AUTO:
-        //        The default value if the flags parameter is zero, uses a
-        //        heuristic based on the number of active CUDA contexts in the
-        //        process C and the number of logical processors in the system P.
-        //        If C > P, then CUDA will yield to other OS threads when waiting
-        //        for the GPU, otherwise CUDA will not yield while waiting for
-        //        results and actively spin on the processor.
-        //    CU_CTX_SCHED_SPIN:
-        //        Instruct CUDA to actively spin when waiting for results from the GPU.
-        //        This can decrease latency when waiting for the GPU, but may lower
-        //        the performance of CPU threads if they are performing work in parallel
-        //        with the CUDA thread.
-        //    CU_CTX_SCHED_YIELD:
-        //        Instruct CUDA to yield its thread when waiting for results from the GPU.
-        //        This can increase latency when waiting for the GPU, but can increase
-        //        the performance of CPU threads performing work in parallel with the GPU.
-        //    CU_CTX_SCHED_BLOCKING_SYNC:
-        //        Instruct CUDA to block the CPU thread on a synchronization primitive
-        //        when waiting for the GPU to finish work.
-        Context(const Device &device, unsigned int flags = CU_CTX_SCHED_YIELD);
-
-        // Returns the device associated to this context.
-        Device getDevice() const;
-
-        // Set the cache configuration for kernel launches in this context.
-        void setCacheConfig(CUfunc_cache config) const;
-
-        // Set the shared memory configuration for kernel launches in this context.
-        void setSharedMemConfig(CUsharedconfig config) const;
-
-      private:
-        // Non-copyable implementation class.
-        class Impl;
-
-        // Reference counted pointer to the implementation class.
-        boost::shared_ptr<Impl> _impl;
-
-        friend class ScopedCurrentContext;
-      };
-
-
-      // Make a certain context the current one for a certain scope.
-      class ScopedCurrentContext
-      {
-      public:
-        ScopedCurrentContext( const Context &context );
-        ~ScopedCurrentContext();
-
-      private:
-        const Context &_context;
-      };
-
-
-      // Wrap CUDA Host Memory. This is the equivalent of a OpenCL Buffer. CUDA
-      // distinguishes between between host- and device memory, OpenCL does not.
-      class HostMemory
-      {
-      public:
-        // Allocate \a size bytes of host memory.
-        // \param context CUDA context associated with this HostMemory object.
-        // \param size number of bytes to allocate
-        // \param flags affect allocation
-        // \note To create pinned memory, we need to set
-        // \code
-        // flags = CU_MEMHOSTALLOC_PORTABLE
-        // \endcode
-        // \note For input buffers we may consider setting
-        // \code
-        // flags = CU_MEMHOSTALLOC_PORTABLE | CU_MEMHOSTALLOC_WRITECOMBINED
-        // \endcode
-        // Please refer to the documentation of the function \c cuMemHostAlloc()
-        // in the CUDA Driver API for details.
-        HostMemory(const Context &context, size_t size, unsigned int flags = 0);
-
-        // Return a pointer to the actual memory.
-        // \warning The returned pointer shall not have a lifetime beyond the
-        // lifetime of this object (actually the last copy).
-        template <typename T>
-        T *get() const;
-
-        // Return the size of this memory block.
-        size_t size() const;
-
-      private:
-        // Get a void pointer to the actual memory from our Impl class. This
-        // method is only used by our templated get() method.
-        void* getPtr() const;
-
-        // Non-copyable implementation class.
-        class Impl;
-
-        // Reference counted pointer to the implementation class.
-        boost::shared_ptr<Impl> _impl;
-      };
-
-
-      // Wrap CUDA Device Memory. This is the equivalent of an OpenCL
-      // Buffer. CUDA distinguishes between between host- and device memory,
-      // OpenCL does not.
-      class DeviceMemory
-      {
-      public:
-        // Allocate \a size bytes of device memory.
-        DeviceMemory(const Context &context, size_t size);
-
-        // Return a device pointer as a handle to the memory.
-        void *get() const;
-
-        // Fill the first \a n bytes of memory with the constant byte \a uc.
-        // \param uc Constant byte value to put into memory
-        // \param n  Number of bytes to set. Defaults to the complete block.
-        //           If \a n is larger than the current memory block size, then
-        //           the complete block will be set to \a uc.
-        void set(unsigned char uc, size_t n = (size_t)-1) const;
-
-        // Return the size of this memory block.
-        size_t size() const;
-
-        // Fetch the contents of this buffer in a new HostMemory buffer.
-        HostMemory fetch() const;
-
-      private:
-        // Function needs access to our device ptr location to set this as a kernel arg.
-        friend class Function;
-
-        // Non-copyable implementation class.
-        class Impl;
-
-        // Reference counted pointer to the implementation class.
-        boost::shared_ptr<Impl> _impl;
-      };
-
-
-      // Wrap a CUDA Module. This is the equivalent of a OpenCL Program.
-      class Module
-      {
-      public:
-        typedef std::map<CUjit_option, void*> optionmap_t;
-
-        Module(); // TODO: tmp, as long as CorrelatorPipelinePrograms needs a default init
-
-        // Load the module in the file \a fname into the given \a context. The
-        // file should be a \e cubin file or a \e ptx file as output by \c nvcc.
-        // \param context CUDA context associated with this Module object.
-        // \param fname name of a module file
-        // \note For details, please refer to the documentation of \c
-        // cuModuleLoad in the CUDA Driver API.
-        Module(const Context &context, const std::string &fname);
-
-        // Load the module pointed to by \a image into the given \a context. The
-        // pointer may point to a null-terminated string containing \e cubin or
-        // \e ptx code.
-        // \param context CUDA context associated with this Module object.
-        // \param image pointer to a module image in memory
-        // \note For details, please refer to the documentation of \c
-        // cuModuleLoadData in the CUDA Driver API.
-        Module(const Context &context, const void *image);
-
-        // Load the module pointed to by \a image into the given \a context. The
-        // pointer may point to a null-terminated string containing \e cubin or
-        // \e ptx code.
-        // \param context CUDA context associated with this Module object.
-        // \param image pointer to a module image in memory
-        // \param options map of \c CUjit_option items, with their associated
-        // values.
-        // \note All values are cast to void*, so if an option requires
-        // an unsigned int as value, the unsigned int's value itself is cast to void*!
-        // \note For details, please refer to the documentation of \c
-        // cuModuleLoadDataEx in the CUDA Driver API.
-        Module(const Context &context, const void *image, optionmap_t &options);
-
-        // Return the Context in which this Module was created.
-        Context getContext() const;
-
-      private:
-        // Function needs access to our module to create a function.
-        friend class Function;
-
-        // Non-copyable implementation class.
-        class Impl;
-
-        // Reference counted pointer to the implementation class.
-        boost::shared_ptr<Impl> _impl;
-      };
-
-      // Wrap a CUDA Device Function. This is the equivalent of an OpenCL
-      // Program.
-      class Function
-      {
-      public:
-        // Construct a function object by looking up the function \a name in the
-        // module \a module.
-        Function(const Module &module, const std::string &name);
-
-        // Return the name of the function.
-        std::string name() const;
-
-        // Set kernel immediate argument number \a index to \a val.
-        // \a val must outlive kernel execution.
-        // Not for device memory objects (be it as DeviceMemory or as void *).
-        template <typename T>
-        void setArg(size_t index, const T &val);
-
-        // Set kernel DeviceMemory object argument number \a index to \a mem.
-        // \a mem must outlive kernel execution.
-        void setArg(size_t index, const DeviceMemory &mem);
-
-        // Set pointer to kernel device memory object (as void *) number \a index
-        // to \a val. \a *val must outlive kernel execution.
-        // Note: Prefer to use setArg() passing a DeviceMemory ref over this overload.
-        void setArg(size_t index, const void **val);
-
-        // Return information about a function.
-        // \note For details on valid values for \a attribute, please refer to
-        // the documentation of cuFuncGetAttribute in the CUDA Driver API.
-        int getAttribute(CUfunction_attribute attribute) const;
-
-        // Set the shared memory configuration for a device function.
-        // \note For details on valid values for \a config, please refer to the
-        // documentation of cuFuncSetSharedMemConfig in the CUDA Driver API.
-        void setSharedMemConfig(CUsharedconfig config) const;
-
-      protected:
-        const Context _context;
-
-      private:
-        // Keep the Module alive, because Function actually wraps a pointer
-        // to a function within the Module.
-        const Module _module;
-
-        // The name of the function, for error reporting purposes
-        const std::string _name;
-
-        // Stream needs access to our CUDA function to launch a kernel.
-        friend class Stream;
-
-        // CUDA function.
-        CUfunction _function;
-
-        // Function arguments as set.
-        std::vector<const void *> _kernelArgs;
-
-        // Helper function to modify _kernelArgs.
-        void doSetArg(size_t index, const void *argp);
-
-        // Do not use. To guard against passing pointers.
-        // Note that even device void * cannot be passed, because we need its
-        // address with a life time longer than this formal parameter.
-        template<typename T>
-        void setArg(size_t index, const T *&); // intentionally not impl.
-
-        // Do not use. To guard against passing HostMemory references to kernels.
-        void setArg(size_t index, const HostMemory &); // intentionally not impl.
-
-        // Do not use. To guard against passing HostMemory pointers to kernels.
-        void setArg(size_t index, const HostMemory *); // intentionally not impl.
-      };
-
-      // Wrap a CUDA Event. This is the equivalent of an OpenCL Event.
-      class Event
-      {
-      public:
-        // Construct a CUDA event. This class manages a resource (a CUDA event)
-        // and is therefore implemented using the pimpl idiom, using a reference
-        // counted pointer to a non-copyable implementation class.
-        // \note For details on valid values for \a flags, please refer to the
-        // documentation of cuEventCreate in the CUDA Driver API.
-        Event(const Context &context, unsigned int flags = CU_EVENT_DEFAULT);
-
-        // Return the elapsed time in milliseconds between this event and the \a
-        // second event.
-        float elapsedTime(Event &second) const;
-
-        // Wait until all work preceding this event in the same stream has
-        // completed.
-        void wait();
-
-      private:
-        // Stream needs access to our CUDA event to wait for and record events.
-        friend class Stream;
-
-        // Non-copyable implementation class.
-        class Impl;
-
-        // Reference counted pointer to the implementation class.
-        boost::shared_ptr<Impl> _impl;
-      };
-
-
-      // Wrap a CUDA Stream. This is the equivalent of an OpenCL
-      // CommandQueue. This class manages a resource (a CUDA stream) and is
-      // therefore implemented using the pimpl idiom, using a reference counted
-      // pointer to a non-copyable implementation class.
-      class Stream
-      {
-      public:
-        // Create a stream.
-        // \param flags must be 0 for CUDA < 5.0
-        // \param context CUDA context associated with this Stream object.
-        // \note For details on valid values for \a flags, please refer to the
-        // documentation of \c cuStreamCreate in the CUDA Driver API.
-        explicit Stream(const Context &context, unsigned int flags = 0);  // named CU_STREAM_DEFAULT (0) since CUDA 5.0
-
-        // Transfer data from host memory \a hostMem to device memory \a devMem.
-        // \param devMem Device memory that will be copied to.
-        // \param hostMem Host memory that will be copied from.
-        // \param synchronous Indicates whether the transfer must be done
-        //        synchronously or asynchronously.
-        void writeBuffer(const DeviceMemory &devMem, const HostMemory &hostMem,
-                         bool synchronous = false) const;
-
-        // Transfer data from host memory \a hostMem to device memory \a devMem.
-        // When gpuProfiling is enabled this transfer is synchronous
-        // \param devMem Device memory that will be copied to.
-        // \param hostMem Host memory that will be copied from.
-        // \param counter PerformanceCounter that will receive transfer duration
-        // if  gpuProfiling is enabled
-        // \param synchronous Indicates whether the transfer must be done
-        //        synchronously or asynchronously. Default == false
-        void writeBuffer(const DeviceMemory &devMem, const HostMemory &hostMem,
-                         PerformanceCounter &counter, bool synchronous = false) const;
-
-        // Transfer data from device memory \a devMem to host memory \a hostMem.
-        // \param hostMem Host memory that will be copied to.
-        // \param devMem Device memory that will be copied from.
-        // \param synchronous Indicates whether the transfer must be done
-        //        synchronously or asynchronously.
-        void readBuffer(const HostMemory &hostMem, const DeviceMemory &devMem,
-                        bool synchronous = false) const;
-
-        // Transfer data from device memory \a devMem to host memory \a hostMem.
-        // When gpuProfiling is enabled this transfer is synchronous
-        // \param hostMem Host memory that will be copied to.
-        // \param devMem Device memory that will be copied from.
-        // \param counter PerformanceCounter that will receive transfer duration
-        // if  gpuProfiling is enabled
-        // \param synchronous Indicates whether the transfer must be done
-        //        synchronously or asynchronously. Default == false
-        void readBuffer(const HostMemory &hostMem, const DeviceMemory &devMem,
-                        PerformanceCounter &counter, bool synchronous = false) const;
-
-        // Transfer data from device memory \a devSource to device memory \a devTarget.
-        // \param devTarget Device memory that will be copied to.
-        // \param devSource Device memory that will be copied from.
-        // \param synchronous Indicates whether the transfer must be done
-        //        synchronously or asynchronously.
-        void copyBuffer(const DeviceMemory &devTarget, const DeviceMemory &devSource,
-                        bool synchronous = false) const;
-
-        // Transfer data from device memory \a devSource to device memory \a devTarget.
-        // When gpuProfiling is enabled this transfer is synchronous
-        // \param devTarget Device memory that will be copied to.
-        // \param devSource Device memory that will be copied from.
-        // \param counter PerformanceCounter that will receive transfer duration
-        //        if gpuProfiling is enabled
-        // \param synchronous Indicates whether the transfer must be done
-        //        synchronously or asynchronously. Defaults to \c false
-        //        (asynchronously).
-        void copyBuffer(const DeviceMemory &devTarget, const DeviceMemory &devSource,
-                        PerformanceCounter &counter, bool synchronous = false) const;
-
-        // Launch a CUDA function.
-        // \param function object containing the function to launch
-        // \param grid Grid size (in terms of blocks (not threads (OpenCL)))
-        // \param block Block (thread group) size
-        void launchKernel(const Function &function,
-                          const Grid &grid, const Block &block) const;
-
-        // Check if all operations on this stream have completed.
-        // \return true if all completed, or false otherwise.
-        bool query() const;
-
-        // Wait until a this stream's tasks are completed.
-        void synchronize() const;
-
-        // Let this stream wait on the event \a event.
-        void waitEvent(const Event &event) const;
-
-        // Record the event \a event for this stream.
-        void recordEvent(const Event &event) const;
-
-        // Return the underlying CUDA stream. TODO: try to get rid of CUstream here: FFT thing to here or make it friend
-        CUstream get() const;
-
-        // Returns the context associated with the underlying CUDA stream.
-        Context getContext() const; // TODO: consider using this in the SubbandProcs (now has Stream and Context stored)
-
-        // Return whether this stream mandates synchronous behaviour
-        bool isSynchronous() const;
-
-      private:
-        // Non-copyable implementation class.
-        class Impl;
-
-        // Reference counted pointer to the implementation class.
-        boost::shared_ptr<Impl> _impl;
-
-        // Force synchronous transfers and kernel launches
-        bool force_synchronous;
-      };
-
-    } // namespace gpu
-  } // namespace Cobalt
-} // namespace LOFAR
-
-#include "gpu_wrapper.tcc"
-
-#endif
-
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/cuda_config.h.in b/RTCP/Cobalt/GPUProc/src/cuda_config.h.in
similarity index 100%
rename from RTCP/Cobalt/GPUProc/src/cuda/cuda_config.h.in
rename to RTCP/Cobalt/GPUProc/src/cuda_config.h.in
diff --git a/RTCP/Cobalt/GPUProc/src/gpu_incl.h b/RTCP/Cobalt/GPUProc/src/gpu_incl.h
index 4884cbd3c55..efb6c8185bb 100644
--- a/RTCP/Cobalt/GPUProc/src/gpu_incl.h
+++ b/RTCP/Cobalt/GPUProc/src/gpu_incl.h
@@ -25,17 +25,7 @@
 #ifndef LOFAR_GPUPROC_GPU_INCL_H
 #define LOFAR_GPUPROC_GPU_INCL_H
 
-#if defined (USE_CUDA) && defined (USE_OPENCL)
-# error "Either CUDA or OpenCL must be enabled, not both"
-#endif
-
-#if defined (USE_CUDA)
-# include "cuda/gpu_incl.h"
-#elif defined (USE_OPENCL)
-# include "opencl/gpu_incl.h"
-#else
-# error "Either CUDA or OpenCL must be enabled, not neither"
-#endif
+#include <cuda.h>
 
 #endif
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_utils.cc b/RTCP/Cobalt/GPUProc/src/gpu_utils.cc
similarity index 100%
rename from RTCP/Cobalt/GPUProc/src/cuda/gpu_utils.cc
rename to RTCP/Cobalt/GPUProc/src/gpu_utils.cc
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.cc b/RTCP/Cobalt/GPUProc/src/gpu_wrapper.cc
similarity index 100%
rename from RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.cc
rename to RTCP/Cobalt/GPUProc/src/gpu_wrapper.cc
diff --git a/RTCP/Cobalt/GPUProc/src/gpu_wrapper.h b/RTCP/Cobalt/GPUProc/src/gpu_wrapper.h
index 640e89ccbc5..6f19a4f10ee 100644
--- a/RTCP/Cobalt/GPUProc/src/gpu_wrapper.h
+++ b/RTCP/Cobalt/GPUProc/src/gpu_wrapper.h
@@ -19,41 +19,612 @@
 //#
 //# $Id$
 
-// \file
-// Wrapper classes for GPU types.
+// \file cuda/gpu_wrapper.h
+// C++ wrappers for CUDA akin the OpenCL C++ wrappers.
+// Uses the "Pimpl" idiom for resource managing classes (i.e. that need to
+// control copying having a non-trivial destructor. For more info on Pimpl, see
+// http://www.boost.org/doc/libs/release/libs/smart_ptr/sp_techniques.html#pimpl
+// Not Pimpl-ed are class Platform, Device, and Function.
+// These are also passed by value.
 
 #ifndef LOFAR_GPUPROC_GPU_WRAPPER_H
 #define LOFAR_GPUPROC_GPU_WRAPPER_H
 
-#if defined (USE_CUDA) && defined (USE_OPENCL)
-# error "Either CUDA or OpenCL must be enabled, not both"
-#endif
+#include <cstddef>
+#include <string>
+#include <vector>
+#include <map>
+#include <iosfwd>
+
+#include <boost/shared_ptr.hpp>
+#include "gpu_incl.h" // ideally, this goes into the .cc, but too much leakage
+#include <cufft.h>
 
 #include <Common/Exception.h>
+#include <GPUProc/gpu_wrapper.h> // GPUException
+
+#if CUDA_VERSION < 4020
+typedef int CUsharedconfig;
+#endif
 
 namespace LOFAR
 {
   namespace Cobalt
   {
+    class PerformanceCounter;
+
     namespace gpu
     {
       // Exception class for GPU errors.
       EXCEPTION_CLASS(GPUException, LOFAR::Exception);
 
-    } // namespace gpu
+      // Exception class for CUDA errors.
+      EXCEPTION_CLASS(CUDAException, GPUException);
 
-  } // namespace Cobalt
+      // Return the cuFFT error string associated with \a errcode.
+      std::string cufftErrorMessage(cufftResult errcode);
 
-} // namespace LOFAR
+      // Return the CUDA error string associated with \a errcode.
+      std::string errorMessage(CUresult errcode);
 
+      // Struct representing a CUDA Grid, which is similar to the @c dim3 type
+      // in the CUDA Runtime API.
+      struct Grid
+      {
+        Grid(unsigned int x_ = 1, unsigned int y_ = 1, unsigned int z_ = 1);
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+        friend std::ostream& operator<<(std::ostream& os, const Grid& grid);
+      };
 
-#if defined (USE_CUDA)
-# include "cuda/gpu_wrapper.h"
-#elif defined (USE_OPENCL)
-# include "opencl/gpu_wrapper.h"
-#else
-# error "Either CUDA or OpenCL must be enabled, not neither"
-#endif
+      // Struct representing a CUDA Block, which is similar to the @c dim3 type
+      // in the CUDA Runtime API.
+      //
+      // @invariant x > 0, y > 0, z > 0
+      struct Block
+      {
+        Block(unsigned int x_ = 1, unsigned int y_ = 1, unsigned int z_ = 1);
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+        friend std::ostream& operator<<(std::ostream& os, const Block& block);
+      };
+
+      // Struct containing kernel launch configuration.
+      struct ExecConfig
+      {
+        ExecConfig(Grid gr = Grid(), Block bl = Block(), size_t dynShMem = 0);
+        Grid   grid;
+        Block  block;
+        size_t dynSharedMemSize;
+        friend std::ostream& operator<<(std::ostream& os,
+                                        const ExecConfig& execConfig);
+      };
+
+
+      // Forward declaration needed by Platform::devices.
+      class Device;
+
+      // This class is not strictly needed, because in CUDA there's only one
+      // platform, but it hides the CUDA calls and makes it similar to OpenCL.
+      class Platform
+      {
+      public:
+        // Initialize the CUDA platform.
+        // \param flags must be 0 (at least up till CUDA 5.0).
+        Platform(unsigned int flags = 0);
+
+        // The CUDA version (e.g. 5.0 -> 5000).
+        int version() const;
+
+        // Returns the number of devices in the CUDA platform.
+        size_t size() const;
+
+        // Returns a vector of all devices in the CUDA platform.
+        std::vector<Device> devices() const;
+
+        // Returns the name of the CUDA platform. (currently, "NVIDIA CUDA")
+        std::string getName() const;
+
+        // Return the maximum number of threads per block, that
+        // is supported by all devices on the platform.
+        //
+        // Hardware dependent.
+        // - Returns at least 512 (except for ancient hardware)
+        // - Returns 1024 for K10 (= Cobalt hardware)
+        unsigned getMaxThreadsPerBlock() const;
+      };
+
+      // Wrap a CUDA Device.
+      class Device
+      {
+      public:
+        // Create a device.
+        // \param ordinal is the device number;
+        //        valid range: [0, Platform.size()-1]
+        Device(int ordinal = 0);
+
+        // Order Devices by PCI ID (used in std::sort)
+        bool operator<(const Device &other) const;
+
+        // Return the name of the device in human readable form.
+        std::string getName() const;
+
+        // Return the compute capability (major)
+        unsigned getComputeCapabilityMajor() const;
+
+        // Return the compute capability (minor)
+        unsigned getComputeCapabilityMinor() const;
+
+        // Return the total amount of global memory, in bytes
+        size_t getTotalGlobalMem() const;
+
+        // Return the maximum amount of shared memory per block
+        size_t getBlockSharedMem() const;
+
+        // Return the total amount of constant memory
+        size_t getTotalConstMem() const;
+
+        // Return the PCI ID (bus:device) of this GPU
+        std::string pciId() const;
+
+        // Return the maximum number of threads per block
+        //
+        // Hardware dependent.
+        // - Returns at least 512 (except for ancient hardware)
+        // - Returns 1024 for K10 (= Cobalt hardware)
+        unsigned getMaxThreadsPerBlock() const;
+
+        // Return the maximum dimensions of a block of threads.
+        struct Block getMaxBlockDims() const;
+
+        // Return the maximum dimensions of a grid of blocks.
+        struct Grid getMaxGridDims() const;
+
+        // Return the number of multi-processors.
+        unsigned getMultiProcessorCount() const;
+
+        // Return the maximum number of threads that can be
+        // resident on a multi-processor.
+        unsigned getMaxThreadsPerMultiProcessor() const;
+
+        // Return information on a specific \a attribute.
+        // \param attribute CUDA device attribute
+        int getAttribute(CUdevice_attribute attribute) const;
+
+      private:
+        // Context needs access to our \c _device to create a context.
+        friend class Context;
+
+        // The CUDA device.
+        CUdevice _device;
+      };
+
+
+      // Wrap a CUDA Context. Since this class manages a resource (a CUDA
+      // context), it uses the pimpl idiom in combination with a reference
+      // counted pointer to make it copyable.
+      //
+      // We do not tie any context to any thread by default -- all contexts
+      // are `floating', and are to be tied to a thread only by pushing them
+      // as the current context, performing operation(s), and popping them
+      // from the current context stack. The pushing and popping is automated
+      // in the ScopedCurrentContext class.
+      class Context
+      {
+      public:
+        // Create a new CUDA context and associate it with the calling thread.
+        // In other words, \c setCurrent() is implied.
+        //
+        // Flags:
+
+        //    CU_CTX_SCHED_AUTO:
+        //        The default value if the flags parameter is zero, uses a
+        //        heuristic based on the number of active CUDA contexts in the
+        //        process C and the number of logical processors in the system P.
+        //        If C > P, then CUDA will yield to other OS threads when waiting
+        //        for the GPU, otherwise CUDA will not yield while waiting for
+        //        results and actively spin on the processor.
+        //    CU_CTX_SCHED_SPIN:
+        //        Instruct CUDA to actively spin when waiting for results from the GPU.
+        //        This can decrease latency when waiting for the GPU, but may lower
+        //        the performance of CPU threads if they are performing work in parallel
+        //        with the CUDA thread.
+        //    CU_CTX_SCHED_YIELD:
+        //        Instruct CUDA to yield its thread when waiting for results from the GPU.
+        //        This can increase latency when waiting for the GPU, but can increase
+        //        the performance of CPU threads performing work in parallel with the GPU.
+        //    CU_CTX_SCHED_BLOCKING_SYNC:
+        //        Instruct CUDA to block the CPU thread on a synchronization primitive
+        //        when waiting for the GPU to finish work.
+        Context(const Device &device, unsigned int flags = CU_CTX_SCHED_YIELD);
+
+        // Returns the device associated to this context.
+        Device getDevice() const;
+
+        // Set the cache configuration for kernel launches in this context.
+        void setCacheConfig(CUfunc_cache config) const;
+
+        // Set the shared memory configuration for kernel launches in this context.
+        void setSharedMemConfig(CUsharedconfig config) const;
+
+      private:
+        // Non-copyable implementation class.
+        class Impl;
+
+        // Reference counted pointer to the implementation class.
+        boost::shared_ptr<Impl> _impl;
+
+        friend class ScopedCurrentContext;
+      };
+
+
+      // Make a certain context the current one for a certain scope.
+      class ScopedCurrentContext
+      {
+      public:
+        ScopedCurrentContext( const Context &context );
+        ~ScopedCurrentContext();
+
+      private:
+        const Context &_context;
+      };
+
+
+      // Wrap CUDA Host Memory. This is the equivalent of a OpenCL Buffer. CUDA
+      // distinguishes between between host- and device memory, OpenCL does not.
+      class HostMemory
+      {
+      public:
+        // Allocate \a size bytes of host memory.
+        // \param context CUDA context associated with this HostMemory object.
+        // \param size number of bytes to allocate
+        // \param flags affect allocation
+        // \note To create pinned memory, we need to set
+        // \code
+        // flags = CU_MEMHOSTALLOC_PORTABLE
+        // \endcode
+        // \note For input buffers we may consider setting
+        // \code
+        // flags = CU_MEMHOSTALLOC_PORTABLE | CU_MEMHOSTALLOC_WRITECOMBINED
+        // \endcode
+        // Please refer to the documentation of the function \c cuMemHostAlloc()
+        // in the CUDA Driver API for details.
+        HostMemory(const Context &context, size_t size, unsigned int flags = 0);
+
+        // Return a pointer to the actual memory.
+        // \warning The returned pointer shall not have a lifetime beyond the
+        // lifetime of this object (actually the last copy).
+        template <typename T>
+        T *get() const;
+
+        // Return the size of this memory block.
+        size_t size() const;
+
+      private:
+        // Get a void pointer to the actual memory from our Impl class. This
+        // method is only used by our templated get() method.
+        void* getPtr() const;
+
+        // Non-copyable implementation class.
+        class Impl;
+
+        // Reference counted pointer to the implementation class.
+        boost::shared_ptr<Impl> _impl;
+      };
+
+
+      // Wrap CUDA Device Memory. This is the equivalent of an OpenCL
+      // Buffer. CUDA distinguishes between between host- and device memory,
+      // OpenCL does not.
+      class DeviceMemory
+      {
+      public:
+        // Allocate \a size bytes of device memory.
+        DeviceMemory(const Context &context, size_t size);
+
+        // Return a device pointer as a handle to the memory.
+        void *get() const;
+
+        // Fill the first \a n bytes of memory with the constant byte \a uc.
+        // \param uc Constant byte value to put into memory
+        // \param n  Number of bytes to set. Defaults to the complete block.
+        //           If \a n is larger than the current memory block size, then
+        //           the complete block will be set to \a uc.
+        void set(unsigned char uc, size_t n = (size_t)-1) const;
+
+        // Return the size of this memory block.
+        size_t size() const;
+
+        // Fetch the contents of this buffer in a new HostMemory buffer.
+        HostMemory fetch() const;
+
+      private:
+        // Function needs access to our device ptr location to set this as a kernel arg.
+        friend class Function;
+
+        // Non-copyable implementation class.
+        class Impl;
+
+        // Reference counted pointer to the implementation class.
+        boost::shared_ptr<Impl> _impl;
+      };
+
+
+      // Wrap a CUDA Module. This is the equivalent of a OpenCL Program.
+      class Module
+      {
+      public:
+        typedef std::map<CUjit_option, void*> optionmap_t;
+
+        Module(); // TODO: tmp, as long as CorrelatorPipelinePrograms needs a default init
+
+        // Load the module in the file \a fname into the given \a context. The
+        // file should be a \e cubin file or a \e ptx file as output by \c nvcc.
+        // \param context CUDA context associated with this Module object.
+        // \param fname name of a module file
+        // \note For details, please refer to the documentation of \c
+        // cuModuleLoad in the CUDA Driver API.
+        Module(const Context &context, const std::string &fname);
+
+        // Load the module pointed to by \a image into the given \a context. The
+        // pointer may point to a null-terminated string containing \e cubin or
+        // \e ptx code.
+        // \param context CUDA context associated with this Module object.
+        // \param image pointer to a module image in memory
+        // \note For details, please refer to the documentation of \c
+        // cuModuleLoadData in the CUDA Driver API.
+        Module(const Context &context, const void *image);
+
+        // Load the module pointed to by \a image into the given \a context. The
+        // pointer may point to a null-terminated string containing \e cubin or
+        // \e ptx code.
+        // \param context CUDA context associated with this Module object.
+        // \param image pointer to a module image in memory
+        // \param options map of \c CUjit_option items, with their associated
+        // values.
+        // \note All values are cast to void*, so if an option requires
+        // an unsigned int as value, the unsigned int's value itself is cast to void*!
+        // \note For details, please refer to the documentation of \c
+        // cuModuleLoadDataEx in the CUDA Driver API.
+        Module(const Context &context, const void *image, optionmap_t &options);
+
+        // Return the Context in which this Module was created.
+        Context getContext() const;
+
+      private:
+        // Function needs access to our module to create a function.
+        friend class Function;
+
+        // Non-copyable implementation class.
+        class Impl;
+
+        // Reference counted pointer to the implementation class.
+        boost::shared_ptr<Impl> _impl;
+      };
+
+      // Wrap a CUDA Device Function. This is the equivalent of an OpenCL
+      // Program.
+      class Function
+      {
+      public:
+        // Construct a function object by looking up the function \a name in the
+        // module \a module.
+        Function(const Module &module, const std::string &name);
+
+        // Return the name of the function.
+        std::string name() const;
+
+        // Set kernel immediate argument number \a index to \a val.
+        // \a val must outlive kernel execution.
+        // Not for device memory objects (be it as DeviceMemory or as void *).
+        template <typename T>
+        void setArg(size_t index, const T &val);
+
+        // Set kernel DeviceMemory object argument number \a index to \a mem.
+        // \a mem must outlive kernel execution.
+        void setArg(size_t index, const DeviceMemory &mem);
+
+        // Set pointer to kernel device memory object (as void *) number \a index
+        // to \a val. \a *val must outlive kernel execution.
+        // Note: Prefer to use setArg() passing a DeviceMemory ref over this overload.
+        void setArg(size_t index, const void **val);
+
+        // Return information about a function.
+        // \note For details on valid values for \a attribute, please refer to
+        // the documentation of cuFuncGetAttribute in the CUDA Driver API.
+        int getAttribute(CUfunction_attribute attribute) const;
+
+        // Set the shared memory configuration for a device function.
+        // \note For details on valid values for \a config, please refer to the
+        // documentation of cuFuncSetSharedMemConfig in the CUDA Driver API.
+        void setSharedMemConfig(CUsharedconfig config) const;
+
+      protected:
+        const Context _context;
+
+      private:
+        // Keep the Module alive, because Function actually wraps a pointer
+        // to a function within the Module.
+        const Module _module;
+
+        // The name of the function, for error reporting purposes
+        const std::string _name;
+
+        // Stream needs access to our CUDA function to launch a kernel.
+        friend class Stream;
+
+        // CUDA function.
+        CUfunction _function;
+
+        // Function arguments as set.
+        std::vector<const void *> _kernelArgs;
+
+        // Helper function to modify _kernelArgs.
+        void doSetArg(size_t index, const void *argp);
+
+        // Do not use. To guard against passing pointers.
+        // Note that even device void * cannot be passed, because we need its
+        // address with a life time longer than this formal parameter.
+        template<typename T>
+        void setArg(size_t index, const T *&); // intentionally not impl.
+
+        // Do not use. To guard against passing HostMemory references to kernels.
+        void setArg(size_t index, const HostMemory &); // intentionally not impl.
+
+        // Do not use. To guard against passing HostMemory pointers to kernels.
+        void setArg(size_t index, const HostMemory *); // intentionally not impl.
+      };
+
+      // Wrap a CUDA Event. This is the equivalent of an OpenCL Event.
+      class Event
+      {
+      public:
+        // Construct a CUDA event. This class manages a resource (a CUDA event)
+        // and is therefore implemented using the pimpl idiom, using a reference
+        // counted pointer to a non-copyable implementation class.
+        // \note For details on valid values for \a flags, please refer to the
+        // documentation of cuEventCreate in the CUDA Driver API.
+        Event(const Context &context, unsigned int flags = CU_EVENT_DEFAULT);
+
+        // Return the elapsed time in milliseconds between this event and the \a
+        // second event.
+        float elapsedTime(Event &second) const;
+
+        // Wait until all work preceding this event in the same stream has
+        // completed.
+        void wait();
+
+      private:
+        // Stream needs access to our CUDA event to wait for and record events.
+        friend class Stream;
+
+        // Non-copyable implementation class.
+        class Impl;
+
+        // Reference counted pointer to the implementation class.
+        boost::shared_ptr<Impl> _impl;
+      };
+
+
+      // Wrap a CUDA Stream. This is the equivalent of an OpenCL
+      // CommandQueue. This class manages a resource (a CUDA stream) and is
+      // therefore implemented using the pimpl idiom, using a reference counted
+      // pointer to a non-copyable implementation class.
+      class Stream
+      {
+      public:
+        // Create a stream.
+        // \param flags must be 0 for CUDA < 5.0
+        // \param context CUDA context associated with this Stream object.
+        // \note For details on valid values for \a flags, please refer to the
+        // documentation of \c cuStreamCreate in the CUDA Driver API.
+        explicit Stream(const Context &context, unsigned int flags = 0);  // named CU_STREAM_DEFAULT (0) since CUDA 5.0
+
+        // Transfer data from host memory \a hostMem to device memory \a devMem.
+        // \param devMem Device memory that will be copied to.
+        // \param hostMem Host memory that will be copied from.
+        // \param synchronous Indicates whether the transfer must be done
+        //        synchronously or asynchronously.
+        void writeBuffer(const DeviceMemory &devMem, const HostMemory &hostMem,
+                         bool synchronous = false) const;
+
+        // Transfer data from host memory \a hostMem to device memory \a devMem.
+        // When gpuProfiling is enabled this transfer is synchronous
+        // \param devMem Device memory that will be copied to.
+        // \param hostMem Host memory that will be copied from.
+        // \param counter PerformanceCounter that will receive transfer duration
+        // if  gpuProfiling is enabled
+        // \param synchronous Indicates whether the transfer must be done
+        //        synchronously or asynchronously. Default == false
+        void writeBuffer(const DeviceMemory &devMem, const HostMemory &hostMem,
+                         PerformanceCounter &counter, bool synchronous = false) const;
+
+        // Transfer data from device memory \a devMem to host memory \a hostMem.
+        // \param hostMem Host memory that will be copied to.
+        // \param devMem Device memory that will be copied from.
+        // \param synchronous Indicates whether the transfer must be done
+        //        synchronously or asynchronously.
+        void readBuffer(const HostMemory &hostMem, const DeviceMemory &devMem,
+                        bool synchronous = false) const;
+
+        // Transfer data from device memory \a devMem to host memory \a hostMem.
+        // When gpuProfiling is enabled this transfer is synchronous
+        // \param hostMem Host memory that will be copied to.
+        // \param devMem Device memory that will be copied from.
+        // \param counter PerformanceCounter that will receive transfer duration
+        // if  gpuProfiling is enabled
+        // \param synchronous Indicates whether the transfer must be done
+        //        synchronously or asynchronously. Default == false
+        void readBuffer(const HostMemory &hostMem, const DeviceMemory &devMem,
+                        PerformanceCounter &counter, bool synchronous = false) const;
+
+        // Transfer data from device memory \a devSource to device memory \a devTarget.
+        // \param devTarget Device memory that will be copied to.
+        // \param devSource Device memory that will be copied from.
+        // \param synchronous Indicates whether the transfer must be done
+        //        synchronously or asynchronously.
+        void copyBuffer(const DeviceMemory &devTarget, const DeviceMemory &devSource,
+                        bool synchronous = false) const;
+
+        // Transfer data from device memory \a devSource to device memory \a devTarget.
+        // When gpuProfiling is enabled this transfer is synchronous
+        // \param devTarget Device memory that will be copied to.
+        // \param devSource Device memory that will be copied from.
+        // \param counter PerformanceCounter that will receive transfer duration
+        //        if gpuProfiling is enabled
+        // \param synchronous Indicates whether the transfer must be done
+        //        synchronously or asynchronously. Defaults to \c false
+        //        (asynchronously).
+        void copyBuffer(const DeviceMemory &devTarget, const DeviceMemory &devSource,
+                        PerformanceCounter &counter, bool synchronous = false) const;
+
+        // Launch a CUDA function.
+        // \param function object containing the function to launch
+        // \param grid Grid size (in terms of blocks (not threads (OpenCL)))
+        // \param block Block (thread group) size
+        void launchKernel(const Function &function,
+                          const Grid &grid, const Block &block) const;
+
+        // Check if all operations on this stream have completed.
+        // \return true if all completed, or false otherwise.
+        bool query() const;
+
+        // Wait until a this stream's tasks are completed.
+        void synchronize() const;
+
+        // Let this stream wait on the event \a event.
+        void waitEvent(const Event &event) const;
+
+        // Record the event \a event for this stream.
+        void recordEvent(const Event &event) const;
+
+        // Return the underlying CUDA stream. TODO: try to get rid of CUstream here: FFT thing to here or make it friend
+        CUstream get() const;
+
+        // Returns the context associated with the underlying CUDA stream.
+        Context getContext() const; // TODO: consider using this in the SubbandProcs (now has Stream and Context stored)
+
+        // Return whether this stream mandates synchronous behaviour
+        bool isSynchronous() const;
+
+      private:
+        // Non-copyable implementation class.
+        class Impl;
+
+        // Reference counted pointer to the implementation class.
+        boost::shared_ptr<Impl> _impl;
+
+        // Force synchronous transfers and kernel launches
+        bool force_synchronous;
+      };
+
+    } // namespace gpu
+  } // namespace Cobalt
+} // namespace LOFAR
+
+#include "gpu_wrapper.tcc"
 
 #endif
 
diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.tcc b/RTCP/Cobalt/GPUProc/src/gpu_wrapper.tcc
similarity index 100%
rename from RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.tcc
rename to RTCP/Cobalt/GPUProc/src/gpu_wrapper.tcc
diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorStep.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorStep.cc
index 6f78be3eb99..eb1850b2e75 100644
--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorStep.cc
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorStep.cc
@@ -22,7 +22,7 @@
 
 #include <CoInterface/BudgetTimer.h>
 #include <GPUProc/SubbandProcs/CorrelatorStep.h>
-#include <GPUProc/cuda/gpu_wrapper.h>
+#include <GPUProc/gpu_wrapper.h>
 
 #include <UnitTest++.h>
 #include <iostream>
diff --git a/RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc b/RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc
index 9571fe3438d..319ab116ac1 100644
--- a/RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc
+++ b/RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc
@@ -37,7 +37,7 @@
 #include <CoInterface/Parset.h>
 #include <GPUProc/FilterBank.h>
 #include <GPUProc/SubbandProcs/CorrelatorSubbandProc.h>
-#include <GPUProc/cuda/Pipelines/Pipeline.h>
+#include <GPUProc/Pipelines/Pipeline.h>
 #include <GPUProc/gpu_utils.h>
 #include <GPUProc/gpu_wrapper.h>
 
diff --git a/RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc b/RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc
index e0d1b940506..1e1bb0f2ca3 100644
--- a/RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc
+++ b/RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc
@@ -28,7 +28,7 @@
 
 #include <Common/LofarLogger.h>
 #include <GPUProc/gpu_wrapper.h>
-#include <GPUProc/cuda/PerformanceCounter.h>
+#include <GPUProc/PerformanceCounter.h>
 #include <UnitTest++.h>
 
 using namespace std;
-- 
GitLab