From 0f7e584913fe015013db51a3e29b22d733f2e664 Mon Sep 17 00:00:00 2001 From: Jorrit Schaap <schaap@astron.nl> Date: Fri, 15 Mar 2019 07:50:49 +0000 Subject: [PATCH] COB-60: moved reamining cuda files up one level to the one-and-only implementation dir. --- .../CobaltTest/test/tManyPartTABOutput.cc | 2 +- .../CobaltTest/test/tMultiPartTABOutput.cc | 2 +- RTCP/Cobalt/GPUProc/src/CMakeLists.txt | 14 +- .../GPUProc/src/{cuda => }/KernelFactory.cc | 0 RTCP/Cobalt/GPUProc/src/KernelFactory.h | 133 +++- .../GPUProc/src/MultiDimArrayHostBuffer.h | 53 +- .../src/{cuda => }/PerformanceCounter.cc | 0 RTCP/Cobalt/GPUProc/src/PerformanceCounter.h | 56 +- RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt | 7 - RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h | 140 ---- .../src/cuda/MultiDimArrayHostBuffer.h | 62 -- .../GPUProc/src/cuda/PerformanceCounter.h | 65 -- RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h | 33 - RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h | 627 ------------------ .../GPUProc/src/{cuda => }/cuda_config.h.in | 0 RTCP/Cobalt/GPUProc/src/gpu_incl.h | 12 +- .../GPUProc/src/{cuda => }/gpu_utils.cc | 0 .../GPUProc/src/{cuda => }/gpu_wrapper.cc | 0 RTCP/Cobalt/GPUProc/src/gpu_wrapper.h | 601 ++++++++++++++++- .../GPUProc/src/{cuda => }/gpu_wrapper.tcc | 0 .../test/SubbandProcs/tCorrelatorStep.cc | 2 +- RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc | 2 +- RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc | 2 +- 23 files changed, 794 insertions(+), 1019 deletions(-) rename RTCP/Cobalt/GPUProc/src/{cuda => }/KernelFactory.cc (100%) rename RTCP/Cobalt/GPUProc/src/{cuda => }/PerformanceCounter.cc (100%) delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.h delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h delete mode 100644 RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h rename RTCP/Cobalt/GPUProc/src/{cuda => }/cuda_config.h.in (100%) rename RTCP/Cobalt/GPUProc/src/{cuda => }/gpu_utils.cc (100%) rename RTCP/Cobalt/GPUProc/src/{cuda => }/gpu_wrapper.cc (100%) rename RTCP/Cobalt/GPUProc/src/{cuda => }/gpu_wrapper.tcc (100%) diff --git a/RTCP/Cobalt/CobaltTest/test/tManyPartTABOutput.cc b/RTCP/Cobalt/CobaltTest/test/tManyPartTABOutput.cc index d310005bdb5..3cd86fb6639 100644 --- a/RTCP/Cobalt/CobaltTest/test/tManyPartTABOutput.cc +++ b/RTCP/Cobalt/CobaltTest/test/tManyPartTABOutput.cc @@ -30,7 +30,7 @@ #include <Common/LofarLogger.h> #include <CoInterface/Parset.h> #include <GPUProc/Pipelines/Pipeline.h> -#include <GPUProc/cuda/SubbandProcs/SubbandProcOutputData.h> +#include <GPUProc/SubbandProcs/SubbandProcOutputData.h> #include <GPUProc/Station/StationInput.h> #include <GPUProc/Storage/StorageProcesses.h> diff --git a/RTCP/Cobalt/CobaltTest/test/tMultiPartTABOutput.cc b/RTCP/Cobalt/CobaltTest/test/tMultiPartTABOutput.cc index f7cb9a1464c..9177c52d59d 100644 --- a/RTCP/Cobalt/CobaltTest/test/tMultiPartTABOutput.cc +++ b/RTCP/Cobalt/CobaltTest/test/tMultiPartTABOutput.cc @@ -30,7 +30,7 @@ #include <Common/LofarLogger.h> #include <CoInterface/Parset.h> #include <GPUProc/Pipelines/Pipeline.h> -#include <GPUProc/cuda/SubbandProcs/SubbandProcOutputData.h> +#include <GPUProc/SubbandProcs/SubbandProcOutputData.h> #include <GPUProc/Station/StationInput.h> #include <GPUProc/Storage/StorageProcesses.h> diff --git a/RTCP/Cobalt/GPUProc/src/CMakeLists.txt b/RTCP/Cobalt/GPUProc/src/CMakeLists.txt index 8b52611b0b5..a73efffb36a 100644 --- a/RTCP/Cobalt/GPUProc/src/CMakeLists.txt +++ b/RTCP/Cobalt/GPUProc/src/CMakeLists.txt @@ -7,6 +7,11 @@ execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_BINARY_DIR}/include/${PACKAGE_NAME}) +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cuda_config.h.in" + "${CMAKE_BINARY_DIR}/include/cuda_config.h" # internal, no need to install +) + set(_gpuproc_sources #Package__Version.cc BandPass.cc @@ -26,10 +31,10 @@ set(_gpuproc_sources ) list(APPEND _gpuproc_sources - cuda/gpu_wrapper.cc - cuda/gpu_utils.cc - cuda/KernelFactory.cc - cuda/PerformanceCounter.cc + gpu_wrapper.cc + gpu_utils.cc + KernelFactory.cc + PerformanceCounter.cc Kernels/Kernel.cc Kernels/BeamFormerKernel.cc Kernels/BeamFormerTransposeKernel.cc @@ -67,7 +72,6 @@ list(APPEND _gpuproc_sources # SubbandProcs/UHEP_SubbandProc.cc ) -add_subdirectory(cuda) lofar_add_library(gpuproc ${_gpuproc_sources}) if(CUDA_cufft_LIBRARY) diff --git a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.cc b/RTCP/Cobalt/GPUProc/src/KernelFactory.cc similarity index 100% rename from RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.cc rename to RTCP/Cobalt/GPUProc/src/KernelFactory.cc diff --git a/RTCP/Cobalt/GPUProc/src/KernelFactory.h b/RTCP/Cobalt/GPUProc/src/KernelFactory.h index 8a3bcef789d..9fc75474dd1 100644 --- a/RTCP/Cobalt/GPUProc/src/KernelFactory.h +++ b/RTCP/Cobalt/GPUProc/src/KernelFactory.h @@ -1,6 +1,6 @@ -//# KernelFactory.h: Factory for Kernel objects. +//# KernelFactory.h //# -//# Copyright (C) 2013 ASTRON (Netherlands Institute for Radio Astronomy) +//# Copyright (C) 2012-2013 ASTRON (Netherlands Institute for Radio Astronomy) //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands //# //# This file is part of the LOFAR software suite. @@ -19,23 +19,122 @@ //# //# $Id$ -// \file -// Factory for Kernel objects. +#ifndef LOFAR_GPUPROC_CUDA_KERNELFACTORY_H +#define LOFAR_GPUPROC_CUDA_KERNELFACTORY_H -#ifndef LOFAR_GPUPROC_KERNELFACTORY_H -#define LOFAR_GPUPROC_KERNELFACTORY_H +#include <string> +#include <CoInterface/Parset.h> +#include <GPUProc/Kernels/Kernel.h> +#include <GPUProc/gpu_wrapper.h> +#include <GPUProc/gpu_utils.h> -#if defined (USE_CUDA) && defined (USE_OPENCL) -# error "Either CUDA or OpenCL must be enabled, not both" -#endif +namespace LOFAR +{ + namespace Cobalt + { + // Abstract base class of the templated KernelFactory class. + class KernelFactoryBase + { + public: + // Pure virtual destructor, because this is an abstract base class. + virtual ~KernelFactoryBase() = 0; -#if defined (USE_CUDA) -# include "cuda/KernelFactory.h" -#elif defined (USE_OPENCL) -# include "opencl/KernelFactory.h" -#else -# error "Either CUDA or OpenCL must be enabled, not neither" -#endif + protected: + // Return compile definitions to use when creating PTX code for any + // Kernel. + CompileDefinitions + compileDefinitions(const Kernel::Parameters& param) const; -#endif + // Return compile flags to use when creating PTX code for any Kernel. + CompileFlags + compileFlags(const Kernel::Parameters& param) const; + }; + + // Declaration of a generic factory class. For each concrete Kernel class + // (e.g. FIR_FilterKernel), a specialization must exist of the constructor + // and of the bufferSize() method. + template<typename T> class KernelFactory : public KernelFactoryBase + { + public: + // typedef typename T::Parameters Parameters; + typedef typename T::BufferType BufferType; + typedef typename T::Buffers Buffers; + + // Construct a factory for creating Kernel objects of type \c T, using the + // settings provided by \a params. + KernelFactory(const typename T::Parameters ¶ms) : + itsParameters(params), + itsPTX(_createPTX()) + { + } + + // Create a new Kernel object of type \c T. + T* create(const gpu::Stream& stream, + gpu::DeviceMemory &inputBuffer, + gpu::DeviceMemory &outputBuffer) const + { + const typename T::Buffers buffers(inputBuffer, outputBuffer); + + return create(stream, buffers); + } + + // Return required buffer size for \a bufferType + size_t bufferSize(BufferType bufferType) const + { + return itsParameters.bufferSize(bufferType); + } + + private: + // Used by the constructors to construct the PTX from the other + // members. + std::string _createPTX() const { + return createPTX(T::theirSourceFile, + compileDefinitions(), + compileFlags()); + } + // Create a new Kernel object of type \c T. + T* create(const gpu::Stream& stream, + const typename T::Buffers& buffers) const + { + // Since we use overlapping input/output buffers, their size + // could be larger than we need. + ASSERTSTR(buffers.input.size() >= bufferSize(T::INPUT_DATA), + "Require " << bufferSize(T::INPUT_DATA) << " bytes for input, " + "but buffer is only " << buffers.input.size() << " bytes."); + ASSERTSTR(buffers.output.size() >= bufferSize(T::OUTPUT_DATA), + "Require " << bufferSize(T::OUTPUT_DATA) << " bytes for output, " + "but buffer is only " << buffers.output.size() << " bytes."); + + return new T( + stream, createModule(stream.getContext(), + T::theirSourceFile, + itsPTX), + buffers, itsParameters); + } + + // Return compile definitions to use when creating PTX code for kernels of + // type \c T, using the parameters stored in \c itsParameters. + CompileDefinitions compileDefinitions() const { + return KernelFactoryBase::compileDefinitions(itsParameters); + } + + // Return compile flags to use when creating PTX code for kernels of type + // \c T. + CompileFlags compileFlags() const { + return KernelFactoryBase::compileFlags(itsParameters); + } + + // Additional parameters needed to create a Kernel object of type \c T. + typename T::Parameters itsParameters; + + // PTX code, generated for kernels of type \c T, using information in the + // Parset that was passed to the constructor. + std::string itsPTX; + }; + + } // namespace Cobalt + +} // namespace LOFAR + +#endif diff --git a/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h b/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h index dee99d41fa5..dafb5ba201a 100644 --- a/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h +++ b/RTCP/Cobalt/GPUProc/src/MultiDimArrayHostBuffer.h @@ -1,6 +1,5 @@ //# MultiDimArrayHostBuffer.h -//# -//# Copyright (C) 2013 ASTRON (Netherlands Institute for Radio Astronomy) +//# Copyright (C) 2012-2013 ASTRON (Netherlands Institute for Radio Astronomy) //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands //# //# This file is part of the LOFAR software suite. @@ -19,23 +18,45 @@ //# //# $Id$ -// \file -// Support for our multi-dim array-ed GPU host buffer. +#ifndef LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H +#define LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H -#ifndef LOFAR_GPUPROC_MULTI_DIM_ARRAY_HOST_BUFFER_H -#define LOFAR_GPUPROC_MULTI_DIM_ARRAY_HOST_BUFFER_H +#include <CoInterface/MultiDimArray.h> -#if defined (USE_CUDA) && defined (USE_OPENCL) -# error "Either CUDA or OpenCL must be enabled, not both" -#endif +#include "gpu_wrapper.h" -#if defined (USE_CUDA) -# include "cuda/MultiDimArrayHostBuffer.h" -#elif defined (USE_OPENCL) -# include "opencl/MultiDimArrayHostBuffer.h" -#else -# error "Either CUDA or OpenCL must be enabled, not neither" -#endif +namespace LOFAR +{ + namespace Cobalt + { + + // A MultiDimArray allocated as a HostBuffer + // Note: Elements are not constructed/destructed. + template <typename T, unsigned DIM> + class MultiDimArrayHostBuffer : public gpu::HostMemory, + public MultiDimArray<T, DIM> + { + public: + template <typename ExtentList> + MultiDimArrayHostBuffer(const ExtentList &extents, const gpu::Context &context, + unsigned int flags = 0) + : + HostMemory(context, MultiDimArray<T, DIM>::nrElements(extents) * sizeof(T), flags), + MultiDimArray<T, DIM>(extents, gpu::HostMemory::get<T>(), false) + { + } + + using HostMemory::size; + + private: + MultiDimArrayHostBuffer(); // don't use + MultiDimArrayHostBuffer(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use + MultiDimArrayHostBuffer<T, DIM> &operator=(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use + using MultiDimArray<T, DIM>::resize; // don't use + }; + + } // namespace Cobalt +} // namespace LOFAR #endif diff --git a/RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.cc b/RTCP/Cobalt/GPUProc/src/PerformanceCounter.cc similarity index 100% rename from RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.cc rename to RTCP/Cobalt/GPUProc/src/PerformanceCounter.cc diff --git a/RTCP/Cobalt/GPUProc/src/PerformanceCounter.h b/RTCP/Cobalt/GPUProc/src/PerformanceCounter.h index d7bca1ca404..5748b5f4d19 100644 --- a/RTCP/Cobalt/GPUProc/src/PerformanceCounter.h +++ b/RTCP/Cobalt/GPUProc/src/PerformanceCounter.h @@ -1,6 +1,5 @@ //# PerformanceCounter.h -//# -//# Copyright (C) 2013 ASTRON (Netherlands Institute for Radio Astronomy) +//# Copyright (C) 2012-2013 ASTRON (Netherlands Institute for Radio Astronomy) //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands //# //# This file is part of the LOFAR software suite. @@ -19,23 +18,48 @@ //# //# $Id$ -// \file -// Support GPU kernel performance timing. +#ifndef LOFAR_GPUPROC_CUDA_PERFORMANCECOUNTER_H +#define LOFAR_GPUPROC_CUDA_PERFORMANCECOUNTER_H -#ifndef LOFAR_PERFORMANCE_COUNTER_H -#define LOFAR_PERFORMANCE_COUNTER_H -#if defined (USE_CUDA) && defined (USE_OPENCL) -# error "Either CUDA or OpenCL must be enabled, not both" -#endif +#include <GPUProc/gpu_wrapper.h> +#include <CoInterface/RunningStatistics.h> -#if defined (USE_CUDA) -# include "cuda/PerformanceCounter.h" -#elif defined (USE_OPENCL) -# include "opencl/PerformanceCounter.h" -#else -# error "Either CUDA or OpenCL must be enabled, not neither" -#endif +namespace LOFAR +{ + namespace Cobalt + { + class PerformanceCounter + { + public: + PerformanceCounter(const gpu::Context &context, const std::string &name); + ~PerformanceCounter(); + + void recordStart(const gpu::Stream &stream); + void recordStop(const gpu::Stream &stream); + + // Warning: user must make sure that the counter is not running! + RunningStatistics getStats() { logTime(); return stats; } + + private: + const std::string name; + + // Public event: it needs to be inserted into a stream. + // @{ + gpu::Event start; + gpu::Event stop; + // @} + + // Whether we have posted events that still need to be + // processed in logTime() + bool recording; + + RunningStatistics stats; + + void logTime(); + }; + } +} #endif diff --git a/RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt b/RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt deleted file mode 100644 index 17eac6967b6..00000000000 --- a/RTCP/Cobalt/GPUProc/src/cuda/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -# $Id$ - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/cuda_config.h.in" - "${CMAKE_BINARY_DIR}/include/cuda_config.h" # internal, no need to install -) - diff --git a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h b/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h deleted file mode 100644 index 9fc75474dd1..00000000000 --- a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h +++ /dev/null @@ -1,140 +0,0 @@ -//# KernelFactory.h -//# -//# Copyright (C) 2012-2013 ASTRON (Netherlands Institute for Radio Astronomy) -//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands -//# -//# This file is part of the LOFAR software suite. -//# The LOFAR software suite is free software: you can redistribute it and/or -//# modify it under the terms of the GNU General Public License as published -//# by the Free Software Foundation, either version 3 of the License, or -//# (at your option) any later version. -//# -//# The LOFAR software suite is distributed in the hope that it will be useful, -//# but WITHOUT ANY WARRANTY; without even the implied warranty of -//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//# GNU General Public License for more details. -//# -//# You should have received a copy of the GNU General Public License along -//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>. -//# -//# $Id$ - -#ifndef LOFAR_GPUPROC_CUDA_KERNELFACTORY_H -#define LOFAR_GPUPROC_CUDA_KERNELFACTORY_H - -#include <string> -#include <CoInterface/Parset.h> -#include <GPUProc/Kernels/Kernel.h> -#include <GPUProc/gpu_wrapper.h> -#include <GPUProc/gpu_utils.h> - -namespace LOFAR -{ - namespace Cobalt - { - // Abstract base class of the templated KernelFactory class. - class KernelFactoryBase - { - public: - // Pure virtual destructor, because this is an abstract base class. - virtual ~KernelFactoryBase() = 0; - - protected: - // Return compile definitions to use when creating PTX code for any - // Kernel. - CompileDefinitions - compileDefinitions(const Kernel::Parameters& param) const; - - // Return compile flags to use when creating PTX code for any Kernel. - CompileFlags - compileFlags(const Kernel::Parameters& param) const; - }; - - // Declaration of a generic factory class. For each concrete Kernel class - // (e.g. FIR_FilterKernel), a specialization must exist of the constructor - // and of the bufferSize() method. - template<typename T> class KernelFactory : public KernelFactoryBase - { - public: - // typedef typename T::Parameters Parameters; - typedef typename T::BufferType BufferType; - typedef typename T::Buffers Buffers; - - // Construct a factory for creating Kernel objects of type \c T, using the - // settings provided by \a params. - KernelFactory(const typename T::Parameters ¶ms) : - itsParameters(params), - itsPTX(_createPTX()) - { - } - - // Create a new Kernel object of type \c T. - T* create(const gpu::Stream& stream, - gpu::DeviceMemory &inputBuffer, - gpu::DeviceMemory &outputBuffer) const - { - const typename T::Buffers buffers(inputBuffer, outputBuffer); - - return create(stream, buffers); - } - - // Return required buffer size for \a bufferType - size_t bufferSize(BufferType bufferType) const - { - return itsParameters.bufferSize(bufferType); - } - - private: - // Used by the constructors to construct the PTX from the other - // members. - std::string _createPTX() const { - return createPTX(T::theirSourceFile, - compileDefinitions(), - compileFlags()); - } - - // Create a new Kernel object of type \c T. - T* create(const gpu::Stream& stream, - const typename T::Buffers& buffers) const - { - // Since we use overlapping input/output buffers, their size - // could be larger than we need. - ASSERTSTR(buffers.input.size() >= bufferSize(T::INPUT_DATA), - "Require " << bufferSize(T::INPUT_DATA) << " bytes for input, " - "but buffer is only " << buffers.input.size() << " bytes."); - ASSERTSTR(buffers.output.size() >= bufferSize(T::OUTPUT_DATA), - "Require " << bufferSize(T::OUTPUT_DATA) << " bytes for output, " - "but buffer is only " << buffers.output.size() << " bytes."); - - return new T( - stream, createModule(stream.getContext(), - T::theirSourceFile, - itsPTX), - buffers, itsParameters); - } - - // Return compile definitions to use when creating PTX code for kernels of - // type \c T, using the parameters stored in \c itsParameters. - CompileDefinitions compileDefinitions() const { - return KernelFactoryBase::compileDefinitions(itsParameters); - } - - // Return compile flags to use when creating PTX code for kernels of type - // \c T. - CompileFlags compileFlags() const { - return KernelFactoryBase::compileFlags(itsParameters); - } - - // Additional parameters needed to create a Kernel object of type \c T. - typename T::Parameters itsParameters; - - // PTX code, generated for kernels of type \c T, using information in the - // Parset that was passed to the constructor. - std::string itsPTX; - }; - - } // namespace Cobalt - -} // namespace LOFAR - -#endif diff --git a/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h b/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h deleted file mode 100644 index dafb5ba201a..00000000000 --- a/RTCP/Cobalt/GPUProc/src/cuda/MultiDimArrayHostBuffer.h +++ /dev/null @@ -1,62 +0,0 @@ -//# MultiDimArrayHostBuffer.h -//# Copyright (C) 2012-2013 ASTRON (Netherlands Institute for Radio Astronomy) -//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands -//# -//# This file is part of the LOFAR software suite. -//# The LOFAR software suite is free software: you can redistribute it and/or -//# modify it under the terms of the GNU General Public License as published -//# by the Free Software Foundation, either version 3 of the License, or -//# (at your option) any later version. -//# -//# The LOFAR software suite is distributed in the hope that it will be useful, -//# but WITHOUT ANY WARRANTY; without even the implied warranty of -//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//# GNU General Public License for more details. -//# -//# You should have received a copy of the GNU General Public License along -//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>. -//# -//# $Id$ - -#ifndef LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H -#define LOFAR_GPUPROC_CUDA_MULTI_DIM_ARRAY_HOST_BUFFER_H - -#include <CoInterface/MultiDimArray.h> - -#include "gpu_wrapper.h" - -namespace LOFAR -{ - namespace Cobalt - { - - // A MultiDimArray allocated as a HostBuffer - // Note: Elements are not constructed/destructed. - template <typename T, unsigned DIM> - class MultiDimArrayHostBuffer : public gpu::HostMemory, - public MultiDimArray<T, DIM> - { - public: - template <typename ExtentList> - MultiDimArrayHostBuffer(const ExtentList &extents, const gpu::Context &context, - unsigned int flags = 0) - : - HostMemory(context, MultiDimArray<T, DIM>::nrElements(extents) * sizeof(T), flags), - MultiDimArray<T, DIM>(extents, gpu::HostMemory::get<T>(), false) - { - } - - using HostMemory::size; - - private: - MultiDimArrayHostBuffer(); // don't use - MultiDimArrayHostBuffer(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use - MultiDimArrayHostBuffer<T, DIM> &operator=(const MultiDimArrayHostBuffer<T, DIM> &rhs); // don't use - using MultiDimArray<T, DIM>::resize; // don't use - }; - - } // namespace Cobalt -} // namespace LOFAR - -#endif - diff --git a/RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.h b/RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.h deleted file mode 100644 index 5748b5f4d19..00000000000 --- a/RTCP/Cobalt/GPUProc/src/cuda/PerformanceCounter.h +++ /dev/null @@ -1,65 +0,0 @@ -//# PerformanceCounter.h -//# Copyright (C) 2012-2013 ASTRON (Netherlands Institute for Radio Astronomy) -//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands -//# -//# This file is part of the LOFAR software suite. -//# The LOFAR software suite is free software: you can redistribute it and/or -//# modify it under the terms of the GNU General Public License as published -//# by the Free Software Foundation, either version 3 of the License, or -//# (at your option) any later version. -//# -//# The LOFAR software suite is distributed in the hope that it will be useful, -//# but WITHOUT ANY WARRANTY; without even the implied warranty of -//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//# GNU General Public License for more details. -//# -//# You should have received a copy of the GNU General Public License along -//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>. -//# -//# $Id$ - -#ifndef LOFAR_GPUPROC_CUDA_PERFORMANCECOUNTER_H -#define LOFAR_GPUPROC_CUDA_PERFORMANCECOUNTER_H - - -#include <GPUProc/gpu_wrapper.h> -#include <CoInterface/RunningStatistics.h> - -namespace LOFAR -{ - namespace Cobalt - { - class PerformanceCounter - { - public: - PerformanceCounter(const gpu::Context &context, const std::string &name); - ~PerformanceCounter(); - - void recordStart(const gpu::Stream &stream); - void recordStop(const gpu::Stream &stream); - - // Warning: user must make sure that the counter is not running! - RunningStatistics getStats() { logTime(); return stats; } - - private: - const std::string name; - - // Public event: it needs to be inserted into a stream. - // @{ - gpu::Event start; - gpu::Event stop; - // @} - - // Whether we have posted events that still need to be - // processed in logTime() - bool recording; - - RunningStatistics stats; - - void logTime(); - }; - } -} - -#endif - diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h b/RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h deleted file mode 100644 index 7e56db7b5b4..00000000000 --- a/RTCP/Cobalt/GPUProc/src/cuda/gpu_incl.h +++ /dev/null @@ -1,33 +0,0 @@ -//# gpu_incl.h: portable CUDA header to mirror OpenCL sources -//# Copyright (C) 2012-2013 ASTRON (Netherlands Institute for Radio Astronomy) -//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands -//# -//# This file is part of the LOFAR software suite. -//# The LOFAR software suite is free software: you can redistribute it and/or -//# modify it under the terms of the GNU General Public License as published -//# by the Free Software Foundation, either version 3 of the License, or -//# (at your option) any later version. -//# -//# The LOFAR software suite is distributed in the hope that it will be useful, -//# but WITHOUT ANY WARRANTY; without even the implied warranty of -//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//# GNU General Public License for more details. -//# -//# You should have received a copy of the GNU General Public License along -//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>. -//# -//# $Id$ - -#ifndef LOFAR_GPUPROC_CUDA_GPU_INCL_H -#define LOFAR_GPUPROC_CUDA_GPU_INCL_H - -// Pointless in itself; to mirror the OpenCL sources -// Note: nvcc automatically includes cuda.h, but for most code we don't need it. - -// CUDA include option(s) -//<none> - -#include <cuda.h> - -#endif - diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h b/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h deleted file mode 100644 index 831a5e05993..00000000000 --- a/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.h +++ /dev/null @@ -1,627 +0,0 @@ -//# gpu_wrapper.h: CUDA-specific wrapper classes for GPU types. -//# -//# Copyright (C) 2013 ASTRON (Netherlands Institute for Radio Astronomy) -//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands -//# -//# This file is part of the LOFAR software suite. -//# The LOFAR software suite is free software: you can redistribute it and/or -//# modify it under the terms of the GNU General Public License as published -//# by the Free Software Foundation, either version 3 of the License, or -//# (at your option) any later version. -//# -//# The LOFAR software suite is distributed in the hope that it will be useful, -//# but WITHOUT ANY WARRANTY; without even the implied warranty of -//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//# GNU General Public License for more details. -//# -//# You should have received a copy of the GNU General Public License along -//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>. -//# -//# $Id$ - -#ifndef LOFAR_GPUPROC_CUDA_GPU_WRAPPER_H -#define LOFAR_GPUPROC_CUDA_GPU_WRAPPER_H - -// \file cuda/gpu_wrapper.h -// C++ wrappers for CUDA akin the OpenCL C++ wrappers. -// Uses the "Pimpl" idiom for resource managing classes (i.e. that need to -// control copying having a non-trivial destructor. For more info on Pimpl, see -// http://www.boost.org/doc/libs/release/libs/smart_ptr/sp_techniques.html#pimpl -// Not Pimpl-ed are class Platform, Device, and Function. -// These are also passed by value. - -#include <cstddef> -#include <string> -#include <vector> -#include <map> -#include <iosfwd> - -#include <boost/shared_ptr.hpp> -#include "gpu_incl.h" // ideally, this goes into the .cc, but too much leakage -#include <cufft.h> - -#include <GPUProc/gpu_wrapper.h> // GPUException - -#if CUDA_VERSION < 4020 -typedef int CUsharedconfig; -#endif - -namespace LOFAR -{ - namespace Cobalt - { - class PerformanceCounter; - namespace gpu - { - - // Exception class for CUDA errors. - EXCEPTION_CLASS(CUDAException, GPUException); - - // Return the cuFFT error string associated with \a errcode. - std::string cufftErrorMessage(cufftResult errcode); - - // Return the CUDA error string associated with \a errcode. - std::string errorMessage(CUresult errcode); - - - // Struct representing a CUDA Grid, which is similar to the @c dim3 type - // in the CUDA Runtime API. - struct Grid - { - Grid(unsigned int x_ = 1, unsigned int y_ = 1, unsigned int z_ = 1); - unsigned int x; - unsigned int y; - unsigned int z; - friend std::ostream& operator<<(std::ostream& os, const Grid& grid); - }; - - // Struct representing a CUDA Block, which is similar to the @c dim3 type - // in the CUDA Runtime API. - // - // @invariant x > 0, y > 0, z > 0 - struct Block - { - Block(unsigned int x_ = 1, unsigned int y_ = 1, unsigned int z_ = 1); - unsigned int x; - unsigned int y; - unsigned int z; - friend std::ostream& operator<<(std::ostream& os, const Block& block); - }; - - // Struct containing kernel launch configuration. - struct ExecConfig - { - ExecConfig(Grid gr = Grid(), Block bl = Block(), size_t dynShMem = 0); - Grid grid; - Block block; - size_t dynSharedMemSize; - friend std::ostream& operator<<(std::ostream& os, - const ExecConfig& execConfig); - }; - - - // Forward declaration needed by Platform::devices. - class Device; - - // This class is not strictly needed, because in CUDA there's only one - // platform, but it hides the CUDA calls and makes it similar to OpenCL. - class Platform - { - public: - // Initialize the CUDA platform. - // \param flags must be 0 (at least up till CUDA 5.0). - Platform(unsigned int flags = 0); - - // The CUDA version (e.g. 5.0 -> 5000). - int version() const; - - // Returns the number of devices in the CUDA platform. - size_t size() const; - - // Returns a vector of all devices in the CUDA platform. - std::vector<Device> devices() const; - - // Returns the name of the CUDA platform. (currently, "NVIDIA CUDA") - std::string getName() const; - - // Return the maximum number of threads per block, that - // is supported by all devices on the platform. - // - // Hardware dependent. - // - Returns at least 512 (except for ancient hardware) - // - Returns 1024 for K10 (= Cobalt hardware) - unsigned getMaxThreadsPerBlock() const; - }; - - // Wrap a CUDA Device. - class Device - { - public: - // Create a device. - // \param ordinal is the device number; - // valid range: [0, Platform.size()-1] - Device(int ordinal = 0); - - // Order Devices by PCI ID (used in std::sort) - bool operator<(const Device &other) const; - - // Return the name of the device in human readable form. - std::string getName() const; - - // Return the compute capability (major) - unsigned getComputeCapabilityMajor() const; - - // Return the compute capability (minor) - unsigned getComputeCapabilityMinor() const; - - // Return the total amount of global memory, in bytes - size_t getTotalGlobalMem() const; - - // Return the maximum amount of shared memory per block - size_t getBlockSharedMem() const; - - // Return the total amount of constant memory - size_t getTotalConstMem() const; - - // Return the PCI ID (bus:device) of this GPU - std::string pciId() const; - - // Return the maximum number of threads per block - // - // Hardware dependent. - // - Returns at least 512 (except for ancient hardware) - // - Returns 1024 for K10 (= Cobalt hardware) - unsigned getMaxThreadsPerBlock() const; - - // Return the maximum dimensions of a block of threads. - struct Block getMaxBlockDims() const; - - // Return the maximum dimensions of a grid of blocks. - struct Grid getMaxGridDims() const; - - // Return the number of multi-processors. - unsigned getMultiProcessorCount() const; - - // Return the maximum number of threads that can be - // resident on a multi-processor. - unsigned getMaxThreadsPerMultiProcessor() const; - - // Return information on a specific \a attribute. - // \param attribute CUDA device attribute - int getAttribute(CUdevice_attribute attribute) const; - - private: - // Context needs access to our \c _device to create a context. - friend class Context; - - // The CUDA device. - CUdevice _device; - }; - - - // Wrap a CUDA Context. Since this class manages a resource (a CUDA - // context), it uses the pimpl idiom in combination with a reference - // counted pointer to make it copyable. - // - // We do not tie any context to any thread by default -- all contexts - // are `floating', and are to be tied to a thread only by pushing them - // as the current context, performing operation(s), and popping them - // from the current context stack. The pushing and popping is automated - // in the ScopedCurrentContext class. - class Context - { - public: - // Create a new CUDA context and associate it with the calling thread. - // In other words, \c setCurrent() is implied. - // - // Flags: - - // CU_CTX_SCHED_AUTO: - // The default value if the flags parameter is zero, uses a - // heuristic based on the number of active CUDA contexts in the - // process C and the number of logical processors in the system P. - // If C > P, then CUDA will yield to other OS threads when waiting - // for the GPU, otherwise CUDA will not yield while waiting for - // results and actively spin on the processor. - // CU_CTX_SCHED_SPIN: - // Instruct CUDA to actively spin when waiting for results from the GPU. - // This can decrease latency when waiting for the GPU, but may lower - // the performance of CPU threads if they are performing work in parallel - // with the CUDA thread. - // CU_CTX_SCHED_YIELD: - // Instruct CUDA to yield its thread when waiting for results from the GPU. - // This can increase latency when waiting for the GPU, but can increase - // the performance of CPU threads performing work in parallel with the GPU. - // CU_CTX_SCHED_BLOCKING_SYNC: - // Instruct CUDA to block the CPU thread on a synchronization primitive - // when waiting for the GPU to finish work. - Context(const Device &device, unsigned int flags = CU_CTX_SCHED_YIELD); - - // Returns the device associated to this context. - Device getDevice() const; - - // Set the cache configuration for kernel launches in this context. - void setCacheConfig(CUfunc_cache config) const; - - // Set the shared memory configuration for kernel launches in this context. - void setSharedMemConfig(CUsharedconfig config) const; - - private: - // Non-copyable implementation class. - class Impl; - - // Reference counted pointer to the implementation class. - boost::shared_ptr<Impl> _impl; - - friend class ScopedCurrentContext; - }; - - - // Make a certain context the current one for a certain scope. - class ScopedCurrentContext - { - public: - ScopedCurrentContext( const Context &context ); - ~ScopedCurrentContext(); - - private: - const Context &_context; - }; - - - // Wrap CUDA Host Memory. This is the equivalent of a OpenCL Buffer. CUDA - // distinguishes between between host- and device memory, OpenCL does not. - class HostMemory - { - public: - // Allocate \a size bytes of host memory. - // \param context CUDA context associated with this HostMemory object. - // \param size number of bytes to allocate - // \param flags affect allocation - // \note To create pinned memory, we need to set - // \code - // flags = CU_MEMHOSTALLOC_PORTABLE - // \endcode - // \note For input buffers we may consider setting - // \code - // flags = CU_MEMHOSTALLOC_PORTABLE | CU_MEMHOSTALLOC_WRITECOMBINED - // \endcode - // Please refer to the documentation of the function \c cuMemHostAlloc() - // in the CUDA Driver API for details. - HostMemory(const Context &context, size_t size, unsigned int flags = 0); - - // Return a pointer to the actual memory. - // \warning The returned pointer shall not have a lifetime beyond the - // lifetime of this object (actually the last copy). - template <typename T> - T *get() const; - - // Return the size of this memory block. - size_t size() const; - - private: - // Get a void pointer to the actual memory from our Impl class. This - // method is only used by our templated get() method. - void* getPtr() const; - - // Non-copyable implementation class. - class Impl; - - // Reference counted pointer to the implementation class. - boost::shared_ptr<Impl> _impl; - }; - - - // Wrap CUDA Device Memory. This is the equivalent of an OpenCL - // Buffer. CUDA distinguishes between between host- and device memory, - // OpenCL does not. - class DeviceMemory - { - public: - // Allocate \a size bytes of device memory. - DeviceMemory(const Context &context, size_t size); - - // Return a device pointer as a handle to the memory. - void *get() const; - - // Fill the first \a n bytes of memory with the constant byte \a uc. - // \param uc Constant byte value to put into memory - // \param n Number of bytes to set. Defaults to the complete block. - // If \a n is larger than the current memory block size, then - // the complete block will be set to \a uc. - void set(unsigned char uc, size_t n = (size_t)-1) const; - - // Return the size of this memory block. - size_t size() const; - - // Fetch the contents of this buffer in a new HostMemory buffer. - HostMemory fetch() const; - - private: - // Function needs access to our device ptr location to set this as a kernel arg. - friend class Function; - - // Non-copyable implementation class. - class Impl; - - // Reference counted pointer to the implementation class. - boost::shared_ptr<Impl> _impl; - }; - - - // Wrap a CUDA Module. This is the equivalent of a OpenCL Program. - class Module - { - public: - typedef std::map<CUjit_option, void*> optionmap_t; - - Module(); // TODO: tmp, as long as CorrelatorPipelinePrograms needs a default init - - // Load the module in the file \a fname into the given \a context. The - // file should be a \e cubin file or a \e ptx file as output by \c nvcc. - // \param context CUDA context associated with this Module object. - // \param fname name of a module file - // \note For details, please refer to the documentation of \c - // cuModuleLoad in the CUDA Driver API. - Module(const Context &context, const std::string &fname); - - // Load the module pointed to by \a image into the given \a context. The - // pointer may point to a null-terminated string containing \e cubin or - // \e ptx code. - // \param context CUDA context associated with this Module object. - // \param image pointer to a module image in memory - // \note For details, please refer to the documentation of \c - // cuModuleLoadData in the CUDA Driver API. - Module(const Context &context, const void *image); - - // Load the module pointed to by \a image into the given \a context. The - // pointer may point to a null-terminated string containing \e cubin or - // \e ptx code. - // \param context CUDA context associated with this Module object. - // \param image pointer to a module image in memory - // \param options map of \c CUjit_option items, with their associated - // values. - // \note All values are cast to void*, so if an option requires - // an unsigned int as value, the unsigned int's value itself is cast to void*! - // \note For details, please refer to the documentation of \c - // cuModuleLoadDataEx in the CUDA Driver API. - Module(const Context &context, const void *image, optionmap_t &options); - - // Return the Context in which this Module was created. - Context getContext() const; - - private: - // Function needs access to our module to create a function. - friend class Function; - - // Non-copyable implementation class. - class Impl; - - // Reference counted pointer to the implementation class. - boost::shared_ptr<Impl> _impl; - }; - - // Wrap a CUDA Device Function. This is the equivalent of an OpenCL - // Program. - class Function - { - public: - // Construct a function object by looking up the function \a name in the - // module \a module. - Function(const Module &module, const std::string &name); - - // Return the name of the function. - std::string name() const; - - // Set kernel immediate argument number \a index to \a val. - // \a val must outlive kernel execution. - // Not for device memory objects (be it as DeviceMemory or as void *). - template <typename T> - void setArg(size_t index, const T &val); - - // Set kernel DeviceMemory object argument number \a index to \a mem. - // \a mem must outlive kernel execution. - void setArg(size_t index, const DeviceMemory &mem); - - // Set pointer to kernel device memory object (as void *) number \a index - // to \a val. \a *val must outlive kernel execution. - // Note: Prefer to use setArg() passing a DeviceMemory ref over this overload. - void setArg(size_t index, const void **val); - - // Return information about a function. - // \note For details on valid values for \a attribute, please refer to - // the documentation of cuFuncGetAttribute in the CUDA Driver API. - int getAttribute(CUfunction_attribute attribute) const; - - // Set the shared memory configuration for a device function. - // \note For details on valid values for \a config, please refer to the - // documentation of cuFuncSetSharedMemConfig in the CUDA Driver API. - void setSharedMemConfig(CUsharedconfig config) const; - - protected: - const Context _context; - - private: - // Keep the Module alive, because Function actually wraps a pointer - // to a function within the Module. - const Module _module; - - // The name of the function, for error reporting purposes - const std::string _name; - - // Stream needs access to our CUDA function to launch a kernel. - friend class Stream; - - // CUDA function. - CUfunction _function; - - // Function arguments as set. - std::vector<const void *> _kernelArgs; - - // Helper function to modify _kernelArgs. - void doSetArg(size_t index, const void *argp); - - // Do not use. To guard against passing pointers. - // Note that even device void * cannot be passed, because we need its - // address with a life time longer than this formal parameter. - template<typename T> - void setArg(size_t index, const T *&); // intentionally not impl. - - // Do not use. To guard against passing HostMemory references to kernels. - void setArg(size_t index, const HostMemory &); // intentionally not impl. - - // Do not use. To guard against passing HostMemory pointers to kernels. - void setArg(size_t index, const HostMemory *); // intentionally not impl. - }; - - // Wrap a CUDA Event. This is the equivalent of an OpenCL Event. - class Event - { - public: - // Construct a CUDA event. This class manages a resource (a CUDA event) - // and is therefore implemented using the pimpl idiom, using a reference - // counted pointer to a non-copyable implementation class. - // \note For details on valid values for \a flags, please refer to the - // documentation of cuEventCreate in the CUDA Driver API. - Event(const Context &context, unsigned int flags = CU_EVENT_DEFAULT); - - // Return the elapsed time in milliseconds between this event and the \a - // second event. - float elapsedTime(Event &second) const; - - // Wait until all work preceding this event in the same stream has - // completed. - void wait(); - - private: - // Stream needs access to our CUDA event to wait for and record events. - friend class Stream; - - // Non-copyable implementation class. - class Impl; - - // Reference counted pointer to the implementation class. - boost::shared_ptr<Impl> _impl; - }; - - - // Wrap a CUDA Stream. This is the equivalent of an OpenCL - // CommandQueue. This class manages a resource (a CUDA stream) and is - // therefore implemented using the pimpl idiom, using a reference counted - // pointer to a non-copyable implementation class. - class Stream - { - public: - // Create a stream. - // \param flags must be 0 for CUDA < 5.0 - // \param context CUDA context associated with this Stream object. - // \note For details on valid values for \a flags, please refer to the - // documentation of \c cuStreamCreate in the CUDA Driver API. - explicit Stream(const Context &context, unsigned int flags = 0); // named CU_STREAM_DEFAULT (0) since CUDA 5.0 - - // Transfer data from host memory \a hostMem to device memory \a devMem. - // \param devMem Device memory that will be copied to. - // \param hostMem Host memory that will be copied from. - // \param synchronous Indicates whether the transfer must be done - // synchronously or asynchronously. - void writeBuffer(const DeviceMemory &devMem, const HostMemory &hostMem, - bool synchronous = false) const; - - // Transfer data from host memory \a hostMem to device memory \a devMem. - // When gpuProfiling is enabled this transfer is synchronous - // \param devMem Device memory that will be copied to. - // \param hostMem Host memory that will be copied from. - // \param counter PerformanceCounter that will receive transfer duration - // if gpuProfiling is enabled - // \param synchronous Indicates whether the transfer must be done - // synchronously or asynchronously. Default == false - void writeBuffer(const DeviceMemory &devMem, const HostMemory &hostMem, - PerformanceCounter &counter, bool synchronous = false) const; - - // Transfer data from device memory \a devMem to host memory \a hostMem. - // \param hostMem Host memory that will be copied to. - // \param devMem Device memory that will be copied from. - // \param synchronous Indicates whether the transfer must be done - // synchronously or asynchronously. - void readBuffer(const HostMemory &hostMem, const DeviceMemory &devMem, - bool synchronous = false) const; - - // Transfer data from device memory \a devMem to host memory \a hostMem. - // When gpuProfiling is enabled this transfer is synchronous - // \param hostMem Host memory that will be copied to. - // \param devMem Device memory that will be copied from. - // \param counter PerformanceCounter that will receive transfer duration - // if gpuProfiling is enabled - // \param synchronous Indicates whether the transfer must be done - // synchronously or asynchronously. Default == false - void readBuffer(const HostMemory &hostMem, const DeviceMemory &devMem, - PerformanceCounter &counter, bool synchronous = false) const; - - // Transfer data from device memory \a devSource to device memory \a devTarget. - // \param devTarget Device memory that will be copied to. - // \param devSource Device memory that will be copied from. - // \param synchronous Indicates whether the transfer must be done - // synchronously or asynchronously. - void copyBuffer(const DeviceMemory &devTarget, const DeviceMemory &devSource, - bool synchronous = false) const; - - // Transfer data from device memory \a devSource to device memory \a devTarget. - // When gpuProfiling is enabled this transfer is synchronous - // \param devTarget Device memory that will be copied to. - // \param devSource Device memory that will be copied from. - // \param counter PerformanceCounter that will receive transfer duration - // if gpuProfiling is enabled - // \param synchronous Indicates whether the transfer must be done - // synchronously or asynchronously. Defaults to \c false - // (asynchronously). - void copyBuffer(const DeviceMemory &devTarget, const DeviceMemory &devSource, - PerformanceCounter &counter, bool synchronous = false) const; - - // Launch a CUDA function. - // \param function object containing the function to launch - // \param grid Grid size (in terms of blocks (not threads (OpenCL))) - // \param block Block (thread group) size - void launchKernel(const Function &function, - const Grid &grid, const Block &block) const; - - // Check if all operations on this stream have completed. - // \return true if all completed, or false otherwise. - bool query() const; - - // Wait until a this stream's tasks are completed. - void synchronize() const; - - // Let this stream wait on the event \a event. - void waitEvent(const Event &event) const; - - // Record the event \a event for this stream. - void recordEvent(const Event &event) const; - - // Return the underlying CUDA stream. TODO: try to get rid of CUstream here: FFT thing to here or make it friend - CUstream get() const; - - // Returns the context associated with the underlying CUDA stream. - Context getContext() const; // TODO: consider using this in the SubbandProcs (now has Stream and Context stored) - - // Return whether this stream mandates synchronous behaviour - bool isSynchronous() const; - - private: - // Non-copyable implementation class. - class Impl; - - // Reference counted pointer to the implementation class. - boost::shared_ptr<Impl> _impl; - - // Force synchronous transfers and kernel launches - bool force_synchronous; - }; - - } // namespace gpu - } // namespace Cobalt -} // namespace LOFAR - -#include "gpu_wrapper.tcc" - -#endif - diff --git a/RTCP/Cobalt/GPUProc/src/cuda/cuda_config.h.in b/RTCP/Cobalt/GPUProc/src/cuda_config.h.in similarity index 100% rename from RTCP/Cobalt/GPUProc/src/cuda/cuda_config.h.in rename to RTCP/Cobalt/GPUProc/src/cuda_config.h.in diff --git a/RTCP/Cobalt/GPUProc/src/gpu_incl.h b/RTCP/Cobalt/GPUProc/src/gpu_incl.h index 4884cbd3c55..efb6c8185bb 100644 --- a/RTCP/Cobalt/GPUProc/src/gpu_incl.h +++ b/RTCP/Cobalt/GPUProc/src/gpu_incl.h @@ -25,17 +25,7 @@ #ifndef LOFAR_GPUPROC_GPU_INCL_H #define LOFAR_GPUPROC_GPU_INCL_H -#if defined (USE_CUDA) && defined (USE_OPENCL) -# error "Either CUDA or OpenCL must be enabled, not both" -#endif - -#if defined (USE_CUDA) -# include "cuda/gpu_incl.h" -#elif defined (USE_OPENCL) -# include "opencl/gpu_incl.h" -#else -# error "Either CUDA or OpenCL must be enabled, not neither" -#endif +#include <cuda.h> #endif diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_utils.cc b/RTCP/Cobalt/GPUProc/src/gpu_utils.cc similarity index 100% rename from RTCP/Cobalt/GPUProc/src/cuda/gpu_utils.cc rename to RTCP/Cobalt/GPUProc/src/gpu_utils.cc diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.cc b/RTCP/Cobalt/GPUProc/src/gpu_wrapper.cc similarity index 100% rename from RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.cc rename to RTCP/Cobalt/GPUProc/src/gpu_wrapper.cc diff --git a/RTCP/Cobalt/GPUProc/src/gpu_wrapper.h b/RTCP/Cobalt/GPUProc/src/gpu_wrapper.h index 640e89ccbc5..6f19a4f10ee 100644 --- a/RTCP/Cobalt/GPUProc/src/gpu_wrapper.h +++ b/RTCP/Cobalt/GPUProc/src/gpu_wrapper.h @@ -19,41 +19,612 @@ //# //# $Id$ -// \file -// Wrapper classes for GPU types. +// \file cuda/gpu_wrapper.h +// C++ wrappers for CUDA akin the OpenCL C++ wrappers. +// Uses the "Pimpl" idiom for resource managing classes (i.e. that need to +// control copying having a non-trivial destructor. For more info on Pimpl, see +// http://www.boost.org/doc/libs/release/libs/smart_ptr/sp_techniques.html#pimpl +// Not Pimpl-ed are class Platform, Device, and Function. +// These are also passed by value. #ifndef LOFAR_GPUPROC_GPU_WRAPPER_H #define LOFAR_GPUPROC_GPU_WRAPPER_H -#if defined (USE_CUDA) && defined (USE_OPENCL) -# error "Either CUDA or OpenCL must be enabled, not both" -#endif +#include <cstddef> +#include <string> +#include <vector> +#include <map> +#include <iosfwd> + +#include <boost/shared_ptr.hpp> +#include "gpu_incl.h" // ideally, this goes into the .cc, but too much leakage +#include <cufft.h> #include <Common/Exception.h> +#include <GPUProc/gpu_wrapper.h> // GPUException + +#if CUDA_VERSION < 4020 +typedef int CUsharedconfig; +#endif namespace LOFAR { namespace Cobalt { + class PerformanceCounter; + namespace gpu { // Exception class for GPU errors. EXCEPTION_CLASS(GPUException, LOFAR::Exception); - } // namespace gpu + // Exception class for CUDA errors. + EXCEPTION_CLASS(CUDAException, GPUException); - } // namespace Cobalt + // Return the cuFFT error string associated with \a errcode. + std::string cufftErrorMessage(cufftResult errcode); -} // namespace LOFAR + // Return the CUDA error string associated with \a errcode. + std::string errorMessage(CUresult errcode); + // Struct representing a CUDA Grid, which is similar to the @c dim3 type + // in the CUDA Runtime API. + struct Grid + { + Grid(unsigned int x_ = 1, unsigned int y_ = 1, unsigned int z_ = 1); + unsigned int x; + unsigned int y; + unsigned int z; + friend std::ostream& operator<<(std::ostream& os, const Grid& grid); + }; -#if defined (USE_CUDA) -# include "cuda/gpu_wrapper.h" -#elif defined (USE_OPENCL) -# include "opencl/gpu_wrapper.h" -#else -# error "Either CUDA or OpenCL must be enabled, not neither" -#endif + // Struct representing a CUDA Block, which is similar to the @c dim3 type + // in the CUDA Runtime API. + // + // @invariant x > 0, y > 0, z > 0 + struct Block + { + Block(unsigned int x_ = 1, unsigned int y_ = 1, unsigned int z_ = 1); + unsigned int x; + unsigned int y; + unsigned int z; + friend std::ostream& operator<<(std::ostream& os, const Block& block); + }; + + // Struct containing kernel launch configuration. + struct ExecConfig + { + ExecConfig(Grid gr = Grid(), Block bl = Block(), size_t dynShMem = 0); + Grid grid; + Block block; + size_t dynSharedMemSize; + friend std::ostream& operator<<(std::ostream& os, + const ExecConfig& execConfig); + }; + + + // Forward declaration needed by Platform::devices. + class Device; + + // This class is not strictly needed, because in CUDA there's only one + // platform, but it hides the CUDA calls and makes it similar to OpenCL. + class Platform + { + public: + // Initialize the CUDA platform. + // \param flags must be 0 (at least up till CUDA 5.0). + Platform(unsigned int flags = 0); + + // The CUDA version (e.g. 5.0 -> 5000). + int version() const; + + // Returns the number of devices in the CUDA platform. + size_t size() const; + + // Returns a vector of all devices in the CUDA platform. + std::vector<Device> devices() const; + + // Returns the name of the CUDA platform. (currently, "NVIDIA CUDA") + std::string getName() const; + + // Return the maximum number of threads per block, that + // is supported by all devices on the platform. + // + // Hardware dependent. + // - Returns at least 512 (except for ancient hardware) + // - Returns 1024 for K10 (= Cobalt hardware) + unsigned getMaxThreadsPerBlock() const; + }; + + // Wrap a CUDA Device. + class Device + { + public: + // Create a device. + // \param ordinal is the device number; + // valid range: [0, Platform.size()-1] + Device(int ordinal = 0); + + // Order Devices by PCI ID (used in std::sort) + bool operator<(const Device &other) const; + + // Return the name of the device in human readable form. + std::string getName() const; + + // Return the compute capability (major) + unsigned getComputeCapabilityMajor() const; + + // Return the compute capability (minor) + unsigned getComputeCapabilityMinor() const; + + // Return the total amount of global memory, in bytes + size_t getTotalGlobalMem() const; + + // Return the maximum amount of shared memory per block + size_t getBlockSharedMem() const; + + // Return the total amount of constant memory + size_t getTotalConstMem() const; + + // Return the PCI ID (bus:device) of this GPU + std::string pciId() const; + + // Return the maximum number of threads per block + // + // Hardware dependent. + // - Returns at least 512 (except for ancient hardware) + // - Returns 1024 for K10 (= Cobalt hardware) + unsigned getMaxThreadsPerBlock() const; + + // Return the maximum dimensions of a block of threads. + struct Block getMaxBlockDims() const; + + // Return the maximum dimensions of a grid of blocks. + struct Grid getMaxGridDims() const; + + // Return the number of multi-processors. + unsigned getMultiProcessorCount() const; + + // Return the maximum number of threads that can be + // resident on a multi-processor. + unsigned getMaxThreadsPerMultiProcessor() const; + + // Return information on a specific \a attribute. + // \param attribute CUDA device attribute + int getAttribute(CUdevice_attribute attribute) const; + + private: + // Context needs access to our \c _device to create a context. + friend class Context; + + // The CUDA device. + CUdevice _device; + }; + + + // Wrap a CUDA Context. Since this class manages a resource (a CUDA + // context), it uses the pimpl idiom in combination with a reference + // counted pointer to make it copyable. + // + // We do not tie any context to any thread by default -- all contexts + // are `floating', and are to be tied to a thread only by pushing them + // as the current context, performing operation(s), and popping them + // from the current context stack. The pushing and popping is automated + // in the ScopedCurrentContext class. + class Context + { + public: + // Create a new CUDA context and associate it with the calling thread. + // In other words, \c setCurrent() is implied. + // + // Flags: + + // CU_CTX_SCHED_AUTO: + // The default value if the flags parameter is zero, uses a + // heuristic based on the number of active CUDA contexts in the + // process C and the number of logical processors in the system P. + // If C > P, then CUDA will yield to other OS threads when waiting + // for the GPU, otherwise CUDA will not yield while waiting for + // results and actively spin on the processor. + // CU_CTX_SCHED_SPIN: + // Instruct CUDA to actively spin when waiting for results from the GPU. + // This can decrease latency when waiting for the GPU, but may lower + // the performance of CPU threads if they are performing work in parallel + // with the CUDA thread. + // CU_CTX_SCHED_YIELD: + // Instruct CUDA to yield its thread when waiting for results from the GPU. + // This can increase latency when waiting for the GPU, but can increase + // the performance of CPU threads performing work in parallel with the GPU. + // CU_CTX_SCHED_BLOCKING_SYNC: + // Instruct CUDA to block the CPU thread on a synchronization primitive + // when waiting for the GPU to finish work. + Context(const Device &device, unsigned int flags = CU_CTX_SCHED_YIELD); + + // Returns the device associated to this context. + Device getDevice() const; + + // Set the cache configuration for kernel launches in this context. + void setCacheConfig(CUfunc_cache config) const; + + // Set the shared memory configuration for kernel launches in this context. + void setSharedMemConfig(CUsharedconfig config) const; + + private: + // Non-copyable implementation class. + class Impl; + + // Reference counted pointer to the implementation class. + boost::shared_ptr<Impl> _impl; + + friend class ScopedCurrentContext; + }; + + + // Make a certain context the current one for a certain scope. + class ScopedCurrentContext + { + public: + ScopedCurrentContext( const Context &context ); + ~ScopedCurrentContext(); + + private: + const Context &_context; + }; + + + // Wrap CUDA Host Memory. This is the equivalent of a OpenCL Buffer. CUDA + // distinguishes between between host- and device memory, OpenCL does not. + class HostMemory + { + public: + // Allocate \a size bytes of host memory. + // \param context CUDA context associated with this HostMemory object. + // \param size number of bytes to allocate + // \param flags affect allocation + // \note To create pinned memory, we need to set + // \code + // flags = CU_MEMHOSTALLOC_PORTABLE + // \endcode + // \note For input buffers we may consider setting + // \code + // flags = CU_MEMHOSTALLOC_PORTABLE | CU_MEMHOSTALLOC_WRITECOMBINED + // \endcode + // Please refer to the documentation of the function \c cuMemHostAlloc() + // in the CUDA Driver API for details. + HostMemory(const Context &context, size_t size, unsigned int flags = 0); + + // Return a pointer to the actual memory. + // \warning The returned pointer shall not have a lifetime beyond the + // lifetime of this object (actually the last copy). + template <typename T> + T *get() const; + + // Return the size of this memory block. + size_t size() const; + + private: + // Get a void pointer to the actual memory from our Impl class. This + // method is only used by our templated get() method. + void* getPtr() const; + + // Non-copyable implementation class. + class Impl; + + // Reference counted pointer to the implementation class. + boost::shared_ptr<Impl> _impl; + }; + + + // Wrap CUDA Device Memory. This is the equivalent of an OpenCL + // Buffer. CUDA distinguishes between between host- and device memory, + // OpenCL does not. + class DeviceMemory + { + public: + // Allocate \a size bytes of device memory. + DeviceMemory(const Context &context, size_t size); + + // Return a device pointer as a handle to the memory. + void *get() const; + + // Fill the first \a n bytes of memory with the constant byte \a uc. + // \param uc Constant byte value to put into memory + // \param n Number of bytes to set. Defaults to the complete block. + // If \a n is larger than the current memory block size, then + // the complete block will be set to \a uc. + void set(unsigned char uc, size_t n = (size_t)-1) const; + + // Return the size of this memory block. + size_t size() const; + + // Fetch the contents of this buffer in a new HostMemory buffer. + HostMemory fetch() const; + + private: + // Function needs access to our device ptr location to set this as a kernel arg. + friend class Function; + + // Non-copyable implementation class. + class Impl; + + // Reference counted pointer to the implementation class. + boost::shared_ptr<Impl> _impl; + }; + + + // Wrap a CUDA Module. This is the equivalent of a OpenCL Program. + class Module + { + public: + typedef std::map<CUjit_option, void*> optionmap_t; + + Module(); // TODO: tmp, as long as CorrelatorPipelinePrograms needs a default init + + // Load the module in the file \a fname into the given \a context. The + // file should be a \e cubin file or a \e ptx file as output by \c nvcc. + // \param context CUDA context associated with this Module object. + // \param fname name of a module file + // \note For details, please refer to the documentation of \c + // cuModuleLoad in the CUDA Driver API. + Module(const Context &context, const std::string &fname); + + // Load the module pointed to by \a image into the given \a context. The + // pointer may point to a null-terminated string containing \e cubin or + // \e ptx code. + // \param context CUDA context associated with this Module object. + // \param image pointer to a module image in memory + // \note For details, please refer to the documentation of \c + // cuModuleLoadData in the CUDA Driver API. + Module(const Context &context, const void *image); + + // Load the module pointed to by \a image into the given \a context. The + // pointer may point to a null-terminated string containing \e cubin or + // \e ptx code. + // \param context CUDA context associated with this Module object. + // \param image pointer to a module image in memory + // \param options map of \c CUjit_option items, with their associated + // values. + // \note All values are cast to void*, so if an option requires + // an unsigned int as value, the unsigned int's value itself is cast to void*! + // \note For details, please refer to the documentation of \c + // cuModuleLoadDataEx in the CUDA Driver API. + Module(const Context &context, const void *image, optionmap_t &options); + + // Return the Context in which this Module was created. + Context getContext() const; + + private: + // Function needs access to our module to create a function. + friend class Function; + + // Non-copyable implementation class. + class Impl; + + // Reference counted pointer to the implementation class. + boost::shared_ptr<Impl> _impl; + }; + + // Wrap a CUDA Device Function. This is the equivalent of an OpenCL + // Program. + class Function + { + public: + // Construct a function object by looking up the function \a name in the + // module \a module. + Function(const Module &module, const std::string &name); + + // Return the name of the function. + std::string name() const; + + // Set kernel immediate argument number \a index to \a val. + // \a val must outlive kernel execution. + // Not for device memory objects (be it as DeviceMemory or as void *). + template <typename T> + void setArg(size_t index, const T &val); + + // Set kernel DeviceMemory object argument number \a index to \a mem. + // \a mem must outlive kernel execution. + void setArg(size_t index, const DeviceMemory &mem); + + // Set pointer to kernel device memory object (as void *) number \a index + // to \a val. \a *val must outlive kernel execution. + // Note: Prefer to use setArg() passing a DeviceMemory ref over this overload. + void setArg(size_t index, const void **val); + + // Return information about a function. + // \note For details on valid values for \a attribute, please refer to + // the documentation of cuFuncGetAttribute in the CUDA Driver API. + int getAttribute(CUfunction_attribute attribute) const; + + // Set the shared memory configuration for a device function. + // \note For details on valid values for \a config, please refer to the + // documentation of cuFuncSetSharedMemConfig in the CUDA Driver API. + void setSharedMemConfig(CUsharedconfig config) const; + + protected: + const Context _context; + + private: + // Keep the Module alive, because Function actually wraps a pointer + // to a function within the Module. + const Module _module; + + // The name of the function, for error reporting purposes + const std::string _name; + + // Stream needs access to our CUDA function to launch a kernel. + friend class Stream; + + // CUDA function. + CUfunction _function; + + // Function arguments as set. + std::vector<const void *> _kernelArgs; + + // Helper function to modify _kernelArgs. + void doSetArg(size_t index, const void *argp); + + // Do not use. To guard against passing pointers. + // Note that even device void * cannot be passed, because we need its + // address with a life time longer than this formal parameter. + template<typename T> + void setArg(size_t index, const T *&); // intentionally not impl. + + // Do not use. To guard against passing HostMemory references to kernels. + void setArg(size_t index, const HostMemory &); // intentionally not impl. + + // Do not use. To guard against passing HostMemory pointers to kernels. + void setArg(size_t index, const HostMemory *); // intentionally not impl. + }; + + // Wrap a CUDA Event. This is the equivalent of an OpenCL Event. + class Event + { + public: + // Construct a CUDA event. This class manages a resource (a CUDA event) + // and is therefore implemented using the pimpl idiom, using a reference + // counted pointer to a non-copyable implementation class. + // \note For details on valid values for \a flags, please refer to the + // documentation of cuEventCreate in the CUDA Driver API. + Event(const Context &context, unsigned int flags = CU_EVENT_DEFAULT); + + // Return the elapsed time in milliseconds between this event and the \a + // second event. + float elapsedTime(Event &second) const; + + // Wait until all work preceding this event in the same stream has + // completed. + void wait(); + + private: + // Stream needs access to our CUDA event to wait for and record events. + friend class Stream; + + // Non-copyable implementation class. + class Impl; + + // Reference counted pointer to the implementation class. + boost::shared_ptr<Impl> _impl; + }; + + + // Wrap a CUDA Stream. This is the equivalent of an OpenCL + // CommandQueue. This class manages a resource (a CUDA stream) and is + // therefore implemented using the pimpl idiom, using a reference counted + // pointer to a non-copyable implementation class. + class Stream + { + public: + // Create a stream. + // \param flags must be 0 for CUDA < 5.0 + // \param context CUDA context associated with this Stream object. + // \note For details on valid values for \a flags, please refer to the + // documentation of \c cuStreamCreate in the CUDA Driver API. + explicit Stream(const Context &context, unsigned int flags = 0); // named CU_STREAM_DEFAULT (0) since CUDA 5.0 + + // Transfer data from host memory \a hostMem to device memory \a devMem. + // \param devMem Device memory that will be copied to. + // \param hostMem Host memory that will be copied from. + // \param synchronous Indicates whether the transfer must be done + // synchronously or asynchronously. + void writeBuffer(const DeviceMemory &devMem, const HostMemory &hostMem, + bool synchronous = false) const; + + // Transfer data from host memory \a hostMem to device memory \a devMem. + // When gpuProfiling is enabled this transfer is synchronous + // \param devMem Device memory that will be copied to. + // \param hostMem Host memory that will be copied from. + // \param counter PerformanceCounter that will receive transfer duration + // if gpuProfiling is enabled + // \param synchronous Indicates whether the transfer must be done + // synchronously or asynchronously. Default == false + void writeBuffer(const DeviceMemory &devMem, const HostMemory &hostMem, + PerformanceCounter &counter, bool synchronous = false) const; + + // Transfer data from device memory \a devMem to host memory \a hostMem. + // \param hostMem Host memory that will be copied to. + // \param devMem Device memory that will be copied from. + // \param synchronous Indicates whether the transfer must be done + // synchronously or asynchronously. + void readBuffer(const HostMemory &hostMem, const DeviceMemory &devMem, + bool synchronous = false) const; + + // Transfer data from device memory \a devMem to host memory \a hostMem. + // When gpuProfiling is enabled this transfer is synchronous + // \param hostMem Host memory that will be copied to. + // \param devMem Device memory that will be copied from. + // \param counter PerformanceCounter that will receive transfer duration + // if gpuProfiling is enabled + // \param synchronous Indicates whether the transfer must be done + // synchronously or asynchronously. Default == false + void readBuffer(const HostMemory &hostMem, const DeviceMemory &devMem, + PerformanceCounter &counter, bool synchronous = false) const; + + // Transfer data from device memory \a devSource to device memory \a devTarget. + // \param devTarget Device memory that will be copied to. + // \param devSource Device memory that will be copied from. + // \param synchronous Indicates whether the transfer must be done + // synchronously or asynchronously. + void copyBuffer(const DeviceMemory &devTarget, const DeviceMemory &devSource, + bool synchronous = false) const; + + // Transfer data from device memory \a devSource to device memory \a devTarget. + // When gpuProfiling is enabled this transfer is synchronous + // \param devTarget Device memory that will be copied to. + // \param devSource Device memory that will be copied from. + // \param counter PerformanceCounter that will receive transfer duration + // if gpuProfiling is enabled + // \param synchronous Indicates whether the transfer must be done + // synchronously or asynchronously. Defaults to \c false + // (asynchronously). + void copyBuffer(const DeviceMemory &devTarget, const DeviceMemory &devSource, + PerformanceCounter &counter, bool synchronous = false) const; + + // Launch a CUDA function. + // \param function object containing the function to launch + // \param grid Grid size (in terms of blocks (not threads (OpenCL))) + // \param block Block (thread group) size + void launchKernel(const Function &function, + const Grid &grid, const Block &block) const; + + // Check if all operations on this stream have completed. + // \return true if all completed, or false otherwise. + bool query() const; + + // Wait until a this stream's tasks are completed. + void synchronize() const; + + // Let this stream wait on the event \a event. + void waitEvent(const Event &event) const; + + // Record the event \a event for this stream. + void recordEvent(const Event &event) const; + + // Return the underlying CUDA stream. TODO: try to get rid of CUstream here: FFT thing to here or make it friend + CUstream get() const; + + // Returns the context associated with the underlying CUDA stream. + Context getContext() const; // TODO: consider using this in the SubbandProcs (now has Stream and Context stored) + + // Return whether this stream mandates synchronous behaviour + bool isSynchronous() const; + + private: + // Non-copyable implementation class. + class Impl; + + // Reference counted pointer to the implementation class. + boost::shared_ptr<Impl> _impl; + + // Force synchronous transfers and kernel launches + bool force_synchronous; + }; + + } // namespace gpu + } // namespace Cobalt +} // namespace LOFAR + +#include "gpu_wrapper.tcc" #endif diff --git a/RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.tcc b/RTCP/Cobalt/GPUProc/src/gpu_wrapper.tcc similarity index 100% rename from RTCP/Cobalt/GPUProc/src/cuda/gpu_wrapper.tcc rename to RTCP/Cobalt/GPUProc/src/gpu_wrapper.tcc diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorStep.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorStep.cc index 6f78be3eb99..eb1850b2e75 100644 --- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorStep.cc +++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorStep.cc @@ -22,7 +22,7 @@ #include <CoInterface/BudgetTimer.h> #include <GPUProc/SubbandProcs/CorrelatorStep.h> -#include <GPUProc/cuda/gpu_wrapper.h> +#include <GPUProc/gpu_wrapper.h> #include <UnitTest++.h> #include <iostream> diff --git a/RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc b/RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc index 9571fe3438d..319ab116ac1 100644 --- a/RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc +++ b/RTCP/Cobalt/GPUProc/test/cuda/tFFT_leakage.cc @@ -37,7 +37,7 @@ #include <CoInterface/Parset.h> #include <GPUProc/FilterBank.h> #include <GPUProc/SubbandProcs/CorrelatorSubbandProc.h> -#include <GPUProc/cuda/Pipelines/Pipeline.h> +#include <GPUProc/Pipelines/Pipeline.h> #include <GPUProc/gpu_utils.h> #include <GPUProc/gpu_wrapper.h> diff --git a/RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc b/RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc index e0d1b940506..1e1bb0f2ca3 100644 --- a/RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc +++ b/RTCP/Cobalt/GPUProc/test/cuda/tGPUWrapper.cc @@ -28,7 +28,7 @@ #include <Common/LofarLogger.h> #include <GPUProc/gpu_wrapper.h> -#include <GPUProc/cuda/PerformanceCounter.h> +#include <GPUProc/PerformanceCounter.h> #include <UnitTest++.h> using namespace std; -- GitLab