diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..5c2de547ee7275e615821173bf63de8982dd7207
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "external/cuda-wrappers"]
+	path = external/cuda-wrappers
+	url = https://github.com/nlesc-recruit/CUDA-wrappers.git
diff --git a/Makefile b/Makefile
index 04e5ea223dd5825fa92002528b73ee50fdf05a27..e7f090f1a39230054cc5b819bc3168a58a939cf3 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,6 @@ CUDA=			$(shell dirname `dirname \`which nvcc\``)
 #CUDA=			/usr/local/cuda
 CUDA_INCLUDE=		$(shell dirname `find $(CUDA) -name cuda.h`)
 CUDA_LIBDIR=		$(shell dirname `find $(CUDA) -name libcuda.so`|head -n1)
-NVRTC_INCLUDE=		$(shell dirname `find $(CUDA) -name nvrtc.h`)
 NVRTC_LIBDIR=		$(shell dirname `find $(CUDA) -name libnvrtc.so`|head -n1)
 #POWER_SENSOR=		$(HOME)/projects/libpowersensor-master/build
 ARCH=			$(shell arch)
@@ -11,16 +10,14 @@ CC=			gcc
 CXX=			g++ #-Wno-deprecated-declarations
 NVCC=			nvcc
 INCLUDES=		-I.
-INCLUDES+=		-I$(CUDA_INCLUDE) -I$(NVRTC_INCLUDE)
+INCLUDES+=		-I$(CUDA_INCLUDE)
 #INCLUDES+=		-I$(POWER_SENSOR)/include
 CXXFLAGS+=		-std=c++11 -O3 -g -fpic -fopenmp $(INCLUDES) -DNDEBUG
 NVCCFLAGS=		$(INCLUDES)
 
 #CXXFLAGS+=		-march=core-avx2 -mcmodel=medium
 
-LIBTCC_SOURCES=		util/cu.cc\
-			util/nvrtc.cc\
-			libtcc/CorrelatorKernel.cc\
+LIBTCC_SOURCES=		libtcc/CorrelatorKernel.cc\
 			libtcc/Correlator.cc\
 			libtcc/Kernel.cc
 
@@ -53,8 +50,14 @@ EXECUTABLES=		test/SimpleExample/SimpleExample\
 			test/CorrelatorTest/CorrelatorTest\
 			test/OpenCLCorrelatorTest/OpenCLCorrelatorTest
 
+CUDA_WRAPPERS_DIR=       external/cuda-wrappers
+CUDA_WRAPPERS_LIB=        $(CUDA_WRAPPERS_DIR)/libcu.so
+CUDA_WRAPPERS_INCLUDE=    $(CUDA_WRAPPERS_DIR)/cu
+#LIBTCC_OBJECTS+=         $(CUDA_WRAPPERS_LIB)
+
 LIBRARIES=		-L$(CUDA_LIBDIR) -lcuda\
-			-L$(NVRTC_LIBDIR) -lnvrtc #\
+			$(CUDA_WRAPPERS_LIB) \
+			-L$(NVRTC_LIBDIR) -lnvrtc
 			#-L$(POWER_SENSOR)/lib -lpowersensor #-lnvidia-ml
 
 
@@ -82,13 +85,17 @@ all::			$(EXECUTABLES)
 clean::
 			$(RM) $(OBJECTS) $(SHARED_OBJECTS) $(DEPENDENCIES) $(EXECUTABLES)
 
+$(CUDA_WRAPPERS_LIB):
+			cd $(CUDA_WRAPPERS_DIR) && cmake .
+			cd $(CUDA_WRAPPERS_DIR) && CPATH=$(CPATH):$(CUDA_INCLUDE) make
+
 libtcc/TCCorrelator.o:	libtcc/TCCorrelator.cu	# CUDA code embedded in object file
 			ld -r -b binary -o $@ $<
 
 libtcc/TCCorrelator.d:
 			-
 
-libtcc/libtcc.so.$(VERSION):		$(LIBTCC_OBJECTS)
+libtcc/libtcc.so.$(VERSION):		$(LIBTCC_OBJECTS) $(CUDA_WRAPPERS_LIB)
 			$(CXX) -shared -o $@ $^ $(LIBRARIES)
 
 test/SimpleExample/SimpleExample:		$(SIMPLE_EXAMPLE_OBJECTS) libtcc/libtcc.so
diff --git a/README.md b/README.md
index 8f020f0e8ac7c8f847ab88590dba43a3a101ec1f..362d4e09392bf4e17b3666de817d8ae6124d8242 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,8 @@ appear in _Astronomy and Astrophysics_ soon.
 
 ## Brief overview on how to use the Tensor-Core Correlator library:
 
+Clone the repository (`git clone --recursive`)
+
 Build the library (just type `make`)
 
 Include `libtcc/Correlator.h`, and link with `libtcc/libtcc.so`.
diff --git a/external/cuda-wrappers b/external/cuda-wrappers
new file mode 160000
index 0000000000000000000000000000000000000000..884fbc7c69617f0b4fbc6696435272a488f49716
--- /dev/null
+++ b/external/cuda-wrappers
@@ -0,0 +1 @@
+Subproject commit 884fbc7c69617f0b4fbc6696435272a488f49716
diff --git a/libtcc/Correlator.h b/libtcc/Correlator.h
index 5b9cac6744a1d4f194634e967f30dbf67fe10aaa..0a14267b1d7549e320663485d5213c98d5a71c8f 100644
--- a/libtcc/Correlator.h
+++ b/libtcc/Correlator.h
@@ -2,8 +2,8 @@
 #define TCC_CORRELATOR_H
 
 #include "libtcc/CorrelatorKernel.h"
-#include "util/cu.h"
-#include "util/nvrtc.h"
+#include "external/cuda-wrappers/cu/cu.h"
+#include "external/cuda-wrappers/cu/nvrtc.h"
 
 #include <string>
 
diff --git a/libtcc/Kernel.h b/libtcc/Kernel.h
index 63329ee32a91a50e6c261b5f5f48e97bfbe533df..a5d195c21f32e0a30b4f6cfc1a61478e28326dd0 100644
--- a/libtcc/Kernel.h
+++ b/libtcc/Kernel.h
@@ -1,7 +1,7 @@
 #if !defined TCC_KERNEL_H
 #define TCC_KERNEL_H
 
-#include "util/cu.h"
+#include "external/cuda-wrappers/cu/cu.h"
 
 #include <stdint.h>
 
diff --git a/test/Common/Record.h b/test/Common/Record.h
index 2704748115ff90eaeb62c158377fbb2d00df655d..fa65629c19ef398f97bbd9fcdd0011e14ec62143 100644
--- a/test/Common/Record.h
+++ b/test/Common/Record.h
@@ -3,7 +3,7 @@
 
 #include "test/Common/Config.h"
 
-#include "util/cu.h"
+#include "external/cuda-wrappers/cu/cu.h"
 
 #if defined MEASURE_POWER
 #include <powersensor/NVMLPowerSensor.h>
diff --git a/test/Common/UnitTest.h b/test/Common/UnitTest.h
index dce8b476d3a008b0c265d245cf640a5f10e47ff8..be3c090f487fae9f0cc757fb196beef8cba25b32 100644
--- a/test/Common/UnitTest.h
+++ b/test/Common/UnitTest.h
@@ -2,7 +2,7 @@
 #define UNIT_TEST_H
 
 #include "test/Common/Record.h"
-#include "util/cu.h"
+#include "external/cuda-wrappers/cu/cu.h"
 
 #if defined MEASURE_POWER
 #include <powersensor/NVMLPowerSensor.h>
diff --git a/test/CorrelatorTest/CorrelatorTest.cc b/test/CorrelatorTest/CorrelatorTest.cc
index 2735a36378a9545b678b78b089623571e107d66c..33b33d01381d53f22fbbaee48bce2f47b73d6211 100644
--- a/test/CorrelatorTest/CorrelatorTest.cc
+++ b/test/CorrelatorTest/CorrelatorTest.cc
@@ -2,7 +2,7 @@
 #include "test/Common/Record.h"
 #include "test/CorrelatorTest/CorrelatorTest.h"
 #include "util/ExceptionPropagator.h"
-#include "util/nvrtc.h"
+#include "external/cuda-wrappers/cu/nvrtc.h"
 
 #include <cstdlib>
 #include <cstring>
diff --git a/util/cu.cc b/util/cu.cc
deleted file mode 100644
index 82113bedfe3a17f0c66e300214ca9cb197cb4e35..0000000000000000000000000000000000000000
--- a/util/cu.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "cu.h"
-
-#include <iostream>
-#include <sstream>
-
-
-namespace cu {
-
-const char *Error::what() const noexcept
-{
-  const char *str;
-  return cuGetErrorString(_result, &str) != CUDA_ERROR_INVALID_VALUE ? str : "unknown error";
-}
-
-
-Context Device::primaryCtxRetain()
-{
-  CUcontext context;
-  checkCudaCall(cuDevicePrimaryCtxRetain(&context, _obj));
-  return Context(context, *this);
-}
-
-
-void Source::compile(const char *output_file_name, const char *compiler_options)
-{
-  std::stringstream command_line;
-  command_line << "nvcc -cubin " << compiler_options << " -o " << output_file_name << ' ' << input_file_name;
-//#pragma omp critical (clog)
-  //std::clog << command_line.str() << std::endl;
-
-  int retval = system(command_line.str().c_str());
-
-  if (WEXITSTATUS(retval) != 0)
-    throw Error(CUDA_ERROR_INVALID_SOURCE);
-}
-
-}
diff --git a/util/cu.h b/util/cu.h
deleted file mode 100644
index 72f6cf575a7512b097539202a623b0f089ed951a..0000000000000000000000000000000000000000
--- a/util/cu.h
+++ /dev/null
@@ -1,674 +0,0 @@
-#if !defined CU_WRAPPER_H
-#define CU_WRAPPER_H
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <exception>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-
-
-namespace cu {
-  class Error : public std::exception {
-    public:
-      Error(CUresult result)
-      :
-        _result(result)
-      {
-      }
-
-      virtual const char *what() const noexcept;
-
-      operator CUresult () const
-      {
-	return _result;
-      }
-
-    private:
-      CUresult _result;
-  };
-
-
-  inline void checkCudaCall(CUresult result)
-  {
-    if (result != CUDA_SUCCESS)
-      throw Error(result);
-  }
-
-
-  inline void init(unsigned flags = 0)
-  {
-    checkCudaCall(cuInit(flags));
-  }
-
-
-  inline int driverGetVersion()
-  {
-    int version;
-    checkCudaCall(cuDriverGetVersion(&version));
-    return version;
-  }
-
-  inline void memcpyHtoD(CUdeviceptr dst, const void *src, size_t size)
-  {
-    checkCudaCall(cuMemcpyHtoD(dst, src, size));
-  }
-
-  class Context;
-  class Stream;
-
-  template <typename T> class Wrapper
-  {
-    public:
-      // conversion to C-style T
-
-      operator T () const
-      {
-	return _obj;
-      }
-
-      operator T ()
-      {
-	return _obj;
-      }
-
-      bool operator == (const Wrapper<T> &other)
-      {
-	return _obj == other._obj;
-      }
-
-      bool operator != (const Wrapper<T> &other)
-      {
-	return _obj != other._obj;
-      }
-
-    protected:
-      Wrapper<T>()
-      {
-      }
-
-      Wrapper<T>(const Wrapper<T> &other)
-      :
-	_obj(other._obj),
-	manager(other.manager)
-      {
-      }
-
-      Wrapper<T>(Wrapper<T> &&other)
-      :
-	_obj(other._obj),
-	manager(std::move(other.manager))
-      {
-	other._obj = 0;
-      }
-
-      Wrapper<T>(T &obj)
-      :
-        _obj(obj)
-      {
-      }
-
-      T _obj;
-      std::shared_ptr<T> manager;
-  };
-  
-  class Device : public Wrapper<CUdevice>
-  {
-    public:
-      // Device Management
-
-      Device(int ordinal)
-      {
-	checkCudaCall(cuDeviceGet(&_obj, ordinal));
-      }
-
-      int getAttribute(CUdevice_attribute attribute) const
-      {
-	int value;
-	checkCudaCall(cuDeviceGetAttribute(&value, attribute, _obj));
-	return value;
-      }
-
-      template <CUdevice_attribute attribute> int getAttribute() const
-      {
-	return getAttribute(attribute);
-      }
-
-      static int getCount()
-      {
-	int nrDevices;
-	checkCudaCall(cuDeviceGetCount(&nrDevices));
-	return nrDevices;
-      }
-
-      std::string getName() const
-      {
-	char name[64];
-	checkCudaCall(cuDeviceGetName(name, sizeof name, _obj));
-	return std::string(name);
-      }
-
-      size_t totalMem() const
-      {
-	size_t size;
-	checkCudaCall(cuDeviceTotalMem(&size, _obj));
-	return size;
-      }
-
-
-      // Primary Context Management
-
-      std::pair<unsigned, bool> primaryCtxGetState() const
-      {
-	unsigned flags;
-	int active;
-	checkCudaCall(cuDevicePrimaryCtxGetState(_obj, &flags, &active));
-	return std::pair<unsigned, bool>(flags, active);
-      }
-
-      // void primaryCtxRelease() not available; it is released on destruction of the Context returned by Device::primaryContextRetain()
-
-      void primaryCtxReset()
-      {
-	checkCudaCall(cuDevicePrimaryCtxReset(_obj));
-      }
-
-      Context primaryCtxRetain(); // retain this context until the primary context can be released
-
-      void primaryCtxSetFlags(unsigned flags)
-      {
-	checkCudaCall(cuDevicePrimaryCtxSetFlags(_obj, flags));
-      }
-  };
-
-
-  class Context : public Wrapper<CUcontext>
-  {
-    public:
-      // Context Management
-
-      Context(int flags, Device &device)
-      :
-        _primaryContext(false)
-      {
-	checkCudaCall(cuCtxCreate(&_obj, flags, device));
-	manager = std::shared_ptr<CUcontext>(new CUcontext(_obj), [] (CUcontext *ptr) { if (*ptr) cuCtxDestroy(*ptr); delete ptr; });
-      }
-
-      Context(CUcontext context)
-      :
-	Wrapper<CUcontext>(context),
-	_primaryContext(false)
-      {
-      }
-
-      unsigned getApiVersion() const
-      {
-	unsigned version;
-	checkCudaCall(cuCtxGetApiVersion(_obj, &version));
-	return version;
-      }
-
-      static CUfunc_cache getCacheConfig()
-      {
-	CUfunc_cache config;
-	checkCudaCall(cuCtxGetCacheConfig(&config));
-	return config;
-      }
-
-      static void setCacheConfig(CUfunc_cache config)
-      {
-	checkCudaCall(cuCtxSetCacheConfig(config));
-      }
-
-      static Context getCurrent()
-      {
-	CUcontext context;
-	checkCudaCall(cuCtxGetCurrent(&context));
-	return std::move(Context(context));
-      }
-
-      void setCurrent() const
-      {
-	checkCudaCall(cuCtxSetCurrent(_obj));
-      }
-
-      void pushCurrent()
-      {
-	checkCudaCall(cuCtxPushCurrent(_obj));
-      }
-
-      static Context popCurrent()
-      {
-	CUcontext context;
-	checkCudaCall(cuCtxPopCurrent(&context));
-	return Context(context);
-      }
-
-      void setSharedMemConfig(CUsharedconfig config)
-      {
-	checkCudaCall(cuCtxSetSharedMemConfig(config));
-      }
-
-      static Device getDevice()
-      {
-	CUdevice device;
-	checkCudaCall(cuCtxGetDevice(&device));
-	return Device(device); // FIXME: ~Device()
-      }
-
-      static size_t getLimit(CUlimit limit)
-      {
-	size_t value;
-	checkCudaCall(cuCtxGetLimit(&value, limit));
-	return value;
-      }
-
-      template <CUlimit limit> static size_t getLimit()
-      {
-	return getLimit(limit);
-      }
-
-      static void setLimit(CUlimit limit, size_t value)
-      {
-	checkCudaCall(cuCtxSetLimit(limit, value));
-      }
-
-      template <CUlimit limit> static void setLimit(size_t value)
-      {
-	setLimit(limit, value);
-      }
-
-      static void synchronize()
-      {
-	checkCudaCall(cuCtxSynchronize());
-      }
-
-    private:
-      friend class Device;
-      Context(CUcontext context, Device &device)
-      :
-	Wrapper<CUcontext>(context),
-	_primaryContext(true)
-      {
-      }
-
-      bool _primaryContext;
-  };
-
-
-  class HostMemory : public Wrapper<void *>
-  {
-    public:
-      HostMemory(size_t size, int flags = 0)
-      {
-	checkCudaCall(cuMemHostAlloc(&_obj, size, flags));
-	manager = std::shared_ptr<void *>(new (void *)(_obj), [] (void **ptr) { cuMemFreeHost(*ptr); delete ptr; });
-      }
-
-      template <typename T> operator T * ()
-      {
-	return static_cast<T *>(_obj);
-      }
-  };
-
-
-  class DeviceMemory : public Wrapper<CUdeviceptr>
-  {
-    public:
-      DeviceMemory(size_t size)
-      {
-	checkCudaCall(cuMemAlloc(&_obj, size));
-	manager = std::shared_ptr<CUdeviceptr>(new CUdeviceptr(_obj), [] (CUdeviceptr *ptr) { cuMemFree(*ptr); delete ptr; });
-      }
-
-      DeviceMemory(CUdeviceptr ptr)
-      :
-        Wrapper(ptr)
-      {
-      }
-
-      DeviceMemory(const HostMemory &hostMemory)
-      {
-	checkCudaCall(cuMemHostGetDevicePointer(&_obj, hostMemory, 0));
-      }
-
-      const void *parameter() const // used to construct parameter list for launchKernel();
-      {
-	return &_obj;
-      }
-  };
-
-
-  class Array : public Wrapper<CUarray>
-  {
-    public:
-      Array(unsigned width, CUarray_format format, unsigned numChannels)
-      {
-	Array(width, 0, format, numChannels);
-	manager = std::shared_ptr<CUarray>(new CUarray(_obj), [] (CUarray *ptr) { cuArrayDestroy(*ptr); delete ptr; });
-      }
-
-      Array(unsigned width, unsigned height, CUarray_format format, unsigned numChannels)
-      {
-	CUDA_ARRAY_DESCRIPTOR descriptor;
-	descriptor.Width       = width;
-	descriptor.Height      = height;
-	descriptor.Format      = format;
-	descriptor.NumChannels = numChannels;
-	checkCudaCall(cuArrayCreate(&_obj, &descriptor));
-	manager = std::shared_ptr<CUarray>(new CUarray(_obj), [] (CUarray *ptr) { cuArrayDestroy(*ptr); delete ptr; });
-      }
-
-      Array(unsigned width, unsigned height, unsigned depth, CUarray_format format, unsigned numChannels)
-      {
-	CUDA_ARRAY3D_DESCRIPTOR descriptor;
-	descriptor.Width       = width;
-	descriptor.Height      = height;
-	descriptor.Depth       = depth;
-	descriptor.Format      = format;
-	descriptor.NumChannels = numChannels;
-	descriptor.Flags       = 0;
-	checkCudaCall(cuArray3DCreate(&_obj, &descriptor));
-	manager = std::shared_ptr<CUarray>(new CUarray(_obj), [] (CUarray *ptr) { cuArrayDestroy(*ptr); delete ptr; });
-      }
-
-      Array(CUarray &array)
-      :
-        Wrapper(array)
-      {
-      }
-  };
-
-
-  class Source
-  {
-    public:
-      Source(const char *input_file_name)
-      :
-        input_file_name(input_file_name)
-      {
-      }
-
-      void compile(const char *ptx_name, const char *compile_options = 0);
-
-    private:
-      const char *input_file_name;
-  };
-
-
-  class Module : public Wrapper<CUmodule>
-  {
-    public:
-      Module(const char *file_name)
-      {
-#if defined TEGRA_QUIRKS // cuModuleLoad broken on Jetson TX1
-	std::ifstream file(file_name);
-	std::string program((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-	checkCudaCall(cuModuleLoadData(&_obj, program.c_str()));
-#else
-	checkCudaCall(cuModuleLoad(&_obj, file_name));
-#endif
-	manager = std::shared_ptr<CUmodule>(new CUmodule(_obj), [] (CUmodule *ptr) { cuModuleUnload(*ptr); delete ptr; });
-      }
-
-      Module(const void *data)
-      {
-	checkCudaCall(cuModuleLoadData(&_obj, data));
-	manager = std::shared_ptr<CUmodule>(new CUmodule(_obj), [] (CUmodule *ptr) { cuModuleUnload(*ptr); delete ptr; });
-      }
-
-      Module(CUmodule &module)
-      :
-        Wrapper(module)
-      {
-      }
-
-#if 0
-      TexRef getTexRef(const char *name) const
-      {
-	CUtexref texref;
-	checkCudaCall(cuModuleGetTexRef(&texref, _obj, name));
-	return TexRef(texref);
-      }
-#endif
-
-      CUdeviceptr getGlobal(const char *name) const
-      {
-	CUdeviceptr deviceptr;
-	checkCudaCall(cuModuleGetGlobal(&deviceptr, nullptr, _obj, name));
-	return deviceptr;
-      }
-  };
-
-
-  class Function : public Wrapper<CUfunction>
-  {
-    public:
-      Function(const Module &module, const char *name)
-      {
-	checkCudaCall(cuModuleGetFunction(&_obj, module, name));
-      }
-
-      Function(CUfunction &function)
-      :
-	Wrapper(function)
-      {
-      }
-
-      int getAttribute(CUfunction_attribute attribute)
-      {
-	int value;
-	checkCudaCall(cuFuncGetAttribute(&value, attribute, _obj));
-	return value;
-      }
-
-      void setCacheConfig(CUfunc_cache config)
-      {
-	checkCudaCall(cuFuncSetCacheConfig(_obj, config));
-      }
-  };
-
-
-  class Event : public Wrapper<CUevent>
-  {
-    public:
-      Event(int flags = CU_EVENT_DEFAULT)
-      {
-	checkCudaCall(cuEventCreate(&_obj, flags));
-	manager = std::shared_ptr<CUevent>(new CUevent(_obj), [] (CUevent *ptr) { cuEventDestroy(*ptr); delete ptr; });
-      }
-
-      Event(CUevent &event)
-      :
-	Wrapper(event)
-      {
-      }
-
-      float elapsedTime(const Event &start) const
-      {
-	float ms;
-	checkCudaCall(cuEventElapsedTime(&ms, start, _obj));
-	return ms;
-      }
-
-      void query() const
-      {
-	checkCudaCall(cuEventQuery(_obj)); // unsuccessful result throws cu::Error
-      }
-
-      void record()
-      {
-	checkCudaCall(cuEventRecord(_obj, 0));
-      }
-
-      void record(Stream &);
-
-      void synchronize()
-      {
-	checkCudaCall(cuEventSynchronize(_obj));
-      }
-  };
-
-
-  class Stream : public Wrapper<CUstream>
-  {
-    friend class Event;
-
-    public:
-      Stream(int flags = CU_STREAM_DEFAULT)
-      {
-	checkCudaCall(cuStreamCreate(&_obj, flags));
-	manager = std::shared_ptr<CUstream>(new CUstream(_obj), [] (CUstream *ptr) { cuStreamDestroy(*ptr); delete ptr; });
-      }
-
-      Stream(CUstream stream)
-      :
-	Wrapper<CUstream>(stream)
-      {
-      }
-
-      void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size)
-      {
-	checkCudaCall(cuMemcpyHtoDAsync(devPtr, hostPtr, size, _obj));
-      }
-
-      void memcpyDtoHAsync(void *hostPtr, CUdeviceptr devPtr, size_t size)
-      {
-	checkCudaCall(cuMemcpyDtoHAsync(hostPtr, devPtr, size, _obj));
-      }
-
-      void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> &parameters)
-      {
-	checkCudaCall(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(&parameters[0]), 0));
-      }
-
-#if CUDART_VERSION >= 9000
-      void launchCooperativeKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> &parameters)
-      {
-	checkCudaCall(cuLaunchCooperativeKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(&parameters[0])));
-      }
-#endif
-
-      void query()
-      {
-	checkCudaCall(cuStreamQuery(_obj)); // unsuccessful result throws cu::Error
-      }
-
-      void synchronize()
-      {
-	checkCudaCall(cuStreamSynchronize(_obj));
-      }
-
-      void wait(Event &event)
-      {
-	checkCudaCall(cuStreamWaitEvent(_obj, event, 0));
-      }
-
-      void addCallback(CUstreamCallback callback, void *userData, int flags = 0)
-      {
-	checkCudaCall(cuStreamAddCallback(_obj, callback, userData, flags));
-      }
-
-      void record(Event &event)
-      {
-	checkCudaCall(cuEventRecord(event, _obj));
-      }
-
-      void batchMemOp(unsigned count, CUstreamBatchMemOpParams *paramArray, unsigned flags)
-      {
-	checkCudaCall(cuStreamBatchMemOp(_obj, count, paramArray, flags));
-      }
-
-      void waitValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags) const
-      {
-	checkCudaCall(cuStreamWaitValue32(_obj, addr, value, flags));
-      }
-
-      void writeValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags)
-      {
-	checkCudaCall(cuStreamWriteValue32(_obj, addr, value, flags));
-      }
-  };
-
-#if 0
-  class Graph : public Wrapper<CUgraph>
-  {
-    public:
-      class GraphNode : public Wrapper<CUgraphNode>
-      {
-      };
-
-      class ExecKernelNode : public GraphNode
-      {
-      };
-
-      class KernelNodeParams : public Wrapper<CUDA_KERNEL_NODE_PARAMS>
-      {
-	public:
-	  KernelNodeParams(const Function &function,
-			   unsigned gridDimX, unsigned gridDimY, unsigned gridDimZ,
-			   unsigned blockDimX, unsigned blockDimY, unsigned blockDimZ,
-			   unsigned sharedMemBytes,
-			   const std::vector<const void *> &kernelParams)
-	  {
-	    _obj.func	   = function;
-	    _obj.blockDimX = blockDimX;
-	    _obj.blockDimY = blockDimY;
-	    _obj.blockDimZ = blockDimZ;
-	    _obj.gridDimX  = gridDimX;
-	    _obj.gridDimY  = gridDimY;
-	    _obj.gridDimZ  = gridDimZ;
-	    _obj.sharedMemBytes = sharedMemBytes;
-	    _obj.kernelParams = const_cast<void **>(kernelParams.data());
-	    _obj.extra	   = nullptr;
-	  }
-      };
-
-      class Exec : public Wrapper<CUgraphExec>
-      {
-	public:
-	  void launch(Stream &stream)
-	  {
-	    checkCudaCall(cuGraphLaunch(_obj, stream));
-	  }
-      };
-
-      Graph(unsigned flags = 0)
-      {
-	checkCudaCall(cuGraphCreate(&_obj, flags));
-	manager = std::shared_ptr<CUgraphNode>(new CUgraphNode(_obj), [] (CUgraphNode *ptr) { cuGraphDestroy(*ptr); delete ptr; });
-      }
-
-      Graph(CUgraph &graph)
-      :
-	Wrapper(graph)
-      {
-      }
-
-      ExecKernelNode addKernelNode(/* std::vector<GraphNode> dependencies, */ const KernelNodeParams &kernelArgs)
-      {
-	ExecKernelNode node;
-	checkCudaCall(cuGraphAddKernelNode(& (CUgraphNode &) node, _obj, nullptr, 0, & (const CUDA_KERNEL_NODE_PARAMS &) kernelArgs));
-	return node;
-      }
-
-      Exec instantiate()
-      {
-	Exec exec;
-	checkCudaCall(cuGraphInstantiate(& (CUgraphExec &) exec, _obj, nullptr, nullptr, 0));
-	return exec;
-      }
-  };
-#endif
-
-
-  inline void Event::record(Stream &stream)
-  {
-    checkCudaCall(cuEventRecord(_obj, stream._obj));
-  }
-}
-
-#endif
diff --git a/util/nvrtc.cc b/util/nvrtc.cc
deleted file mode 100644
index b6ac88599a78ba2cad86136e2013674d52b30943..0000000000000000000000000000000000000000
--- a/util/nvrtc.cc
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "nvrtc.h"
-
-
-namespace nvrtc {
-
-const char *Error::what() const noexcept
-{
-  return nvrtcGetErrorString(_result);
-}
-
-}
diff --git a/util/nvrtc.h b/util/nvrtc.h
deleted file mode 100644
index f429a737bb5f8bdd05fb2c798bb970c50ad4b924..0000000000000000000000000000000000000000
--- a/util/nvrtc.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#if !defined NVRTC_H
-#define NVRTC_H
-
-#include <cuda.h>
-#include <nvrtc.h>
-
-#include <algorithm>
-#include <exception>
-#include <fstream>
-#include <iterator>
-#include <string>
-#include <vector>
-
-
-namespace nvrtc {
-  class Error : public std::exception {
-    public:
-      Error(nvrtcResult result)
-	:
-	  _result(result)
-	{
-	}
-
-	virtual const char *what() const noexcept;
-
-	operator nvrtcResult () const
-	{
-	  return _result;
-	}
-
-      private:
-	nvrtcResult _result;
-  };
-
-
-  inline void checkNvrtcCall(nvrtcResult result)
-  {
-    if (result != NVRTC_SUCCESS)
-      throw Error(result);
-  }
-
-
-  class Program {
-    public:
-      Program(const std::string &src, const std::string &name, int numHeaders = 0, const char *headers[] = nullptr, const char *includeNames[] = nullptr) // TODO: use std::vector<std::string>
-      {
-	checkNvrtcCall(nvrtcCreateProgram(&program, src.c_str(), name.c_str(), numHeaders, headers, includeNames));
-      }
-
-      Program(const std::string &filename)
-      {
-	std::ifstream ifs(filename);
-	std::string source(std::istreambuf_iterator<char>{ifs}, {});
-	checkNvrtcCall(nvrtcCreateProgram(&program, source.c_str(), filename.c_str(), 0, nullptr, nullptr));
-      }
-
-      ~Program()
-      {
-	checkNvrtcCall(nvrtcDestroyProgram(&program));
-      }
-
-      void compile(const std::vector<std::string> &options)
-      {
-	std::vector<const char *> c_options;
-	std::transform(options.begin(), options.end(), std::back_inserter(c_options), [] (const std::string &option) { return option.c_str();});
-	checkNvrtcCall(nvrtcCompileProgram(program, c_options.size(), c_options.data()));
-      }
-
-      std::string getPTX()
-      {
-	size_t      size;
-	std::string ptx;
-
-	checkNvrtcCall(nvrtcGetPTXSize(program, &size));
-	ptx.resize(size);
-	checkNvrtcCall(nvrtcGetPTX(program, &ptx[0]));
-	return ptx;
-      }
-
-#if CUDA_VERSION >= 11020
-      std::vector<char> getCUBIN()
-      {
-	size_t      size;
-	std::vector<char> cubin;
-
-	checkNvrtcCall(nvrtcGetCUBINSize(program, &size));
-	cubin.resize(size);
-	checkNvrtcCall(nvrtcGetCUBIN(program, &cubin[0]));
-	return cubin;
-      }
-#endif
-
-      std::string getLog()
-      {
-	size_t      size;
-	std::string log;
-
-	checkNvrtcCall(nvrtcGetProgramLogSize(program, &size));
-	log.resize(size);
-	checkNvrtcCall(nvrtcGetProgramLog(program, &log[0]));
-	return log;
-      }
-
-    private:
-      nvrtcProgram program;
-  };
-}
-
-#endif