diff --git a/Makefile b/Makefile
index 04e5ea223dd5825fa92002528b73ee50fdf05a27..8cfbb920b3b2b24370a6f18dbb30303b6a9224c2 100644
--- a/Makefile
+++ b/Makefile
@@ -18,8 +18,7 @@ NVCCFLAGS=		$(INCLUDES)
 
 #CXXFLAGS+=		-march=core-avx2 -mcmodel=medium
 
-LIBTCC_SOURCES=		util/cu.cc\
-			util/nvrtc.cc\
+LIBTCC_SOURCES=		util/nvrtc.cc\
 			libtcc/CorrelatorKernel.cc\
 			libtcc/Correlator.cc\
 			libtcc/Kernel.cc
@@ -53,8 +52,14 @@ EXECUTABLES=		test/SimpleExample/SimpleExample\
 			test/CorrelatorTest/CorrelatorTest\
 			test/OpenCLCorrelatorTest/OpenCLCorrelatorTest
 
+CUDA_WRAPPERS_DIR=       external/cuda-wrappers
+CUDA_WRAPERS_LIB=        ${CUDA_WRAPPERS_DIR}/libcu.so
+CUDA_WRAPERS_INCLUDE=    ${CUDA_WRAPPERS_DIR}/cu
+#LIBTCC_OBJECTS+=         ${CUDA_WRAPERS_LIB}
+
 LIBRARIES=		-L$(CUDA_LIBDIR) -lcuda\
-			-L$(NVRTC_LIBDIR) -lnvrtc #\
+			-L$(NVRTC_LIBDIR) -lnvrtc \
+			${CUDA_WRAPERS_LIB}
 			#-L$(POWER_SENSOR)/lib -lpowersensor #-lnvidia-ml
 
 
@@ -82,6 +87,10 @@ all::			$(EXECUTABLES)
 clean::
 			$(RM) $(OBJECTS) $(SHARED_OBJECTS) $(DEPENDENCIES) $(EXECUTABLES)
 
+${CUDA_WRAPERS_LIB}:
+			cd ${CUDA_WRAPPERS_DIR} && cmake .
+			cd ${CUDA_WRAPPERS_DIR} && CPATH=${CPATH}:${CUDA_INCLUDE} make
+
 libtcc/TCCorrelator.o:	libtcc/TCCorrelator.cu	# CUDA code embedded in object file
 			ld -r -b binary -o $@ $<
 
diff --git a/libtcc/Correlator.h b/libtcc/Correlator.h
index 5b9cac6744a1d4f194634e967f30dbf67fe10aaa..ba76b3590c7c75a289ba794c36a6d0d49fed1481 100644
--- a/libtcc/Correlator.h
+++ b/libtcc/Correlator.h
@@ -2,7 +2,7 @@
 #define TCC_CORRELATOR_H
 
 #include "libtcc/CorrelatorKernel.h"
-#include "util/cu.h"
+#include "external/cuda-wrappers/cu/cu.h"
 #include "util/nvrtc.h"
 
 #include <string>
diff --git a/libtcc/Kernel.h b/libtcc/Kernel.h
index 63329ee32a91a50e6c261b5f5f48e97bfbe533df..a5d195c21f32e0a30b4f6cfc1a61478e28326dd0 100644
--- a/libtcc/Kernel.h
+++ b/libtcc/Kernel.h
@@ -1,7 +1,7 @@
 #if !defined TCC_KERNEL_H
 #define TCC_KERNEL_H
 
-#include "util/cu.h"
+#include "external/cuda-wrappers/cu/cu.h"
 
 #include <stdint.h>
 
diff --git a/test/Common/Record.h b/test/Common/Record.h
index 2704748115ff90eaeb62c158377fbb2d00df655d..fa65629c19ef398f97bbd9fcdd0011e14ec62143 100644
--- a/test/Common/Record.h
+++ b/test/Common/Record.h
@@ -3,7 +3,7 @@
 
 #include "test/Common/Config.h"
 
-#include "util/cu.h"
+#include "external/cuda-wrappers/cu/cu.h"
 
 #if defined MEASURE_POWER
 #include <powersensor/NVMLPowerSensor.h>
diff --git a/test/Common/UnitTest.h b/test/Common/UnitTest.h
index dce8b476d3a008b0c265d245cf640a5f10e47ff8..be3c090f487fae9f0cc757fb196beef8cba25b32 100644
--- a/test/Common/UnitTest.h
+++ b/test/Common/UnitTest.h
@@ -2,7 +2,7 @@
 #define UNIT_TEST_H
 
 #include "test/Common/Record.h"
-#include "util/cu.h"
+#include "external/cuda-wrappers/cu/cu.h"
 
 #if defined MEASURE_POWER
 #include <powersensor/NVMLPowerSensor.h>
diff --git a/util/cu.cc b/util/cu.cc
deleted file mode 100644
index 82113bedfe3a17f0c66e300214ca9cb197cb4e35..0000000000000000000000000000000000000000
--- a/util/cu.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "cu.h"
-
-#include <iostream>
-#include <sstream>
-
-
-namespace cu {
-
-const char *Error::what() const noexcept
-{
-  const char *str;
-  return cuGetErrorString(_result, &str) != CUDA_ERROR_INVALID_VALUE ? str : "unknown error";
-}
-
-
-Context Device::primaryCtxRetain()
-{
-  CUcontext context;
-  checkCudaCall(cuDevicePrimaryCtxRetain(&context, _obj));
-  return Context(context, *this);
-}
-
-
-void Source::compile(const char *output_file_name, const char *compiler_options)
-{
-  std::stringstream command_line;
-  command_line << "nvcc -cubin " << compiler_options << " -o " << output_file_name << ' ' << input_file_name;
-//#pragma omp critical (clog)
-  //std::clog << command_line.str() << std::endl;
-
-  int retval = system(command_line.str().c_str());
-
-  if (WEXITSTATUS(retval) != 0)
-    throw Error(CUDA_ERROR_INVALID_SOURCE);
-}
-
-}
diff --git a/util/cu.h b/util/cu.h
deleted file mode 100644
index 72f6cf575a7512b097539202a623b0f089ed951a..0000000000000000000000000000000000000000
--- a/util/cu.h
+++ /dev/null
@@ -1,674 +0,0 @@
-#if !defined CU_WRAPPER_H
-#define CU_WRAPPER_H
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <exception>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-
-
-namespace cu {
-  class Error : public std::exception {
-    public:
-      Error(CUresult result)
-      :
-        _result(result)
-      {
-      }
-
-      virtual const char *what() const noexcept;
-
-      operator CUresult () const
-      {
-	return _result;
-      }
-
-    private:
-      CUresult _result;
-  };
-
-
-  inline void checkCudaCall(CUresult result)
-  {
-    if (result != CUDA_SUCCESS)
-      throw Error(result);
-  }
-
-
-  inline void init(unsigned flags = 0)
-  {
-    checkCudaCall(cuInit(flags));
-  }
-
-
-  inline int driverGetVersion()
-  {
-    int version;
-    checkCudaCall(cuDriverGetVersion(&version));
-    return version;
-  }
-
-  inline void memcpyHtoD(CUdeviceptr dst, const void *src, size_t size)
-  {
-    checkCudaCall(cuMemcpyHtoD(dst, src, size));
-  }
-
-  class Context;
-  class Stream;
-
-  template <typename T> class Wrapper
-  {
-    public:
-      // conversion to C-style T
-
-      operator T () const
-      {
-	return _obj;
-      }
-
-      operator T ()
-      {
-	return _obj;
-      }
-
-      bool operator == (const Wrapper<T> &other)
-      {
-	return _obj == other._obj;
-      }
-
-      bool operator != (const Wrapper<T> &other)
-      {
-	return _obj != other._obj;
-      }
-
-    protected:
-      Wrapper<T>()
-      {
-      }
-
-      Wrapper<T>(const Wrapper<T> &other)
-      :
-	_obj(other._obj),
-	manager(other.manager)
-      {
-      }
-
-      Wrapper<T>(Wrapper<T> &&other)
-      :
-	_obj(other._obj),
-	manager(std::move(other.manager))
-      {
-	other._obj = 0;
-      }
-
-      Wrapper<T>(T &obj)
-      :
-        _obj(obj)
-      {
-      }
-
-      T _obj;
-      std::shared_ptr<T> manager;
-  };
-  
-  class Device : public Wrapper<CUdevice>
-  {
-    public:
-      // Device Management
-
-      Device(int ordinal)
-      {
-	checkCudaCall(cuDeviceGet(&_obj, ordinal));
-      }
-
-      int getAttribute(CUdevice_attribute attribute) const
-      {
-	int value;
-	checkCudaCall(cuDeviceGetAttribute(&value, attribute, _obj));
-	return value;
-      }
-
-      template <CUdevice_attribute attribute> int getAttribute() const
-      {
-	return getAttribute(attribute);
-      }
-
-      static int getCount()
-      {
-	int nrDevices;
-	checkCudaCall(cuDeviceGetCount(&nrDevices));
-	return nrDevices;
-      }
-
-      std::string getName() const
-      {
-	char name[64];
-	checkCudaCall(cuDeviceGetName(name, sizeof name, _obj));
-	return std::string(name);
-      }
-
-      size_t totalMem() const
-      {
-	size_t size;
-	checkCudaCall(cuDeviceTotalMem(&size, _obj));
-	return size;
-      }
-
-
-      // Primary Context Management
-
-      std::pair<unsigned, bool> primaryCtxGetState() const
-      {
-	unsigned flags;
-	int active;
-	checkCudaCall(cuDevicePrimaryCtxGetState(_obj, &flags, &active));
-	return std::pair<unsigned, bool>(flags, active);
-      }
-
-      // void primaryCtxRelease() not available; it is released on destruction of the Context returned by Device::primaryContextRetain()
-
-      void primaryCtxReset()
-      {
-	checkCudaCall(cuDevicePrimaryCtxReset(_obj));
-      }
-
-      Context primaryCtxRetain(); // retain this context until the primary context can be released
-
-      void primaryCtxSetFlags(unsigned flags)
-      {
-	checkCudaCall(cuDevicePrimaryCtxSetFlags(_obj, flags));
-      }
-  };
-
-
-  class Context : public Wrapper<CUcontext>
-  {
-    public:
-      // Context Management
-
-      Context(int flags, Device &device)
-      :
-        _primaryContext(false)
-      {
-	checkCudaCall(cuCtxCreate(&_obj, flags, device));
-	manager = std::shared_ptr<CUcontext>(new CUcontext(_obj), [] (CUcontext *ptr) { if (*ptr) cuCtxDestroy(*ptr); delete ptr; });
-      }
-
-      Context(CUcontext context)
-      :
-	Wrapper<CUcontext>(context),
-	_primaryContext(false)
-      {
-      }
-
-      unsigned getApiVersion() const
-      {
-	unsigned version;
-	checkCudaCall(cuCtxGetApiVersion(_obj, &version));
-	return version;
-      }
-
-      static CUfunc_cache getCacheConfig()
-      {
-	CUfunc_cache config;
-	checkCudaCall(cuCtxGetCacheConfig(&config));
-	return config;
-      }
-
-      static void setCacheConfig(CUfunc_cache config)
-      {
-	checkCudaCall(cuCtxSetCacheConfig(config));
-      }
-
-      static Context getCurrent()
-      {
-	CUcontext context;
-	checkCudaCall(cuCtxGetCurrent(&context));
-	return std::move(Context(context));
-      }
-
-      void setCurrent() const
-      {
-	checkCudaCall(cuCtxSetCurrent(_obj));
-      }
-
-      void pushCurrent()
-      {
-	checkCudaCall(cuCtxPushCurrent(_obj));
-      }
-
-      static Context popCurrent()
-      {
-	CUcontext context;
-	checkCudaCall(cuCtxPopCurrent(&context));
-	return Context(context);
-      }
-
-      void setSharedMemConfig(CUsharedconfig config)
-      {
-	checkCudaCall(cuCtxSetSharedMemConfig(config));
-      }
-
-      static Device getDevice()
-      {
-	CUdevice device;
-	checkCudaCall(cuCtxGetDevice(&device));
-	return Device(device); // FIXME: ~Device()
-      }
-
-      static size_t getLimit(CUlimit limit)
-      {
-	size_t value;
-	checkCudaCall(cuCtxGetLimit(&value, limit));
-	return value;
-      }
-
-      template <CUlimit limit> static size_t getLimit()
-      {
-	return getLimit(limit);
-      }
-
-      static void setLimit(CUlimit limit, size_t value)
-      {
-	checkCudaCall(cuCtxSetLimit(limit, value));
-      }
-
-      template <CUlimit limit> static void setLimit(size_t value)
-      {
-	setLimit(limit, value);
-      }
-
-      static void synchronize()
-      {
-	checkCudaCall(cuCtxSynchronize());
-      }
-
-    private:
-      friend class Device;
-      Context(CUcontext context, Device &device)
-      :
-	Wrapper<CUcontext>(context),
-	_primaryContext(true)
-      {
-      }
-
-      bool _primaryContext;
-  };
-
-
-  class HostMemory : public Wrapper<void *>
-  {
-    public:
-      HostMemory(size_t size, int flags = 0)
-      {
-	checkCudaCall(cuMemHostAlloc(&_obj, size, flags));
-	manager = std::shared_ptr<void *>(new (void *)(_obj), [] (void **ptr) { cuMemFreeHost(*ptr); delete ptr; });
-      }
-
-      template <typename T> operator T * ()
-      {
-	return static_cast<T *>(_obj);
-      }
-  };
-
-
-  class DeviceMemory : public Wrapper<CUdeviceptr>
-  {
-    public:
-      DeviceMemory(size_t size)
-      {
-	checkCudaCall(cuMemAlloc(&_obj, size));
-	manager = std::shared_ptr<CUdeviceptr>(new CUdeviceptr(_obj), [] (CUdeviceptr *ptr) { cuMemFree(*ptr); delete ptr; });
-      }
-
-      DeviceMemory(CUdeviceptr ptr)
-      :
-        Wrapper(ptr)
-      {
-      }
-
-      DeviceMemory(const HostMemory &hostMemory)
-      {
-	checkCudaCall(cuMemHostGetDevicePointer(&_obj, hostMemory, 0));
-      }
-
-      const void *parameter() const // used to construct parameter list for launchKernel();
-      {
-	return &_obj;
-      }
-  };
-
-
-  class Array : public Wrapper<CUarray>
-  {
-    public:
-      Array(unsigned width, CUarray_format format, unsigned numChannels)
-      {
-	Array(width, 0, format, numChannels);
-	manager = std::shared_ptr<CUarray>(new CUarray(_obj), [] (CUarray *ptr) { cuArrayDestroy(*ptr); delete ptr; });
-      }
-
-      Array(unsigned width, unsigned height, CUarray_format format, unsigned numChannels)
-      {
-	CUDA_ARRAY_DESCRIPTOR descriptor;
-	descriptor.Width       = width;
-	descriptor.Height      = height;
-	descriptor.Format      = format;
-	descriptor.NumChannels = numChannels;
-	checkCudaCall(cuArrayCreate(&_obj, &descriptor));
-	manager = std::shared_ptr<CUarray>(new CUarray(_obj), [] (CUarray *ptr) { cuArrayDestroy(*ptr); delete ptr; });
-      }
-
-      Array(unsigned width, unsigned height, unsigned depth, CUarray_format format, unsigned numChannels)
-      {
-	CUDA_ARRAY3D_DESCRIPTOR descriptor;
-	descriptor.Width       = width;
-	descriptor.Height      = height;
-	descriptor.Depth       = depth;
-	descriptor.Format      = format;
-	descriptor.NumChannels = numChannels;
-	descriptor.Flags       = 0;
-	checkCudaCall(cuArray3DCreate(&_obj, &descriptor));
-	manager = std::shared_ptr<CUarray>(new CUarray(_obj), [] (CUarray *ptr) { cuArrayDestroy(*ptr); delete ptr; });
-      }
-
-      Array(CUarray &array)
-      :
-        Wrapper(array)
-      {
-      }
-  };
-
-
-  class Source
-  {
-    public:
-      Source(const char *input_file_name)
-      :
-        input_file_name(input_file_name)
-      {
-      }
-
-      void compile(const char *ptx_name, const char *compile_options = 0);
-
-    private:
-      const char *input_file_name;
-  };
-
-
-  class Module : public Wrapper<CUmodule>
-  {
-    public:
-      Module(const char *file_name)
-      {
-#if defined TEGRA_QUIRKS // cuModuleLoad broken on Jetson TX1
-	std::ifstream file(file_name);
-	std::string program((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-	checkCudaCall(cuModuleLoadData(&_obj, program.c_str()));
-#else
-	checkCudaCall(cuModuleLoad(&_obj, file_name));
-#endif
-	manager = std::shared_ptr<CUmodule>(new CUmodule(_obj), [] (CUmodule *ptr) { cuModuleUnload(*ptr); delete ptr; });
-      }
-
-      Module(const void *data)
-      {
-	checkCudaCall(cuModuleLoadData(&_obj, data));
-	manager = std::shared_ptr<CUmodule>(new CUmodule(_obj), [] (CUmodule *ptr) { cuModuleUnload(*ptr); delete ptr; });
-      }
-
-      Module(CUmodule &module)
-      :
-        Wrapper(module)
-      {
-      }
-
-#if 0
-      TexRef getTexRef(const char *name) const
-      {
-	CUtexref texref;
-	checkCudaCall(cuModuleGetTexRef(&texref, _obj, name));
-	return TexRef(texref);
-      }
-#endif
-
-      CUdeviceptr getGlobal(const char *name) const
-      {
-	CUdeviceptr deviceptr;
-	checkCudaCall(cuModuleGetGlobal(&deviceptr, nullptr, _obj, name));
-	return deviceptr;
-      }
-  };
-
-
-  class Function : public Wrapper<CUfunction>
-  {
-    public:
-      Function(const Module &module, const char *name)
-      {
-	checkCudaCall(cuModuleGetFunction(&_obj, module, name));
-      }
-
-      Function(CUfunction &function)
-      :
-	Wrapper(function)
-      {
-      }
-
-      int getAttribute(CUfunction_attribute attribute)
-      {
-	int value;
-	checkCudaCall(cuFuncGetAttribute(&value, attribute, _obj));
-	return value;
-      }
-
-      void setCacheConfig(CUfunc_cache config)
-      {
-	checkCudaCall(cuFuncSetCacheConfig(_obj, config));
-      }
-  };
-
-
-  class Event : public Wrapper<CUevent>
-  {
-    public:
-      Event(int flags = CU_EVENT_DEFAULT)
-      {
-	checkCudaCall(cuEventCreate(&_obj, flags));
-	manager = std::shared_ptr<CUevent>(new CUevent(_obj), [] (CUevent *ptr) { cuEventDestroy(*ptr); delete ptr; });
-      }
-
-      Event(CUevent &event)
-      :
-	Wrapper(event)
-      {
-      }
-
-      float elapsedTime(const Event &start) const
-      {
-	float ms;
-	checkCudaCall(cuEventElapsedTime(&ms, start, _obj));
-	return ms;
-      }
-
-      void query() const
-      {
-	checkCudaCall(cuEventQuery(_obj)); // unsuccessful result throws cu::Error
-      }
-
-      void record()
-      {
-	checkCudaCall(cuEventRecord(_obj, 0));
-      }
-
-      void record(Stream &);
-
-      void synchronize()
-      {
-	checkCudaCall(cuEventSynchronize(_obj));
-      }
-  };
-
-
-  class Stream : public Wrapper<CUstream>
-  {
-    friend class Event;
-
-    public:
-      Stream(int flags = CU_STREAM_DEFAULT)
-      {
-	checkCudaCall(cuStreamCreate(&_obj, flags));
-	manager = std::shared_ptr<CUstream>(new CUstream(_obj), [] (CUstream *ptr) { cuStreamDestroy(*ptr); delete ptr; });
-      }
-
-      Stream(CUstream stream)
-      :
-	Wrapper<CUstream>(stream)
-      {
-      }
-
-      void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size)
-      {
-	checkCudaCall(cuMemcpyHtoDAsync(devPtr, hostPtr, size, _obj));
-      }
-
-      void memcpyDtoHAsync(void *hostPtr, CUdeviceptr devPtr, size_t size)
-      {
-	checkCudaCall(cuMemcpyDtoHAsync(hostPtr, devPtr, size, _obj));
-      }
-
-      void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> &parameters)
-      {
-	checkCudaCall(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(&parameters[0]), 0));
-      }
-
-#if CUDART_VERSION >= 9000
-      void launchCooperativeKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> &parameters)
-      {
-	checkCudaCall(cuLaunchCooperativeKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(&parameters[0])));
-      }
-#endif
-
-      void query()
-      {
-	checkCudaCall(cuStreamQuery(_obj)); // unsuccessful result throws cu::Error
-      }
-
-      void synchronize()
-      {
-	checkCudaCall(cuStreamSynchronize(_obj));
-      }
-
-      void wait(Event &event)
-      {
-	checkCudaCall(cuStreamWaitEvent(_obj, event, 0));
-      }
-
-      void addCallback(CUstreamCallback callback, void *userData, int flags = 0)
-      {
-	checkCudaCall(cuStreamAddCallback(_obj, callback, userData, flags));
-      }
-
-      void record(Event &event)
-      {
-	checkCudaCall(cuEventRecord(event, _obj));
-      }
-
-      void batchMemOp(unsigned count, CUstreamBatchMemOpParams *paramArray, unsigned flags)
-      {
-	checkCudaCall(cuStreamBatchMemOp(_obj, count, paramArray, flags));
-      }
-
-      void waitValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags) const
-      {
-	checkCudaCall(cuStreamWaitValue32(_obj, addr, value, flags));
-      }
-
-      void writeValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags)
-      {
-	checkCudaCall(cuStreamWriteValue32(_obj, addr, value, flags));
-      }
-  };
-
-#if 0
-  class Graph : public Wrapper<CUgraph>
-  {
-    public:
-      class GraphNode : public Wrapper<CUgraphNode>
-      {
-      };
-
-      class ExecKernelNode : public GraphNode
-      {
-      };
-
-      class KernelNodeParams : public Wrapper<CUDA_KERNEL_NODE_PARAMS>
-      {
-	public:
-	  KernelNodeParams(const Function &function,
-			   unsigned gridDimX, unsigned gridDimY, unsigned gridDimZ,
-			   unsigned blockDimX, unsigned blockDimY, unsigned blockDimZ,
-			   unsigned sharedMemBytes,
-			   const std::vector<const void *> &kernelParams)
-	  {
-	    _obj.func	   = function;
-	    _obj.blockDimX = blockDimX;
-	    _obj.blockDimY = blockDimY;
-	    _obj.blockDimZ = blockDimZ;
-	    _obj.gridDimX  = gridDimX;
-	    _obj.gridDimY  = gridDimY;
-	    _obj.gridDimZ  = gridDimZ;
-	    _obj.sharedMemBytes = sharedMemBytes;
-	    _obj.kernelParams = const_cast<void **>(kernelParams.data());
-	    _obj.extra	   = nullptr;
-	  }
-      };
-
-      class Exec : public Wrapper<CUgraphExec>
-      {
-	public:
-	  void launch(Stream &stream)
-	  {
-	    checkCudaCall(cuGraphLaunch(_obj, stream));
-	  }
-      };
-
-      Graph(unsigned flags = 0)
-      {
-	checkCudaCall(cuGraphCreate(&_obj, flags));
-	manager = std::shared_ptr<CUgraphNode>(new CUgraphNode(_obj), [] (CUgraphNode *ptr) { cuGraphDestroy(*ptr); delete ptr; });
-      }
-
-      Graph(CUgraph &graph)
-      :
-	Wrapper(graph)
-      {
-      }
-
-      ExecKernelNode addKernelNode(/* std::vector<GraphNode> dependencies, */ const KernelNodeParams &kernelArgs)
-      {
-	ExecKernelNode node;
-	checkCudaCall(cuGraphAddKernelNode(& (CUgraphNode &) node, _obj, nullptr, 0, & (const CUDA_KERNEL_NODE_PARAMS &) kernelArgs));
-	return node;
-      }
-
-      Exec instantiate()
-      {
-	Exec exec;
-	checkCudaCall(cuGraphInstantiate(& (CUgraphExec &) exec, _obj, nullptr, nullptr, 0));
-	return exec;
-      }
-  };
-#endif
-
-
-  inline void Event::record(Stream &stream)
-  {
-    checkCudaCall(cuEventRecord(_obj, stream._obj));
-  }
-}
-
-#endif