From 7a9dd8b2bd10c9f5b3386c38f6377a1c1094ae21 Mon Sep 17 00:00:00 2001
From: John Romein <romein@astron.nl>
Date: Mon, 30 Aug 2021 17:31:29 +0200
Subject: [PATCH] Initial commit.

---
 Makefile                                      | 102 +++
 libtcc/Correlator.cc                          | 117 +++
 libtcc/Correlator.h                           |  40 +
 libtcc/CorrelatorKernel.cc                    |  43 +
 libtcc/CorrelatorKernel.h                     |  35 +
 libtcc/Kernel.cc                              |  13 +
 libtcc/Kernel.h                               |  23 +
 libtcc/TCCorrelator.cu                        | 596 +++++++++++++
 libtcc/TCCorrelator.cu.o                      | Bin 0 -> 26544 bytes
 libtcc/TCCorrelator.cu.orig                   | 587 +++++++++++++
 test/Common/ComplexInt4.h                     |  21 +
 test/Common/Config.h                          |  22 +
 test/Common/Record.cc                         |  35 +
 test/Common/Record.h                          |  33 +
 test/Common/UnitTest.cc                       |  60 ++
 test/Common/UnitTest.h                        |  30 +
 test/CorrelatorTest/CorrelatorTest.cc         | 239 ++++++
 test/CorrelatorTest/CorrelatorTest.h          |  61 ++
 test/CorrelatorTest/Options.cc                |  89 ++
 test/CorrelatorTest/Options.h                 |  45 +
 .../OpenCLCorrelatorTest.cc                   | 450 ++++++++++
 test/SimpleExample/SimpleExample.cu           |  77 ++
 util/ExceptionPropagator.h                    |  59 ++
 util/cu.cc                                    |  37 +
 util/cu.h                                     | 786 ++++++++++++++++++
 util/multi_array.h                            |  63 ++
 util/nvrtc.cc                                 |  11 +
 util/nvrtc.h                                  | 109 +++
 28 files changed, 3783 insertions(+)
 create mode 100644 Makefile
 create mode 100644 libtcc/Correlator.cc
 create mode 100644 libtcc/Correlator.h
 create mode 100644 libtcc/CorrelatorKernel.cc
 create mode 100644 libtcc/CorrelatorKernel.h
 create mode 100644 libtcc/Kernel.cc
 create mode 100644 libtcc/Kernel.h
 create mode 100644 libtcc/TCCorrelator.cu
 create mode 100644 libtcc/TCCorrelator.cu.o
 create mode 100644 libtcc/TCCorrelator.cu.orig
 create mode 100644 test/Common/ComplexInt4.h
 create mode 100644 test/Common/Config.h
 create mode 100644 test/Common/Record.cc
 create mode 100644 test/Common/Record.h
 create mode 100644 test/Common/UnitTest.cc
 create mode 100644 test/Common/UnitTest.h
 create mode 100644 test/CorrelatorTest/CorrelatorTest.cc
 create mode 100644 test/CorrelatorTest/CorrelatorTest.h
 create mode 100644 test/CorrelatorTest/Options.cc
 create mode 100644 test/CorrelatorTest/Options.h
 create mode 100644 test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc
 create mode 100644 test/SimpleExample/SimpleExample.cu
 create mode 100644 util/ExceptionPropagator.h
 create mode 100644 util/cu.cc
 create mode 100644 util/cu.h
 create mode 100644 util/multi_array.h
 create mode 100644 util/nvrtc.cc
 create mode 100644 util/nvrtc.h

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0ce2042
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,102 @@
+VERSION=		0.5
+#CUDA=			/cm/shared/package/cuda100/toolkit/10.0.130
+CUDA=			$(shell dirname `dirname \`which nvcc\``)
+#CUDA=			/usr/local/cuda
+#POWER_SENSOR=		$(HOME)/projects/libpowersensor-master/build
+ARCH=			$(shell arch)
+CC=			gcc
+CXX=			g++ #-Wno-deprecated-declarations
+NVCC=			nvcc
+INCLUDES=		-I.
+#INCLUDES+=		-I$(CUDA)/include
+#INCLUDES+=		-I$(POWER_SENSOR)/include
+CXXFLAGS+=		-std=c++11 -O3 -g -fpic -fopenmp $(INCLUDES) -DNDEBUG
+NVCCFLAGS=		$(INCLUDES)
+
+#CXXFLAGS+=		-march=core-avx2 -mcmodel=medium
+
+LIBTCC_SOURCES=		util/cu.cc\
+			util/nvrtc.cc\
+			libtcc/CorrelatorKernel.cc\
+			libtcc/Correlator.cc\
+			libtcc/Kernel.cc
+
+
+CORRELATOR_TEST_SOURCES=test/CorrelatorTest/CorrelatorTest.cc\
+			test/CorrelatorTest/Options.cc\
+			test/Common/Record.cc\
+			test/Common/UnitTest.cc
+
+OPENCL_TEST_SOURCES=	test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc
+
+SIMPLE_EXAMPLE_SOURCES=	test/SimpleExample/SimpleExample.cu
+
+
+LIBTCC_OBJECTS=		$(LIBTCC_SOURCES:%.cc=%.o) libtcc/TCCorrelator.o
+SIMPLE_EXAMPLE_OBJECTS=	$(SIMPLE_EXAMPLE_SOURCES:%.cu=%.o)
+CORRELATOR_TEST_OBJECTS=$(CORRELATOR_TEST_SOURCES:%.cc=%.o)
+OPENCL_TEST_OBJECTS=	$(OPENCL_TEST_SOURCES:%.cc=%.o)
+
+OBJECTS=		$(LIBTCC_OBJECTS)\
+			$(SIMPLE_EXAMPLE_OBJECTS)\
+			$(CORRELATOR_TEST_OBJECTS)\
+			$(OPENCL_TEST_OBJECTS)
+
+SHARED_OBJECTS=		libtcc/libtcc.so libtcc/libtcc.so.$(VERSION)
+
+DEPENDENCIES=		$(OBJECTS:%.o=%.d)
+
+EXECUTABLES=		test/SimpleExample/SimpleExample\
+			test/CorrelatorTest/CorrelatorTest\
+			test/OpenCLCorrelatorTest/OpenCLCorrelatorTest
+
+LIBRARIES=		-L$(CUDA)/lib64 \
+			-L$(CUDA)/lib64/stubs -lcuda -lnvrtc #\
+			#-L$(POWER_SENSOR)/lib -lpowersensor #-lnvidia-ml
+
+
+%.d:			%.cc
+			-$(CXX) $(CXXFLAGS) -MM -MT $@ -MT ${@:%.d=%.o} -MT ${@:%.d=%.s} $< -o $@
+
+%.d:			%.cu
+			-$(CXX) -x c++ $(CXXFLAGS) -MM -MT $@ -MT ${@:%.d=%.o} -MT ${@:%.d=%.s} $< -o $@
+
+%.o:			%.cc
+			$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+%.o:			%.cu
+			$(NVCC) $(NVCCFLAGS) -o $@ -c $<
+
+%.s:			%.cc
+			$(CXX) $(CXXFLAGS) -o $@ -S $<
+
+%.so:			%.so.$(VERSION)
+			rm -f $@
+			ln -s $(@F).$(VERSION) $@
+
+all::			$(EXECUTABLES)
+
+clean::
+			$(RM) $(OBJECTS) $(SHARED_OBJECTS) $(DEPENDENCIES) $(EXECUTABLES)
+
+libtcc/TCCorrelator.o:	libtcc/TCCorrelator.cu	# CUDA code embedded in object file
+			ld -r -b binary -o $@ $<
+
+libtcc/TCCorrelator.d:
+			-
+
+libtcc/libtcc.so.$(VERSION):		$(LIBTCC_OBJECTS)
+			$(CXX) -shared -o $@ -Wl,-soname=$@ $^ $(LIBRARIES)
+
+test/SimpleExample/SimpleExample:		$(SIMPLE_EXAMPLE_OBJECTS) libtcc/libtcc.so
+			$(NVCC) $(NVCCFLAGS) -o $@ $(SIMPLE_EXAMPLE_OBJECTS) -Xlinker -rpath=. -Llibtcc -ltcc $(LIBRARIES)
+
+test/CorrelatorTest/CorrelatorTest:	$(CORRELATOR_TEST_OBJECTS) libtcc/libtcc.so
+			$(CXX) $(CXXFLAGS) -o $@ $(CORRELATOR_TEST_OBJECTS) -Wl,-rpath=. -Llibtcc -ltcc $(LIBRARIES)
+
+test/OpenCLCorrelatorTest/OpenCLCorrelatorTest:	$(OPENCL_TEST_OBJECTS)
+			$(CXX) $(CXXFLAGS) -o $@ $(OPENCL_TEST_OBJECTS) -L$(CUDA)/lib64 -lOpenCL
+
+ifeq (0, $(words $(findstring $(MAKECMDGOALS), clean)))
+-include $(DEPENDENCIES)
+endif
diff --git a/libtcc/Correlator.cc b/libtcc/Correlator.cc
new file mode 100644
index 0000000..a4e5b7d
--- /dev/null
+++ b/libtcc/Correlator.cc
@@ -0,0 +1,117 @@
+#include "libtcc/Correlator.h"
+
+#include <iostream>
+
+#define GNU_SOURCE
+#include <link.h>
+
+
+extern const char _binary_libtcc_TCCorrelator_cu_start, _binary_libtcc_TCCorrelator_cu_end;
+
+namespace tcc {
+
+std::string Correlator::findNVRTCincludePath() const
+{
+  std::string path;
+
+  if (dl_iterate_phdr([] (struct dl_phdr_info *info, size_t, void *arg) -> int
+		      {
+			std::string &path = *static_cast<std::string *>(arg);
+			path = info->dlpi_name;
+			return path.find("libnvrtc.so") != std::string::npos;
+		      }, &path))
+  {
+    path.erase(path.find_last_of("/")); // remove library name
+    path.erase(path.find_last_of("/")); // remove /lib64
+    path += "/include";
+  }
+
+  return path;
+}
+
+
+Correlator::Correlator(unsigned nrBits,
+		       unsigned nrReceivers,
+		       unsigned nrChannels,
+		       unsigned nrSamplesPerChannel,
+		       unsigned nrPolarizations,
+		       unsigned nrReceiversPerBlock
+		      )
+:
+  correlatorModule(compileModule(nrBits, nrReceivers, nrChannels, nrSamplesPerChannel, nrPolarizations, nrReceiversPerBlock)),
+  correlatorKernel(correlatorModule, nrBits, nrReceivers, nrChannels, nrSamplesPerChannel, nrPolarizations, nrReceiversPerBlock)
+{
+}
+
+
+cu::Module Correlator::compileModule(unsigned nrBits,
+				     unsigned nrReceivers,
+				     unsigned nrChannels,
+				     unsigned nrSamplesPerChannel,
+				     unsigned nrPolarizations,
+				     unsigned nrReceiversPerBlock
+				    )
+{
+  cu::Device device(cu::Context::getCurrent().getDevice());
+  int capability = 10 * device.getAttribute<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>() + device.getAttribute<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
+
+  std::vector<std::string> options =
+  {
+    "-I" + findNVRTCincludePath(),
+    "-std=c++11",
+    "-arch=compute_" + std::to_string(capability),
+    "-lineinfo",
+    "-DNR_BITS=" + std::to_string(nrBits),
+    "-DNR_RECEIVERS=" + std::to_string(nrReceivers),
+    "-DNR_CHANNELS=" + std::to_string(nrChannels),
+    "-DNR_SAMPLES_PER_CHANNEL=" + std::to_string(nrSamplesPerChannel),
+    "-DNR_POLARIZATIONS=" + std::to_string(nrPolarizations),
+    "-DNR_RECEIVERS_PER_BLOCK=" + std::to_string(nrReceiversPerBlock),
+  };
+
+  //std::for_each(options.begin(), options.end(), [] (const std::string &e) { std::cout << e << ' '; }); std::cout << std::endl;
+
+#if 0
+  nvrtc::Program program("tcc/TCCorrelator.cu");
+#else
+  // embed the CUDA source code in libtcc.so, so that it need not be installed separately
+  // for runtime compilation
+  // copy into std::string for '\0' termination
+  std::string source(&_binary_libtcc_TCCorrelator_cu_start, &_binary_libtcc_TCCorrelator_cu_end);
+  nvrtc::Program program(source, "TCCorrelator.cu");
+#endif
+
+  try {
+    program.compile(options);
+  } catch (nvrtc::Error &error) {
+    std::cerr << program.getLog();
+    throw;
+  }
+
+  //std::ofstream cubin("out.ptx");
+  //cubin << program.getPTX().data();
+  return cu::Module((void *) program.getPTX().data());
+}
+
+
+void Correlator::launchAsync(cu::Stream &stream, cu::DeviceMemory &visibilities, cu::DeviceMemory &samples)
+{
+  correlatorKernel.launchAsync(stream, visibilities, samples);
+}
+
+
+void Correlator::launchAsync(CUstream stream, CUdeviceptr visibilities, CUdeviceptr samples)
+{
+  cu::Stream _stream(stream);
+  cu::DeviceMemory _visibilities(visibilities);
+  cu::DeviceMemory _samples(samples);
+  correlatorKernel.launchAsync(_stream, _visibilities, _samples);
+}
+
+
+uint64_t Correlator::FLOPS() const
+{
+  return correlatorKernel.FLOPS();
+}
+
+}
diff --git a/libtcc/Correlator.h b/libtcc/Correlator.h
new file mode 100644
index 0000000..723361f
--- /dev/null
+++ b/libtcc/Correlator.h
@@ -0,0 +1,40 @@
+#if !defined CORRELATOR_H
+#define CORRELATOR_H
+
+#include "libtcc/CorrelatorKernel.h"
+#include "util/cu.h"
+#include "util/nvrtc.h"
+
+
+namespace tcc {
+  class Correlator {
+    public:
+      Correlator(unsigned nrBits,
+		 unsigned nrReceivers,
+		 unsigned nrChannels,
+		 unsigned nrSamplesPerChannel,
+		 unsigned nrPolarizations = 2,
+		 unsigned nrReceiversPerBlock = 64
+		); // throw (cu::Error, nvrtc::Error)
+
+      void launchAsync(cu::Stream &, cu::DeviceMemory &isibilities, cu::DeviceMemory &samples); // throw (cu::Error)
+      void launchAsync(CUstream, CUdeviceptr visibilities, CUdeviceptr samples); // throw (cu::Error)
+
+      uint64_t FLOPS() const;
+
+    private:
+      std::string      findNVRTCincludePath() const;
+      cu::Module       compileModule(unsigned nrBits,
+				     unsigned nrReceivers,
+				     unsigned nrChannels,
+				     unsigned nrSamplesPerChannel,
+				     unsigned nrPolarizations,
+				     unsigned nrReceiversPerBlock
+				    );
+
+      cu::Module       correlatorModule;
+      CorrelatorKernel correlatorKernel;
+  };
+}
+
+#endif
diff --git a/libtcc/CorrelatorKernel.cc b/libtcc/CorrelatorKernel.cc
new file mode 100644
index 0000000..1572811
--- /dev/null
+++ b/libtcc/CorrelatorKernel.cc
@@ -0,0 +1,43 @@
+#include "libtcc/CorrelatorKernel.h"
+
+
+namespace tcc {
+
+CorrelatorKernel::CorrelatorKernel(cu::Module &module,
+				   unsigned nrBits,
+				   unsigned nrReceivers,
+				   unsigned nrChannels,
+				   unsigned nrSamplesPerChannel,
+				   unsigned nrPolarizations,
+				   unsigned nrReceiversPerBlock
+				  )
+:
+  Kernel(module, "correlate"),
+  nrBits(nrBits),
+  nrReceivers(nrReceivers),
+  nrChannels(nrChannels),
+  nrSamplesPerChannel(nrSamplesPerChannel),
+  nrPolarizations(nrPolarizations),
+  nrReceiversPerBlock(nrReceiversPerBlock)
+{
+  unsigned blocksPerDim = (nrReceivers + nrReceiversPerBlock - 1) / nrReceiversPerBlock;
+  nrThreadBlocksPerChannel = nrReceiversPerBlock == 64 ? blocksPerDim * blocksPerDim : blocksPerDim * (blocksPerDim + 1) / 2;
+}
+
+
+void CorrelatorKernel::launchAsync(cu::Stream &stream, cu::DeviceMemory &deviceVisibilities, cu::DeviceMemory &deviceSamples)
+{
+  std::vector<const void *> parameters = { deviceVisibilities.parameter(), deviceSamples.parameter() };
+  stream.launchKernel(function,
+		      nrThreadBlocksPerChannel, nrChannels, 1,
+		      32, 2, 2,
+		      0, parameters);
+}
+
+
+uint64_t CorrelatorKernel::FLOPS() const
+{
+  return 8ULL * nrReceivers * nrReceivers / 2 * nrPolarizations * nrPolarizations * nrChannels * nrSamplesPerChannel;
+}
+
+}
diff --git a/libtcc/CorrelatorKernel.h b/libtcc/CorrelatorKernel.h
new file mode 100644
index 0000000..e28a2fd
--- /dev/null
+++ b/libtcc/CorrelatorKernel.h
@@ -0,0 +1,35 @@
+#if !defined CORRELATOR_KERNEL_H
+#define CORRELATOR_KERNEL_H
+
+#include "libtcc/Kernel.h"
+
+
+namespace tcc {
+  class CorrelatorKernel : public Kernel
+  {
+    public:
+      CorrelatorKernel(cu::Module &module,
+		       unsigned nrBits,
+		       unsigned nrReceivers,
+		       unsigned nrChannels,
+		       unsigned nrSamplesPerChannel,
+		       unsigned nrPolarizations = 2,
+		       unsigned nrReceiversPerBlock = 64
+		      );
+
+      void launchAsync(cu::Stream &, cu::DeviceMemory &deviceVisibilities, cu::DeviceMemory &deviceSamples);
+
+      virtual uint64_t FLOPS() const;
+
+    private:
+      const unsigned nrBits;
+      const unsigned nrReceivers;
+      const unsigned nrChannels;
+      const unsigned nrSamplesPerChannel;
+      const unsigned nrPolarizations;
+      const unsigned nrReceiversPerBlock;
+      unsigned       nrThreadBlocksPerChannel;
+  };
+}
+
+#endif
diff --git a/libtcc/Kernel.cc b/libtcc/Kernel.cc
new file mode 100644
index 0000000..0f3c1af
--- /dev/null
+++ b/libtcc/Kernel.cc
@@ -0,0 +1,13 @@
+#include "libtcc/Kernel.h"
+
+
+namespace tcc {
+
+Kernel::Kernel(cu::Module &module, const char *name)
+:
+  module(module),
+  function(module, name)
+{
+}
+
+}
diff --git a/libtcc/Kernel.h b/libtcc/Kernel.h
new file mode 100644
index 0000000..f6c9ab4
--- /dev/null
+++ b/libtcc/Kernel.h
@@ -0,0 +1,23 @@
+#if !defined KERNEL_H
+#define KERNEL_H
+
+#include "util/cu.h"
+
+#include <stdint.h>
+
+
+namespace tcc {
+  class Kernel
+  {
+    public:
+      Kernel(cu::Module &, const char *name);
+
+      virtual uint64_t FLOPS() const = 0;
+
+    protected:
+      cu::Module   &module;
+      cu::Function function;
+  };
+}
+
+#endif
diff --git a/libtcc/TCCorrelator.cu b/libtcc/TCCorrelator.cu
new file mode 100644
index 0000000..0836edc
--- /dev/null
+++ b/libtcc/TCCorrelator.cu
@@ -0,0 +1,596 @@
+#if 1000 * __CUDACC_VER_MAJOR__ + __CUDACC_VER_MINOR__ >= 11001 && __CUDA_ARCH__ >= 800
+#define ASYNC_COPIES
+#endif
+
+#if defined ASYNC_COPIES
+#include <cooperative_groups/memcpy_async.h>
+#endif
+
+#include <mma.h>
+
+#define NR_BASELINES		(NR_RECEIVERS * (NR_RECEIVERS + 1) / 2)
+#define ALIGN(A,N)		(((A)+(N)-1)/(N)*(N))
+
+#define NR_TIMES_PER_BLOCK	(128 / (NR_BITS))
+#define NR_RECEIVERS_PER_TCM_X	((NR_BITS) == 4 ? 2 : 4)
+#define NR_RECEIVERS_PER_TCM_Y	((NR_BITS) == 4 ? 4 : 8)
+
+#define COMPLEX			2
+
+#if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0)
+#error this architecture has no suitable tensor cores
+#endif
+
+#if __CUDA_ARCH__ != 700 && __CUDA_ARCH__ != 720 && __CUDA_ARCH__ != 750 && __CUDA_ARCH__ != 800 && __CUDA_ARCH__ != 860
+#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory)
+#endif
+
+#if NR_RECEIVERS_PER_BLOCK != 32 && NR_RECEIVERS_PER_BLOCK != 48 && NR_RECEIVERS_PER_BLOCK != 64
+#error unsupported NR_RECEIVERS_PER_BLOCK
+#endif
+
+#if NR_SAMPLES_PER_CHANNEL % NR_TIMES_PER_BLOCK != 0
+#error NR_SAMPLES_PER_CHANNEL should be a multiple of NR_TIMES_PER_BLOCK
+#endif
+
+#define MIN(A,B) ((A)<(B)?(A):(B))
+
+
+using namespace nvcuda::wmma;
+
+#if NR_BITS == 4
+typedef char    Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
+typedef int2    Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+#elif NR_BITS == 8
+typedef char2   Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
+typedef int2    Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+#elif NR_BITS == 16
+typedef __half2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
+typedef float2  Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+#endif
+
+
+#if NR_BITS == 4
+typedef fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> Afrag;
+typedef fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> Bfrag;
+typedef fragment<accumulator, 8, 8, 32, int>                                 Sum;
+#elif NR_BITS == 8
+typedef fragment<matrix_a, 16, 16, 16, signed char, row_major>               Afrag;
+typedef fragment<matrix_b, 16, 16, 16, signed char, col_major>               Bfrag;
+typedef fragment<accumulator, 16, 16, 16, int>                               Sum;
+#elif NR_BITS == 16
+typedef fragment<matrix_a, 16, 16, 16, __half, row_major>                    Afrag;
+typedef fragment<matrix_b, 16, 16, 16, __half, col_major>                    Bfrag;
+typedef fragment<accumulator, 16, 16, 16, float>                             Sum;
+#endif
+
+
+#if NR_BITS == 4
+typedef int2   ScratchSpace[4][NR_POLARIZATIONS][2][NR_POLARIZATIONS];
+#elif NR_BITS == 8
+typedef int2   ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS];
+#elif NR_BITS == 16
+typedef float2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS];
+#endif
+
+
+__device__ inline int conj_perm(int v)
+{
+#if NR_BITS == 4
+  //return ((v & 0x0F0F0F0F) << 4) | (__vnegss4(v >> 4) & 0x0F0F0F0F);
+  return ((v & 0x0F0F0F0F) << 4) | ((0xF0F0F0F0 - ((v >> 4) & 0x0F0F0F0F)) & 0x0F0F0F0F);
+#elif NR_BITS == 8
+  //return __byte_perm(v, __vnegss4(v), 0x2705);
+  return __byte_perm(v, 0x00FF00FF - (v & 0xFF00FF00), 0x2705);
+#elif NR_BITS == 16
+  return __byte_perm(v ^ 0x80000000, v, 0x1032);
+#endif
+}
+
+
+__device__ inline int2 conj_perm(int2 v)
+{
+  return make_int2(conj_perm(v.x), conj_perm(v.y));
+}
+
+
+__device__ inline int4 conj_perm(int4 v)
+{
+  return make_int4(conj_perm(v.x), conj_perm(v.y), conj_perm(v.z), conj_perm(v.w));
+}
+
+
+#if defined ASYNC_COPIES
+#define READ_AHEAD        MIN(2, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK)
+#define NR_SHARED_BUFFERS MIN(4, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK)
+#else
+#define READ_AHEAD        1
+#define NR_SHARED_BUFFERS 2
+#endif
+
+
+template <unsigned nrReceiversPerBlock = NR_RECEIVERS_PER_BLOCK> struct SharedData
+{
+#if NR_BITS == 4
+  typedef char        Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][1];
+  typedef char        Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 16][1];
+#elif NR_BITS == 8
+  typedef signed char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX];
+  typedef signed char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 8][COMPLEX];
+#elif NR_BITS == 16
+  typedef __half      Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX];
+  typedef __half      Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 4][COMPLEX];
+#endif
+};
+
+
+template <typename T> struct FetchData
+{
+  __device__ FetchData(unsigned loadRecv, unsigned loadPol, unsigned loadTime)
+  :
+    loadRecv(loadRecv), loadPol(loadPol), loadTime(loadTime), data({0})
+  {
+  }
+
+  __device__ void load(const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0)
+  {
+    if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS)
+      //data = * (T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime];
+      memcpy(&data, &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], sizeof(T));
+  }
+
+  template <typename SharedData> __device__ void storeA(SharedData samples) const
+  {
+    //* ((T *) &samples[loadRecv][loadPol][loadTime][0]) = data;
+    memcpy(&samples[loadRecv][loadPol][loadTime][0], &data, sizeof(T));
+  }
+
+  template <typename SharedData> __device__ void storeB(SharedData samples) const
+  {
+    //* ((T *) &samples[loadRecv][loadPol][0][loadTime][0]) = data;
+    //* ((T *) &samples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(data);
+    T tmp = conj_perm(data);
+    memcpy(&samples[loadRecv][loadPol][0][loadTime][0], &data, sizeof(T));
+    memcpy(&samples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T));
+  }
+
+#if defined ASYNC_COPIES
+  template <typename Asamples> __device__ void copyAsyncA(nvcuda::experimental::pipeline &pipe, Asamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0)
+  {
+    if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS)
+      nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe);
+  }
+
+  template<typename Bsamples> __device__ void copyAsyncB(nvcuda::experimental::pipeline &pipe, Bsamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0)
+  {
+    if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS)
+      nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][0][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe);
+  }
+
+  template<typename Bsamples> __device__ void fixB(Bsamples bSamples)
+  {
+    //* ((T *) &bSamples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(* ((T *) &bSamples[loadRecv][loadPol][0][loadTime][0]));
+    T tmp;
+    memcpy(&tmp, &bSamples[loadRecv][loadPol][0][loadTime][0], sizeof(T));
+    tmp = conj_perm(tmp);
+    memcpy(&bSamples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T));
+  }
+#endif
+
+  unsigned loadRecv, loadPol, loadTime;
+  T        data;
+};
+
+
+__device__ inline int2 make_complex(int real, int imag)
+{
+  return make_int2(real, imag);
+}
+
+
+__device__ inline float2 make_complex(float real, float imag)
+{
+  return make_float2(real, imag);
+}
+
+
+template <typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI)
+{
+  if ((skipCheckX || recvX + tcX <= recvY + tcY) && (skipCheckY || recvY + tcY < NR_RECEIVERS))
+    visibilities[channel][baseline + tcY * recvY + tcY * (tcY + 1) / 2 + tcX][polY][polX] = make_complex(sumR, sumI);
+}
+
+
+__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned recvYoffset, unsigned recvXoffset, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp)
+{
+#if defined PORTABLE
+ store_matrix_sync(&scratchSpace[warp][0][0][0][0].x, sum, NR_BITS == 4 ? 8 : 16, mem_row_major);
+  __syncwarp();
+
+#if 0
+  if (threadIdx.x == 0)
+    for (unsigned _y = 0; _y < 8; _y ++)
+      for (unsigned pol_y = 0; pol_y < NR_POLARIZATIONS; pol_y ++)
+        for (unsigned _x = 0; _x < 4; _x ++)
+          for (unsigned pol_x = 0; pol_x < NR_POLARIZATIONS; pol_x ++)
+            if (scratchSpace[warp][_y][pol_y][_x][pol_x],x != 0 || scratchSpace[warp][_y][pol_y][_x][pol_x].y != 0)
+              printf("firstY=%u firstX=%u warp=%u y=%u x=%u _y=%u pol_y=%u _x=%u pol_x=%u val=(%f,%f)\n", firstReceiverY, firstReceiverX, warp, y, x, _y, pol_y, _x, pol_x, scratchSpace[warp][_y][pol_y][_x][pol_x].x, scratchSpace[warp][_y][pol_y][_x][pol_x].y);
+#endif
+
+#if NR_BITS == 4
+  unsigned _y       = threadIdx.x >> 3;
+  unsigned _x       = (threadIdx.x >> 2) & 1;
+  unsigned polY     = (threadIdx.x >> 1) & 1;
+  unsigned polX     = threadIdx.x & 1;
+#elif NR_BITS == 8 || NR_BITS == 16
+  unsigned _y       = threadIdx.x >> 2;
+  unsigned _x       = threadIdx.x & 3;
+#endif
+
+  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + _y;
+  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + _x;
+  unsigned baseline = (recvY * (recvY + 1) / 2) + recvX;
+
+  if ((skipCheckX || recvX <= recvY) && (skipCheckY || recvY < NR_RECEIVERS))
+#if NR_BITS == 4
+    visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX];
+#elif NR_BITS == 8 || NR_BITS == 16
+    for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++)
+      for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++)
+        visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX];
+#endif
+#else
+#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16)
+  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4);
+  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2);
+  unsigned polY     = threadIdx.x & 1;
+  unsigned polX     = (threadIdx.x >> 1) & 1;
+#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860
+  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3);
+  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1);
+  unsigned polY     = (threadIdx.x >> 2) & 1;
+  unsigned polX     = threadIdx.x & 1;
+#endif
+
+  unsigned baseline = (recvY * (recvY + 1) / 2) + recvX;
+
+#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16)
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 1, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 1, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]);
+#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]);
+#if NR_BITS == 8 || NR_BITS == 16
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 2, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 2, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]);
+#endif
+#endif
+#endif
+}
+
+
+#define NR_WARPS 4
+
+#if NR_RECEIVERS_PER_BLOCK == 64
+
+template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities visibilities, const Samples samples, unsigned firstReceiver, unsigned warp, unsigned tid, SharedData<>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS])
+{
+  const unsigned nrFragmentsX = NR_BITS == 4 ? 12 : 6;
+  const unsigned nrFragmentsY = nrFragmentsX / 2;
+  Sum            sum[nrFragmentsX * nrFragmentsY];
+
+  for (auto &s : sum)
+    fill_fragment(s, 0);
+
+  unsigned channel = blockIdx.y;
+
+  const uchar2 offsets[] = {
+    make_uchar2( 0,  0),
+    make_uchar2( 0, 16),
+    make_uchar2( 0, 40),
+    make_uchar2(24, 40),
+  };
+
+  unsigned recvXoffset = offsets[warp].x;
+  unsigned recvYoffset = offsets[warp].y;
+
+  FetchData<int4> tmp0((tid >> 2)                             , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+  FetchData<int4> tmp1((tid >> 2) + NR_RECEIVERS_PER_BLOCK / 2, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+
+#if defined ASYNC_COPIES
+  using namespace nvcuda::experimental;
+  pipeline pipe;
+
+  for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) {
+    unsigned fetchBuffer = majorTime;
+
+    tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle);
+    tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle);
+
+    pipe.commit();
+  }
+#else
+  tmp0.load(samples, channel, 0, firstReceiver, fullTriangle);
+  tmp1.load(samples, channel, 0, firstReceiver, fullTriangle);
+#endif
+
+  for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) {
+    unsigned buffer = majorTime % NR_SHARED_BUFFERS;
+
+#if !defined ASYNC_COPIES
+    tmp0.storeB(bSamples[buffer]);
+    tmp1.storeB(bSamples[buffer]);
+#endif
+
+    unsigned majorReadTime = majorTime + READ_AHEAD;
+
+    if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) {
+#if defined ASYNC_COPIES
+      unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS;
+
+      tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle);
+      tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle);
+#else
+      tmp0.load(samples, channel, majorReadTime, firstReceiver, fullTriangle);
+      tmp1.load(samples, channel, majorReadTime, firstReceiver, fullTriangle);
+#endif
+    }
+
+#if defined ASYNC_COPIES
+    pipe.commit();
+    pipe.wait_prior<READ_AHEAD>();
+
+    tmp0.fixB(bSamples[buffer]);
+    tmp1.fixB(bSamples[buffer]);
+#endif
+
+    __syncthreads();
+
+#pragma unroll
+    for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) {
+      Afrag aFrag[nrFragmentsY];
+      Bfrag bFrag[nrFragmentsX];
+
+      if (warp != 0) {
+	for (unsigned y = 0; y < nrFragmentsY; y ++)
+	  load_matrix_sync(aFrag[y], &bSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS);
+											    
+	for (unsigned x = 0; x < nrFragmentsX; x ++)
+	  load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS);
+
+	for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++)
+	  for (unsigned x = 0; x < nrFragmentsX; x ++, i ++)
+	    mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]);
+      } else {
+	for (unsigned z = 0, i = 0; z < 3; z ++) {
+	  for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++)
+	    load_matrix_sync(aFrag[y], &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS);
+											    
+	  for (unsigned x = 0; x < (NR_BITS == 4 ? 8 : 4); x ++)
+	    load_matrix_sync(bFrag[x], &bSamples[buffer][/*recvXoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS);
+
+	  for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++)
+	    for (unsigned x = 0; x < 2 + 2 * y; x ++, i ++)
+	      mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]);
+	}
+      }
+    }
+  }
+
+#if defined PORTABLE
+  __syncthreads();
+#endif
+
+  if (warp != 0)
+    for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++)
+      for (unsigned x = 0; x < nrFragmentsX; x ++, i ++)
+	storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, recvYoffset, recvXoffset, y, x, fullTriangle, x < 2 * y + (NR_BITS == 4 ? 8 : 4), sum[i], scratchSpace, warp);
+  else
+    for (unsigned z = 0, i = 0; z < 3; z ++)
+      for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++)
+	for (unsigned x = 0; x < 2 * y + 2; x ++, i ++)
+	  storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, 24 * z, 24 * z, y, x, fullTriangle, x < 2 * y, sum[i], scratchSpace, warp);
+}
+
+#endif
+
+
+template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS])
+{
+  const unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_X / 2 / (NR_RECEIVERS_PER_BLOCK == 64 ? 2 : 1);
+
+  Sum sum[nrFragmentsY][nrFragmentsX];
+
+  for (unsigned y = 0; y < nrFragmentsY; y ++)
+    for (unsigned x = 0; x < nrFragmentsX; x ++)
+      fill_fragment(sum[y][x], 0);
+
+  unsigned tid     = warpSize * (blockDim.y * threadIdx.z + threadIdx.y) + threadIdx.x;
+  unsigned channel = blockIdx.y;
+
+  unsigned recvXoffset = nrFragmentsX * NR_RECEIVERS_PER_TCM_X * threadIdx.y;
+  unsigned recvYoffset = nrFragmentsY * NR_RECEIVERS_PER_TCM_Y * threadIdx.z;
+
+  FetchData<int4> tmpY0((tid >> 2)     , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+  FetchData<int4> tmpX0((tid >> 2)     , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+#if NR_RECEIVERS_PER_BLOCK == 48
+  FetchData<int2> tmpY1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3));
+  FetchData<int2> tmpX1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3));
+#elif NR_RECEIVERS_PER_BLOCK == 64
+  FetchData<int4> tmpY1((tid >> 2) + 32, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+#endif
+
+#if defined ASYNC_COPIES
+  using namespace nvcuda::experimental;
+  pipeline pipe;
+
+  for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) {
+    unsigned fetchBuffer = majorTime;
+
+    tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+    tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck);
+#endif
+    tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48
+    tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck);
+#endif
+
+    pipe.commit();
+  }
+#else
+  tmpY0.load(samples, channel, 0, firstReceiverY, skipLoadYcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+  tmpY1.load(samples, channel, 0, firstReceiverY, skipLoadYcheck);
+#endif
+  tmpX0.load(samples, channel, 0, firstReceiverX, skipLoadXcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48
+  tmpX1.load(samples, channel, 0, firstReceiverX, skipLoadXcheck);
+#endif
+#endif
+
+  for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) {
+    unsigned buffer = majorTime % NR_SHARED_BUFFERS;
+
+#if !defined ASYNC_COPIES
+    tmpY0.storeA(aSamples[buffer]);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+    tmpY1.storeA(aSamples[buffer]);
+#endif
+    tmpX0.storeB(bSamples[buffer]);
+#if NR_RECEIVERS_PER_BLOCK == 48
+    tmpX1.storeB(bSamples[buffer]);
+#endif
+#endif
+
+    unsigned majorReadTime = majorTime + READ_AHEAD;
+
+    if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) {
+#if defined ASYNC_COPIES
+      unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS;
+
+      tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+      tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck);
+#endif
+      tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48
+      tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck);
+#endif
+#else
+      tmpY0.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+      tmpY1.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck);
+#endif
+      tmpX0.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48
+      tmpX1.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck);
+#endif
+#endif
+    }
+
+#if defined ASYNC_COPIES
+    pipe.commit();
+    pipe.wait_prior<READ_AHEAD>();
+
+    tmpX0.fixB(bSamples[buffer]);
+#if NR_RECEIVERS_PER_BLOCK == 48
+    tmpX1.fixB(bSamples[buffer]);
+#endif
+#endif
+
+    __syncthreads();
+
+#pragma unroll
+    for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) {
+      Afrag aFrag[nrFragmentsY];
+      Bfrag bFrag[nrFragmentsX];
+
+      for (unsigned y = 0; y < nrFragmentsY; y ++)
+	load_matrix_sync(aFrag[y], &aSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][minorTime][0], sizeof(aSamples[0][0][0]) * 8 / NR_BITS);
+
+      for (unsigned x = 0; x < nrFragmentsX; x ++)
+	load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS);
+
+      for (unsigned y = 0; y < nrFragmentsY; y ++)
+	for (unsigned x = 0; x < nrFragmentsX; x ++)
+	  mma_sync(sum[y][x], aFrag[y], bFrag[x], sum[y][x]);
+    }
+  }
+
+#if 0
+  for (unsigned y = 0; y < nrFragmentsY; y ++)
+    for (unsigned x = 0; x < nrFragmentsX; x ++)
+      for (unsigned i = 0; i < sum[0][0].num_storage_elements; i ++)
+	if (sum[y][x].x[i] != 0)
+#if NR_BITS == 4 || NR_BITS == 8
+	  printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%d\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]);
+#else
+	  printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%f\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]);
+#endif
+#endif
+
+#if defined PORTABLE
+  __syncthreads();
+#endif
+
+  for (unsigned y = 0; y < nrFragmentsY; y ++)
+    for (unsigned x = 0; x < nrFragmentsX; x ++)
+      storeVisibilities(visibilities, channel, firstReceiverY, firstReceiverX, recvYoffset, recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize);
+}
+
+
+extern "C" __global__
+__launch_bounds__(NR_WARPS * 32, NR_RECEIVERS_PER_BLOCK == 32 ? 4 : 2)
+void correlate(Visibilities visibilities, const Samples samples)
+{
+  const unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2;
+
+  unsigned block = blockIdx.x;
+
+#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48
+  unsigned blockY = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2;
+  unsigned blockX = block - blockY * (blockY + 1) / 2;
+  unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK;
+#elif NR_RECEIVERS_PER_BLOCK == 64
+  unsigned blockY = (unsigned) sqrtf(block);
+  unsigned blockX = block - blockY * blockY;
+  unsigned firstReceiverX = blockX * (NR_RECEIVERS_PER_BLOCK / 2);
+#endif
+  unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK;
+
+  union shared {
+    struct {
+      SharedData<>::Asamples aSamples;
+      SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples bSamples;
+    } rectangle;
+    struct {
+      SharedData<>::Bsamples samples;
+    } triangle;
+    ScratchSpace scratchSpace[NR_WARPS];
+  };
+  
+  // the following hack is necessary to run the correlator in the OpenCL environment,
+  // as the maximum local memory size is 48K - 16 bytes.  Due to padding in bSamples,
+  // the last 16 bytes are not used, so allocate 16 fewer bytes.
+  __shared__ char rawbuffer[sizeof(union shared) - 16] __attribute__((aligned(16)));
+  union shared &u = (union shared &) rawbuffer;
+
+  if (firstReceiverX == firstReceiverY)
+#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48
+    doCorrelateRectangle<nrFragmentsY, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+#elif NR_RECEIVERS_PER_BLOCK == 64
+    if (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 && (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK))
+      doCorrelateTriangle<false>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace);
+    else
+      doCorrelateTriangle<true>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace);
+#endif
+#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0
+  else if (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverY >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK)
+    doCorrelateRectangle<(NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK + 2 * NR_RECEIVERS_PER_TCM_Y - 1) / NR_RECEIVERS_PER_TCM_Y / 2, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_Y) == 0, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+#endif
+  else
+    doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+}
diff --git a/libtcc/TCCorrelator.cu.o b/libtcc/TCCorrelator.cu.o
new file mode 100644
index 0000000000000000000000000000000000000000..943a2da3a75846584b95962314b91f4161f2bc5a
GIT binary patch
literal 26544
zcmb<-^>JfjWMqH=Mg}_u1P><4z)+BeU^{@B4h*aeEDXw-X$po01_laR3i0vIp)QWj
z&hcTcLGiwhUj9Mx@e0~#!k&I$VLMv|Lqh`tLj^T8sDgOMAZL&Gcm+FK1q%ZMF6ETe
zw9LFz1;^k>Kj(O7{{T<dU@qm<yp+r|E-vNFyyTqHlvD+q<ox`C)S|?a%(B$@^rHOI
zf?~Z4JG}Dxxv9Cy1(orM#g%!<AjMoDFF<TdL9-5FO=(J^enDnIYEEWesvQ=0<mM(q
z%z_xC;1?9{<QVMg<LT!b%*m<Y7Ze}l>g?(n<{A{NprwGy*H$ppRM1y2(nNUL$J5<U
z!%@dilao_JL&H&1Tf<LN*HBYm!%tI7!%tI_3$D~JC_co~*EKjkz%?k|$;aQ>n^VKk
z$U;FMY>AU+NU$ctTx4f}HH0|(#z$~!fZPLCs$grYV4`5JV5DHBU_wA&Bo=)p3RVgh
zNUn4C_YLrIjo{?uGy=t18d_Z2z#R#**xcAa!3rFDo*}^?K??<Y1#=@bK|?c;r~ydS
zK$A;3wWuh+NTDPnvsfXqC^;juBsICDv?x^}Be7T^FJGa!G_xczDJNB-BsH%%zepiD
zzbLgBDGt$HrD&^QZeW0!GC|@-IK)k{i(B9@$HEMe!UOz+LL8lZTov^76-x86^YY8{
zP`#mRr%+y$S(2(ymRX#cl$n!Rl9^hpkeR1YkY5DxSb1Wlf<{?pqC#;-Vo_>}LT+kq
zeo-YTWMKIv4HTyEBnXasaAHwVv{f)R0{ImqZNTMCEbz;lnZTl;G_Sa{0OYFF6dd-U
z1iN2Qe6S-Zfk0BPvxlRfpR138Do8I{q6WFs0A>vieZ?91r8y}INvR5n3b~~@C7A^|
zsS5dEpQD+Ea0N8G`g(%0o0FyjD6`pUIBD8zIBHsHIDv8+S7~u(Ub;eFVs2`2L1J>M
zLS9*NX-cA%Re5f1qBSI0gR(m`!J2TDR2HPBq^2n(XCxLWC@3fdC*~I9q!vf}1;s<X
z7#s`YVDq~^7QcfP!{ZLj3-I@G4DyU}4Ds~$gBXV9!&qy$Gcxl^j6lu^LrOg$>rvbS
zFRXAFk7}VcmvU-OCOBk4#RfQ^!tDcj8)V^#@V22D+~4u>8HqV*Mgtw)X*v0cpa`dO
zbVCXfa9xQtv8NR!rst;SmDuDamK0@H#3$+~Sb&kSk&Z%YML}v&CP+Fl$I7apC^b2=
zI5R)b%Bt8zN1-UcJU%xuE5FE2!4YH%BISa3AWbkclfVXA=qM00GdVv8W~LL!6f9;Y
zCMTEXmgXdu<QE|uo0(T)2g)sYad2ra-YfvJ64~L2Itqqn2wa?*o|l>eE+0_5k7_63
zz`$uHlJ`-~Bq}hFjU^&Dum=WG#vl+HkbHwDAi!QAEi_<e;tU9|nIwk>IB(%k9?;Mr
zlwF{Oa&U4{Vo7pFFsNRMHo;c*8etK{oA$7oY=OlPtkz*M895>$MGK0fNirH5D)I3t
zsb!hTsqyg&nR%cVwL)fIi9&LIURHcTYEiBRh*ze`RgGENC@ARb7p0bz7Ud~uXp||a
zDHv23xIv(%f{l%WiKar0f<}CNSzc;-aj}U;nSz}iNDxJ#HJ5?{VM8<wDxfACDCmN1
z!(x;ssuQpTp9L3^o8#k?Doau!o-ET*Kys(1j)Fmjk-34XCX(yWlp9nS7`VBCFvvj=
z-$VEY2FUs`>SjYTgpX0oRESeBsIV}A03C%gkRuEYjEyuw9sx&IEf*IT*61=qjV~hw
zaD>4FHa9UlH6D}{H4xESrdOe<gQTrYuToRfnhU2DCP->C^GZx`SYblI3REkr(40^X
za{}&OAheAb<m%`W@95#`=mIW06ciLdtwtjqP>YS&rW{I_D%is@$kip@Db&pk)GY*=
zW<siIsX4`|@V+3X3k~6Npf)qAeMazBFjq-xZb43BNveWPX&$t4$}0*=O-{`$OD!r6
zNG)>8$xqH!umyP)-VjA}BJ323ONvU9OB8~^-4B<<l0@v87rkW+3P{IdMDrLmL}H_{
zI09QqhS>*>jW&!0Wh87CI#F&Rv|9^y3tGclL0iGl3}QV-J_MBoFc(3C8Pr%-pivOO
zT!|bI$kx(4AS_~|5pKW;4REOh3l2zy4N4CR^b8JUYiS-FCaA#yDPOI*xRA0RC^$eJ
zO@$D6o^neqNzQ;~DFp?jvKTI*0nfBK`H3k(smWzJ3MfJW`8jAJA(^?Unp_GBR$QPU
zhiTJ*F*J1)K+J&r9I#M8ehyRwqz^0vVrc3pq$HLkYE&E4g3PMsQc$Sn;zDs_S$<{;
z*eFmLU0ed|8!14uAd;(+GZORiQjy(Rl9`)|q#!M`sJH}HCg~_7<>%)p6lZ4^_~a+1
zIA^3{6i<+01T3W$sJChWbFhL!W}1QqsyQ_^3MiH-XoI}~3Rndjlp+vh3Mc?U!JuFZ
z8rTa_(9%>;gL*F->Z#ahkY8e>ahVz$4GnDw0~EI4(hD4#h_V@;I1t6G9dd#I2X=8u
zeo?BU20{retTjQkX>kdcf&wU_6hMyEKy@w15g<=OToI6;17U!j6KxQysbC9^Z%}bU
zw#%HT<uU`@jw8}<hL|QG+A5$J1vkZNAp<;+HUQfAuz~`X$bl8+m{F6QUr^}?8g+Km
zfDIv`b&R0{%nE9tp=KR;L8y?DT3iBd#th#qggr!%$Lck-6rc$f6dd3zgD17>fD(9e
zeqM2j0%|I!AW!NjfZ_#~ED*781J9YTIs{MTI}sWAFr!8zACzSXM?PACFxcWhEwjQ&
z10JVI(7{nq1%h52!{o4~aJ*%fmVyQ}gWxj)Eu28zA5gswn_5#qZ90H*7`!n6VSwrb
zP#q8gs`)?|R90GZ)q+|wNX-*O&)!G@+#5{J2MvQ)fP42vsfjr{;9h@bZelv9tB<AU
z2vr19fUB1X9*{vZA1tN-(F|c@w*xF^qybR~RsyjFOPL;mlAobILsYl$q1H+bI1@BQ
zgFIh@)RKXgKu97<iN)ZVMI@o3)a0^AG=2mUza$x1wj>!@t{^`LS+XEMCjzO#1a2%v
zq6<XmD1;~!m*xgRX-`O`fLcf3<`Gy|ghEY?0?1_%3fc-K$q@=RwqR}~m>a1HnkPc2
zj)bX(N~1TPK#5-gWl{^#V1k7R#9#$2WNSbrdr30LP{_O~SWQH1G|1avG9p&N7B$U5
zJXTzq>xpx~0ZVjdrWTVKp(rgq<hVeQK#ns|s7B_er4^@^AjL9BGy+4k5=lh`dPHK2
zSx7w?T$-z(R$Q8^qkueur%;SM1{sTFba`S?0Vow9kBq=3E4dWF4b6DyXg|1!Rzoof
zq&qg+0D<%>bU+CVJi-B+H3H2wTPRq8N5^whbK~K20pLnDJ|1K)$RG_(=tP?V7o^2r
zl2MeJnBtjIp;rNK&nqaT<rgU+dPDJ*3bqOc)(Y{J3N{KBU|L%nlIYNt7Ubu^R705H
zc0S6am$d>!24MucA@LPZ`zjP{6iloY;wu!86{9N$83)w?VdAnA-55v+U<vH_N^tTA
zQSlWJW<{({1$gchloyHBqE`vl0t+fo84tn*MVWaeX&Or4G#qKGS_<hVMSxhKNCMH7
zAff_9#Dke2pMe?iU_lTU%qUCDvDHvb(@{;+jLB2dftUpA-9@7DBXkr%=Iba_>L^s`
zD8yIlK;lA2A-)2_1SKuxWDHUs8x0Bwkd2kG(TMN>CqrV@Rw711u?<GUd)e`opa4=(
zP_R`%PMLOg3dW$S5}~dFu1*6@p%G{-*$_oFs6Yd&w^cw_ZHQHM1XMM$(P|2YpwV3P
z=~_^FK$^${55^#ziPJkqC=QHAcn8%=V~~~L0S+#t1_h`ziv&5*7A?LNv?1j?q@+{O
zM(JdO#~DEjRU#F%6e>XriYieY0WuZraWqpwbre_*zo`)lS_%~)Q!7wRg;zkf3K}3&
zK$b%p@bx-SXGd6rMm=!Wl(0GyckKwy`0$Ym(0U!zxpfopC=GIJ0<)q7XZJ`j39kNd
z=k-W<Zit8lMH4Y8154Qz2`TKrX%y@M%+fLvsr*ANnn6txxc&%8!vIr%1d@J24F#~z
z$?`uqnJPmD(lgQ50@*^=0@Z+Ll|akeV9V4%5|G6}D7gq!X2P=(B;9~Z1#p=OFW;aE
z2U_0~O79xznco<c`HeIcv{8!%H3buJwTWUs6><iK{otYsG)aV%x54QElDAQd0*oRJ
z>>>r+MH*zFSOc<}5N~i>Xo5;HOogTfSj8<2aEP0MS`{dvMuk!eBh<jLW=x+Lf>aNv
zb|J+OdV&H+D9(t&h)*niMG}ikP?Hif-XQTqOw1dafifDn2@3CVRcauYkUF5{6_62S
znA>z9l?;T6&`~e|BT%q_8xavY;PGT|M<)`A4H7TS)vJg$hy^!b^(vwbW5Gd7x&4M<
zn<K#_$e!%X0^IhS!0k7siv5Np`rim{zcID!C&~Y2aQn?c_76}wL3yMbfTNqpNJmR_
zpp1i5;aGqQIwU^08ArJbjYx4JwUedE04GZm9R-s7kC7~)4GLrmw8jCpqTMejKHM=V
zAXvc!GPQtX;jOK$f|&{O+#q<yE3GsqC!{DdF)uwQ6=hHgJbaOo@0?#$lnR>4hbh80
zfB_vAfXu(c=82F708kn&NaCR06%w~3Gerk!>fgrB$_hTvsRkR0#5d>w3z1mRXoCV|
zo(eGxnOEcnU0YZj0a=28JbGXVTB>daYPv&w1Fy!Bw1JwcNZbg}VtbGd(0~Z2$pXd2
zrMb~4%Cr=axRJ5oW<Izbn^;<sub@_}U<De`0M$ea3Tc@+Iq|TSZW_fp3I?Dd3#0}T
zbktA5Rv`(rm;y9(Q3*B?>Tb|V03!uRU!^!2G=c>g7zU4>mV#w9KvkB4fu;_)KY%V_
zXof?=1e*>c6CDMp1Zd<I$)TWDD|F0B!4~cyP@Ko=q4tqLDq(F{bd?al!RM%LKnr;6
z6iRXn3^X)KGE+e7vWzsL0gsi^QGls~G-h-Z%uE!(%cH^3uLY4-Q!s>#U}Le#5ZNXy
zeM-mzcYRQ9C&4Q4H~^06KwK-uk*E1U2@f=l1YY+BVt^9?I9b8dE_gXCXq+9=M?!FI
z6cB5CtQ8RA+S;1XX<fJ(X`pojPNivSsYT$?(5(C-kcnX16u=Ro2cM91(f~O`M*)^u
zqmeX&DqTnkfz+ge+l)3+qoaUaK!F=cV7m<IVHY@8fIOm?oS&PUSppg~giLLK7D<8K
zVW0<^<ko=B&m(!rfUrkOatjQ}){ba06CKl_6<b(#M}V^c+M*)@Ng)X%6+jk8pf0z9
zj<71?NZZix12-<5G~g)(Vm7Ef2Fk_YFv6__5ei7<4LB_Yr6#6;k|%OfRnSIG?VvFh
z$jB$E2JqMe(d+LNAd5<I`Vq-pNO=I%TZMQO#Z8(xf)NxF(BPsli@}10aOwov2@ZWa
z*$5dh0oe-mDUOVV5?Lg8gBr#`lN-p9_?K}gfG74*iVdhxd17Wsd_hrWevu8L<h28p
zEa1`xRG;aAr*`pW2wV!F?gN_ynd^XbwTr>#D;I#8J&6jXc}4j-IbcO-Wk+Tnq)dm_
z5^yd!OJEieaCvQ81<WmfhGq(&y?d~FL;<oHLm?5=_(E<Lf!aZ!5LHkBFP~RPLRSD9
zRRp^hl3PIy2gr1SLNymBst=$OG@$tz<d&8-XqZ=9lZ#UUyzmNTk_qCT%2*u*c+)Z(
zk~L$ak=r9!>U;PYGHgyN7ZxHQ0Ud?n%&OG<G(^z`o4e9f&{6<xsD$>jG_AQf5dahv
z=-z_P>VT$!kh~RP4H{?0;VnogR^ap&tU-puTi`)#kS{<JZbW(uyrc`=XXsu5$B>Rf
zCM1dBi73>F#Tx=3<3JN*oS?AF%}oT)@@art-<h#G&;$pHU__We<e<$71%+A#&}c1Y
za;gHk5@b4f+OJB%M#0!xp$asJ3{s0eh7I-_D5+uYLIrJOHPQsh!QBZekBCl)`dUay
zQA=OJ$V5R4<Pbdb8YD&@5or?LzesTc+wTjSI)zQanrI@WPb?V^5gV8p58_{F8YSpo
zDkfHRKav)wIMb|=g0_MYXtoIwB%mY(PY4PMq$OF-T4-Sip+O{CWrUbhSHP$R5LE<d
zehE>Qg3UyAKiKaer-D;IuB-~mbLgRllrk{OE0Bp`&x4zF@X=P#+6Wxut!TprXk4V6
z0Ue2j^m$>kyT}~~9R*Ms3XTcTWCyeWMoqQgG1+KP$$>oO1(`7hml&Y30!wh=E-b)`
z(ZdKFx}Xw)U}=FGD@b7r^M?^es-jZRLW&t3XmLXzKna8!D2+l-TR>W$4)0wer4!@@
zC!odHk;$Od*htG#K!OqIg2AA5${0$)q7b!sN4J7flS{y3U{o94!na5Vv6vb(wgp?@
ztDu$$UB!no`h{;ePr)9v*&ckp0erd_JiQGbilF(p85TPsBPys8pmq{?j0}2+1gb39
zsh})q2C6wgqi=?gkuK0^8mOX1P8y&sJfQX=Xftdvv|)&vKalbjvPMu&1C=_U;tEv*
zQiej(0B>A@G+>OzmF7lQ#zuo$SLma1prI&`7F+NNli<v%RM6@&@CcntX0BePf)>((
zjw;YfGx)-zO3(x>q`?jv%tIX-!#Uaq&q1Ja5L#m@*dh-!f~rT1L<ETwWXE7Gf<O$#
zAq5O*-2ztAKoSVoSAoZcuneL_VvN^OJ}?(SnVHZz5FE+U1hmTv#Un=GamPr+aG)_L
z=0JPMVZ(lq`A^Wka!7ta9bq)aGQtQkGlDWRAv0JQv4fPuxD*snykdgx74RwmWUml0
zSPE-XfWi^SK**39$c!{Vj4nDtM>1hWGub1V&>3J*-36(oKr6u@%W`lgJm_Q`4tX=k
zGM<v$f=Hy{P}({eQiFpExRTt02&8bL+K4Z7)*2K}kb0YlZ~`~5Kn_9*2iiIa5)NQ{
z@edG35<NmpqXYs;9-!VP5u?;t-2m|`cvLQeXy1?-jgUwr(zRIafVdV`B%!yu!J~)J
zabW}GQEu=M7qO$h$QrTClt6}$P)44?M-Y)dJRWJF2X4wcYQW1Mh~c2|aZquKyR0{X
zP303)df>4H5}t@04_1nGyq=JIz_|qy!FUXT*oTNeqycnr<iSR;QG$2K5A0K3<-o!P
zZActzj;2Pn11=9JuXsptHY90)k|Nl0%Bvq7mSZhf2!t6paYLL%aV12GvmjxDHhF@(
z(m}~W^iC=`igl9R1o1O+Iv~SjiX)j?c0zmuE$~2|0*#bprYYcPNkiw6kY}W!LWHN5
zA`EcOEs;=o6PSI1`UIRIFoqwwhQTBh>hJ-9@di#%GQl;@k6x*hHsOSQS_W<=?%5`A
z)S$PF2n>T@`UBtiJ@Qxt&1aZEc?R7#gkl9X4hi`kZHOK^^n;i=L<9wByb3H09w`Pb
z76D=K5+Bf-a8zpvI1=P9R1E~&3DSTb5YUOXOa&Xz(lXF&o<Xc$UTJPTs0o&so*JK;
zlL{V8vsTCiO>A*-g60L_-q5Rv&Wwf3wt_u@GGbm+1DbyVk8Rl6Dp-J~S7BSazyn^M
zDHXOFswp~Pq^VGnnF89!1>U^{-m{en;+EN}rhs>B!L)$(Q^DDlI*8S6RXQLeV588P
zknv`4V1r!=>L-GS<~bD<$Z|#+^_+pcR1+L-u)>_UsTtH*r%f^e#eXWvvoBb7(t+|i
zbfggL{0rzT6!0)SWY`0;5FRW5>E9#8qYld6QE+rC=)(t6p*w9;D@sy}@)VSul@zK$
zXNjff<R>NO#K&{R$LA!L<|SvuC*_ysr4+}<gJvZmD`U05Cp_ZrEEyXq*egILX}Q1)
zUz1_WUP+!0AhhHZw00DSL%=JsP`hiOVcJN@@>1lDZ{Ylayz;XGy4VVb$+qA_c_H}~
z)dKj~Ab6M)K9~%)0OVBo?pI9(jpD+hk~9qq1uaOP1f5K!si3Q%X9)sn;3KC&ixE)`
z0WGlwo2H-((FPhP)&Pqjc3h)qLz&-$YsWf@4R$@!o--VN1CNxUIt$Hj5U+t<1unT!
z6~i5Y><6&oNFrShN{ZMV2Rfx2<bH60fz;JS2?LN@A+{-Kfl6E~27yQ0<KrR6HOI#*
zl;&mT=RsF+LeElzto~I%TUZUBe1f$bKpp~BBuH~qxDyq4%ElNpXM<f0@(dJgK@-G+
zTF~AZ$dnsI0Iw_I%fF!uEg@QJ6-uBh*CBk|yU^ha!a)ja6-pK2;}c6tiZYW*OH$+G
zH8c`)z$d6`7@BE<#^#}W!cih20yIU1A`+<yF50pC9HiRV2>$>sILj$0AT6?oPea-u
z&$xhhxWgkIvPKZjMNIdBtTdpV+O))+;#51dJyFp3!0bt)gcRsp0=**G6ssOAsp%+`
zB8kG%2S^lo<{eV6<L!Qf^EYI57;<dj4k<-j1@QUmDA^LSYZP0+q95QD0lHKG#U?C8
zH@IxVnPD{{tNc-7AGXE9hJ>Jw&;c(jL7I3&ol^v_az~!#1I;-agH{e8w1D<NBX|{{
z2!LfQJ$M9{>Y+snsE`1+w?GW+{wOIbB{IbfoHwA|RrLNZ{<OoT09i2xYAd6p1Mo01
zF-a(r3P}jukHcDepr&i^;4?NyK^7$;djMRTKpGvokXs~h$bxq{fYYgt0w|g>${h_*
z8pEzx6IxP%^bJ&j0<Cc22^ecIq87p+=fXUTU=NH3K>NRHxxm+^;31f?+`Iz1i$ws7
zxH1C+g8;+F|2nCt2D*U785z*U8RC;N^Ad|H<4cm0<3pTb`^55#;*(3`i%SxVO7N@5
ztV+eJAT=+AfkCghGPfi#i9xTp1avb6gI;k4j0HL*3e~0L@uV3Tz@fp6rX6x84FiV1
zLE(q26vSsj7J#w~py5~n<%7%zspEk1C7?7X0|Ns{4$6g7FQEEU;35zP$UX=QLJH#0
Q?*qMGM-ie7PNM4v0D^MdDF6Tf

literal 0
HcmV?d00001

diff --git a/libtcc/TCCorrelator.cu.orig b/libtcc/TCCorrelator.cu.orig
new file mode 100644
index 0000000..ddf921c
--- /dev/null
+++ b/libtcc/TCCorrelator.cu.orig
@@ -0,0 +1,587 @@
+#if 1000 * __CUDACC_VER_MAJOR__ + __CUDACC_VER_MINOR__ >= 11001 && __CUDA_ARCH__ >= 800
+#define ASYNC_COPIES
+#endif
+
+#if defined ASYNC_COPIES
+#include <cooperative_groups/memcpy_async.h>
+#endif
+
+#include <mma.h>
+
+#define NR_BASELINES		(NR_RECEIVERS * (NR_RECEIVERS + 1) / 2)
+#define ALIGN(A,N)		(((A)+(N)-1)/(N)*(N))
+
+#define NR_TIMES_PER_BLOCK	(128 / (NR_BITS))
+#define NR_RECEIVERS_PER_TCM_X	((NR_BITS) == 4 ? 2 : 4)
+#define NR_RECEIVERS_PER_TCM_Y	((NR_BITS) == 4 ? 4 : 8)
+
+#define COMPLEX			2
+
+#if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0)
+#error this architecture has no suitable tensor cores
+#endif
+
+#if __CUDA_ARCH__ != 700 && __CUDA_ARCH__ != 720 && __CUDA_ARCH__ != 750 && __CUDA_ARCH__ != 800 && __CUDA_ARCH__ != 860
+#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory)
+#endif
+
+#if NR_RECEIVERS_PER_BLOCK != 32 && NR_RECEIVERS_PER_BLOCK != 48 && NR_RECEIVERS_PER_BLOCK != 64
+#error unsupported NR_RECEIVERS_PER_BLOCK
+#endif
+
+#if NR_SAMPLES_PER_CHANNEL % NR_TIMES_PER_BLOCK != 0
+#error NR_SAMPLES_PER_CHANNEL should be a multiple of NR_TIMES_PER_BLOCK
+#endif
+
+#define MIN(A,B) ((A)<(B)?(A):(B))
+
+
+using namespace nvcuda::wmma;
+
+#if NR_BITS == 4
+typedef char    Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
+typedef int2    Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+#elif NR_BITS == 8
+typedef char2   Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
+typedef int2    Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+#elif NR_BITS == 16
+typedef __half2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
+typedef float2  Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+#endif
+
+
+#if NR_BITS == 4
+typedef fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> Afrag;
+typedef fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> Bfrag;
+typedef fragment<accumulator, 8, 8, 32, int>                                 Sum;
+#elif NR_BITS == 8
+typedef fragment<matrix_a, 16, 16, 16, signed char, row_major>               Afrag;
+typedef fragment<matrix_b, 16, 16, 16, signed char, col_major>               Bfrag;
+typedef fragment<accumulator, 16, 16, 16, int>                               Sum;
+#elif NR_BITS == 16
+typedef fragment<matrix_a, 16, 16, 16, __half, row_major>                    Afrag;
+typedef fragment<matrix_b, 16, 16, 16, __half, col_major>                    Bfrag;
+typedef fragment<accumulator, 16, 16, 16, float>                             Sum;
+#endif
+
+
+#if NR_BITS == 4
+typedef int2   ScratchSpace[4][NR_POLARIZATIONS][2][NR_POLARIZATIONS];
+#elif NR_BITS == 8
+typedef int2   ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS];
+#elif NR_BITS == 16
+typedef float2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS];
+#endif
+
+
+__device__ inline int conj_perm(int v)
+{
+#if NR_BITS == 4
+  //return ((v & 0x0F0F0F0F) << 4) | (__vnegss4(v >> 4) & 0x0F0F0F0F);
+  return ((v & 0x0F0F0F0F) << 4) | ((0xF0F0F0F0 - ((v >> 4) & 0x0F0F0F0F)) & 0x0F0F0F0F);
+#elif NR_BITS == 8
+  //return __byte_perm(v, __vnegss4(v), 0x2705);
+  return __byte_perm(v, 0x00FF00FF - (v & 0xFF00FF00), 0x2705);
+#elif NR_BITS == 16
+  return __byte_perm(v ^ 0x80000000, v, 0x1032);
+#endif
+}
+
+
+__device__ inline int2 conj_perm(int2 v)
+{
+  return make_int2(conj_perm(v.x), conj_perm(v.y));
+}
+
+
+__device__ inline int4 conj_perm(int4 v)
+{
+  return make_int4(conj_perm(v.x), conj_perm(v.y), conj_perm(v.z), conj_perm(v.w));
+}
+
+
+#if defined ASYNC_COPIES
+#define READ_AHEAD        MIN(2, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK)
+#define NR_SHARED_BUFFERS MIN(4, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK)
+#else
+#define READ_AHEAD        1
+#define NR_SHARED_BUFFERS 2
+#endif
+
+
+template <unsigned nrReceiversPerBlock = NR_RECEIVERS_PER_BLOCK> struct SharedData
+{
+#if NR_BITS == 4
+  typedef char        Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][1];
+  typedef char        Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 16][1];
+#elif NR_BITS == 8
+  typedef signed char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX];
+  typedef signed char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 8][COMPLEX];
+#elif NR_BITS == 16
+  typedef __half      Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX];
+  typedef __half      Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 4][COMPLEX];
+#endif
+};
+
+
+template <typename T> struct FetchData
+{
+  __device__ FetchData(unsigned loadRecv, unsigned loadPol, unsigned loadTime)
+  :
+    loadRecv(loadRecv), loadPol(loadPol), loadTime(loadTime), data({0})
+  {
+  }
+
+  __device__ void load(const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0)
+  {
+    if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS)
+      data = * (T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime];
+  }
+
+  template <typename SharedData> __device__ void storeA(SharedData samples) const
+  {
+    * ((T *) &samples[loadRecv][loadPol][loadTime][0]) = data;
+  }
+
+  template <typename SharedData> __device__ void storeB(SharedData samples) const
+  {
+    * ((T *) &samples[loadRecv][loadPol][0][loadTime][0]) = data;
+    * ((T *) &samples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(data);
+  }
+
+#if defined ASYNC_COPIES
+  template <typename Asamples> __device__ void copyAsyncA(nvcuda::experimental::pipeline &pipe, Asamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0)
+  {
+    if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS)
+      nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe);
+  }
+
+  template<typename Bsamples> __device__ void copyAsyncB(nvcuda::experimental::pipeline &pipe, Bsamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0)
+  {
+    if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS)
+      nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][0][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe);
+  }
+
+  template<typename Bsamples> __device__ void fixB(Bsamples bSamples)
+  {
+    * ((T *) &bSamples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(* ((T *) &bSamples[loadRecv][loadPol][0][loadTime][0]));
+  }
+#endif
+
+  unsigned loadRecv, loadPol, loadTime;
+  T        data;
+};
+
+
+__device__ inline int2 make_complex(int real, int imag)
+{
+  return make_int2(real, imag);
+}
+
+
+__device__ inline float2 make_complex(float real, float imag)
+{
+  return make_float2(real, imag);
+}
+
+
+template <typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI)
+{
+  if ((skipCheckX || recvX + tcX <= recvY + tcY) && (skipCheckY || recvY + tcY < NR_RECEIVERS))
+    visibilities[channel][baseline + tcY * recvY + tcY * (tcY + 1) / 2 + tcX][polY][polX] = make_complex(sumR, sumI);
+}
+
+
+__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned recvYoffset, unsigned recvXoffset, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp)
+{
+#if defined PORTABLE
+ store_matrix_sync(&scratchSpace[warp][0][0][0][0].x, sum, NR_BITS == 4 ? 8 : 16, mem_row_major);
+  __syncwarp();
+
+#if 0
+  if (threadIdx.x == 0)
+    for (unsigned _y = 0; _y < 8; _y ++)
+      for (unsigned pol_y = 0; pol_y < NR_POLARIZATIONS; pol_y ++)
+        for (unsigned _x = 0; _x < 4; _x ++)
+          for (unsigned pol_x = 0; pol_x < NR_POLARIZATIONS; pol_x ++)
+            if (scratchSpace[warp][_y][pol_y][_x][pol_x],x != 0 || scratchSpace[warp][_y][pol_y][_x][pol_x].y != 0)
+              printf("firstY=%u firstX=%u warp=%u y=%u x=%u _y=%u pol_y=%u _x=%u pol_x=%u val=(%f,%f)\n", firstReceiverY, firstReceiverX, warp, y, x, _y, pol_y, _x, pol_x, scratchSpace[warp][_y][pol_y][_x][pol_x].x, scratchSpace[warp][_y][pol_y][_x][pol_x].y);
+#endif
+
+#if NR_BITS == 4
+  unsigned _y       = threadIdx.x >> 3;
+  unsigned _x       = (threadIdx.x >> 2) & 1;
+  unsigned polY     = (threadIdx.x >> 1) & 1;
+  unsigned polX     = threadIdx.x & 1;
+#elif NR_BITS == 8 || NR_BITS == 16
+  unsigned _y       = threadIdx.x >> 2;
+  unsigned _x       = threadIdx.x & 3;
+#endif
+
+  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + _y;
+  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + _x;
+  unsigned baseline = (recvY * (recvY + 1) / 2) + recvX;
+
+  if ((skipCheckX || recvX <= recvY) && (skipCheckY || recvY < NR_RECEIVERS))
+#if NR_BITS == 4
+    visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX];
+#elif NR_BITS == 8 || NR_BITS == 16
+    for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++)
+      for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++)
+        visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX];
+#endif
+#else
+#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16)
+  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4);
+  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2);
+  unsigned polY     = threadIdx.x & 1;
+  unsigned polX     = (threadIdx.x >> 1) & 1;
+#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860
+  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3);
+  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1);
+  unsigned polY     = (threadIdx.x >> 2) & 1;
+  unsigned polX     = threadIdx.x & 1;
+#endif
+
+  unsigned baseline = (recvY * (recvY + 1) / 2) + recvX;
+
+#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16)
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 1, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 1, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]);
+#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]);
+#if NR_BITS == 8 || NR_BITS == 16
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 2, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]);
+  storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 2, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]);
+#endif
+#endif
+#endif
+}
+
+
+#define NR_WARPS 4
+
+#if NR_RECEIVERS_PER_BLOCK == 64
+
+template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities visibilities, const Samples samples, unsigned firstReceiver, unsigned warp, unsigned tid, SharedData<>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS])
+{
+  const unsigned nrFragmentsX = NR_BITS == 4 ? 12 : 6;
+  const unsigned nrFragmentsY = nrFragmentsX / 2;
+  Sum            sum[nrFragmentsX * nrFragmentsY];
+
+  for (auto &s : sum)
+    fill_fragment(s, 0);
+
+  unsigned channel = blockIdx.y;
+
+  const uchar2 offsets[] = {
+    make_uchar2( 0,  0),
+    make_uchar2( 0, 16),
+    make_uchar2( 0, 40),
+    make_uchar2(24, 40),
+  };
+
+  unsigned recvXoffset = offsets[warp].x;
+  unsigned recvYoffset = offsets[warp].y;
+
+  FetchData<int4> tmp0((tid >> 2)                            , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+  FetchData<int4> tmp1((tid >> 2) + NR_RECEIVERS_PER_BLOCK / 2, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+
+#if defined ASYNC_COPIES
+  using namespace nvcuda::experimental;
+  pipeline pipe;
+
+  for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) {
+    unsigned fetchBuffer = majorTime;
+
+    tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle);
+    tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle);
+
+    pipe.commit();
+  }
+#else
+  tmp0.load(samples, channel, 0, firstReceiver, fullTriangle);
+  tmp1.load(samples, channel, 0, firstReceiver, fullTriangle);
+#endif
+
+  for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) {
+    unsigned buffer = majorTime % NR_SHARED_BUFFERS;
+
+#if !defined ASYNC_COPIES
+    tmp0.storeB(bSamples[buffer]);
+    tmp1.storeB(bSamples[buffer]);
+#endif
+
+    unsigned majorReadTime = majorTime + READ_AHEAD;
+
+    if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) {
+#if defined ASYNC_COPIES
+      unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS;
+
+      tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle);
+      tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle);
+#else
+      tmp0.load(samples, channel, majorReadTime, firstReceiver, fullTriangle);
+      tmp1.load(samples, channel, majorReadTime, firstReceiver, fullTriangle);
+#endif
+    }
+
+#if defined ASYNC_COPIES
+    pipe.commit();
+    pipe.wait_prior<READ_AHEAD>();
+
+    tmp0.fixB(bSamples[buffer]);
+    tmp1.fixB(bSamples[buffer]);
+#endif
+
+    __syncthreads();
+
+#pragma unroll
+    for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) {
+      Afrag aFrag[nrFragmentsY];
+      Bfrag bFrag[nrFragmentsX];
+
+      if (warp != 0) {
+	for (unsigned y = 0; y < nrFragmentsY; y ++)
+	  load_matrix_sync(aFrag[y], &bSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS);
+											    
+	for (unsigned x = 0; x < nrFragmentsX; x ++)
+	  load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS);
+
+	for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++)
+	  for (unsigned x = 0; x < nrFragmentsX; x ++, i ++)
+	    mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]);
+      } else {
+	for (unsigned z = 0, i = 0; z < 3; z ++) {
+	  for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++)
+	    load_matrix_sync(aFrag[y], &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS);
+											    
+	  for (unsigned x = 0; x < (NR_BITS == 4 ? 8 : 4); x ++)
+	    load_matrix_sync(bFrag[x], &bSamples[buffer][/*recvXoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS);
+
+	  for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++)
+	    for (unsigned x = 0; x < 2 + 2 * y; x ++, i ++)
+	      mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]);
+	}
+      }
+    }
+  }
+
+#if defined PORTABLE
+  __syncthreads();
+#endif
+
+  if (warp != 0)
+    for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++)
+      for (unsigned x = 0; x < nrFragmentsX; x ++, i ++)
+	storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, recvYoffset, recvXoffset, y, x, fullTriangle, x < 2 * y + (NR_BITS == 4 ? 8 : 4), sum[i], scratchSpace, warp);
+  else
+    for (unsigned z = 0, i = 0; z < 3; z ++)
+      for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++)
+	for (unsigned x = 0; x < 2 * y + 2; x ++, i ++)
+	  storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, 24 * z, 24 * z, y, x, fullTriangle, x < 2 * y, sum[i], scratchSpace, warp);
+}
+
+#endif
+
+
+template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS])
+{
+  const unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_X / 2 / (NR_RECEIVERS_PER_BLOCK == 64 ? 2 : 1);
+
+  Sum sum[nrFragmentsY][nrFragmentsX];
+
+  for (unsigned y = 0; y < nrFragmentsY; y ++)
+    for (unsigned x = 0; x < nrFragmentsX; x ++)
+      fill_fragment(sum[y][x], 0);
+
+  unsigned tid     = warpSize * (blockDim.y * threadIdx.z + threadIdx.y) + threadIdx.x;
+  unsigned channel = blockIdx.y;
+
+  unsigned recvXoffset = nrFragmentsX * NR_RECEIVERS_PER_TCM_X * threadIdx.y;
+  unsigned recvYoffset = nrFragmentsY * NR_RECEIVERS_PER_TCM_Y * threadIdx.z;
+
+  FetchData<int4> tmpY0((tid >> 2)     , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+  FetchData<int4> tmpX0((tid >> 2)     , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+#if NR_RECEIVERS_PER_BLOCK == 48
+  FetchData<int2> tmpY1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3));
+  FetchData<int2> tmpX1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3));
+#elif NR_RECEIVERS_PER_BLOCK == 64
+  FetchData<int4> tmpY1((tid >> 2) + 32, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1));
+#endif
+
+#if defined ASYNC_COPIES
+  using namespace nvcuda::experimental;
+  pipeline pipe;
+
+  for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) {
+    unsigned fetchBuffer = majorTime;
+
+    tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+    tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck);
+#endif
+    tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48
+    tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck);
+#endif
+
+    pipe.commit();
+  }
+#else
+  tmpY0.load(samples, channel, 0, firstReceiverY, skipLoadYcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+  tmpY1.load(samples, channel, 0, firstReceiverY, skipLoadYcheck);
+#endif
+  tmpX0.load(samples, channel, 0, firstReceiverX, skipLoadXcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48
+  tmpX1.load(samples, channel, 0, firstReceiverX, skipLoadXcheck);
+#endif
+#endif
+
+  for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) {
+    unsigned buffer = majorTime % NR_SHARED_BUFFERS;
+
+#if !defined ASYNC_COPIES
+    tmpY0.storeA(aSamples[buffer]);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+    tmpY1.storeA(aSamples[buffer]);
+#endif
+    tmpX0.storeB(bSamples[buffer]);
+#if NR_RECEIVERS_PER_BLOCK == 48
+    tmpX1.storeB(bSamples[buffer]);
+#endif
+#endif
+
+    unsigned majorReadTime = majorTime + READ_AHEAD;
+
+    if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) {
+#if defined ASYNC_COPIES
+      unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS;
+
+      tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+      tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck);
+#endif
+      tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48
+      tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck);
+#endif
+#else
+      tmpY0.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64
+      tmpY1.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck);
+#endif
+      tmpX0.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck);
+#if NR_RECEIVERS_PER_BLOCK == 48
+      tmpX1.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck);
+#endif
+#endif
+    }
+
+#if defined ASYNC_COPIES
+    pipe.commit();
+    pipe.wait_prior<READ_AHEAD>();
+
+    tmpX0.fixB(bSamples[buffer]);
+#if NR_RECEIVERS_PER_BLOCK == 48
+    tmpX1.fixB(bSamples[buffer]);
+#endif
+#endif
+
+    __syncthreads();
+
+#pragma unroll
+    for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) {
+      Afrag aFrag[nrFragmentsY];
+      Bfrag bFrag[nrFragmentsX];
+
+      for (unsigned y = 0; y < nrFragmentsY; y ++)
+	load_matrix_sync(aFrag[y], &aSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][minorTime][0], sizeof(aSamples[0][0][0]) * 8 / NR_BITS);
+
+      for (unsigned x = 0; x < nrFragmentsX; x ++)
+	load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS);
+
+      for (unsigned y = 0; y < nrFragmentsY; y ++)
+	for (unsigned x = 0; x < nrFragmentsX; x ++)
+	  mma_sync(sum[y][x], aFrag[y], bFrag[x], sum[y][x]);
+    }
+  }
+
+#if 0
+  for (unsigned y = 0; y < nrFragmentsY; y ++)
+    for (unsigned x = 0; x < nrFragmentsX; x ++)
+      for (unsigned i = 0; i < sum[0][0].num_storage_elements; i ++)
+	if (sum[y][x].x[i] != 0)
+#if NR_BITS == 4 || NR_BITS == 8
+	  printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%d\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]);
+#else
+	  printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%f\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]);
+#endif
+#endif
+
+#if defined PORTABLE
+  __syncthreads();
+#endif
+
+  for (unsigned y = 0; y < nrFragmentsY; y ++)
+    for (unsigned x = 0; x < nrFragmentsX; x ++)
+      storeVisibilities(visibilities, channel, firstReceiverY, firstReceiverX, recvYoffset, recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize);
+}
+
+
+extern "C" __global__
+__launch_bounds__(NR_WARPS * 32, NR_RECEIVERS_PER_BLOCK == 32 ? 4 : 2)
+void correlate(Visibilities visibilities, const Samples samples)
+{
+  const unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2;
+
+  unsigned block = blockIdx.x;
+
+#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48
+  unsigned blockY = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2;
+  unsigned blockX = block - blockY * (blockY + 1) / 2;
+  unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK;
+#elif NR_RECEIVERS_PER_BLOCK == 64
+  unsigned blockY = (unsigned) sqrtf(block);
+  unsigned blockX = block - blockY * blockY;
+  unsigned firstReceiverX = blockX * (NR_RECEIVERS_PER_BLOCK / 2);
+#endif
+  unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK;
+
+  union shared {
+    struct {
+      SharedData<>::Asamples aSamples;
+      SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples bSamples;
+    } rectangle;
+    struct {
+      SharedData<>::Bsamples samples;
+    } triangle;
+    ScratchSpace scratchSpace[NR_WARPS];
+  };
+  
+  // the following hack is necessary to run the correlator in the OpenCL environment,
+  // as the maximum local memory size is 48K - 16 bytes.  Due to padding in bSamples,
+  // the last 16 bytes are not used, so allocate 16 fewer bytes.
+  __shared__ char rawbuffer[sizeof(union shared) - 16] __attribute__((aligned(16)));
+  union shared &u = (union shared &) rawbuffer;
+
+  if (firstReceiverX == firstReceiverY)
+#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48
+    doCorrelateRectangle<nrFragmentsY, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+#elif NR_RECEIVERS_PER_BLOCK == 64
+    if (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 && (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK))
+      doCorrelateTriangle<false>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace);
+    else
+      doCorrelateTriangle<true>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace);
+#endif
+#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0
+  else if (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverY >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK)
+    doCorrelateRectangle<(NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK + 2 * NR_RECEIVERS_PER_TCM_Y - 1) / NR_RECEIVERS_PER_TCM_Y / 2, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_Y) == 0, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+#endif
+  else
+    doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+}
diff --git a/test/Common/ComplexInt4.h b/test/Common/ComplexInt4.h
new file mode 100644
index 0000000..e448428
--- /dev/null
+++ b/test/Common/ComplexInt4.h
@@ -0,0 +1,21 @@
+#if !defined COMPLEX_INT4_T
+#define COMPLEX_INT4_T
+
+#include <complex>
+#include <limits>
+
+class complex_int4_t
+{
+  public:
+    complex_int4_t() {}
+    complex_int4_t(int real, int imag) { value = (imag << 4) | (real & 0xF); }
+    complex_int4_t operator = (const complex_int4_t &other) { value = other.value; return *this; }
+    int real() const { return value << (std::numeric_limits<int>::digits - 4) >> (std::numeric_limits<int>::digits - 4); }
+    int imag() const { return value << (std::numeric_limits<int>::digits - 8) >> (std::numeric_limits<int>::digits - 4); }
+    operator std::complex<int> () const { return std::complex<int>(real(), imag()); }
+
+  private:
+    char value;
+};
+
+#endif
diff --git a/test/Common/Config.h b/test/Common/Config.h
new file mode 100644
index 0000000..0ecedb4
--- /dev/null
+++ b/test/Common/Config.h
@@ -0,0 +1,22 @@
+#if !defined CONFIG_H
+#define CONFIG_H
+
+#if defined __ARM_ARCH
+#define UNIFIED_MEMORY // assume this is a Jetson Xavier
+#endif
+
+#undef MEASURE_POWER
+
+#define POL_X			0
+#define POL_Y			1
+#define NR_POLARIZATIONS	2
+
+#if !defined NR_TIMES
+#define NR_TIMES		768
+#endif
+
+#define REAL			0
+#define IMAG			1
+#define COMPLEX			2
+
+#endif
diff --git a/test/Common/Record.cc b/test/Common/Record.cc
new file mode 100644
index 0000000..8171486
--- /dev/null
+++ b/test/Common/Record.cc
@@ -0,0 +1,35 @@
+#include "test/Common/Record.h"
+
+
+#if defined MEASURE_POWER
+Record::Record(powersensor::PowerSensor &powerSensor)
+:
+  powerSensor(powerSensor)
+{
+}
+#endif
+
+
+#if defined MEASURE_POWER
+void Record::getPower(CUstream, CUresult, void *userData)
+{
+  Record *record = (Record *) userData;
+  record->state = record->powerSensor.read();
+}
+
+#endif
+
+
+void Record::enqueue(cu::Stream &stream)
+{
+  stream.record(event); // if this is omitted, the callback takes ~100 ms ?!
+
+#if defined MEASURE_POWER
+#if !defined TEGRA_QUIRKS
+  stream.addCallback(&Record::getPower, this); // does not work well on Tegra
+#else
+  stream.synchronize();
+  state = powerSensor.read();
+#endif
+#endif
+}
diff --git a/test/Common/Record.h b/test/Common/Record.h
new file mode 100644
index 0000000..2704748
--- /dev/null
+++ b/test/Common/Record.h
@@ -0,0 +1,33 @@
+#if !defined RECORD_H
+#define RECORD_H
+
+#include "test/Common/Config.h"
+
+#include "util/cu.h"
+
+#if defined MEASURE_POWER
+#include <powersensor/NVMLPowerSensor.h>
+#endif
+
+
+struct Record
+{
+  public:
+#if defined MEASURE_POWER
+    Record(powersensor::PowerSensor &);
+#endif
+
+    void enqueue(cu::Stream &);
+
+    mutable cu::Event event;
+
+#if defined MEASURE_POWER
+    powersensor::PowerSensor &powerSensor;
+    powersensor::State		   state;
+
+  private:
+    static void getPower(CUstream, CUresult, void *userData);
+#endif
+};
+
+#endif
diff --git a/test/Common/UnitTest.cc b/test/Common/UnitTest.cc
new file mode 100644
index 0000000..c05fb61
--- /dev/null
+++ b/test/Common/UnitTest.cc
@@ -0,0 +1,60 @@
+#include "test/Common/UnitTest.h"
+
+#include <iostream>
+
+
+UnitTest::UnitTest(unsigned deviceNumber)
+:
+  device(deviceNumber),
+  context(CU_CTX_SCHED_BLOCKING_SYNC, device)
+#if defined MEASURE_POWER
+, powerSensor(powersensor::nvml::NVMLPowerSensor::create(0))
+#endif
+{
+#pragma omp critical (clog)
+  std::clog << "running test on " << device.getName() << std::endl;
+
+#if 0 && defined MEASURE_POWER
+  powerSensor->dump("/tmp/sensor_readings");
+#endif
+}
+
+
+UnitTest::~UnitTest()
+{
+#if defined MEASURE_POWER
+  delete powerSensor;
+#endif
+}
+
+
+void UnitTest::report(const char *name, const Record &startRecord, const Record &stopRecord, uint64_t FLOPS, uint64_t bytes)
+{
+#if defined MEASURE_POWER
+  //powerSensor->mark(startRecord.state, name);
+
+  double Watt    = powersensor::PowerSensor::Watt(startRecord.state, stopRecord.state);
+#endif
+  
+  double runtime = stopRecord.event.elapsedTime(startRecord.event) * 1e-3;
+
+#pragma omp critical (cout)
+  { 
+    std::cout << name << ": " << runtime << " s";
+
+    if (FLOPS != 0)
+      std::cout << ", " << FLOPS / runtime * 1e-12 << " TOPS";
+       
+    if (bytes != 0)
+      std::cout << ", " << bytes / runtime * 1e-9 << " GB/s";
+
+#if defined MEASURE_POWER
+    std::cout << ", " << Watt << " W";
+
+    if (FLOPS != 0)
+      std::cout << ", " << FLOPS / runtime / Watt * 1e-9 << " GOPS/W";
+#endif
+
+    std::cout << std::endl;
+  }
+}
diff --git a/test/Common/UnitTest.h b/test/Common/UnitTest.h
new file mode 100644
index 0000000..dce8b47
--- /dev/null
+++ b/test/Common/UnitTest.h
@@ -0,0 +1,30 @@
+#if !defined UNIT_TEST_H
+#define UNIT_TEST_H
+
+#include "test/Common/Record.h"
+#include "util/cu.h"
+
+#if defined MEASURE_POWER
+#include <powersensor/NVMLPowerSensor.h>
+#endif
+
+
+class UnitTest
+{
+  public:
+    UnitTest(unsigned deviceNumber);
+    ~UnitTest();
+
+  protected:
+    void report(const char *name, const Record &startRecord, const Record &stopRecord, uint64_t FLOPS = 0, uint64_t bytes = 0);
+
+    cu::Device  device;
+    cu::Context context;
+    cu::Stream  stream;
+
+#if defined MEASURE_POWER
+    powersensor::PowerSensor *powerSensor;
+#endif
+};
+
+#endif
diff --git a/test/CorrelatorTest/CorrelatorTest.cc b/test/CorrelatorTest/CorrelatorTest.cc
new file mode 100644
index 0000000..23bc7c1
--- /dev/null
+++ b/test/CorrelatorTest/CorrelatorTest.cc
@@ -0,0 +1,239 @@
+#include "test/Common/ComplexInt4.h"
+#include "test/Common/Record.h"
+#include "test/CorrelatorTest/CorrelatorTest.h"
+#include "util/ExceptionPropagator.h"
+#include "util/nvrtc.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#define GNU_SOURCE
+#include <link.h>
+#include <omp.h>
+
+
+CorrelatorTest::CorrelatorTest(const Options &options)
+:
+  UnitTest(options.deviceNumber),
+  options(options),
+  correlator(options.nrBits, options.nrReceivers, options.nrChannels, options.nrSamplesPerChannel, options.nrPolarizations, options.nrReceiversPerBlock)
+{
+#if defined MEASURE_POWER
+  Record start(*powerSensor), stop(*powerSensor);
+#else
+  Record start, stop;
+#endif
+
+  start.enqueue(stream);
+
+  switch (options.nrBits) {
+    case  4 : doTest<complex_int4_t, std::complex<int32_t>>();
+	      break;
+
+    case  8 : doTest<std::complex<int8_t>, std::complex<int32_t>>();
+	      break;
+
+    case 16 : doTest<std::complex<__half>, std::complex<float>>();
+	      break;
+  }
+
+  stop.enqueue(stream);
+  stream.synchronize();
+
+  report("total              ", start, stop, options.innerRepeatCount * options.outerRepeatCount * correlator.FLOPS());
+}
+
+
+template <typename SampleType, typename VisibilityType> void CorrelatorTest::doTest()
+{
+  omp_set_nested(1);
+
+  ExceptionPropagator ep;
+
+#pragma omp parallel num_threads(2)
+  ep([&] () {  
+    context.setCurrent();
+
+    multi_array::extent<5> samplesExtent(multi_array::extents[options.nrChannels][options.nrSamplesPerChannel / options.nrTimesPerBlock][options.nrReceivers][options.nrPolarizations][options.nrTimesPerBlock]);
+    multi_array::extent<4> visibilitiesExtent(multi_array::extents[options.nrChannels][options.nrBaselines()][options.nrPolarizations][options.nrPolarizations]);
+
+    cu::HostMemory hostSamples(sizeof(SampleType) * samplesExtent.size, CU_MEMHOSTALLOC_WRITECOMBINED);
+    cu::HostMemory hostVisibilities(sizeof(VisibilityType) * visibilitiesExtent.size);
+
+#if defined UNIFIED_MEMORY
+    cu::DeviceMemory deviceSamples(hostSamples);
+    cu::DeviceMemory deviceVisibilities(hostVisibilities);
+#else
+    cu::DeviceMemory deviceSamples(sizeof(SampleType) * samplesExtent.size);
+    cu::DeviceMemory deviceVisibilities(sizeof(VisibilityType) * visibilitiesExtent.size);
+
+    cu::Stream hostToDeviceStream, deviceToHostStream;
+#endif
+
+    multi_array::array_ref<SampleType, 5> samplesRef(* (SampleType *) hostSamples, samplesExtent);
+    multi_array::array_ref<VisibilityType, 4> visibilitiesRef(* (VisibilityType *) hostVisibilities, visibilitiesExtent);
+
+    setTestPattern<SampleType>(samplesRef);
+
+#pragma omp for schedule(dynamic), ordered
+    for (int i = 0; i < options.outerRepeatCount; i ++)
+      if (!ep)
+	ep([&] () {
+#if !defined UNIFIED_MEMORY
+	  cu::Event inputDataTransferred, executeFinished, executeFinished2;
+#endif
+
+#if defined MEASURE_POWER
+	  Record hostToDeviceRecordStart(*powerSensor), hostToDeviceRecordStop(*powerSensor);
+	  Record computeRecordStart(*powerSensor), computeRecordStop(*powerSensor);
+	  Record deviceToHostRecordStart(*powerSensor), deviceToHostRecordStop(*powerSensor);
+#else
+	  Record hostToDeviceRecordStart, hostToDeviceRecordStop;
+	  Record computeRecordStart, computeRecordStop;
+	  Record deviceToHostRecordStart, deviceToHostRecordStop;
+#endif
+
+#pragma omp critical (GPU) // TODO: use multiple locks when using multiple GPUs
+	  ep([&] () {
+#if !defined UNIFIED_MEMORY
+	    hostToDeviceRecordStart.enqueue(hostToDeviceStream);
+	    hostToDeviceStream.memcpyHtoDAsync(deviceSamples, hostSamples, samplesRef.bytesize());
+	    hostToDeviceRecordStop.enqueue(hostToDeviceStream);
+
+	    stream.waitEvent(hostToDeviceRecordStop.event);
+#endif
+
+	    computeRecordStart.enqueue(stream);
+
+	    for (unsigned j = 0; j < options.innerRepeatCount; j ++)
+	      correlator.launchAsync(stream, deviceVisibilities, deviceSamples);
+
+	    computeRecordStop.enqueue(stream);
+
+#if !defined UNIFIED_MEMORY
+	    deviceToHostStream.waitEvent(computeRecordStop.event);
+	    deviceToHostRecordStart.enqueue(deviceToHostStream);
+	    deviceToHostStream.memcpyDtoHAsync(hostVisibilities, deviceVisibilities, visibilitiesRef.bytesize());
+	    deviceToHostRecordStop.enqueue(deviceToHostStream);
+#endif
+	  });
+
+#if !defined UNIFIED_MEMORY
+	  deviceToHostStream.synchronize();
+#else
+	  stream.synchronize();
+#endif
+
+	  if (i == options.outerRepeatCount - 1 && options.verifyOutput)
+	    verifyOutput<SampleType, VisibilityType>(samplesRef, visibilitiesRef);
+
+#if !defined UNIFIED_MEMORY
+	  report("host-to-device     ", hostToDeviceRecordStart, hostToDeviceRecordStop, 0, samplesRef.bytesize());
+#endif
+
+	  report("correlate-total    ", computeRecordStart, computeRecordStop, options.innerRepeatCount * correlator.FLOPS());
+
+#if !defined UNIFIED_MEMORY
+	  report("device-to-host     ", deviceToHostRecordStart, deviceToHostRecordStop, 0, visibilitiesRef.bytesize());
+#endif
+	});
+  });
+}
+
+
+template<typename SampleType> void CorrelatorTest::setTestPattern(const multi_array::array_ref<SampleType, 5> &samples)
+{
+#if 0
+  memset(samples.begin(), 0, samples.bytesize());
+
+  unsigned channel = options.nrChannels / 3;
+  unsigned time    = options.nrSamplesPerChannel / 5;
+  unsigned recv0   = options.nrReceivers > 174 ? 174 : options.nrReceivers / 3;
+  unsigned recv1   = options.nrReceivers > 418 ? 418 : options.nrReceivers / 2;
+
+  samples[channel][time / options.nrTimesPerBlock][recv0][POL_X][time % options.nrTimesPerBlock] = SampleType(2.0, 3.0);
+  samples[channel][time / options.nrTimesPerBlock][recv1][POL_X][time % options.nrTimesPerBlock] = SampleType(4.0, 5.0);
+#else
+  SampleType randomValues[7777]; // use a limited set of random numbers to save time
+
+  for (unsigned i = 0; i < 7777; i ++)
+    randomValues[i] = randomValue<SampleType>();
+
+  unsigned i = 0;
+
+  for (SampleType &sample : samples)
+    sample = randomValues[i ++ % 7777U];
+#endif
+}
+
+
+template<typename SampleType, typename VisibilityType> void CorrelatorTest::verifyOutput(const multi_array::array_ref<SampleType, 5> &samples, const multi_array::array_ref<VisibilityType, 4> &visibilities) const
+{
+  std::atomic<int> count(0);
+  ExceptionPropagator ep;
+
+#if 1
+  std::cout << "verifying ..." << std::endl;
+
+#pragma omp parallel for schedule (dynamic)
+  for (unsigned channel = 0; channel < options.nrChannels; channel ++)
+    ep([&] () {
+      multi_array::array<VisibilityType, 3> sum(multi_array::extents[options.nrBaselines()][options.nrPolarizations][options.nrPolarizations]);
+
+      memset(sum.begin(), 0, sum.bytesize());
+
+      for (unsigned major_time = 0; major_time < options.nrSamplesPerChannel / options.nrTimesPerBlock; major_time ++) {
+	multi_array::array_ref<SampleType, 3> ref = samples[channel][major_time];
+
+	for (unsigned recv1 = 0, baseline = 0; recv1 < options.nrReceivers; recv1 ++)
+	  for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++)
+	    for (unsigned minor_time = 0; minor_time < options.nrTimesPerBlock; minor_time ++)
+	      for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
+		for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) {
+		  SampleType sample0 = ref[recv0][pol0][minor_time];
+		  SampleType sample1 = ref[recv1][pol1][minor_time];
+
+		  sum[baseline][pol1][pol0] += VisibilityType(sample1.real(), sample1.imag()) * conj(VisibilityType(sample0.real(), sample0.imag()));
+		}
+      }
+
+      for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++)
+	for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
+	  for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++)
+	    if (!approximates(visibilities[channel][baseline][pol1][pol0], sum[baseline][pol1][pol0]) && ++ count < 100)
+#pragma omp critical (cout)
+	      ep([&] () {
+		std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl;
+	      });
+    });
+
+  std::cout << "#errors = " << count << std::endl;
+#else
+  for (unsigned channel = 0; channel < options.nrChannels; channel ++)
+    for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++)
+      for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
+	for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++)
+	    if (visibilities[channel][baseline][pol0][pol1] != (VisibilityType)(0, 0))
+	      if (count ++ < 10)
+		std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl;
+#endif
+}
+
+
+int main(int argc, char *argv[])
+{
+  try {
+    cu::init();
+    Options options(argc, argv);
+    CorrelatorTest test(options);
+  } catch (cu::Error &error) {
+    std::cerr << "cu::Error: " << error.what() << std::endl;
+  } catch (nvrtc::Error &error) {
+    std::cerr << "nvrtc::Error: " << error.what() << std::endl;
+  } catch (Options::Error &error) {
+    std::cerr << "Options::Error: " << error.what() << std::endl;
+  }
+
+  return 0;
+}
diff --git a/test/CorrelatorTest/CorrelatorTest.h b/test/CorrelatorTest/CorrelatorTest.h
new file mode 100644
index 0000000..7e2032c
--- /dev/null
+++ b/test/CorrelatorTest/CorrelatorTest.h
@@ -0,0 +1,61 @@
+#if !defined CORRELATOR_TEST_H
+#define CORRELATOR_TEST_H
+
+#include "test/Common/ComplexInt4.h"
+#include "test/Common/UnitTest.h"
+#include "test/CorrelatorTest/Options.h"
+#include "libtcc/Correlator.h"
+#include "util/multi_array.h"
+
+#include <cuda_fp16.h>
+
+
+class CorrelatorTest : public UnitTest
+{
+  public:
+    CorrelatorTest(const Options &);
+
+  private:
+    template<typename SampleType, typename VisibilityType> void doTest();
+    template<typename SampleType>                          void setTestPattern(const multi_array::array_ref<SampleType, 5> &samples);
+    template<typename SampleType, typename VisibilityType> void verifyOutput(const multi_array::array_ref<SampleType, 5> &samples, const multi_array::array_ref<VisibilityType, 4> &visibilities) const;
+
+    template<typename SampleType> static SampleType randomValue();
+    template<typename VisibilityType> bool approximates(const VisibilityType &a, const VisibilityType &b) const;
+
+    tcc::Correlator correlator;
+    Options         options;
+};
+
+
+template<> complex_int4_t CorrelatorTest::randomValue<complex_int4_t>()
+{
+  return complex_int4_t(16 * drand48() - 8, 16 * drand48() - 8);
+}
+
+
+template<> std::complex<int8_t> CorrelatorTest::randomValue<std::complex<int8_t>>()
+{
+  return std::complex<int8_t>(256 * drand48() - 128, 256 * drand48() - 128);
+}
+
+
+template<> std::complex<__half> CorrelatorTest::randomValue<std::complex<__half>>()
+{
+  return std::complex<__half>(drand48() - .5, drand48() - .5);
+}
+
+template <typename VisibilityType> bool CorrelatorTest::approximates(const VisibilityType &a, const VisibilityType &b) const
+{
+  return a == b;
+}
+
+
+template <> bool CorrelatorTest::approximates(const std::complex<float> &a, const std::complex<float> &b) const
+{
+  float absolute = abs(a - b), relative = abs(a / b);
+
+  return (relative > .999 && relative < 1.001) || absolute < .0001 * options.nrSamplesPerChannel;
+}
+
+#endif
diff --git a/test/CorrelatorTest/Options.cc b/test/CorrelatorTest/Options.cc
new file mode 100644
index 0000000..2e27f1f
--- /dev/null
+++ b/test/CorrelatorTest/Options.cc
@@ -0,0 +1,89 @@
+#include "test/CorrelatorTest/Options.h"
+
+#include <cstdlib>
+#include <iostream>
+
+#include <unistd.h>
+
+
+Options::Options(int argc, char *argv[])
+:
+  nrBits(8),
+  nrChannels(480),
+  nrReceivers(576),
+  nrReceiversPerBlock(64),
+  nrSamplesPerChannel(3072),
+  nrTimesPerBlock(128 / nrBits),
+  innerRepeatCount(1), outerRepeatCount(1),
+  deviceNumber(0),
+  verifyOutput(true)
+{
+  opterr = 0;
+
+  for (int opt; (opt = getopt(argc, argv, "b:c:d:hn:N:r:R:t:V:")) >= 0;)
+    switch (opt) {
+      case 'b' : nrBits = atoi(optarg);
+		 break;
+
+      case 'c' : nrChannels = atoi(optarg);
+		 break;
+
+      case 'd' : deviceNumber = atoi(optarg);
+		 break;
+
+      case 'h' : std::cout << usage(argv[0]) << std::endl;
+		 exit(0);
+
+      case 'n' : nrReceivers = atoi(optarg);
+		 break;
+
+      case 'N' : nrReceiversPerBlock = atoi(optarg);
+		 break;
+
+      case 'r' : innerRepeatCount = atoi(optarg);
+		 break;
+
+      case 'R' : outerRepeatCount = atoi(optarg);
+		 break;
+
+      case 't' : nrSamplesPerChannel = atoi(optarg);
+		 break;
+
+      case 'V' : verifyOutput = atoi(optarg);
+		 break;
+
+      default  : throw Error(usage(argv[0]));
+    }
+
+  if (nrBits != 4 && nrBits != 8 && nrBits != 16)
+    throw Error("nrBits must be 4, 8, or 16");
+
+  if (nrChannels == 0)
+    throw Error("nrChannels must be > 0");
+
+  if (nrReceivers == 0)
+    throw Error("nrReceivers must be > 0");
+
+  if (nrReceiversPerBlock != 32 && nrReceiversPerBlock != 48 && nrReceiversPerBlock != 64)
+    throw Error("nrReceiversPerBlock must be 32, 48, or 64");
+
+  if (nrSamplesPerChannel == 0)
+    throw Error("nrSamplesPerChannel must be > 0");
+
+  nrTimesPerBlock = 128 / nrBits;
+
+  if (nrSamplesPerChannel % nrTimesPerBlock != 0)
+    throw Error("nrSamplesPerChannel must be a multiple of " + std::to_string(nrTimesPerBlock));
+}
+
+
+std::string Options::usage(const std::string &execName)
+{
+  return "usage: " + execName + " [-b nrBits] [-c nrChannels] [-n nrReceivers] [-N nrReceiversPerBlock] [-r innerRepeatCount] [-R outerRepeatCount] [-t nrSamplesPerChannel] [-V verifyOutput]";
+}
+
+
+const char *Options::Error::what() const noexcept
+{
+  return msg.c_str();
+}
diff --git a/test/CorrelatorTest/Options.h b/test/CorrelatorTest/Options.h
new file mode 100644
index 0000000..9e51307
--- /dev/null
+++ b/test/CorrelatorTest/Options.h
@@ -0,0 +1,45 @@
+#if !defined OPTIONS_H
+#define OPTIONS_H
+
+#include <exception>
+#include <string>
+
+
+class Options
+{
+  public:
+    class Error : public std::exception {
+      public:
+	Error(const std::string &msg)
+	:
+	  msg(msg)
+	{
+	}
+
+	virtual const char *what() const noexcept;
+
+      private:
+	std::string msg;
+    };
+
+    Options(int argc, char *argv[]);
+
+    unsigned nrBaselines() const { return nrReceivers * (nrReceivers + 1) / 2; }
+
+    unsigned nrBits;
+    unsigned nrChannels;
+    unsigned nrReceivers;
+    unsigned nrReceiversPerBlock;
+    unsigned nrSamplesPerChannel;
+    unsigned nrTimesPerBlock;
+    unsigned innerRepeatCount, outerRepeatCount;
+    unsigned deviceNumber;
+    bool     verifyOutput;
+
+    static const unsigned nrPolarizations = 2;
+
+  private:
+    static std::string usage(const std::string &execName);
+};
+
+#endif
diff --git a/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc
new file mode 100644
index 0000000..d28321f
--- /dev/null
+++ b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc
@@ -0,0 +1,450 @@
+#define NR_BITS 8
+#define NR_CHANNELS 480
+#define NR_POLARIZATIONS 2
+#define NR_SAMPLES_PER_CHANNEL 3072
+#define NR_RECEIVERS 576
+#define NR_BASELINES ((NR_RECEIVERS) * ((NR_RECEIVERS) + 1) / 2)
+#define NR_RECEIVERS_PER_BLOCK 64
+#define NR_TIMES_PER_BLOCK (128 / (NR_BITS))
+
+
+#include "test/Common/ComplexInt4.h"
+
+#include <complex>
+#include <exception>
+//#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <omp.h>
+#include <sys/types.h>
+
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+#include <CL/cl_ext.h>
+
+
+#if NR_BITS == 4
+typedef complex_int4_t        Sample;
+typedef std::complex<int32_t> Visibility;
+#elif NR_BITS == 8
+typedef std::complex<int8_t>  Sample;
+typedef std::complex<int32_t> Visibility;
+#elif NR_BITS == 16
+typedef std::complex<__half>  Sample;
+typedef std::complex<float>   Visibility;
+#endif
+
+typedef Sample Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
+typedef Visibility Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+
+
+std::string errorMessage(cl_int error)
+{
+  switch (error) {
+  case CL_SUCCESS:                            return "Success!";
+  case CL_DEVICE_NOT_FOUND:                   return "Device not found.";
+  case CL_DEVICE_NOT_AVAILABLE:               return "Device not available";
+  case CL_COMPILER_NOT_AVAILABLE:             return "Compiler not available";
+  case CL_MEM_OBJECT_ALLOCATION_FAILURE:      return "Memory object allocation failure";
+  case CL_OUT_OF_RESOURCES:                   return "Out of resources";
+  case CL_OUT_OF_HOST_MEMORY:                 return "Out of host memory";
+  case CL_PROFILING_INFO_NOT_AVAILABLE:       return "Profiling information not available";
+  case CL_MEM_COPY_OVERLAP:                   return "Memory copy overlap";
+  case CL_IMAGE_FORMAT_MISMATCH:              return "Image format mismatch";
+  case CL_IMAGE_FORMAT_NOT_SUPPORTED:         return "Image format not supported";
+  case CL_BUILD_PROGRAM_FAILURE:              return "Program build failure";
+  case CL_MAP_FAILURE:                        return "Map failure";
+  case CL_INVALID_VALUE:                      return "Invalid value";
+  case CL_INVALID_DEVICE_TYPE:                return "Invalid device type";
+  case CL_INVALID_PLATFORM:                   return "Invalid platform";
+  case CL_INVALID_DEVICE:                     return "Invalid device";
+  case CL_INVALID_CONTEXT:                    return "Invalid context";
+  case CL_INVALID_QUEUE_PROPERTIES:           return "Invalid queue properties";
+  case CL_INVALID_COMMAND_QUEUE:              return "Invalid command queue";
+  case CL_INVALID_HOST_PTR:                   return "Invalid host pointer";
+  case CL_INVALID_MEM_OBJECT:                 return "Invalid memory object";
+  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    return "Invalid image format descriptor";
+  case CL_INVALID_IMAGE_SIZE:                 return "Invalid image size";
+  case CL_INVALID_SAMPLER:                    return "Invalid sampler";
+  case CL_INVALID_BINARY:                     return "Invalid binary";
+  case CL_INVALID_BUILD_OPTIONS:              return "Invalid build options";
+  case CL_INVALID_PROGRAM:                    return "Invalid program";
+  case CL_INVALID_PROGRAM_EXECUTABLE:         return "Invalid program executable";
+  case CL_INVALID_KERNEL_NAME:                return "Invalid kernel name";
+  case CL_INVALID_KERNEL_DEFINITION:          return "Invalid kernel definition";
+  case CL_INVALID_KERNEL:                     return "Invalid kernel";
+  case CL_INVALID_ARG_INDEX:                  return "Invalid argument index";
+  case CL_INVALID_ARG_VALUE:                  return "Invalid argument value";
+  case CL_INVALID_ARG_SIZE:                   return "Invalid argument size";
+  case CL_INVALID_KERNEL_ARGS:                return "Invalid kernel arguments";
+  case CL_INVALID_WORK_DIMENSION:             return "Invalid work dimension";
+  case CL_INVALID_WORK_GROUP_SIZE:            return "Invalid work group size";
+  case CL_INVALID_WORK_ITEM_SIZE:             return "Invalid work item size";
+  case CL_INVALID_GLOBAL_OFFSET:              return "Invalid global offset";
+  case CL_INVALID_EVENT_WAIT_LIST:            return "Invalid event wait list";
+  case CL_INVALID_EVENT:                      return "Invalid event";
+  case CL_INVALID_OPERATION:                  return "Invalid operation";
+  case CL_INVALID_GL_OBJECT:                  return "Invalid OpenGL object";
+  case CL_INVALID_BUFFER_SIZE:                return "Invalid buffer size";
+  case CL_INVALID_MIP_LEVEL:                  return "Invalid mip-map level";
+  case CL_INVALID_GLOBAL_WORK_SIZE:           return "Invalid global work size";
+#if defined CL_INVALID_PROPERTY
+  case CL_INVALID_PROPERTY:                   return "Invalid property";
+#endif
+#if defined CL_INVALID_IMAGE_DESCRIPTOR
+  case CL_INVALID_IMAGE_DESCRIPTOR:           return "Invalid image descriptor";
+#endif
+#if defined CL_INVALID_COMPILER_OPTIONS
+  case CL_INVALID_COMPILER_OPTIONS:           return "Invalid compiler options";
+#endif
+#if defined CL_INVALID_LINKER_OPTIONS
+  case CL_INVALID_LINKER_OPTIONS:             return "Invalid linker options";
+#endif
+#if defined CL_INVALID_DEVICE_PARTITION_COUNT
+  case CL_INVALID_DEVICE_PARTITION_COUNT:     return "Invalid device partition count";
+#endif
+  default:                                    std::stringstream str;
+    str << "Unknown (" << error << ')';
+    return str.str();
+  }
+}
+
+
+void createContext(cl::Context &context, std::vector<cl::Device> &devices)
+{
+  const char *platformName = getenv("PLATFORM");
+
+#if defined __linux__
+  if (platformName == 0)
+#endif
+  //platformName = "Intel(R) FPGA SDK for OpenCL(TM)";
+  //platformName = Intel(R) OpenCL";
+  platformName = "NVIDIA CUDA";
+  //platformName = "AMD Accelerated Parallel Processing";
+
+  cl_device_type type = CL_DEVICE_TYPE_DEFAULT;
+
+  const char *deviceType = getenv("TYPE");
+
+  if (deviceType != 0) {
+    if (strcmp(deviceType, "GPU") == 0)
+      type = CL_DEVICE_TYPE_GPU;
+    else if (strcmp(deviceType, "CPU") == 0)
+      type = CL_DEVICE_TYPE_CPU;
+    else if (strcmp(deviceType, "ACCELERATOR") == 0)
+      type = CL_DEVICE_TYPE_ACCELERATOR;
+    else
+      std::cerr << "Unrecognized device type: " << deviceType;
+  }
+
+  const char *deviceName = getenv("DEVICE");
+
+  std::vector<cl::Platform> platforms;
+  cl::Platform::get(&platforms);
+
+  for (cl::Platform &platform : platforms) {
+    std::clog << "Platform name: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
+    std::clog << "Platform version: " << platform.getInfo<CL_PLATFORM_VERSION>() << std::endl;
+    std::clog << "Platform extensions: " << platform.getInfo<CL_PLATFORM_EXTENSIONS>() << std::endl;
+  }
+
+  for (cl::Platform &platform : platforms) {
+    if (strcmp(platform.getInfo<CL_PLATFORM_NAME>().c_str(), platformName) == 0) {
+      platform.getDevices(type, &devices);
+
+      for (cl::Device &device : devices) {
+	std::clog << "Device: " << device.getInfo<CL_DEVICE_NAME>() << ", "
+		      "capability: " << device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>() << '.' << device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>()
+//#if defined CL_DEVICE_TOPOLOGY_AMD
+//		      ", PCIe bus 0x" << std::hex << std::setiosflags (std::ios::uppercase) << std::setfill('0') << std::setw(2) << (device.getInfo<CL_DEVICE_TOPOLOGY_AMD>().pcie.bus & 255) << std::dec
+//#endif
+	          << std::endl;
+      }
+
+      context = cl::Context(devices);
+      return;
+    }
+  }
+
+  std::cerr << "Platform not found: \"" << platformName << '"' << std::endl;
+  exit(1);
+}
+
+
+cl::Program createProgramFromSources(cl::Context &context, std::vector<cl::Device> &devices, const std::string &sources, const char *args)
+{
+  std::stringstream cmd;
+  cmd << "#include \"" << sources << '"' << std::endl;
+#if defined CL_VERSION_1_2
+  cl::Program program(context, cmd.str());
+#else
+  std::string str = cmd.str();
+  cl::Program::Sources src(1, std::make_pair(str.c_str(), str.length()));
+  cl::Program program(context, src);
+#endif
+
+  try {
+    program.build(devices, args);
+    std::string msg;
+    program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &msg);
+
+    std::clog << msg << std::endl;
+  } catch (cl::Error &error) {
+    if (strcmp(error.what(), "clBuildProgram") == 0) {
+      std::string msg;
+      program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &msg);
+
+      std::cerr << msg << std::endl;
+      exit(1);
+    } else {
+      throw;
+    }
+  }
+
+#if 0
+  std::vector<size_t> binarySizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
+  std::vector<char *> binaries = program.getInfo<CL_PROGRAM_BINARIES>();
+
+  for (unsigned i = 0; i < binaries.size(); i++) {
+    std::stringstream filename;
+    filename << sources << '-' << i << ".ptx";
+    std::ofstream(filename.str().c_str(), std::ofstream::binary).write(binaries[i], binarySizes[i]);
+  }
+
+  for (unsigned b = 0; b < binaries.size(); b++)
+    delete [] binaries[b];
+#endif
+
+  return program;
+}
+
+
+cl::Program createProgramFromBinaries(cl::Context &context, std::vector<cl::Device> &devices)
+{
+#if 1
+  cl::Program::Binaries binaries(devices.size());
+  std::vector<std::string> ptx_code(devices.size());
+
+  for (unsigned device = 0; device < devices.size(); device ++) {
+    unsigned compute_capability = 10 * devices[device].getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>() + devices[device].getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>();
+    std::stringstream command;
+
+    command << "nvcc"
+	    << " -ptx"
+	    << " -arch=compute_" << compute_capability
+	    << " -code=sm_" << compute_capability
+	    << " -DNR_BITS=" << NR_BITS
+	    << " -DNR_RECEIVERS=" << NR_RECEIVERS
+	    << " -DNR_CHANNELS=" << NR_CHANNELS
+	    << " -DNR_SAMPLES_PER_CHANNEL=" << NR_SAMPLES_PER_CHANNEL
+	    << " -DNR_POLARIZATIONS=" << NR_POLARIZATIONS
+	    << " -DNR_RECEIVERS_PER_BLOCK=" << NR_RECEIVERS_PER_BLOCK
+	    << " -o -"
+	    << " libtcc/TCCorrelator.cu"
+	    << "|sed -e s/.param\\ .[a-zA-Z0-9]*/\\&\\ .ptr\\ .global/";
+
+    std::clog << "executing: " << command.str() << std::endl;
+    FILE *pipe = popen(command.str().c_str(), "r");
+
+    while (!feof(pipe)) {
+      char buffer[1024];
+
+      if (fgets(buffer, 1024, pipe) != NULL)
+       ptx_code[device] += buffer;
+    }
+
+    binaries[device] = (std::make_pair(ptx_code[device].c_str(), ptx_code[device].length()));
+    pclose(pipe);
+  }
+
+#else
+  std::ifstream ifs(name, std::ios::in | std::ios::binary);
+  std::string str((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
+
+  cl::Program::Binaries binaries(devices.size(), std::make_pair(str.c_str(), str.length()));
+#endif
+
+  cl::Program program(context, devices, binaries);
+  program.build();
+
+  return program;
+} 
+
+
+void setTestPattern(cl::CommandQueue &queue, cl::Buffer &samplesBuffer)
+{
+  Samples &samples = * (Samples *) queue.enqueueMapBuffer(samplesBuffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(Samples));
+
+#if 0
+  memset(samples, 0, sizeof(Samples));
+
+  unsigned channel = NR_CHANNELS / 3;
+  unsigned time    = NR_SAMPLES_PER_CHANNEL / 5;
+  unsigned recv0   = NR_RECEIVERS > 174 ? 174 : NR_RECEIVERS / 3;
+  unsigned recv1   = NR_RECEIVERS > 418 ? 418 : NR_RECEIVERS / 2;
+
+#if NR_BITS == 4
+  samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = complex_int4_t(2, 3);
+  samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = complex_int4_t(4, 5);
+#elif NR_BITS == 8
+  samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<int8_t>(2, 3);
+  samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<int8_t>(4, 5);
+#elif NR_BITS == 16
+  samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<__half>(2.0f, 3.0f);
+  samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<__half>(4.0f, 5.0f);
+#endif
+#else
+  Sample randomValues[7777];
+
+  for (unsigned i = 0; i < 7777; i ++)
+#if NR_BITS == 4
+    randomValues[i] = complex_int4_t(16 * drand48() - 8, 16 * drand48() - 8);
+#elif NR_BITS == 8
+    randomValues[i] = std::complex<int8_t>(256 * drand48() - 128, 256 * drand48() - 128);
+#elif NR_BITS == 16
+    randomValues[i] = std::complex<__half>(drand48() - .5, drand48() - .5);
+#endif
+
+#pragma omp parallel for schedule (dynamic)
+  for (unsigned channel = 0; channel < NR_CHANNELS; channel ++)
+    for (unsigned time_major = 0, i = channel; time_major < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; time_major ++)
+      for (unsigned receiver = 0; receiver < NR_RECEIVERS; receiver ++)
+        for (unsigned polarization = 0; polarization < NR_POLARIZATIONS; polarization ++)
+          for (unsigned time_minor = 0; time_minor < NR_TIMES_PER_BLOCK; time_minor ++, i ++)
+            samples[channel][time_major][receiver][polarization][time_minor] = randomValues[i % 7777U];
+#endif
+
+  queue.enqueueUnmapMemObject(samplesBuffer, samples);
+}
+
+
+void checkTestPattern(cl::CommandQueue &queue, cl::Buffer &visibilitiesBuffer, cl::Buffer &samplesBuffer)
+{
+  Visibilities &visibilities = * (Visibilities *) queue.enqueueMapBuffer(visibilitiesBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(Visibilities));
+
+  std::atomic<int> count(0);
+
+#if 0
+  Samples &samples = * (Samples *) queue.enqueueMapBuffer(samplesBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(Samples));
+
+  std::cout << "checking ..." << std::endl;
+    
+//#pragma omp parallel for schedule (dynamic)
+  for (unsigned channel = 0; channel < NR_CHANNELS; channel ++) {
+//#pragma omp critical (cout)
+    //std::cout << "channel = " << channel << std::endl;
+
+#if NR_BITS == 4 || NR_BITS == 8
+    typedef std::complex<int32_t> Sum[NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+#elif NR_BITS == 16
+    typedef std::complex<float> Sum[NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+#endif
+
+    Sum &sum = *new Sum[1];
+    memset(sum, 0, sizeof sum);
+
+    for (unsigned major_time = 0; major_time < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; major_time ++)
+      for (unsigned recv1 = 0, baseline = 0; recv1 < NR_RECEIVERS; recv1 ++)
+        for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++)
+          for (unsigned minor_time = 0; minor_time < NR_TIMES_PER_BLOCK; minor_time ++)
+            for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++)
+              for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++)
+#if NR_BITS == 4 || NR_BITS == 8
+                sum[baseline][pol1][pol0] += (std::complex<int32_t>) samples[channel][major_time][recv1][pol1][minor_time] * conj((std::complex<int32_t>) samples[channel][major_time][recv0][pol0][minor_time]);
+#elif NR_BITS == 16
+                sum[baseline][pol1][pol0] += samples[channel][major_time][recv1][pol1][minor_time] * conj(samples[channel][major_time][recv0][pol0][minor_time]);
+#endif
+
+    for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++)
+      for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++)
+        for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++)
+#if NR_BITS == 4 || NR_BITS == 8
+          if (visibilities[channel][baseline][pol1][pol0] != sum[baseline][pol1][pol0] && ++ count < 10)
+#pragma omp critical (cout)
+            std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl;
+#elif NR_BITS == 16
+          float absolute = abs(visibilities[channel][baseline][pol1][pol0] - sum[baseline][pol1][pol0]);
+          float relative = abs(visibilities[channel][baseline][pol1][pol0] / sum[baseline][pol1][pol0]);
+                                                                        195,1         83%
+
+
+          if ((relative < .999 || relative > 1.001) && absolute > .0001 * NR_SAMPLES_PER_CHANNEL && ++ count < 10)
+              std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << ", relative = " << relative << ", absolute = " << absolute << std::endl;
+#endif
+    delete &sum;
+  }
+
+  std::cout << "#errors = " << count << std::endl;
+
+  queue.enqueueUnmapMemObject(samplesBuffer, samples);
+#else
+  for (unsigned channel = 0; channel < NR_CHANNELS; channel ++)
+    for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++)
+      for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++)
+        for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++)
+#if NR_BITS == 4 || NR_BITS == 8
+            if (visibilities[channel][baseline][pol0][pol1] != std::complex<int32_t>(0, 0))
+#elif NR_BITS == 16
+            if (visibilities[channel][baseline][pol0][pol1] != std::complex<float>(0.0f, 0.0f))
+#endif
+              if (count ++ < 10)
+#pragma omp critical (cout)
+                std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl;
+#endif
+
+  queue.enqueueUnmapMemObject(visibilitiesBuffer, visibilities);
+}
+
+
+int main()
+{
+  try {
+    cl::Context             context;
+    std::vector<cl::Device> devices;
+    createContext(context, devices);
+
+    cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE);
+
+    cl::Buffer samples(context, CL_MEM_READ_ONLY /*| CL_MEM_HOST_WRITE_ONLY*/, sizeof(Samples));
+    cl::Buffer visibilities(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(Visibilities));
+
+    setTestPattern(queue, samples);
+
+    cl::Program program(createProgramFromBinaries(context, devices));
+
+    cl::Kernel correlatorKernel(program, "correlate");
+    correlatorKernel.setArg(0, visibilities);
+    correlatorKernel.setArg(1, samples);
+
+    cl::Event event;
+
+#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48
+    unsigned nrBlocks = ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK) * ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK + 1) / 2;
+#elif NR_RECEIVERS_PER_BLOCK == 64
+    unsigned nrBlocks = ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK) * ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK);
+#endif
+
+    cl::NDRange globalWorkSize(32 * nrBlocks, 2 * NR_CHANNELS, 2 * 1);
+    cl::NDRange localWorkSize(32, 2, 2);
+
+for (int i = 0; i < 100; i ++)
+    queue.enqueueNDRangeKernel(correlatorKernel, cl::NullRange, globalWorkSize, localWorkSize, 0, &event);
+    queue.finish();
+
+    double runtime = (event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>()) * 1e-9;
+    unsigned long long nrOperations = 8LLU * (NR_RECEIVERS * NR_RECEIVERS + 1) / 2 * NR_POLARIZATIONS * NR_POLARIZATIONS * NR_CHANNELS * NR_SAMPLES_PER_CHANNEL;
+    double TFLOPS = nrOperations / runtime * 1e-12;
+    std::clog << "runtime = " << runtime << "s, TFLOPS = " << TFLOPS << std::endl;
+
+    checkTestPattern(queue, visibilities, samples);
+   } catch (cl::Error &error) {
+     std::cerr << "caught cl::Error: " << error.what() << ": " << errorMessage(error.err()) << std::endl;
+   } catch (std::exception &error) {
+     std::cerr << "caught std::exception: " << error.what() << std::endl;
+   }
+
+  return 0;
+}
diff --git a/test/SimpleExample/SimpleExample.cu b/test/SimpleExample/SimpleExample.cu
new file mode 100644
index 0000000..4fa52dd
--- /dev/null
+++ b/test/SimpleExample/SimpleExample.cu
@@ -0,0 +1,77 @@
+#define NR_BITS 8
+#define NR_CHANNELS 480
+#define NR_POLARIZATIONS 2
+#define NR_SAMPLES_PER_CHANNEL 3072
+#define NR_RECEIVERS 576
+#define NR_BASELINES ((NR_RECEIVERS) * ((NR_RECEIVERS) + 1) / 2)
+#define NR_RECEIVERS_PER_BLOCK 64
+#define NR_TIMES_PER_BLOCK (128 / (NR_BITS))
+
+
+#include "test/Common/ComplexInt4.h"
+#include "libtcc/Correlator.h"
+
+#include <complex>
+#include <iostream>
+
+#include <cuda.h>
+#if NR_BITS == 16
+#include <cuda_fp16.h>
+#endif
+
+
+inline void checkCudaCall(cudaError_t error)
+{
+  if (error != cudaSuccess) {
+    std::cerr << "error " << error << std::endl;
+    exit(1);
+  }
+}
+
+
+#if NR_BITS == 4
+typedef complex_int4_t	      Sample;
+typedef std::complex<int32_t> Visibility;
+#elif NR_BITS == 8
+typedef std::complex<int8_t>  Sample;
+typedef std::complex<int32_t> Visibility;
+#elif NR_BITS == 16
+typedef std::complex<__half>  Sample;
+typedef std::complex<float>   Visibility;
+#endif
+
+typedef Sample Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
+typedef Visibility Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
+
+
+int main()
+{
+  try {
+    checkCudaCall(cudaSetDevice(0)); // combine the CUDA runtime API and CUDA driver API
+    checkCudaCall(cudaFree(0));
+
+    tcc::Correlator correlator(NR_BITS, NR_RECEIVERS, NR_CHANNELS, NR_SAMPLES_PER_CHANNEL, NR_POLARIZATIONS, NR_RECEIVERS_PER_BLOCK);
+
+    cudaStream_t stream;
+    Samples *samples;
+    Visibilities *visibilities;
+
+    checkCudaCall(cudaStreamCreate(&stream));
+    checkCudaCall(cudaMallocManaged(&samples, sizeof(Samples)));
+    checkCudaCall(cudaMallocManaged(&visibilities, sizeof(Visibilities)));
+
+    (*samples)[NR_CHANNELS / 3][NR_SAMPLES_PER_CHANNEL / 5 / NR_TIMES_PER_BLOCK][174][0][NR_SAMPLES_PER_CHANNEL / 5 % NR_TIMES_PER_BLOCK] = Sample(2, 3);
+    (*samples)[NR_CHANNELS / 3][NR_SAMPLES_PER_CHANNEL / 5 / NR_TIMES_PER_BLOCK][418][0][NR_SAMPLES_PER_CHANNEL / 5 % NR_TIMES_PER_BLOCK] = Sample(4, 5);
+
+    correlator.launchAsync((CUstream) stream, (CUdeviceptr) visibilities, (CUdeviceptr) samples);
+    checkCudaCall(cudaDeviceSynchronize());
+
+    std::cout << ((*visibilities)[160][87745][0][0] == Visibility(23, -2) ? "success" : "failed") << std::endl;
+
+    checkCudaCall(cudaFree(visibilities));
+    checkCudaCall(cudaFree(samples));
+    checkCudaCall(cudaStreamDestroy(stream));
+  } catch (std::exception &error) {
+    std::cerr << error.what() << std::endl;
+  }
+}
diff --git a/util/ExceptionPropagator.h b/util/ExceptionPropagator.h
new file mode 100644
index 0000000..942a28e
--- /dev/null
+++ b/util/ExceptionPropagator.h
@@ -0,0 +1,59 @@
+#if !defined EXCEPTION_PROPAGATOR_H
+#define EXCEPTION_PROPAGATOR_H
+
+// Experimental class to propagate exceptions around OpenMP pragmas (and other
+// constructs) that cannot cope with exceptions.  Exceptions thrown inside a lambda
+// function that is given to the () operator are propagated and rethrown when the
+// progagation object is destroyed.  Example use case:
+//
+// ExceptionPropagator ep;
+//
+// #pragma omp parallel for
+// for (int i = 0; i < 10000; i ++)
+//   if (!ep) // finish loop ASAP if exception is pending
+//     ep([&] () {
+//       throw 42; // continue with code that might throw exceptions
+//     });
+//
+// // exception is rethrown at scope exit of ep
+
+#include <atomic>
+#include <exception>
+
+
+class ExceptionPropagator
+{
+  public:
+    ExceptionPropagator()
+    :
+      propagateException(ATOMIC_FLAG_INIT)
+    {
+    }
+
+    ~ExceptionPropagator() noexcept(false)
+    {
+      if (exception != nullptr && !std::uncaught_exception())
+        std::rethrow_exception(exception);
+    }
+
+    template <typename T> void operator () (const T &func)
+    {
+      try {
+        func();
+      } catch (...) {
+        if (!atomic_flag_test_and_set(&propagateException))
+          exception = std::current_exception();
+      }
+    }
+
+    operator bool () const // returns true iff exception pending
+    {
+      return exception != nullptr;
+    }
+
+  private:
+    std::atomic_flag   propagateException;
+    std::exception_ptr exception;
+};
+
+#endif
diff --git a/util/cu.cc b/util/cu.cc
new file mode 100644
index 0000000..82113be
--- /dev/null
+++ b/util/cu.cc
@@ -0,0 +1,37 @@
+#include "cu.h"
+
+#include <iostream>
+#include <sstream>
+
+
+namespace cu {
+
+const char *Error::what() const noexcept
+{
+  const char *str;
+  return cuGetErrorString(_result, &str) != CUDA_ERROR_INVALID_VALUE ? str : "unknown error";
+}
+
+
+Context Device::primaryCtxRetain()
+{
+  CUcontext context;
+  checkCudaCall(cuDevicePrimaryCtxRetain(&context, _obj));
+  return Context(context, *this);
+}
+
+
+void Source::compile(const char *output_file_name, const char *compiler_options)
+{
+  std::stringstream command_line;
+  command_line << "nvcc -cubin " << compiler_options << " -o " << output_file_name << ' ' << input_file_name;
+//#pragma omp critical (clog)
+  //std::clog << command_line.str() << std::endl;
+
+  int retval = system(command_line.str().c_str());
+
+  if (WEXITSTATUS(retval) != 0)
+    throw Error(CUDA_ERROR_INVALID_SOURCE);
+}
+
+}
diff --git a/util/cu.h b/util/cu.h
new file mode 100644
index 0000000..46dfdea
--- /dev/null
+++ b/util/cu.h
@@ -0,0 +1,786 @@
+#if !defined CU_WRAPPER_H
+#define CU_WRAPPER_H
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <exception>
+#include <fstream>
+#include <string>
+#include <vector>
+
+
+namespace cu {
+  class Error : public std::exception {
+    public:
+      Error(CUresult result)
+      :
+        _result(result)
+      {
+      }
+
+      virtual const char *what() const noexcept;
+
+      operator CUresult () const
+      {
+	return _result;
+      }
+
+    private:
+      CUresult _result;
+  };
+
+
+  inline void checkCudaCall(CUresult result)
+  {
+    if (result != CUDA_SUCCESS)
+      throw Error(result);
+  }
+
+
+  inline void init(unsigned flags = 0)
+  {
+    checkCudaCall(cuInit(flags));
+  }
+
+
+  inline int driverGetVersion()
+  {
+    int version;
+    checkCudaCall(cuDriverGetVersion(&version));
+    return version;
+  }
+
+  inline void memcpyHtoD(CUdeviceptr dst, const void *src, size_t size)
+  {
+    checkCudaCall(cuMemcpyHtoD(dst, src, size));
+  }
+
+  class Context;
+  class Stream;
+
+  template <typename T> class Wrapper
+  {
+    public:
+      // Wrapper<T>(Wrapper<T> &) = delete; // disallow copies
+
+      // conversion to C-style T
+
+      operator const T & () const
+      {
+	return _obj;
+      }
+
+      operator T & ()
+      {
+	return _obj;
+      }
+
+#if 0 // makes no sense if object is not copyable
+      bool operator == (const Wrapper<T> &other)
+      {
+	return _obj == other._obj;
+      }
+
+      bool operator != (const Wrapper<T> &other)
+      {
+	return _obj != other._obj;
+      }
+#endif
+
+    protected:
+      Wrapper<T>()
+      :
+        hasOwnership(true)
+      {
+      }
+
+      Wrapper<T>(T &obj)
+      :
+        _obj(obj),
+        hasOwnership(false)
+      {
+      }
+
+      bool hasOwnership;
+      T _obj;
+  };
+  
+  class Device : public Wrapper<CUdevice>
+  {
+    public:
+      // Device Management
+
+      Device(int ordinal)
+      {
+	checkCudaCall(cuDeviceGet(&_obj, ordinal));
+      }
+
+      int getAttribute(CUdevice_attribute attribute) const
+      {
+	int value;
+	checkCudaCall(cuDeviceGetAttribute(&value, attribute, _obj));
+	return value;
+      }
+
+      template <CUdevice_attribute attribute> int getAttribute() const
+      {
+	return getAttribute(attribute);
+      }
+
+      static int getCount()
+      {
+	int nrDevices;
+	checkCudaCall(cuDeviceGetCount(&nrDevices));
+	return nrDevices;
+      }
+
+      std::string getName() const
+      {
+	char name[64];
+	checkCudaCall(cuDeviceGetName(name, sizeof name, _obj));
+	return std::string(name);
+      }
+
+      size_t totalMem() const
+      {
+	size_t size;
+	checkCudaCall(cuDeviceTotalMem(&size, _obj));
+	return size;
+      }
+
+
+      // Primary Context Management
+
+      std::pair<unsigned, bool> primaryCtxGetState() const
+      {
+	unsigned flags;
+	int active;
+	checkCudaCall(cuDevicePrimaryCtxGetState(_obj, &flags, &active));
+	return std::pair<unsigned, bool>(flags, active);
+      }
+
+      // void primaryCtxRelease() not available; it is released on destruction of the Context returned by Device::primaryContextRetain()
+
+      void primaryCtxReset()
+      {
+	checkCudaCall(cuDevicePrimaryCtxReset(_obj));
+      }
+
+      Context primaryCtxRetain(); // retain this context until the primary context can be released
+
+      void primaryCtxSetFlags(unsigned flags)
+      {
+	checkCudaCall(cuDevicePrimaryCtxSetFlags(_obj, flags));
+      }
+  };
+
+
+  class Context : public Wrapper<CUcontext>
+  {
+    public:
+      // Context Management
+
+      Context(int flags, Device &device)
+      :
+        _primaryContext(false)
+      {
+	checkCudaCall(cuCtxCreate(&_obj, flags, device));
+      }
+
+      Context(CUcontext context)
+      :
+	Wrapper<CUcontext>(context),
+	_primaryContext(false)
+      {
+      }
+
+      ~Context()
+      {
+	if (hasOwnership)
+	  checkCudaCall(cuCtxDestroy(_obj));
+	//else
+	  //checkCudaCall(cuDevicePrimaryCtxRelease(getDevice())); // FIXME
+      }
+
+      unsigned getApiVersion() const
+      {
+	unsigned version;
+	checkCudaCall(cuCtxGetApiVersion(_obj, &version));
+	return version;
+      }
+
+      static CUfunc_cache getCacheConfig()
+      {
+	CUfunc_cache config;
+	checkCudaCall(cuCtxGetCacheConfig(&config));
+	return config;
+      }
+
+      static void setCacheConfig(CUfunc_cache config)
+      {
+	checkCudaCall(cuCtxSetCacheConfig(config));
+      }
+
+      static Context getCurrent()
+      {
+	CUcontext context;
+	checkCudaCall(cuCtxGetCurrent(&context));
+	return Context(context);
+      }
+
+      void setCurrent() const
+      {
+	checkCudaCall(cuCtxSetCurrent(_obj));
+      }
+
+      void pushCurrent()
+      {
+	checkCudaCall(cuCtxPushCurrent(_obj));
+      }
+
+      static Context popCurrent()
+      {
+	CUcontext context;
+	checkCudaCall(cuCtxPopCurrent(&context));
+	return Context(context);
+      }
+
+      void setSharedMemConfig(CUsharedconfig config)
+      {
+	checkCudaCall(cuCtxSetSharedMemConfig(config));
+      }
+
+      static Device getDevice()
+      {
+	CUdevice device;
+	checkCudaCall(cuCtxGetDevice(&device));
+	return Device(device); // FIXME: ~Device()
+      }
+
+      static size_t getLimit(CUlimit limit)
+      {
+	size_t value;
+	checkCudaCall(cuCtxGetLimit(&value, limit));
+	return value;
+      }
+
+      template <CUlimit limit> static size_t getLimit()
+      {
+	return getLimit(limit);
+      }
+
+      static void setLimit(CUlimit limit, size_t value)
+      {
+	checkCudaCall(cuCtxSetLimit(limit, value));
+      }
+
+      template <CUlimit limit> static void setLimit(size_t value)
+      {
+	setLimit(limit, value);
+      }
+
+      static void synchronize()
+      {
+	checkCudaCall(cuCtxSynchronize());
+      }
+
+    private:
+      friend class Device;
+      Context(CUcontext context, Device &device)
+      :
+	Wrapper<CUcontext>(context),
+	_primaryContext(true)
+      {
+      }
+
+      bool _primaryContext;
+  };
+
+
+  class HostMemory : public Wrapper<void *>
+  {
+    public:
+      HostMemory(size_t size, int flags = 0)
+      {
+	checkCudaCall(cuMemHostAlloc(&_obj, size, flags));
+      }
+
+      ~HostMemory()
+      {
+	checkCudaCall(cuMemFreeHost(_obj));
+      }
+
+      template <typename T> operator T * ()
+      {
+	return static_cast<T *>(_obj);
+      }
+  };
+
+
+  class DeviceMemory : public Wrapper<CUdeviceptr>
+  {
+    public:
+      DeviceMemory(size_t size)
+      {
+	checkCudaCall(cuMemAlloc(&_obj, size));
+      }
+
+      DeviceMemory(CUdeviceptr ptr)
+      :
+        Wrapper(ptr)
+      {
+      }
+
+      DeviceMemory(HostMemory &hostMemory)
+      {
+	hasOwnership = false;
+	checkCudaCall(cuMemHostGetDevicePointer(&_obj, hostMemory, 0));
+      }
+
+      ~DeviceMemory()
+      {
+	if (hasOwnership)
+	  checkCudaCall(cuMemFree(_obj));
+      }
+
+      const void *parameter() const // used to construct parameter list for launchKernel();
+      {
+	return &_obj;
+      }
+
+      template <typename T> operator T * () const
+      {
+	return (T *) _obj;
+      }
+
+      template <typename T> operator const T * () const
+      {
+	return (const T *) _obj;
+      }
+  };
+
+
+  class Array : public Wrapper<CUarray>
+  {
+    public:
+      Array(unsigned width, CUarray_format format, unsigned numChannels)
+      {
+	Array(width, 0, format, numChannels);
+      }
+
+      Array(unsigned width, unsigned height, CUarray_format format, unsigned numChannels)
+      {
+	CUDA_ARRAY_DESCRIPTOR descriptor;
+	descriptor.Width       = width;
+	descriptor.Height      = height;
+	descriptor.Format      = format;
+	descriptor.NumChannels = numChannels;
+	checkCudaCall(cuArrayCreate(&_obj, &descriptor));
+      }
+
+      Array(unsigned width, unsigned height, unsigned depth, CUarray_format format, unsigned numChannels)
+      {
+	CUDA_ARRAY3D_DESCRIPTOR descriptor;
+	descriptor.Width       = width;
+	descriptor.Height      = height;
+	descriptor.Depth       = depth;
+	descriptor.Format      = format;
+	descriptor.NumChannels = numChannels;
+	descriptor.Flags       = 0;
+	checkCudaCall(cuArray3DCreate(&_obj, &descriptor));
+      }
+
+      Array(CUarray &array)
+      :
+        Wrapper(array)
+      {
+      }
+
+      ~Array()
+      {
+	if (hasOwnership)
+	  checkCudaCall(cuArrayDestroy(_obj));
+      }
+  };
+
+
+#if 0
+  class TexObject : public Wrapper<CUtexObject> {
+    public:
+      TexObject(onst CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc)
+      {
+	checkCudaCall(cuTexObjectCreate(&_obj, pResDesc, pTexDesc, pResViewDesc));
+      }
+
+      TexObject(CUtexObject &obj)
+      :
+        Wrapper<CUtexObject>(obj)
+      {
+      }
+
+      ~TexObject()
+      {
+	if (hasOwnership)
+	  checkCudaCall(cuTexObjectDestroy(_obj));
+      }
+
+
+  };
+#endif
+
+
+  class TexRef : public Wrapper<CUtexref> {
+    public:
+      TexRef(CUtexref texref)
+      : 
+        Wrapper<CUtexref>(texref)
+      {
+      }
+
+#if 0 // deprecated
+      void setAddress(size_t &byte_offset, DeviceMemory &memory, size_t size)
+      {
+	checkCudaCall(cuTexRefSetAddress(&byte_offset, _obj, memory, size));
+      }
+
+      void setArray(Array &array, unsigned flags = CU_TRSA_OVERRIDE_FORMAT)
+      {
+	checkCudaCall(cuTexRefSetArray(_obj, array, flags));
+      }
+
+      void setAddressMode(int dim, CUaddress_mode am)
+      {
+	checkCudaCall(cuTexRefSetAddressMode(_obj, dim, am));
+      }
+
+      void setFilterMode(CUfilter_mode fm)
+      {
+	checkCudaCall(cuTexRefSetFilterMode(_obj, fm));
+      }
+
+      void setFlags(int flags)
+      {
+	checkCudaCall(cuTexRefSetFlags(_obj, flags));
+      }
+
+      void setFormat(CUarray_format fmt, int numPackedComponents)
+      {
+	checkCudaCall(cuTexRefSetFormat(_obj, fmt, numPackedComponents));
+      }
+#endif
+  };
+
+
+  class Source
+  {
+    public:
+      Source(const char *input_file_name)
+      :
+        input_file_name(input_file_name)
+      {
+      }
+
+      void compile(const char *ptx_name, const char *compile_options = 0);
+
+    private:
+      const char *input_file_name;
+  };
+
+
+  class Module : public Wrapper<CUmodule>
+  {
+    public:
+      Module(const char *file_name)
+      {
+#if defined TEGRA_QUIRKS // cuModuleLoad broken on Jetson TX1
+	std::ifstream file(file_name);
+	std::string program((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+	checkCudaCall(cuModuleLoadData(&_obj, program.c_str()));
+#else
+	checkCudaCall(cuModuleLoad(&_obj, file_name));
+#endif
+      }
+
+      Module(const void *data)
+      {
+	checkCudaCall(cuModuleLoadData(&_obj, data));
+      }
+
+      Module(CUmodule &module)
+      :
+        Wrapper(module)
+      {
+      }
+
+      ~Module()
+      {
+	if (hasOwnership)
+	  checkCudaCall(cuModuleUnload(_obj));
+      }
+
+      TexRef getTexRef(const char *name) const
+      {
+	CUtexref texref;
+	checkCudaCall(cuModuleGetTexRef(&texref, _obj, name));
+	return TexRef(texref);
+      }
+
+      CUdeviceptr getGlobal(const char *name) const
+      {
+	CUdeviceptr deviceptr;
+	checkCudaCall(cuModuleGetGlobal(&deviceptr, nullptr, _obj, name));
+	return deviceptr;
+      }
+  };
+
+
+  class Function : public Wrapper<CUfunction>
+  {
+    public:
+      Function(Module &module, const char *name)
+      {
+	checkCudaCall(cuModuleGetFunction(&_obj, module, name));
+      }
+
+      Function(CUfunction &function)
+      :
+	Wrapper(function)
+      {
+      }
+
+      int getAttribute(CUfunction_attribute attribute)
+      {
+	int value;
+	checkCudaCall(cuFuncGetAttribute(&value, attribute, _obj));
+	return value;
+      }
+
+      void setCacheConfig(CUfunc_cache config)
+      {
+	checkCudaCall(cuFuncSetCacheConfig(_obj, config));
+      }
+
+#if 0
+      void paramSetTexRef(TexRef &texref)
+      {
+	checkCudaCall(cuParamSetTexRef(_obj, CU_PARAM_TR_DEFAULT, texref));
+      }
+#endif
+  };
+
+
+  class Event : public Wrapper<CUevent>
+  {
+    public:
+      Event(int flags = CU_EVENT_DEFAULT)
+      {
+	checkCudaCall(cuEventCreate(&_obj, flags));
+      }
+
+      Event(CUevent &event)
+      :
+	Wrapper(event)
+      {
+      }
+
+      ~Event()
+      {
+	if (hasOwnership)
+	  checkCudaCall(cuEventDestroy(_obj));
+      }
+
+      float elapsedTime(Event &second) const
+      {
+	float ms;
+	checkCudaCall(cuEventElapsedTime(&ms, second, _obj));
+	return ms;
+      }
+
+      void query() const
+      {
+	checkCudaCall(cuEventQuery(_obj)); // unsuccessful result throws cu::Error
+      }
+
+      void record()
+      {
+	checkCudaCall(cuEventRecord(_obj, 0));
+      }
+
+      void record(Stream &);
+
+      void synchronize()
+      {
+	checkCudaCall(cuEventSynchronize(_obj));
+      }
+  };
+
+
+  class Stream : public Wrapper<CUstream>
+  {
+    friend class Event;
+
+    public:
+      Stream(int flags = CU_STREAM_DEFAULT)
+      {
+	checkCudaCall(cuStreamCreate(&_obj, flags));
+      }
+
+      Stream(CUstream stream)
+      :
+	Wrapper<CUstream>(stream)
+      {
+      }
+
+      ~Stream()
+      {
+	if (hasOwnership)
+	  checkCudaCall(cuStreamDestroy(_obj));
+      }
+
+      void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size)
+      {
+	checkCudaCall(cuMemcpyHtoDAsync(devPtr, hostPtr, size, _obj));
+      }
+
+      void memcpyDtoHAsync(void *hostPtr, CUdeviceptr devPtr, size_t size)
+      {
+	checkCudaCall(cuMemcpyDtoHAsync(hostPtr, devPtr, size, _obj));
+      }
+
+      void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> &parameters)
+      {
+	checkCudaCall(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(&parameters[0]), 0));
+      }
+
+#if CUDART_VERSION >= 9000
+      void launchCooperativeKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> &parameters)
+      {
+	checkCudaCall(cuLaunchCooperativeKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(&parameters[0])));
+      }
+#endif
+
+      void query()
+      {
+	checkCudaCall(cuStreamQuery(_obj)); // unsuccessful result throws cu::Error
+      }
+
+      void synchronize()
+      {
+	checkCudaCall(cuStreamSynchronize(_obj));
+      }
+
+      void waitEvent(Event &event)
+      {
+	checkCudaCall(cuStreamWaitEvent(_obj, event, 0));
+      }
+
+      void addCallback(CUstreamCallback callback, void *userData, int flags = 0)
+      {
+	checkCudaCall(cuStreamAddCallback(_obj, callback, userData, flags));
+      }
+
+      void record(Event &event)
+      {
+	checkCudaCall(cuEventRecord(event, _obj));
+      }
+
+      void batchMemOp(unsigned count, CUstreamBatchMemOpParams *paramArray, unsigned flags)
+      {
+	checkCudaCall(cuStreamBatchMemOp(_obj, count, paramArray, flags));
+      }
+
+      void waitValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags) const
+      {
+	checkCudaCall(cuStreamWaitValue32(_obj, addr, value, flags));
+      }
+
+      void writeValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags)
+      {
+	checkCudaCall(cuStreamWriteValue32(_obj, addr, value, flags));
+      }
+  };
+
+#if 1
+  class Graph : public Wrapper<CUgraph>
+  {
+    public:
+      class GraphNode : public Wrapper<CUgraphNode>
+      {
+      };
+
+      class ExecKernelNode : public GraphNode
+      {
+      };
+
+      class KernelNodeParams : public Wrapper<CUDA_KERNEL_NODE_PARAMS>
+      {
+	public:
+	  KernelNodeParams(const Function &function,
+			   unsigned gridDimX, unsigned gridDimY, unsigned gridDimZ,
+			   unsigned blockDimX, unsigned blockDimY, unsigned blockDimZ,
+			   unsigned sharedMemBytes,
+			   const std::vector<const void *> &kernelParams)
+	  {
+	    _obj.func	   = function;
+	    _obj.blockDimX = blockDimX;
+	    _obj.blockDimY = blockDimY;
+	    _obj.blockDimZ = blockDimZ;
+	    _obj.gridDimX  = gridDimX;
+	    _obj.gridDimY  = gridDimY;
+	    _obj.gridDimZ  = gridDimZ;
+	    _obj.sharedMemBytes = sharedMemBytes;
+	    _obj.kernelParams = const_cast<void **>(kernelParams.data());
+	    _obj.extra	   = nullptr;
+	  }
+      };
+
+      class Exec : public Wrapper<CUgraphExec>
+      {
+	public:
+	  void launch(Stream &stream)
+	  {
+	    checkCudaCall(cuGraphLaunch(_obj, stream));
+	  }
+      };
+
+      Graph(unsigned flags = 0)
+      {
+	checkCudaCall(cuGraphCreate(&_obj, flags));
+      }
+
+      Graph(CUgraph &graph)
+      :
+	Wrapper(graph)
+      {
+      }
+
+      ~Graph()
+      {
+	if (hasOwnership)
+	  checkCudaCall(cuGraphDestroy(_obj));
+      }
+
+      ExecKernelNode addKernelNode(/* std::vector<GraphNode> dependencies, */ const KernelNodeParams &kernelArgs)
+      {
+	ExecKernelNode node;
+	checkCudaCall(cuGraphAddKernelNode(& (CUgraphNode &) node, _obj, nullptr, 0, & (const CUDA_KERNEL_NODE_PARAMS &) kernelArgs));
+	return node;
+      }
+
+      Exec instantiate()
+      {
+	Exec exec;
+	checkCudaCall(cuGraphInstantiate(& (CUgraphExec &) exec, _obj, nullptr, nullptr, 0));
+	return exec;
+      }
+  };
+#endif
+
+
+  inline void Event::record(Stream &stream)
+  {
+    checkCudaCall(cuEventRecord(_obj, stream._obj));
+  }
+}
+
+#endif
diff --git a/util/multi_array.h b/util/multi_array.h
new file mode 100644
index 0000000..1db0ea5
--- /dev/null
+++ b/util/multi_array.h
@@ -0,0 +1,63 @@
+#if !defined MULTI_ARRAY_H
+#define MULTI_ARRAY_H
+
+// Experimental class that provides multi-dimensional arrays, similar to
+// boost::multi_array[_ref], but much simpler.
+
+namespace multi_array {
+  template <std::size_t dim> class extent {
+    public:
+      extent(std::size_t size, const extent<dim - 1> &rhs) : size(rhs.size * size), next_extent(extent<dim - 1>(size, rhs.next_extent)) {}
+      extent<dim + 1> operator [] (std::size_t size) const { return extent<dim + 1>(size, *this); }
+
+      const std::size_t size;
+      const extent<dim - 1> next_extent;
+  };
+
+
+  template <> class extent<0> {
+    public:
+      extent() {}
+      template <typename T> extent(std::size_t, T) {}
+      extent<1> operator [] (std::size_t size) const { return extent<1>(size, *this); }
+
+      static const std::size_t size = 1;
+      static const struct {} next_extent;
+  };
+
+
+  const static extent<0> extents;
+
+
+  template<typename T, std::size_t dim> class array_ref {
+    public:
+      array_ref(T &ref, const extent<dim> &extents) : ref(ref), extents(extents) {}
+      template <std::size_t d = dim> typename std::enable_if<d == 1, T &>::type operator [] (std::size_t index) const { return (&ref)[index]; }
+      template <std::size_t d = dim> typename std::enable_if<d != 1, array_ref<T, dim - 1>>::type operator [] (std::size_t index) const { return array_ref<T, dim - 1>((&ref)[index * extents.next_extent.size], extents.next_extent); }
+      T *begin() const { return &ref; }
+      T *end() const { return &ref + extents.size; }
+      std::size_t bytesize() const { return sizeof(T) * extents.size; }
+
+      const extent<dim> extents;
+
+    private:
+      T &ref;
+  };
+
+
+  namespace detail {
+    template <typename T> class array_base {
+      protected:
+	array_base(std::size_t size) : vec(size) {}
+	std::vector<T> vec;
+    };
+  }
+
+
+  template<typename T, std::size_t dim> class array : private detail::array_base<T>, public array_ref<T, dim> {
+    public:
+      array(const extent<dim> &extents) : detail::array_base<T>(extents.size), array_ref<T, dim>(*detail::array_base<T>::vec.data(), extents) {}
+  };
+}
+
+#endif
diff --git a/util/nvrtc.cc b/util/nvrtc.cc
new file mode 100644
index 0000000..b6ac885
--- /dev/null
+++ b/util/nvrtc.cc
@@ -0,0 +1,11 @@
+#include "nvrtc.h"
+
+
+namespace nvrtc {
+
+const char *Error::what() const noexcept
+{
+  return nvrtcGetErrorString(_result);
+}
+
+}
diff --git a/util/nvrtc.h b/util/nvrtc.h
new file mode 100644
index 0000000..f429a73
--- /dev/null
+++ b/util/nvrtc.h
@@ -0,0 +1,109 @@
+#if !defined NVRTC_H
+#define NVRTC_H
+
+#include <cuda.h>
+#include <nvrtc.h>
+
+#include <algorithm>
+#include <exception>
+#include <fstream>
+#include <iterator>
+#include <string>
+#include <vector>
+
+
+namespace nvrtc {
+  class Error : public std::exception {
+    public:
+      Error(nvrtcResult result)
+	:
+	  _result(result)
+	{
+	}
+
+	virtual const char *what() const noexcept;
+
+	operator nvrtcResult () const
+	{
+	  return _result;
+	}
+
+      private:
+	nvrtcResult _result;
+  };
+
+
+  inline void checkNvrtcCall(nvrtcResult result)
+  {
+    if (result != NVRTC_SUCCESS)
+      throw Error(result);
+  }
+
+
+  class Program {
+    public:
+      Program(const std::string &src, const std::string &name, int numHeaders = 0, const char *headers[] = nullptr, const char *includeNames[] = nullptr) // TODO: use std::vector<std::string>
+      {
+	checkNvrtcCall(nvrtcCreateProgram(&program, src.c_str(), name.c_str(), numHeaders, headers, includeNames));
+      }
+
+      Program(const std::string &filename)
+      {
+	std::ifstream ifs(filename);
+	std::string source(std::istreambuf_iterator<char>{ifs}, {});
+	checkNvrtcCall(nvrtcCreateProgram(&program, source.c_str(), filename.c_str(), 0, nullptr, nullptr));
+      }
+
+      ~Program()
+      {
+	checkNvrtcCall(nvrtcDestroyProgram(&program));
+      }
+
+      void compile(const std::vector<std::string> &options)
+      {
+	std::vector<const char *> c_options;
+	std::transform(options.begin(), options.end(), std::back_inserter(c_options), [] (const std::string &option) { return option.c_str();});
+	checkNvrtcCall(nvrtcCompileProgram(program, c_options.size(), c_options.data()));
+      }
+
+      std::string getPTX()
+      {
+	size_t      size;
+	std::string ptx;
+
+	checkNvrtcCall(nvrtcGetPTXSize(program, &size));
+	ptx.resize(size);
+	checkNvrtcCall(nvrtcGetPTX(program, &ptx[0]));
+	return ptx;
+      }
+
+#if CUDA_VERSION >= 11020
+      std::vector<char> getCUBIN()
+      {
+	size_t      size;
+	std::vector<char> cubin;
+
+	checkNvrtcCall(nvrtcGetCUBINSize(program, &size));
+	cubin.resize(size);
+	checkNvrtcCall(nvrtcGetCUBIN(program, &cubin[0]));
+	return cubin;
+      }
+#endif
+
+      std::string getLog()
+      {
+	size_t      size;
+	std::string log;
+
+	checkNvrtcCall(nvrtcGetProgramLogSize(program, &size));
+	log.resize(size);
+	checkNvrtcCall(nvrtcGetProgramLog(program, &log[0]));
+	return log;
+      }
+
+    private:
+      nvrtcProgram program;
+  };
+}
+
+#endif
-- 
GitLab