From 7a9dd8b2bd10c9f5b3386c38f6377a1c1094ae21 Mon Sep 17 00:00:00 2001 From: John Romein <romein@astron.nl> Date: Mon, 30 Aug 2021 17:31:29 +0200 Subject: [PATCH] Initial commit. --- Makefile | 102 +++ libtcc/Correlator.cc | 117 +++ libtcc/Correlator.h | 40 + libtcc/CorrelatorKernel.cc | 43 + libtcc/CorrelatorKernel.h | 35 + libtcc/Kernel.cc | 13 + libtcc/Kernel.h | 23 + libtcc/TCCorrelator.cu | 596 +++++++++++++ libtcc/TCCorrelator.cu.o | Bin 0 -> 26544 bytes libtcc/TCCorrelator.cu.orig | 587 +++++++++++++ test/Common/ComplexInt4.h | 21 + test/Common/Config.h | 22 + test/Common/Record.cc | 35 + test/Common/Record.h | 33 + test/Common/UnitTest.cc | 60 ++ test/Common/UnitTest.h | 30 + test/CorrelatorTest/CorrelatorTest.cc | 239 ++++++ test/CorrelatorTest/CorrelatorTest.h | 61 ++ test/CorrelatorTest/Options.cc | 89 ++ test/CorrelatorTest/Options.h | 45 + .../OpenCLCorrelatorTest.cc | 450 ++++++++++ test/SimpleExample/SimpleExample.cu | 77 ++ util/ExceptionPropagator.h | 59 ++ util/cu.cc | 37 + util/cu.h | 786 ++++++++++++++++++ util/multi_array.h | 63 ++ util/nvrtc.cc | 11 + util/nvrtc.h | 109 +++ 28 files changed, 3783 insertions(+) create mode 100644 Makefile create mode 100644 libtcc/Correlator.cc create mode 100644 libtcc/Correlator.h create mode 100644 libtcc/CorrelatorKernel.cc create mode 100644 libtcc/CorrelatorKernel.h create mode 100644 libtcc/Kernel.cc create mode 100644 libtcc/Kernel.h create mode 100644 libtcc/TCCorrelator.cu create mode 100644 libtcc/TCCorrelator.cu.o create mode 100644 libtcc/TCCorrelator.cu.orig create mode 100644 test/Common/ComplexInt4.h create mode 100644 test/Common/Config.h create mode 100644 test/Common/Record.cc create mode 100644 test/Common/Record.h create mode 100644 test/Common/UnitTest.cc create mode 100644 test/Common/UnitTest.h create mode 100644 test/CorrelatorTest/CorrelatorTest.cc create mode 100644 test/CorrelatorTest/CorrelatorTest.h create mode 100644 test/CorrelatorTest/Options.cc create mode 100644 test/CorrelatorTest/Options.h create mode 100644 test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc create mode 100644 test/SimpleExample/SimpleExample.cu create mode 100644 util/ExceptionPropagator.h create mode 100644 util/cu.cc create mode 100644 util/cu.h create mode 100644 util/multi_array.h create mode 100644 util/nvrtc.cc create mode 100644 util/nvrtc.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0ce2042 --- /dev/null +++ b/Makefile @@ -0,0 +1,102 @@ +VERSION= 0.5 +#CUDA= /cm/shared/package/cuda100/toolkit/10.0.130 +CUDA= $(shell dirname `dirname \`which nvcc\``) +#CUDA= /usr/local/cuda +#POWER_SENSOR= $(HOME)/projects/libpowersensor-master/build +ARCH= $(shell arch) +CC= gcc +CXX= g++ #-Wno-deprecated-declarations +NVCC= nvcc +INCLUDES= -I. +#INCLUDES+= -I$(CUDA)/include +#INCLUDES+= -I$(POWER_SENSOR)/include +CXXFLAGS+= -std=c++11 -O3 -g -fpic -fopenmp $(INCLUDES) -DNDEBUG +NVCCFLAGS= $(INCLUDES) + +#CXXFLAGS+= -march=core-avx2 -mcmodel=medium + +LIBTCC_SOURCES= util/cu.cc\ + util/nvrtc.cc\ + libtcc/CorrelatorKernel.cc\ + libtcc/Correlator.cc\ + libtcc/Kernel.cc + + +CORRELATOR_TEST_SOURCES=test/CorrelatorTest/CorrelatorTest.cc\ + test/CorrelatorTest/Options.cc\ + test/Common/Record.cc\ + test/Common/UnitTest.cc + +OPENCL_TEST_SOURCES= test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc + +SIMPLE_EXAMPLE_SOURCES= test/SimpleExample/SimpleExample.cu + + +LIBTCC_OBJECTS= $(LIBTCC_SOURCES:%.cc=%.o) libtcc/TCCorrelator.o +SIMPLE_EXAMPLE_OBJECTS= $(SIMPLE_EXAMPLE_SOURCES:%.cu=%.o) +CORRELATOR_TEST_OBJECTS=$(CORRELATOR_TEST_SOURCES:%.cc=%.o) +OPENCL_TEST_OBJECTS= $(OPENCL_TEST_SOURCES:%.cc=%.o) + +OBJECTS= $(LIBTCC_OBJECTS)\ + $(SIMPLE_EXAMPLE_OBJECTS)\ + $(CORRELATOR_TEST_OBJECTS)\ + $(OPENCL_TEST_OBJECTS) + +SHARED_OBJECTS= libtcc/libtcc.so libtcc/libtcc.so.$(VERSION) + +DEPENDENCIES= $(OBJECTS:%.o=%.d) + +EXECUTABLES= test/SimpleExample/SimpleExample\ + test/CorrelatorTest/CorrelatorTest\ + test/OpenCLCorrelatorTest/OpenCLCorrelatorTest + +LIBRARIES= -L$(CUDA)/lib64 \ + -L$(CUDA)/lib64/stubs -lcuda -lnvrtc #\ + #-L$(POWER_SENSOR)/lib -lpowersensor #-lnvidia-ml + + +%.d: %.cc + -$(CXX) $(CXXFLAGS) -MM -MT $@ -MT ${@:%.d=%.o} -MT ${@:%.d=%.s} $< -o $@ + +%.d: %.cu + -$(CXX) -x c++ $(CXXFLAGS) -MM -MT $@ -MT ${@:%.d=%.o} -MT ${@:%.d=%.s} $< -o $@ + +%.o: %.cc + $(CXX) $(CXXFLAGS) -o $@ -c $< + +%.o: %.cu + $(NVCC) $(NVCCFLAGS) -o $@ -c $< + +%.s: %.cc + $(CXX) $(CXXFLAGS) -o $@ -S $< + +%.so: %.so.$(VERSION) + rm -f $@ + ln -s $(@F).$(VERSION) $@ + +all:: $(EXECUTABLES) + +clean:: + $(RM) $(OBJECTS) $(SHARED_OBJECTS) $(DEPENDENCIES) $(EXECUTABLES) + +libtcc/TCCorrelator.o: libtcc/TCCorrelator.cu # CUDA code embedded in object file + ld -r -b binary -o $@ $< + +libtcc/TCCorrelator.d: + - + +libtcc/libtcc.so.$(VERSION): $(LIBTCC_OBJECTS) + $(CXX) -shared -o $@ -Wl,-soname=$@ $^ $(LIBRARIES) + +test/SimpleExample/SimpleExample: $(SIMPLE_EXAMPLE_OBJECTS) libtcc/libtcc.so + $(NVCC) $(NVCCFLAGS) -o $@ $(SIMPLE_EXAMPLE_OBJECTS) -Xlinker -rpath=. -Llibtcc -ltcc $(LIBRARIES) + +test/CorrelatorTest/CorrelatorTest: $(CORRELATOR_TEST_OBJECTS) libtcc/libtcc.so + $(CXX) $(CXXFLAGS) -o $@ $(CORRELATOR_TEST_OBJECTS) -Wl,-rpath=. -Llibtcc -ltcc $(LIBRARIES) + +test/OpenCLCorrelatorTest/OpenCLCorrelatorTest: $(OPENCL_TEST_OBJECTS) + $(CXX) $(CXXFLAGS) -o $@ $(OPENCL_TEST_OBJECTS) -L$(CUDA)/lib64 -lOpenCL + +ifeq (0, $(words $(findstring $(MAKECMDGOALS), clean))) +-include $(DEPENDENCIES) +endif diff --git a/libtcc/Correlator.cc b/libtcc/Correlator.cc new file mode 100644 index 0000000..a4e5b7d --- /dev/null +++ b/libtcc/Correlator.cc @@ -0,0 +1,117 @@ +#include "libtcc/Correlator.h" + +#include <iostream> + +#define GNU_SOURCE +#include <link.h> + + +extern const char _binary_libtcc_TCCorrelator_cu_start, _binary_libtcc_TCCorrelator_cu_end; + +namespace tcc { + +std::string Correlator::findNVRTCincludePath() const +{ + std::string path; + + if (dl_iterate_phdr([] (struct dl_phdr_info *info, size_t, void *arg) -> int + { + std::string &path = *static_cast<std::string *>(arg); + path = info->dlpi_name; + return path.find("libnvrtc.so") != std::string::npos; + }, &path)) + { + path.erase(path.find_last_of("/")); // remove library name + path.erase(path.find_last_of("/")); // remove /lib64 + path += "/include"; + } + + return path; +} + + +Correlator::Correlator(unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations, + unsigned nrReceiversPerBlock + ) +: + correlatorModule(compileModule(nrBits, nrReceivers, nrChannels, nrSamplesPerChannel, nrPolarizations, nrReceiversPerBlock)), + correlatorKernel(correlatorModule, nrBits, nrReceivers, nrChannels, nrSamplesPerChannel, nrPolarizations, nrReceiversPerBlock) +{ +} + + +cu::Module Correlator::compileModule(unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations, + unsigned nrReceiversPerBlock + ) +{ + cu::Device device(cu::Context::getCurrent().getDevice()); + int capability = 10 * device.getAttribute<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>() + device.getAttribute<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(); + + std::vector<std::string> options = + { + "-I" + findNVRTCincludePath(), + "-std=c++11", + "-arch=compute_" + std::to_string(capability), + "-lineinfo", + "-DNR_BITS=" + std::to_string(nrBits), + "-DNR_RECEIVERS=" + std::to_string(nrReceivers), + "-DNR_CHANNELS=" + std::to_string(nrChannels), + "-DNR_SAMPLES_PER_CHANNEL=" + std::to_string(nrSamplesPerChannel), + "-DNR_POLARIZATIONS=" + std::to_string(nrPolarizations), + "-DNR_RECEIVERS_PER_BLOCK=" + std::to_string(nrReceiversPerBlock), + }; + + //std::for_each(options.begin(), options.end(), [] (const std::string &e) { std::cout << e << ' '; }); std::cout << std::endl; + +#if 0 + nvrtc::Program program("tcc/TCCorrelator.cu"); +#else + // embed the CUDA source code in libtcc.so, so that it need not be installed separately + // for runtime compilation + // copy into std::string for '\0' termination + std::string source(&_binary_libtcc_TCCorrelator_cu_start, &_binary_libtcc_TCCorrelator_cu_end); + nvrtc::Program program(source, "TCCorrelator.cu"); +#endif + + try { + program.compile(options); + } catch (nvrtc::Error &error) { + std::cerr << program.getLog(); + throw; + } + + //std::ofstream cubin("out.ptx"); + //cubin << program.getPTX().data(); + return cu::Module((void *) program.getPTX().data()); +} + + +void Correlator::launchAsync(cu::Stream &stream, cu::DeviceMemory &visibilities, cu::DeviceMemory &samples) +{ + correlatorKernel.launchAsync(stream, visibilities, samples); +} + + +void Correlator::launchAsync(CUstream stream, CUdeviceptr visibilities, CUdeviceptr samples) +{ + cu::Stream _stream(stream); + cu::DeviceMemory _visibilities(visibilities); + cu::DeviceMemory _samples(samples); + correlatorKernel.launchAsync(_stream, _visibilities, _samples); +} + + +uint64_t Correlator::FLOPS() const +{ + return correlatorKernel.FLOPS(); +} + +} diff --git a/libtcc/Correlator.h b/libtcc/Correlator.h new file mode 100644 index 0000000..723361f --- /dev/null +++ b/libtcc/Correlator.h @@ -0,0 +1,40 @@ +#if !defined CORRELATOR_H +#define CORRELATOR_H + +#include "libtcc/CorrelatorKernel.h" +#include "util/cu.h" +#include "util/nvrtc.h" + + +namespace tcc { + class Correlator { + public: + Correlator(unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations = 2, + unsigned nrReceiversPerBlock = 64 + ); // throw (cu::Error, nvrtc::Error) + + void launchAsync(cu::Stream &, cu::DeviceMemory &isibilities, cu::DeviceMemory &samples); // throw (cu::Error) + void launchAsync(CUstream, CUdeviceptr visibilities, CUdeviceptr samples); // throw (cu::Error) + + uint64_t FLOPS() const; + + private: + std::string findNVRTCincludePath() const; + cu::Module compileModule(unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations, + unsigned nrReceiversPerBlock + ); + + cu::Module correlatorModule; + CorrelatorKernel correlatorKernel; + }; +} + +#endif diff --git a/libtcc/CorrelatorKernel.cc b/libtcc/CorrelatorKernel.cc new file mode 100644 index 0000000..1572811 --- /dev/null +++ b/libtcc/CorrelatorKernel.cc @@ -0,0 +1,43 @@ +#include "libtcc/CorrelatorKernel.h" + + +namespace tcc { + +CorrelatorKernel::CorrelatorKernel(cu::Module &module, + unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations, + unsigned nrReceiversPerBlock + ) +: + Kernel(module, "correlate"), + nrBits(nrBits), + nrReceivers(nrReceivers), + nrChannels(nrChannels), + nrSamplesPerChannel(nrSamplesPerChannel), + nrPolarizations(nrPolarizations), + nrReceiversPerBlock(nrReceiversPerBlock) +{ + unsigned blocksPerDim = (nrReceivers + nrReceiversPerBlock - 1) / nrReceiversPerBlock; + nrThreadBlocksPerChannel = nrReceiversPerBlock == 64 ? blocksPerDim * blocksPerDim : blocksPerDim * (blocksPerDim + 1) / 2; +} + + +void CorrelatorKernel::launchAsync(cu::Stream &stream, cu::DeviceMemory &deviceVisibilities, cu::DeviceMemory &deviceSamples) +{ + std::vector<const void *> parameters = { deviceVisibilities.parameter(), deviceSamples.parameter() }; + stream.launchKernel(function, + nrThreadBlocksPerChannel, nrChannels, 1, + 32, 2, 2, + 0, parameters); +} + + +uint64_t CorrelatorKernel::FLOPS() const +{ + return 8ULL * nrReceivers * nrReceivers / 2 * nrPolarizations * nrPolarizations * nrChannels * nrSamplesPerChannel; +} + +} diff --git a/libtcc/CorrelatorKernel.h b/libtcc/CorrelatorKernel.h new file mode 100644 index 0000000..e28a2fd --- /dev/null +++ b/libtcc/CorrelatorKernel.h @@ -0,0 +1,35 @@ +#if !defined CORRELATOR_KERNEL_H +#define CORRELATOR_KERNEL_H + +#include "libtcc/Kernel.h" + + +namespace tcc { + class CorrelatorKernel : public Kernel + { + public: + CorrelatorKernel(cu::Module &module, + unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations = 2, + unsigned nrReceiversPerBlock = 64 + ); + + void launchAsync(cu::Stream &, cu::DeviceMemory &deviceVisibilities, cu::DeviceMemory &deviceSamples); + + virtual uint64_t FLOPS() const; + + private: + const unsigned nrBits; + const unsigned nrReceivers; + const unsigned nrChannels; + const unsigned nrSamplesPerChannel; + const unsigned nrPolarizations; + const unsigned nrReceiversPerBlock; + unsigned nrThreadBlocksPerChannel; + }; +} + +#endif diff --git a/libtcc/Kernel.cc b/libtcc/Kernel.cc new file mode 100644 index 0000000..0f3c1af --- /dev/null +++ b/libtcc/Kernel.cc @@ -0,0 +1,13 @@ +#include "libtcc/Kernel.h" + + +namespace tcc { + +Kernel::Kernel(cu::Module &module, const char *name) +: + module(module), + function(module, name) +{ +} + +} diff --git a/libtcc/Kernel.h b/libtcc/Kernel.h new file mode 100644 index 0000000..f6c9ab4 --- /dev/null +++ b/libtcc/Kernel.h @@ -0,0 +1,23 @@ +#if !defined KERNEL_H +#define KERNEL_H + +#include "util/cu.h" + +#include <stdint.h> + + +namespace tcc { + class Kernel + { + public: + Kernel(cu::Module &, const char *name); + + virtual uint64_t FLOPS() const = 0; + + protected: + cu::Module &module; + cu::Function function; + }; +} + +#endif diff --git a/libtcc/TCCorrelator.cu b/libtcc/TCCorrelator.cu new file mode 100644 index 0000000..0836edc --- /dev/null +++ b/libtcc/TCCorrelator.cu @@ -0,0 +1,596 @@ +#if 1000 * __CUDACC_VER_MAJOR__ + __CUDACC_VER_MINOR__ >= 11001 && __CUDA_ARCH__ >= 800 +#define ASYNC_COPIES +#endif + +#if defined ASYNC_COPIES +#include <cooperative_groups/memcpy_async.h> +#endif + +#include <mma.h> + +#define NR_BASELINES (NR_RECEIVERS * (NR_RECEIVERS + 1) / 2) +#define ALIGN(A,N) (((A)+(N)-1)/(N)*(N)) + +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) +#define NR_RECEIVERS_PER_TCM_X ((NR_BITS) == 4 ? 2 : 4) +#define NR_RECEIVERS_PER_TCM_Y ((NR_BITS) == 4 ? 4 : 8) + +#define COMPLEX 2 + +#if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0) +#error this architecture has no suitable tensor cores +#endif + +#if __CUDA_ARCH__ != 700 && __CUDA_ARCH__ != 720 && __CUDA_ARCH__ != 750 && __CUDA_ARCH__ != 800 && __CUDA_ARCH__ != 860 +#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory) +#endif + +#if NR_RECEIVERS_PER_BLOCK != 32 && NR_RECEIVERS_PER_BLOCK != 48 && NR_RECEIVERS_PER_BLOCK != 64 +#error unsupported NR_RECEIVERS_PER_BLOCK +#endif + +#if NR_SAMPLES_PER_CHANNEL % NR_TIMES_PER_BLOCK != 0 +#error NR_SAMPLES_PER_CHANNEL should be a multiple of NR_TIMES_PER_BLOCK +#endif + +#define MIN(A,B) ((A)<(B)?(A):(B)) + + +using namespace nvcuda::wmma; + +#if NR_BITS == 4 +typedef char Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef int2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 8 +typedef char2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef int2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 16 +typedef __half2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef float2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#endif + + +#if NR_BITS == 4 +typedef fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> Afrag; +typedef fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> Bfrag; +typedef fragment<accumulator, 8, 8, 32, int> Sum; +#elif NR_BITS == 8 +typedef fragment<matrix_a, 16, 16, 16, signed char, row_major> Afrag; +typedef fragment<matrix_b, 16, 16, 16, signed char, col_major> Bfrag; +typedef fragment<accumulator, 16, 16, 16, int> Sum; +#elif NR_BITS == 16 +typedef fragment<matrix_a, 16, 16, 16, __half, row_major> Afrag; +typedef fragment<matrix_b, 16, 16, 16, __half, col_major> Bfrag; +typedef fragment<accumulator, 16, 16, 16, float> Sum; +#endif + + +#if NR_BITS == 4 +typedef int2 ScratchSpace[4][NR_POLARIZATIONS][2][NR_POLARIZATIONS]; +#elif NR_BITS == 8 +typedef int2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS]; +#elif NR_BITS == 16 +typedef float2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS]; +#endif + + +__device__ inline int conj_perm(int v) +{ +#if NR_BITS == 4 + //return ((v & 0x0F0F0F0F) << 4) | (__vnegss4(v >> 4) & 0x0F0F0F0F); + return ((v & 0x0F0F0F0F) << 4) | ((0xF0F0F0F0 - ((v >> 4) & 0x0F0F0F0F)) & 0x0F0F0F0F); +#elif NR_BITS == 8 + //return __byte_perm(v, __vnegss4(v), 0x2705); + return __byte_perm(v, 0x00FF00FF - (v & 0xFF00FF00), 0x2705); +#elif NR_BITS == 16 + return __byte_perm(v ^ 0x80000000, v, 0x1032); +#endif +} + + +__device__ inline int2 conj_perm(int2 v) +{ + return make_int2(conj_perm(v.x), conj_perm(v.y)); +} + + +__device__ inline int4 conj_perm(int4 v) +{ + return make_int4(conj_perm(v.x), conj_perm(v.y), conj_perm(v.z), conj_perm(v.w)); +} + + +#if defined ASYNC_COPIES +#define READ_AHEAD MIN(2, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) +#define NR_SHARED_BUFFERS MIN(4, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) +#else +#define READ_AHEAD 1 +#define NR_SHARED_BUFFERS 2 +#endif + + +template <unsigned nrReceiversPerBlock = NR_RECEIVERS_PER_BLOCK> struct SharedData +{ +#if NR_BITS == 4 + typedef char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][1]; + typedef char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 16][1]; +#elif NR_BITS == 8 + typedef signed char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX]; + typedef signed char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 8][COMPLEX]; +#elif NR_BITS == 16 + typedef __half Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX]; + typedef __half Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 4][COMPLEX]; +#endif +}; + + +template <typename T> struct FetchData +{ + __device__ FetchData(unsigned loadRecv, unsigned loadPol, unsigned loadTime) + : + loadRecv(loadRecv), loadPol(loadPol), loadTime(loadTime), data({0}) + { + } + + __device__ void load(const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + //data = * (T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime]; + memcpy(&data, &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], sizeof(T)); + } + + template <typename SharedData> __device__ void storeA(SharedData samples) const + { + //* ((T *) &samples[loadRecv][loadPol][loadTime][0]) = data; + memcpy(&samples[loadRecv][loadPol][loadTime][0], &data, sizeof(T)); + } + + template <typename SharedData> __device__ void storeB(SharedData samples) const + { + //* ((T *) &samples[loadRecv][loadPol][0][loadTime][0]) = data; + //* ((T *) &samples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(data); + T tmp = conj_perm(data); + memcpy(&samples[loadRecv][loadPol][0][loadTime][0], &data, sizeof(T)); + memcpy(&samples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T)); + } + +#if defined ASYNC_COPIES + template <typename Asamples> __device__ void copyAsyncA(nvcuda::experimental::pipeline &pipe, Asamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe); + } + + template<typename Bsamples> __device__ void copyAsyncB(nvcuda::experimental::pipeline &pipe, Bsamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][0][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe); + } + + template<typename Bsamples> __device__ void fixB(Bsamples bSamples) + { + //* ((T *) &bSamples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(* ((T *) &bSamples[loadRecv][loadPol][0][loadTime][0])); + T tmp; + memcpy(&tmp, &bSamples[loadRecv][loadPol][0][loadTime][0], sizeof(T)); + tmp = conj_perm(tmp); + memcpy(&bSamples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T)); + } +#endif + + unsigned loadRecv, loadPol, loadTime; + T data; +}; + + +__device__ inline int2 make_complex(int real, int imag) +{ + return make_int2(real, imag); +} + + +__device__ inline float2 make_complex(float real, float imag) +{ + return make_float2(real, imag); +} + + +template <typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI) +{ + if ((skipCheckX || recvX + tcX <= recvY + tcY) && (skipCheckY || recvY + tcY < NR_RECEIVERS)) + visibilities[channel][baseline + tcY * recvY + tcY * (tcY + 1) / 2 + tcX][polY][polX] = make_complex(sumR, sumI); +} + + +__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned recvYoffset, unsigned recvXoffset, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp) +{ +#if defined PORTABLE + store_matrix_sync(&scratchSpace[warp][0][0][0][0].x, sum, NR_BITS == 4 ? 8 : 16, mem_row_major); + __syncwarp(); + +#if 0 + if (threadIdx.x == 0) + for (unsigned _y = 0; _y < 8; _y ++) + for (unsigned pol_y = 0; pol_y < NR_POLARIZATIONS; pol_y ++) + for (unsigned _x = 0; _x < 4; _x ++) + for (unsigned pol_x = 0; pol_x < NR_POLARIZATIONS; pol_x ++) + if (scratchSpace[warp][_y][pol_y][_x][pol_x],x != 0 || scratchSpace[warp][_y][pol_y][_x][pol_x].y != 0) + printf("firstY=%u firstX=%u warp=%u y=%u x=%u _y=%u pol_y=%u _x=%u pol_x=%u val=(%f,%f)\n", firstReceiverY, firstReceiverX, warp, y, x, _y, pol_y, _x, pol_x, scratchSpace[warp][_y][pol_y][_x][pol_x].x, scratchSpace[warp][_y][pol_y][_x][pol_x].y); +#endif + +#if NR_BITS == 4 + unsigned _y = threadIdx.x >> 3; + unsigned _x = (threadIdx.x >> 2) & 1; + unsigned polY = (threadIdx.x >> 1) & 1; + unsigned polX = threadIdx.x & 1; +#elif NR_BITS == 8 || NR_BITS == 16 + unsigned _y = threadIdx.x >> 2; + unsigned _x = threadIdx.x & 3; +#endif + + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + _y; + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + _x; + unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + + if ((skipCheckX || recvX <= recvY) && (skipCheckY || recvY < NR_RECEIVERS)) +#if NR_BITS == 4 + visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; +#elif NR_BITS == 8 || NR_BITS == 16 + for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++) + for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) + visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; +#endif +#else +#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4); + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2); + unsigned polY = threadIdx.x & 1; + unsigned polX = (threadIdx.x >> 1) & 1; +#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3); + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1); + unsigned polY = (threadIdx.x >> 2) & 1; + unsigned polX = threadIdx.x & 1; +#endif + + unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + +#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 1, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 1, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); +#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); +#if NR_BITS == 8 || NR_BITS == 16 + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 2, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 2, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); +#endif +#endif +#endif +} + + +#define NR_WARPS 4 + +#if NR_RECEIVERS_PER_BLOCK == 64 + +template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities visibilities, const Samples samples, unsigned firstReceiver, unsigned warp, unsigned tid, SharedData<>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +{ + const unsigned nrFragmentsX = NR_BITS == 4 ? 12 : 6; + const unsigned nrFragmentsY = nrFragmentsX / 2; + Sum sum[nrFragmentsX * nrFragmentsY]; + + for (auto &s : sum) + fill_fragment(s, 0); + + unsigned channel = blockIdx.y; + + const uchar2 offsets[] = { + make_uchar2( 0, 0), + make_uchar2( 0, 16), + make_uchar2( 0, 40), + make_uchar2(24, 40), + }; + + unsigned recvXoffset = offsets[warp].x; + unsigned recvYoffset = offsets[warp].y; + + FetchData<int4> tmp0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + FetchData<int4> tmp1((tid >> 2) + NR_RECEIVERS_PER_BLOCK / 2, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + +#if defined ASYNC_COPIES + using namespace nvcuda::experimental; + pipeline pipe; + + for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) { + unsigned fetchBuffer = majorTime; + + tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle); + tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle); + + pipe.commit(); + } +#else + tmp0.load(samples, channel, 0, firstReceiver, fullTriangle); + tmp1.load(samples, channel, 0, firstReceiver, fullTriangle); +#endif + + for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) { + unsigned buffer = majorTime % NR_SHARED_BUFFERS; + +#if !defined ASYNC_COPIES + tmp0.storeB(bSamples[buffer]); + tmp1.storeB(bSamples[buffer]); +#endif + + unsigned majorReadTime = majorTime + READ_AHEAD; + + if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) { +#if defined ASYNC_COPIES + unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS; + + tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle); + tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle); +#else + tmp0.load(samples, channel, majorReadTime, firstReceiver, fullTriangle); + tmp1.load(samples, channel, majorReadTime, firstReceiver, fullTriangle); +#endif + } + +#if defined ASYNC_COPIES + pipe.commit(); + pipe.wait_prior<READ_AHEAD>(); + + tmp0.fixB(bSamples[buffer]); + tmp1.fixB(bSamples[buffer]); +#endif + + __syncthreads(); + +#pragma unroll + for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) { + Afrag aFrag[nrFragmentsY]; + Bfrag bFrag[nrFragmentsX]; + + if (warp != 0) { + for (unsigned y = 0; y < nrFragmentsY; y ++) + load_matrix_sync(aFrag[y], &bSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < nrFragmentsX; x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) + mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]); + } else { + for (unsigned z = 0, i = 0; z < 3; z ++) { + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + load_matrix_sync(aFrag[y], &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < (NR_BITS == 4 ? 8 : 4); x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][/*recvXoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + for (unsigned x = 0; x < 2 + 2 * y; x ++, i ++) + mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]); + } + } + } + } + +#if defined PORTABLE + __syncthreads(); +#endif + + if (warp != 0) + for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, recvYoffset, recvXoffset, y, x, fullTriangle, x < 2 * y + (NR_BITS == 4 ? 8 : 4), sum[i], scratchSpace, warp); + else + for (unsigned z = 0, i = 0; z < 3; z ++) + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + for (unsigned x = 0; x < 2 * y + 2; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, 24 * z, 24 * z, y, x, fullTriangle, x < 2 * y, sum[i], scratchSpace, warp); +} + +#endif + + +template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +{ + const unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_X / 2 / (NR_RECEIVERS_PER_BLOCK == 64 ? 2 : 1); + + Sum sum[nrFragmentsY][nrFragmentsX]; + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + fill_fragment(sum[y][x], 0); + + unsigned tid = warpSize * (blockDim.y * threadIdx.z + threadIdx.y) + threadIdx.x; + unsigned channel = blockIdx.y; + + unsigned recvXoffset = nrFragmentsX * NR_RECEIVERS_PER_TCM_X * threadIdx.y; + unsigned recvYoffset = nrFragmentsY * NR_RECEIVERS_PER_TCM_Y * threadIdx.z; + + FetchData<int4> tmpY0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + FetchData<int4> tmpX0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); +#if NR_RECEIVERS_PER_BLOCK == 48 + FetchData<int2> tmpY1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3)); + FetchData<int2> tmpX1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3)); +#elif NR_RECEIVERS_PER_BLOCK == 64 + FetchData<int4> tmpY1((tid >> 2) + 32, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); +#endif + +#if defined ASYNC_COPIES + using namespace nvcuda::experimental; + pipeline pipe; + + for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) { + unsigned fetchBuffer = majorTime; + + tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck); +#endif + + pipe.commit(); + } +#else + tmpY0.load(samples, channel, 0, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.load(samples, channel, 0, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.load(samples, channel, 0, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.load(samples, channel, 0, firstReceiverX, skipLoadXcheck); +#endif +#endif + + for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) { + unsigned buffer = majorTime % NR_SHARED_BUFFERS; + +#if !defined ASYNC_COPIES + tmpY0.storeA(aSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.storeA(aSamples[buffer]); +#endif + tmpX0.storeB(bSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.storeB(bSamples[buffer]); +#endif +#endif + + unsigned majorReadTime = majorTime + READ_AHEAD; + + if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) { +#if defined ASYNC_COPIES + unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS; + + tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#endif +#else + tmpY0.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#endif +#endif + } + +#if defined ASYNC_COPIES + pipe.commit(); + pipe.wait_prior<READ_AHEAD>(); + + tmpX0.fixB(bSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.fixB(bSamples[buffer]); +#endif +#endif + + __syncthreads(); + +#pragma unroll + for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) { + Afrag aFrag[nrFragmentsY]; + Bfrag bFrag[nrFragmentsX]; + + for (unsigned y = 0; y < nrFragmentsY; y ++) + load_matrix_sync(aFrag[y], &aSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][minorTime][0], sizeof(aSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < nrFragmentsX; x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + mma_sync(sum[y][x], aFrag[y], bFrag[x], sum[y][x]); + } + } + +#if 0 + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + for (unsigned i = 0; i < sum[0][0].num_storage_elements; i ++) + if (sum[y][x].x[i] != 0) +#if NR_BITS == 4 || NR_BITS == 8 + printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%d\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]); +#else + printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%f\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]); +#endif +#endif + +#if defined PORTABLE + __syncthreads(); +#endif + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + storeVisibilities(visibilities, channel, firstReceiverY, firstReceiverX, recvYoffset, recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize); +} + + +extern "C" __global__ +__launch_bounds__(NR_WARPS * 32, NR_RECEIVERS_PER_BLOCK == 32 ? 4 : 2) +void correlate(Visibilities visibilities, const Samples samples) +{ + const unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2; + + unsigned block = blockIdx.x; + +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + unsigned blockY = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2; + unsigned blockX = block - blockY * (blockY + 1) / 2; + unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK; +#elif NR_RECEIVERS_PER_BLOCK == 64 + unsigned blockY = (unsigned) sqrtf(block); + unsigned blockX = block - blockY * blockY; + unsigned firstReceiverX = blockX * (NR_RECEIVERS_PER_BLOCK / 2); +#endif + unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK; + + union shared { + struct { + SharedData<>::Asamples aSamples; + SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples bSamples; + } rectangle; + struct { + SharedData<>::Bsamples samples; + } triangle; + ScratchSpace scratchSpace[NR_WARPS]; + }; + + // the following hack is necessary to run the correlator in the OpenCL environment, + // as the maximum local memory size is 48K - 16 bytes. Due to padding in bSamples, + // the last 16 bytes are not used, so allocate 16 fewer bytes. + __shared__ char rawbuffer[sizeof(union shared) - 16] __attribute__((aligned(16))); + union shared &u = (union shared &) rawbuffer; + + if (firstReceiverX == firstReceiverY) +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + doCorrelateRectangle<nrFragmentsY, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#elif NR_RECEIVERS_PER_BLOCK == 64 + if (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 && (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK)) + doCorrelateTriangle<false>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); + else + doCorrelateTriangle<true>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); +#endif +#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 + else if (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverY >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK) + doCorrelateRectangle<(NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK + 2 * NR_RECEIVERS_PER_TCM_Y - 1) / NR_RECEIVERS_PER_TCM_Y / 2, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_Y) == 0, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#endif + else + doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +} diff --git a/libtcc/TCCorrelator.cu.o b/libtcc/TCCorrelator.cu.o new file mode 100644 index 0000000000000000000000000000000000000000..943a2da3a75846584b95962314b91f4161f2bc5a GIT binary patch literal 26544 zcmb<-^>JfjWMqH=Mg}_u1P><4z)+BeU^{@B4h*aeEDXw-X$po01_laR3i0vIp)QWj z&hcTcLGiwhUj9Mx@e0~#!k&I$VLMv|Lqh`tLj^T8sDgOMAZL&Gcm+FK1q%ZMF6ETe zw9LFz1;^k>Kj(O7{{T<dU@qm<yp+r|E-vNFyyTqHlvD+q<ox`C)S|?a%(B$@^rHOI zf?~Z4JG}Dxxv9Cy1(orM#g%!<AjMoDFF<TdL9-5FO=(J^enDnIYEEWesvQ=0<mM(q z%z_xC;1?9{<QVMg<LT!b%*m<Y7Ze}l>g?(n<{A{NprwGy*H$ppRM1y2(nNUL$J5<U z!%@dilao_JL&H&1Tf<LN*HBYm!%tI7!%tI_3$D~JC_co~*EKjkz%?k|$;aQ>n^VKk z$U;FMY>AU+NU$ctTx4f}HH0|(#z$~!fZPLCs$grYV4`5JV5DHBU_wA&Bo=)p3RVgh zNUn4C_YLrIjo{?uGy=t18d_Z2z#R#**xcAa!3rFDo*}^?K??<Y1#=@bK|?c;r~ydS zK$A;3wWuh+NTDPnvsfXqC^;juBsICDv?x^}Be7T^FJGa!G_xczDJNB-BsH%%zepiD zzbLgBDGt$HrD&^QZeW0!GC|@-IK)k{i(B9@$HEMe!UOz+LL8lZTov^76-x86^YY8{ zP`#mRr%+y$S(2(ymRX#cl$n!Rl9^hpkeR1YkY5DxSb1Wlf<{?pqC#;-Vo_>}LT+kq zeo-YTWMKIv4HTyEBnXasaAHwVv{f)R0{ImqZNTMCEbz;lnZTl;G_Sa{0OYFF6dd-U z1iN2Qe6S-Zfk0BPvxlRfpR138Do8I{q6WFs0A>vieZ?91r8y}INvR5n3b~~@C7A^| zsS5dEpQD+Ea0N8G`g(%0o0FyjD6`pUIBD8zIBHsHIDv8+S7~u(Ub;eFVs2`2L1J>M zLS9*NX-cA%Re5f1qBSI0gR(m`!J2TDR2HPBq^2n(XCxLWC@3fdC*~I9q!vf}1;s<X z7#s`YVDq~^7QcfP!{ZLj3-I@G4DyU}4Ds~$gBXV9!&qy$Gcxl^j6lu^LrOg$>rvbS zFRXAFk7}VcmvU-OCOBk4#RfQ^!tDcj8)V^#@V22D+~4u>8HqV*Mgtw)X*v0cpa`dO zbVCXfa9xQtv8NR!rst;SmDuDamK0@H#3$+~Sb&kSk&Z%YML}v&CP+Fl$I7apC^b2= zI5R)b%Bt8zN1-UcJU%xuE5FE2!4YH%BISa3AWbkclfVXA=qM00GdVv8W~LL!6f9;Y zCMTEXmgXdu<QE|uo0(T)2g)sYad2ra-YfvJ64~L2Itqqn2wa?*o|l>eE+0_5k7_63 zz`$uHlJ`-~Bq}hFjU^&Dum=WG#vl+HkbHwDAi!QAEi_<e;tU9|nIwk>IB(%k9?;Mr zlwF{Oa&U4{Vo7pFFsNRMHo;c*8etK{oA$7oY=OlPtkz*M895>$MGK0fNirH5D)I3t zsb!hTsqyg&nR%cVwL)fIi9&LIURHcTYEiBRh*ze`RgGENC@ARb7p0bz7Ud~uXp||a zDHv23xIv(%f{l%WiKar0f<}CNSzc;-aj}U;nSz}iNDxJ#HJ5?{VM8<wDxfACDCmN1 z!(x;ssuQpTp9L3^o8#k?Doau!o-ET*Kys(1j)Fmjk-34XCX(yWlp9nS7`VBCFvvj= z-$VEY2FUs`>SjYTgpX0oRESeBsIV}A03C%gkRuEYjEyuw9sx&IEf*IT*61=qjV~hw zaD>4FHa9UlH6D}{H4xESrdOe<gQTrYuToRfnhU2DCP->C^GZx`SYblI3REkr(40^X za{}&OAheAb<m%`W@95#`=mIW06ciLdtwtjqP>YS&rW{I_D%is@$kip@Db&pk)GY*= zW<siIsX4`|@V+3X3k~6Npf)qAeMazBFjq-xZb43BNveWPX&$t4$}0*=O-{`$OD!r6 zNG)>8$xqH!umyP)-VjA}BJ323ONvU9OB8~^-4B<<l0@v87rkW+3P{IdMDrLmL}H_{ zI09QqhS>*>jW&!0Wh87CI#F&Rv|9^y3tGclL0iGl3}QV-J_MBoFc(3C8Pr%-pivOO zT!|bI$kx(4AS_~|5pKW;4REOh3l2zy4N4CR^b8JUYiS-FCaA#yDPOI*xRA0RC^$eJ zO@$D6o^neqNzQ;~DFp?jvKTI*0nfBK`H3k(smWzJ3MfJW`8jAJA(^?Unp_GBR$QPU zhiTJ*F*J1)K+J&r9I#M8ehyRwqz^0vVrc3pq$HLkYE&E4g3PMsQc$Sn;zDs_S$<{; z*eFmLU0ed|8!14uAd;(+GZORiQjy(Rl9`)|q#!M`sJH}HCg~_7<>%)p6lZ4^_~a+1 zIA^3{6i<+01T3W$sJChWbFhL!W}1QqsyQ_^3MiH-XoI}~3Rndjlp+vh3Mc?U!JuFZ z8rTa_(9%>;gL*F->Z#ahkY8e>ahVz$4GnDw0~EI4(hD4#h_V@;I1t6G9dd#I2X=8u zeo?BU20{retTjQkX>kdcf&wU_6hMyEKy@w15g<=OToI6;17U!j6KxQysbC9^Z%}bU zw#%HT<uU`@jw8}<hL|QG+A5$J1vkZNAp<;+HUQfAuz~`X$bl8+m{F6QUr^}?8g+Km zfDIv`b&R0{%nE9tp=KR;L8y?DT3iBd#th#qggr!%$Lck-6rc$f6dd3zgD17>fD(9e zeqM2j0%|I!AW!NjfZ_#~ED*781J9YTIs{MTI}sWAFr!8zACzSXM?PACFxcWhEwjQ& z10JVI(7{nq1%h52!{o4~aJ*%fmVyQ}gWxj)Eu28zA5gswn_5#qZ90H*7`!n6VSwrb zP#q8gs`)?|R90GZ)q+|wNX-*O&)!G@+#5{J2MvQ)fP42vsfjr{;9h@bZelv9tB<AU z2vr19fUB1X9*{vZA1tN-(F|c@w*xF^qybR~RsyjFOPL;mlAobILsYl$q1H+bI1@BQ zgFIh@)RKXgKu97<iN)ZVMI@o3)a0^AG=2mUza$x1wj>!@t{^`LS+XEMCjzO#1a2%v zq6<XmD1;~!m*xgRX-`O`fLcf3<`Gy|ghEY?0?1_%3fc-K$q@=RwqR}~m>a1HnkPc2 zj)bX(N~1TPK#5-gWl{^#V1k7R#9#$2WNSbrdr30LP{_O~SWQH1G|1avG9p&N7B$U5 zJXTzq>xpx~0ZVjdrWTVKp(rgq<hVeQK#ns|s7B_er4^@^AjL9BGy+4k5=lh`dPHK2 zSx7w?T$-z(R$Q8^qkueur%;SM1{sTFba`S?0Vow9kBq=3E4dWF4b6DyXg|1!Rzoof zq&qg+0D<%>bU+CVJi-B+H3H2wTPRq8N5^whbK~K20pLnDJ|1K)$RG_(=tP?V7o^2r zl2MeJnBtjIp;rNK&nqaT<rgU+dPDJ*3bqOc)(Y{J3N{KBU|L%nlIYNt7Ubu^R705H zc0S6am$d>!24MucA@LPZ`zjP{6iloY;wu!86{9N$83)w?VdAnA-55v+U<vH_N^tTA zQSlWJW<{({1$gchloyHBqE`vl0t+fo84tn*MVWaeX&Or4G#qKGS_<hVMSxhKNCMH7 zAff_9#Dke2pMe?iU_lTU%qUCDvDHvb(@{;+jLB2dftUpA-9@7DBXkr%=Iba_>L^s` zD8yIlK;lA2A-)2_1SKuxWDHUs8x0Bwkd2kG(TMN>CqrV@Rw711u?<GUd)e`opa4=( zP_R`%PMLOg3dW$S5}~dFu1*6@p%G{-*$_oFs6Yd&w^cw_ZHQHM1XMM$(P|2YpwV3P z=~_^FK$^${55^#ziPJkqC=QHAcn8%=V~~~L0S+#t1_h`ziv&5*7A?LNv?1j?q@+{O zM(JdO#~DEjRU#F%6e>XriYieY0WuZraWqpwbre_*zo`)lS_%~)Q!7wRg;zkf3K}3& zK$b%p@bx-SXGd6rMm=!Wl(0GyckKwy`0$Ym(0U!zxpfopC=GIJ0<)q7XZJ`j39kNd z=k-W<Zit8lMH4Y8154Qz2`TKrX%y@M%+fLvsr*ANnn6txxc&%8!vIr%1d@J24F#~z z$?`uqnJPmD(lgQ50@*^=0@Z+Ll|akeV9V4%5|G6}D7gq!X2P=(B;9~Z1#p=OFW;aE z2U_0~O79xznco<c`HeIcv{8!%H3buJwTWUs6><iK{otYsG)aV%x54QElDAQd0*oRJ z>>>r+MH*zFSOc<}5N~i>Xo5;HOogTfSj8<2aEP0MS`{dvMuk!eBh<jLW=x+Lf>aNv zb|J+OdV&H+D9(t&h)*niMG}ikP?Hif-XQTqOw1dafifDn2@3CVRcauYkUF5{6_62S znA>z9l?;T6&`~e|BT%q_8xavY;PGT|M<)`A4H7TS)vJg$hy^!b^(vwbW5Gd7x&4M< zn<K#_$e!%X0^IhS!0k7siv5Np`rim{zcID!C&~Y2aQn?c_76}wL3yMbfTNqpNJmR_ zpp1i5;aGqQIwU^08ArJbjYx4JwUedE04GZm9R-s7kC7~)4GLrmw8jCpqTMejKHM=V zAXvc!GPQtX;jOK$f|&{O+#q<yE3GsqC!{DdF)uwQ6=hHgJbaOo@0?#$lnR>4hbh80 zfB_vAfXu(c=82F708kn&NaCR06%w~3Gerk!>fgrB$_hTvsRkR0#5d>w3z1mRXoCV| zo(eGxnOEcnU0YZj0a=28JbGXVTB>daYPv&w1Fy!Bw1JwcNZbg}VtbGd(0~Z2$pXd2 zrMb~4%Cr=axRJ5oW<Izbn^;<sub@_}U<De`0M$ea3Tc@+Iq|TSZW_fp3I?Dd3#0}T zbktA5Rv`(rm;y9(Q3*B?>Tb|V03!uRU!^!2G=c>g7zU4>mV#w9KvkB4fu;_)KY%V_ zXof?=1e*>c6CDMp1Zd<I$)TWDD|F0B!4~cyP@Ko=q4tqLDq(F{bd?al!RM%LKnr;6 z6iRXn3^X)KGE+e7vWzsL0gsi^QGls~G-h-Z%uE!(%cH^3uLY4-Q!s>#U}Le#5ZNXy zeM-mzcYRQ9C&4Q4H~^06KwK-uk*E1U2@f=l1YY+BVt^9?I9b8dE_gXCXq+9=M?!FI z6cB5CtQ8RA+S;1XX<fJ(X`pojPNivSsYT$?(5(C-kcnX16u=Ro2cM91(f~O`M*)^u zqmeX&DqTnkfz+ge+l)3+qoaUaK!F=cV7m<IVHY@8fIOm?oS&PUSppg~giLLK7D<8K zVW0<^<ko=B&m(!rfUrkOatjQ}){ba06CKl_6<b(#M}V^c+M*)@Ng)X%6+jk8pf0z9 zj<71?NZZix12-<5G~g)(Vm7Ef2Fk_YFv6__5ei7<4LB_Yr6#6;k|%OfRnSIG?VvFh z$jB$E2JqMe(d+LNAd5<I`Vq-pNO=I%TZMQO#Z8(xf)NxF(BPsli@}10aOwov2@ZWa z*$5dh0oe-mDUOVV5?Lg8gBr#`lN-p9_?K}gfG74*iVdhxd17Wsd_hrWevu8L<h28p zEa1`xRG;aAr*`pW2wV!F?gN_ynd^XbwTr>#D;I#8J&6jXc}4j-IbcO-Wk+Tnq)dm_ z5^yd!OJEieaCvQ81<WmfhGq(&y?d~FL;<oHLm?5=_(E<Lf!aZ!5LHkBFP~RPLRSD9 zRRp^hl3PIy2gr1SLNymBst=$OG@$tz<d&8-XqZ=9lZ#UUyzmNTk_qCT%2*u*c+)Z( zk~L$ak=r9!>U;PYGHgyN7ZxHQ0Ud?n%&OG<G(^z`o4e9f&{6<xsD$>jG_AQf5dahv z=-z_P>VT$!kh~RP4H{?0;VnogR^ap&tU-puTi`)#kS{<JZbW(uyrc`=XXsu5$B>Rf zCM1dBi73>F#Tx=3<3JN*oS?AF%}oT)@@art-<h#G&;$pHU__We<e<$71%+A#&}c1Y za;gHk5@b4f+OJB%M#0!xp$asJ3{s0eh7I-_D5+uYLIrJOHPQsh!QBZekBCl)`dUay zQA=OJ$V5R4<Pbdb8YD&@5or?LzesTc+wTjSI)zQanrI@WPb?V^5gV8p58_{F8YSpo zDkfHRKav)wIMb|=g0_MYXtoIwB%mY(PY4PMq$OF-T4-Sip+O{CWrUbhSHP$R5LE<d zehE>Qg3UyAKiKaer-D;IuB-~mbLgRllrk{OE0Bp`&x4zF@X=P#+6Wxut!TprXk4V6 z0Ue2j^m$>kyT}~~9R*Ms3XTcTWCyeWMoqQgG1+KP$$>oO1(`7hml&Y30!wh=E-b)` z(ZdKFx}Xw)U}=FGD@b7r^M?^es-jZRLW&t3XmLXzKna8!D2+l-TR>W$4)0wer4!@@ zC!odHk;$Od*htG#K!OqIg2AA5${0$)q7b!sN4J7flS{y3U{o94!na5Vv6vb(wgp?@ ztDu$$UB!no`h{;ePr)9v*&ckp0erd_JiQGbilF(p85TPsBPys8pmq{?j0}2+1gb39 zsh})q2C6wgqi=?gkuK0^8mOX1P8y&sJfQX=Xftdvv|)&vKalbjvPMu&1C=_U;tEv* zQiej(0B>A@G+>OzmF7lQ#zuo$SLma1prI&`7F+NNli<v%RM6@&@CcntX0BePf)>(( zjw;YfGx)-zO3(x>q`?jv%tIX-!#Uaq&q1Ja5L#m@*dh-!f~rT1L<ETwWXE7Gf<O$# zAq5O*-2ztAKoSVoSAoZcuneL_VvN^OJ}?(SnVHZz5FE+U1hmTv#Un=GamPr+aG)_L z=0JPMVZ(lq`A^Wka!7ta9bq)aGQtQkGlDWRAv0JQv4fPuxD*snykdgx74RwmWUml0 zSPE-XfWi^SK**39$c!{Vj4nDtM>1hWGub1V&>3J*-36(oKr6u@%W`lgJm_Q`4tX=k zGM<v$f=Hy{P}({eQiFpExRTt02&8bL+K4Z7)*2K}kb0YlZ~`~5Kn_9*2iiIa5)NQ{ z@edG35<NmpqXYs;9-!VP5u?;t-2m|`cvLQeXy1?-jgUwr(zRIafVdV`B%!yu!J~)J zabW}GQEu=M7qO$h$QrTClt6}$P)44?M-Y)dJRWJF2X4wcYQW1Mh~c2|aZquKyR0{X zP303)df>4H5}t@04_1nGyq=JIz_|qy!FUXT*oTNeqycnr<iSR;QG$2K5A0K3<-o!P zZActzj;2Pn11=9JuXsptHY90)k|Nl0%Bvq7mSZhf2!t6paYLL%aV12GvmjxDHhF@( z(m}~W^iC=`igl9R1o1O+Iv~SjiX)j?c0zmuE$~2|0*#bprYYcPNkiw6kY}W!LWHN5 zA`EcOEs;=o6PSI1`UIRIFoqwwhQTBh>hJ-9@di#%GQl;@k6x*hHsOSQS_W<=?%5`A z)S$PF2n>T@`UBtiJ@Qxt&1aZEc?R7#gkl9X4hi`kZHOK^^n;i=L<9wByb3H09w`Pb z76D=K5+Bf-a8zpvI1=P9R1E~&3DSTb5YUOXOa&Xz(lXF&o<Xc$UTJPTs0o&so*JK; zlL{V8vsTCiO>A*-g60L_-q5Rv&Wwf3wt_u@GGbm+1DbyVk8Rl6Dp-J~S7BSazyn^M zDHXOFswp~Pq^VGnnF89!1>U^{-m{en;+EN}rhs>B!L)$(Q^DDlI*8S6RXQLeV588P zknv`4V1r!=>L-GS<~bD<$Z|#+^_+pcR1+L-u)>_UsTtH*r%f^e#eXWvvoBb7(t+|i zbfggL{0rzT6!0)SWY`0;5FRW5>E9#8qYld6QE+rC=)(t6p*w9;D@sy}@)VSul@zK$ zXNjff<R>NO#K&{R$LA!L<|SvuC*_ysr4+}<gJvZmD`U05Cp_ZrEEyXq*egILX}Q1) zUz1_WUP+!0AhhHZw00DSL%=JsP`hiOVcJN@@>1lDZ{Ylayz;XGy4VVb$+qA_c_H}~ z)dKj~Ab6M)K9~%)0OVBo?pI9(jpD+hk~9qq1uaOP1f5K!si3Q%X9)sn;3KC&ixE)` z0WGlwo2H-((FPhP)&Pqjc3h)qLz&-$YsWf@4R$@!o--VN1CNxUIt$Hj5U+t<1unT! z6~i5Y><6&oNFrShN{ZMV2Rfx2<bH60fz;JS2?LN@A+{-Kfl6E~27yQ0<KrR6HOI#* zl;&mT=RsF+LeElzto~I%TUZUBe1f$bKpp~BBuH~qxDyq4%ElNpXM<f0@(dJgK@-G+ zTF~AZ$dnsI0Iw_I%fF!uEg@QJ6-uBh*CBk|yU^ha!a)ja6-pK2;}c6tiZYW*OH$+G zH8c`)z$d6`7@BE<#^#}W!cih20yIU1A`+<yF50pC9HiRV2>$>sILj$0AT6?oPea-u z&$xhhxWgkIvPKZjMNIdBtTdpV+O))+;#51dJyFp3!0bt)gcRsp0=**G6ssOAsp%+` zB8kG%2S^lo<{eV6<L!Qf^EYI57;<dj4k<-j1@QUmDA^LSYZP0+q95QD0lHKG#U?C8 zH@IxVnPD{{tNc-7AGXE9hJ>Jw&;c(jL7I3&ol^v_az~!#1I;-agH{e8w1D<NBX|{{ z2!LfQJ$M9{>Y+snsE`1+w?GW+{wOIbB{IbfoHwA|RrLNZ{<OoT09i2xYAd6p1Mo01 zF-a(r3P}jukHcDepr&i^;4?NyK^7$;djMRTKpGvokXs~h$bxq{fYYgt0w|g>${h_* z8pEzx6IxP%^bJ&j0<Cc22^ecIq87p+=fXUTU=NH3K>NRHxxm+^;31f?+`Iz1i$ws7 zxH1C+g8;+F|2nCt2D*U785z*U8RC;N^Ad|H<4cm0<3pTb`^55#;*(3`i%SxVO7N@5 ztV+eJAT=+AfkCghGPfi#i9xTp1avb6gI;k4j0HL*3e~0L@uV3Tz@fp6rX6x84FiV1 zLE(q26vSsj7J#w~py5~n<%7%zspEk1C7?7X0|Ns{4$6g7FQEEU;35zP$UX=QLJH#0 Q?*qMGM-ie7PNM4v0D^MdDF6Tf literal 0 HcmV?d00001 diff --git a/libtcc/TCCorrelator.cu.orig b/libtcc/TCCorrelator.cu.orig new file mode 100644 index 0000000..ddf921c --- /dev/null +++ b/libtcc/TCCorrelator.cu.orig @@ -0,0 +1,587 @@ +#if 1000 * __CUDACC_VER_MAJOR__ + __CUDACC_VER_MINOR__ >= 11001 && __CUDA_ARCH__ >= 800 +#define ASYNC_COPIES +#endif + +#if defined ASYNC_COPIES +#include <cooperative_groups/memcpy_async.h> +#endif + +#include <mma.h> + +#define NR_BASELINES (NR_RECEIVERS * (NR_RECEIVERS + 1) / 2) +#define ALIGN(A,N) (((A)+(N)-1)/(N)*(N)) + +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) +#define NR_RECEIVERS_PER_TCM_X ((NR_BITS) == 4 ? 2 : 4) +#define NR_RECEIVERS_PER_TCM_Y ((NR_BITS) == 4 ? 4 : 8) + +#define COMPLEX 2 + +#if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0) +#error this architecture has no suitable tensor cores +#endif + +#if __CUDA_ARCH__ != 700 && __CUDA_ARCH__ != 720 && __CUDA_ARCH__ != 750 && __CUDA_ARCH__ != 800 && __CUDA_ARCH__ != 860 +#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory) +#endif + +#if NR_RECEIVERS_PER_BLOCK != 32 && NR_RECEIVERS_PER_BLOCK != 48 && NR_RECEIVERS_PER_BLOCK != 64 +#error unsupported NR_RECEIVERS_PER_BLOCK +#endif + +#if NR_SAMPLES_PER_CHANNEL % NR_TIMES_PER_BLOCK != 0 +#error NR_SAMPLES_PER_CHANNEL should be a multiple of NR_TIMES_PER_BLOCK +#endif + +#define MIN(A,B) ((A)<(B)?(A):(B)) + + +using namespace nvcuda::wmma; + +#if NR_BITS == 4 +typedef char Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef int2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 8 +typedef char2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef int2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 16 +typedef __half2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef float2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#endif + + +#if NR_BITS == 4 +typedef fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> Afrag; +typedef fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> Bfrag; +typedef fragment<accumulator, 8, 8, 32, int> Sum; +#elif NR_BITS == 8 +typedef fragment<matrix_a, 16, 16, 16, signed char, row_major> Afrag; +typedef fragment<matrix_b, 16, 16, 16, signed char, col_major> Bfrag; +typedef fragment<accumulator, 16, 16, 16, int> Sum; +#elif NR_BITS == 16 +typedef fragment<matrix_a, 16, 16, 16, __half, row_major> Afrag; +typedef fragment<matrix_b, 16, 16, 16, __half, col_major> Bfrag; +typedef fragment<accumulator, 16, 16, 16, float> Sum; +#endif + + +#if NR_BITS == 4 +typedef int2 ScratchSpace[4][NR_POLARIZATIONS][2][NR_POLARIZATIONS]; +#elif NR_BITS == 8 +typedef int2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS]; +#elif NR_BITS == 16 +typedef float2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS]; +#endif + + +__device__ inline int conj_perm(int v) +{ +#if NR_BITS == 4 + //return ((v & 0x0F0F0F0F) << 4) | (__vnegss4(v >> 4) & 0x0F0F0F0F); + return ((v & 0x0F0F0F0F) << 4) | ((0xF0F0F0F0 - ((v >> 4) & 0x0F0F0F0F)) & 0x0F0F0F0F); +#elif NR_BITS == 8 + //return __byte_perm(v, __vnegss4(v), 0x2705); + return __byte_perm(v, 0x00FF00FF - (v & 0xFF00FF00), 0x2705); +#elif NR_BITS == 16 + return __byte_perm(v ^ 0x80000000, v, 0x1032); +#endif +} + + +__device__ inline int2 conj_perm(int2 v) +{ + return make_int2(conj_perm(v.x), conj_perm(v.y)); +} + + +__device__ inline int4 conj_perm(int4 v) +{ + return make_int4(conj_perm(v.x), conj_perm(v.y), conj_perm(v.z), conj_perm(v.w)); +} + + +#if defined ASYNC_COPIES +#define READ_AHEAD MIN(2, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) +#define NR_SHARED_BUFFERS MIN(4, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) +#else +#define READ_AHEAD 1 +#define NR_SHARED_BUFFERS 2 +#endif + + +template <unsigned nrReceiversPerBlock = NR_RECEIVERS_PER_BLOCK> struct SharedData +{ +#if NR_BITS == 4 + typedef char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][1]; + typedef char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 16][1]; +#elif NR_BITS == 8 + typedef signed char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX]; + typedef signed char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 8][COMPLEX]; +#elif NR_BITS == 16 + typedef __half Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX]; + typedef __half Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 4][COMPLEX]; +#endif +}; + + +template <typename T> struct FetchData +{ + __device__ FetchData(unsigned loadRecv, unsigned loadPol, unsigned loadTime) + : + loadRecv(loadRecv), loadPol(loadPol), loadTime(loadTime), data({0}) + { + } + + __device__ void load(const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + data = * (T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime]; + } + + template <typename SharedData> __device__ void storeA(SharedData samples) const + { + * ((T *) &samples[loadRecv][loadPol][loadTime][0]) = data; + } + + template <typename SharedData> __device__ void storeB(SharedData samples) const + { + * ((T *) &samples[loadRecv][loadPol][0][loadTime][0]) = data; + * ((T *) &samples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(data); + } + +#if defined ASYNC_COPIES + template <typename Asamples> __device__ void copyAsyncA(nvcuda::experimental::pipeline &pipe, Asamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe); + } + + template<typename Bsamples> __device__ void copyAsyncB(nvcuda::experimental::pipeline &pipe, Bsamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][0][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe); + } + + template<typename Bsamples> __device__ void fixB(Bsamples bSamples) + { + * ((T *) &bSamples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(* ((T *) &bSamples[loadRecv][loadPol][0][loadTime][0])); + } +#endif + + unsigned loadRecv, loadPol, loadTime; + T data; +}; + + +__device__ inline int2 make_complex(int real, int imag) +{ + return make_int2(real, imag); +} + + +__device__ inline float2 make_complex(float real, float imag) +{ + return make_float2(real, imag); +} + + +template <typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI) +{ + if ((skipCheckX || recvX + tcX <= recvY + tcY) && (skipCheckY || recvY + tcY < NR_RECEIVERS)) + visibilities[channel][baseline + tcY * recvY + tcY * (tcY + 1) / 2 + tcX][polY][polX] = make_complex(sumR, sumI); +} + + +__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned recvYoffset, unsigned recvXoffset, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp) +{ +#if defined PORTABLE + store_matrix_sync(&scratchSpace[warp][0][0][0][0].x, sum, NR_BITS == 4 ? 8 : 16, mem_row_major); + __syncwarp(); + +#if 0 + if (threadIdx.x == 0) + for (unsigned _y = 0; _y < 8; _y ++) + for (unsigned pol_y = 0; pol_y < NR_POLARIZATIONS; pol_y ++) + for (unsigned _x = 0; _x < 4; _x ++) + for (unsigned pol_x = 0; pol_x < NR_POLARIZATIONS; pol_x ++) + if (scratchSpace[warp][_y][pol_y][_x][pol_x],x != 0 || scratchSpace[warp][_y][pol_y][_x][pol_x].y != 0) + printf("firstY=%u firstX=%u warp=%u y=%u x=%u _y=%u pol_y=%u _x=%u pol_x=%u val=(%f,%f)\n", firstReceiverY, firstReceiverX, warp, y, x, _y, pol_y, _x, pol_x, scratchSpace[warp][_y][pol_y][_x][pol_x].x, scratchSpace[warp][_y][pol_y][_x][pol_x].y); +#endif + +#if NR_BITS == 4 + unsigned _y = threadIdx.x >> 3; + unsigned _x = (threadIdx.x >> 2) & 1; + unsigned polY = (threadIdx.x >> 1) & 1; + unsigned polX = threadIdx.x & 1; +#elif NR_BITS == 8 || NR_BITS == 16 + unsigned _y = threadIdx.x >> 2; + unsigned _x = threadIdx.x & 3; +#endif + + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + _y; + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + _x; + unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + + if ((skipCheckX || recvX <= recvY) && (skipCheckY || recvY < NR_RECEIVERS)) +#if NR_BITS == 4 + visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; +#elif NR_BITS == 8 || NR_BITS == 16 + for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++) + for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) + visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; +#endif +#else +#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4); + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2); + unsigned polY = threadIdx.x & 1; + unsigned polX = (threadIdx.x >> 1) & 1; +#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3); + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1); + unsigned polY = (threadIdx.x >> 2) & 1; + unsigned polX = threadIdx.x & 1; +#endif + + unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + +#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 1, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 1, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); +#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); +#if NR_BITS == 8 || NR_BITS == 16 + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 2, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 2, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); +#endif +#endif +#endif +} + + +#define NR_WARPS 4 + +#if NR_RECEIVERS_PER_BLOCK == 64 + +template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities visibilities, const Samples samples, unsigned firstReceiver, unsigned warp, unsigned tid, SharedData<>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +{ + const unsigned nrFragmentsX = NR_BITS == 4 ? 12 : 6; + const unsigned nrFragmentsY = nrFragmentsX / 2; + Sum sum[nrFragmentsX * nrFragmentsY]; + + for (auto &s : sum) + fill_fragment(s, 0); + + unsigned channel = blockIdx.y; + + const uchar2 offsets[] = { + make_uchar2( 0, 0), + make_uchar2( 0, 16), + make_uchar2( 0, 40), + make_uchar2(24, 40), + }; + + unsigned recvXoffset = offsets[warp].x; + unsigned recvYoffset = offsets[warp].y; + + FetchData<int4> tmp0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + FetchData<int4> tmp1((tid >> 2) + NR_RECEIVERS_PER_BLOCK / 2, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + +#if defined ASYNC_COPIES + using namespace nvcuda::experimental; + pipeline pipe; + + for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) { + unsigned fetchBuffer = majorTime; + + tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle); + tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle); + + pipe.commit(); + } +#else + tmp0.load(samples, channel, 0, firstReceiver, fullTriangle); + tmp1.load(samples, channel, 0, firstReceiver, fullTriangle); +#endif + + for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) { + unsigned buffer = majorTime % NR_SHARED_BUFFERS; + +#if !defined ASYNC_COPIES + tmp0.storeB(bSamples[buffer]); + tmp1.storeB(bSamples[buffer]); +#endif + + unsigned majorReadTime = majorTime + READ_AHEAD; + + if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) { +#if defined ASYNC_COPIES + unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS; + + tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle); + tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle); +#else + tmp0.load(samples, channel, majorReadTime, firstReceiver, fullTriangle); + tmp1.load(samples, channel, majorReadTime, firstReceiver, fullTriangle); +#endif + } + +#if defined ASYNC_COPIES + pipe.commit(); + pipe.wait_prior<READ_AHEAD>(); + + tmp0.fixB(bSamples[buffer]); + tmp1.fixB(bSamples[buffer]); +#endif + + __syncthreads(); + +#pragma unroll + for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) { + Afrag aFrag[nrFragmentsY]; + Bfrag bFrag[nrFragmentsX]; + + if (warp != 0) { + for (unsigned y = 0; y < nrFragmentsY; y ++) + load_matrix_sync(aFrag[y], &bSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < nrFragmentsX; x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) + mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]); + } else { + for (unsigned z = 0, i = 0; z < 3; z ++) { + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + load_matrix_sync(aFrag[y], &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < (NR_BITS == 4 ? 8 : 4); x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][/*recvXoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + for (unsigned x = 0; x < 2 + 2 * y; x ++, i ++) + mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]); + } + } + } + } + +#if defined PORTABLE + __syncthreads(); +#endif + + if (warp != 0) + for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, recvYoffset, recvXoffset, y, x, fullTriangle, x < 2 * y + (NR_BITS == 4 ? 8 : 4), sum[i], scratchSpace, warp); + else + for (unsigned z = 0, i = 0; z < 3; z ++) + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + for (unsigned x = 0; x < 2 * y + 2; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, 24 * z, 24 * z, y, x, fullTriangle, x < 2 * y, sum[i], scratchSpace, warp); +} + +#endif + + +template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +{ + const unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_X / 2 / (NR_RECEIVERS_PER_BLOCK == 64 ? 2 : 1); + + Sum sum[nrFragmentsY][nrFragmentsX]; + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + fill_fragment(sum[y][x], 0); + + unsigned tid = warpSize * (blockDim.y * threadIdx.z + threadIdx.y) + threadIdx.x; + unsigned channel = blockIdx.y; + + unsigned recvXoffset = nrFragmentsX * NR_RECEIVERS_PER_TCM_X * threadIdx.y; + unsigned recvYoffset = nrFragmentsY * NR_RECEIVERS_PER_TCM_Y * threadIdx.z; + + FetchData<int4> tmpY0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + FetchData<int4> tmpX0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); +#if NR_RECEIVERS_PER_BLOCK == 48 + FetchData<int2> tmpY1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3)); + FetchData<int2> tmpX1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3)); +#elif NR_RECEIVERS_PER_BLOCK == 64 + FetchData<int4> tmpY1((tid >> 2) + 32, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); +#endif + +#if defined ASYNC_COPIES + using namespace nvcuda::experimental; + pipeline pipe; + + for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) { + unsigned fetchBuffer = majorTime; + + tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck); +#endif + + pipe.commit(); + } +#else + tmpY0.load(samples, channel, 0, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.load(samples, channel, 0, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.load(samples, channel, 0, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.load(samples, channel, 0, firstReceiverX, skipLoadXcheck); +#endif +#endif + + for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) { + unsigned buffer = majorTime % NR_SHARED_BUFFERS; + +#if !defined ASYNC_COPIES + tmpY0.storeA(aSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.storeA(aSamples[buffer]); +#endif + tmpX0.storeB(bSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.storeB(bSamples[buffer]); +#endif +#endif + + unsigned majorReadTime = majorTime + READ_AHEAD; + + if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) { +#if defined ASYNC_COPIES + unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS; + + tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#endif +#else + tmpY0.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#endif +#endif + } + +#if defined ASYNC_COPIES + pipe.commit(); + pipe.wait_prior<READ_AHEAD>(); + + tmpX0.fixB(bSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.fixB(bSamples[buffer]); +#endif +#endif + + __syncthreads(); + +#pragma unroll + for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) { + Afrag aFrag[nrFragmentsY]; + Bfrag bFrag[nrFragmentsX]; + + for (unsigned y = 0; y < nrFragmentsY; y ++) + load_matrix_sync(aFrag[y], &aSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][minorTime][0], sizeof(aSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < nrFragmentsX; x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + mma_sync(sum[y][x], aFrag[y], bFrag[x], sum[y][x]); + } + } + +#if 0 + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + for (unsigned i = 0; i < sum[0][0].num_storage_elements; i ++) + if (sum[y][x].x[i] != 0) +#if NR_BITS == 4 || NR_BITS == 8 + printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%d\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]); +#else + printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%f\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]); +#endif +#endif + +#if defined PORTABLE + __syncthreads(); +#endif + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + storeVisibilities(visibilities, channel, firstReceiverY, firstReceiverX, recvYoffset, recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize); +} + + +extern "C" __global__ +__launch_bounds__(NR_WARPS * 32, NR_RECEIVERS_PER_BLOCK == 32 ? 4 : 2) +void correlate(Visibilities visibilities, const Samples samples) +{ + const unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2; + + unsigned block = blockIdx.x; + +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + unsigned blockY = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2; + unsigned blockX = block - blockY * (blockY + 1) / 2; + unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK; +#elif NR_RECEIVERS_PER_BLOCK == 64 + unsigned blockY = (unsigned) sqrtf(block); + unsigned blockX = block - blockY * blockY; + unsigned firstReceiverX = blockX * (NR_RECEIVERS_PER_BLOCK / 2); +#endif + unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK; + + union shared { + struct { + SharedData<>::Asamples aSamples; + SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples bSamples; + } rectangle; + struct { + SharedData<>::Bsamples samples; + } triangle; + ScratchSpace scratchSpace[NR_WARPS]; + }; + + // the following hack is necessary to run the correlator in the OpenCL environment, + // as the maximum local memory size is 48K - 16 bytes. Due to padding in bSamples, + // the last 16 bytes are not used, so allocate 16 fewer bytes. + __shared__ char rawbuffer[sizeof(union shared) - 16] __attribute__((aligned(16))); + union shared &u = (union shared &) rawbuffer; + + if (firstReceiverX == firstReceiverY) +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + doCorrelateRectangle<nrFragmentsY, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#elif NR_RECEIVERS_PER_BLOCK == 64 + if (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 && (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK)) + doCorrelateTriangle<false>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); + else + doCorrelateTriangle<true>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); +#endif +#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 + else if (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverY >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK) + doCorrelateRectangle<(NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK + 2 * NR_RECEIVERS_PER_TCM_Y - 1) / NR_RECEIVERS_PER_TCM_Y / 2, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_Y) == 0, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#endif + else + doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +} diff --git a/test/Common/ComplexInt4.h b/test/Common/ComplexInt4.h new file mode 100644 index 0000000..e448428 --- /dev/null +++ b/test/Common/ComplexInt4.h @@ -0,0 +1,21 @@ +#if !defined COMPLEX_INT4_T +#define COMPLEX_INT4_T + +#include <complex> +#include <limits> + +class complex_int4_t +{ + public: + complex_int4_t() {} + complex_int4_t(int real, int imag) { value = (imag << 4) | (real & 0xF); } + complex_int4_t operator = (const complex_int4_t &other) { value = other.value; return *this; } + int real() const { return value << (std::numeric_limits<int>::digits - 4) >> (std::numeric_limits<int>::digits - 4); } + int imag() const { return value << (std::numeric_limits<int>::digits - 8) >> (std::numeric_limits<int>::digits - 4); } + operator std::complex<int> () const { return std::complex<int>(real(), imag()); } + + private: + char value; +}; + +#endif diff --git a/test/Common/Config.h b/test/Common/Config.h new file mode 100644 index 0000000..0ecedb4 --- /dev/null +++ b/test/Common/Config.h @@ -0,0 +1,22 @@ +#if !defined CONFIG_H +#define CONFIG_H + +#if defined __ARM_ARCH +#define UNIFIED_MEMORY // assume this is a Jetson Xavier +#endif + +#undef MEASURE_POWER + +#define POL_X 0 +#define POL_Y 1 +#define NR_POLARIZATIONS 2 + +#if !defined NR_TIMES +#define NR_TIMES 768 +#endif + +#define REAL 0 +#define IMAG 1 +#define COMPLEX 2 + +#endif diff --git a/test/Common/Record.cc b/test/Common/Record.cc new file mode 100644 index 0000000..8171486 --- /dev/null +++ b/test/Common/Record.cc @@ -0,0 +1,35 @@ +#include "test/Common/Record.h" + + +#if defined MEASURE_POWER +Record::Record(powersensor::PowerSensor &powerSensor) +: + powerSensor(powerSensor) +{ +} +#endif + + +#if defined MEASURE_POWER +void Record::getPower(CUstream, CUresult, void *userData) +{ + Record *record = (Record *) userData; + record->state = record->powerSensor.read(); +} + +#endif + + +void Record::enqueue(cu::Stream &stream) +{ + stream.record(event); // if this is omitted, the callback takes ~100 ms ?! + +#if defined MEASURE_POWER +#if !defined TEGRA_QUIRKS + stream.addCallback(&Record::getPower, this); // does not work well on Tegra +#else + stream.synchronize(); + state = powerSensor.read(); +#endif +#endif +} diff --git a/test/Common/Record.h b/test/Common/Record.h new file mode 100644 index 0000000..2704748 --- /dev/null +++ b/test/Common/Record.h @@ -0,0 +1,33 @@ +#if !defined RECORD_H +#define RECORD_H + +#include "test/Common/Config.h" + +#include "util/cu.h" + +#if defined MEASURE_POWER +#include <powersensor/NVMLPowerSensor.h> +#endif + + +struct Record +{ + public: +#if defined MEASURE_POWER + Record(powersensor::PowerSensor &); +#endif + + void enqueue(cu::Stream &); + + mutable cu::Event event; + +#if defined MEASURE_POWER + powersensor::PowerSensor &powerSensor; + powersensor::State state; + + private: + static void getPower(CUstream, CUresult, void *userData); +#endif +}; + +#endif diff --git a/test/Common/UnitTest.cc b/test/Common/UnitTest.cc new file mode 100644 index 0000000..c05fb61 --- /dev/null +++ b/test/Common/UnitTest.cc @@ -0,0 +1,60 @@ +#include "test/Common/UnitTest.h" + +#include <iostream> + + +UnitTest::UnitTest(unsigned deviceNumber) +: + device(deviceNumber), + context(CU_CTX_SCHED_BLOCKING_SYNC, device) +#if defined MEASURE_POWER +, powerSensor(powersensor::nvml::NVMLPowerSensor::create(0)) +#endif +{ +#pragma omp critical (clog) + std::clog << "running test on " << device.getName() << std::endl; + +#if 0 && defined MEASURE_POWER + powerSensor->dump("/tmp/sensor_readings"); +#endif +} + + +UnitTest::~UnitTest() +{ +#if defined MEASURE_POWER + delete powerSensor; +#endif +} + + +void UnitTest::report(const char *name, const Record &startRecord, const Record &stopRecord, uint64_t FLOPS, uint64_t bytes) +{ +#if defined MEASURE_POWER + //powerSensor->mark(startRecord.state, name); + + double Watt = powersensor::PowerSensor::Watt(startRecord.state, stopRecord.state); +#endif + + double runtime = stopRecord.event.elapsedTime(startRecord.event) * 1e-3; + +#pragma omp critical (cout) + { + std::cout << name << ": " << runtime << " s"; + + if (FLOPS != 0) + std::cout << ", " << FLOPS / runtime * 1e-12 << " TOPS"; + + if (bytes != 0) + std::cout << ", " << bytes / runtime * 1e-9 << " GB/s"; + +#if defined MEASURE_POWER + std::cout << ", " << Watt << " W"; + + if (FLOPS != 0) + std::cout << ", " << FLOPS / runtime / Watt * 1e-9 << " GOPS/W"; +#endif + + std::cout << std::endl; + } +} diff --git a/test/Common/UnitTest.h b/test/Common/UnitTest.h new file mode 100644 index 0000000..dce8b47 --- /dev/null +++ b/test/Common/UnitTest.h @@ -0,0 +1,30 @@ +#if !defined UNIT_TEST_H +#define UNIT_TEST_H + +#include "test/Common/Record.h" +#include "util/cu.h" + +#if defined MEASURE_POWER +#include <powersensor/NVMLPowerSensor.h> +#endif + + +class UnitTest +{ + public: + UnitTest(unsigned deviceNumber); + ~UnitTest(); + + protected: + void report(const char *name, const Record &startRecord, const Record &stopRecord, uint64_t FLOPS = 0, uint64_t bytes = 0); + + cu::Device device; + cu::Context context; + cu::Stream stream; + +#if defined MEASURE_POWER + powersensor::PowerSensor *powerSensor; +#endif +}; + +#endif diff --git a/test/CorrelatorTest/CorrelatorTest.cc b/test/CorrelatorTest/CorrelatorTest.cc new file mode 100644 index 0000000..23bc7c1 --- /dev/null +++ b/test/CorrelatorTest/CorrelatorTest.cc @@ -0,0 +1,239 @@ +#include "test/Common/ComplexInt4.h" +#include "test/Common/Record.h" +#include "test/CorrelatorTest/CorrelatorTest.h" +#include "util/ExceptionPropagator.h" +#include "util/nvrtc.h" + +#include <cstdlib> +#include <cstring> +#include <iostream> + +#define GNU_SOURCE +#include <link.h> +#include <omp.h> + + +CorrelatorTest::CorrelatorTest(const Options &options) +: + UnitTest(options.deviceNumber), + options(options), + correlator(options.nrBits, options.nrReceivers, options.nrChannels, options.nrSamplesPerChannel, options.nrPolarizations, options.nrReceiversPerBlock) +{ +#if defined MEASURE_POWER + Record start(*powerSensor), stop(*powerSensor); +#else + Record start, stop; +#endif + + start.enqueue(stream); + + switch (options.nrBits) { + case 4 : doTest<complex_int4_t, std::complex<int32_t>>(); + break; + + case 8 : doTest<std::complex<int8_t>, std::complex<int32_t>>(); + break; + + case 16 : doTest<std::complex<__half>, std::complex<float>>(); + break; + } + + stop.enqueue(stream); + stream.synchronize(); + + report("total ", start, stop, options.innerRepeatCount * options.outerRepeatCount * correlator.FLOPS()); +} + + +template <typename SampleType, typename VisibilityType> void CorrelatorTest::doTest() +{ + omp_set_nested(1); + + ExceptionPropagator ep; + +#pragma omp parallel num_threads(2) + ep([&] () { + context.setCurrent(); + + multi_array::extent<5> samplesExtent(multi_array::extents[options.nrChannels][options.nrSamplesPerChannel / options.nrTimesPerBlock][options.nrReceivers][options.nrPolarizations][options.nrTimesPerBlock]); + multi_array::extent<4> visibilitiesExtent(multi_array::extents[options.nrChannels][options.nrBaselines()][options.nrPolarizations][options.nrPolarizations]); + + cu::HostMemory hostSamples(sizeof(SampleType) * samplesExtent.size, CU_MEMHOSTALLOC_WRITECOMBINED); + cu::HostMemory hostVisibilities(sizeof(VisibilityType) * visibilitiesExtent.size); + +#if defined UNIFIED_MEMORY + cu::DeviceMemory deviceSamples(hostSamples); + cu::DeviceMemory deviceVisibilities(hostVisibilities); +#else + cu::DeviceMemory deviceSamples(sizeof(SampleType) * samplesExtent.size); + cu::DeviceMemory deviceVisibilities(sizeof(VisibilityType) * visibilitiesExtent.size); + + cu::Stream hostToDeviceStream, deviceToHostStream; +#endif + + multi_array::array_ref<SampleType, 5> samplesRef(* (SampleType *) hostSamples, samplesExtent); + multi_array::array_ref<VisibilityType, 4> visibilitiesRef(* (VisibilityType *) hostVisibilities, visibilitiesExtent); + + setTestPattern<SampleType>(samplesRef); + +#pragma omp for schedule(dynamic), ordered + for (int i = 0; i < options.outerRepeatCount; i ++) + if (!ep) + ep([&] () { +#if !defined UNIFIED_MEMORY + cu::Event inputDataTransferred, executeFinished, executeFinished2; +#endif + +#if defined MEASURE_POWER + Record hostToDeviceRecordStart(*powerSensor), hostToDeviceRecordStop(*powerSensor); + Record computeRecordStart(*powerSensor), computeRecordStop(*powerSensor); + Record deviceToHostRecordStart(*powerSensor), deviceToHostRecordStop(*powerSensor); +#else + Record hostToDeviceRecordStart, hostToDeviceRecordStop; + Record computeRecordStart, computeRecordStop; + Record deviceToHostRecordStart, deviceToHostRecordStop; +#endif + +#pragma omp critical (GPU) // TODO: use multiple locks when using multiple GPUs + ep([&] () { +#if !defined UNIFIED_MEMORY + hostToDeviceRecordStart.enqueue(hostToDeviceStream); + hostToDeviceStream.memcpyHtoDAsync(deviceSamples, hostSamples, samplesRef.bytesize()); + hostToDeviceRecordStop.enqueue(hostToDeviceStream); + + stream.waitEvent(hostToDeviceRecordStop.event); +#endif + + computeRecordStart.enqueue(stream); + + for (unsigned j = 0; j < options.innerRepeatCount; j ++) + correlator.launchAsync(stream, deviceVisibilities, deviceSamples); + + computeRecordStop.enqueue(stream); + +#if !defined UNIFIED_MEMORY + deviceToHostStream.waitEvent(computeRecordStop.event); + deviceToHostRecordStart.enqueue(deviceToHostStream); + deviceToHostStream.memcpyDtoHAsync(hostVisibilities, deviceVisibilities, visibilitiesRef.bytesize()); + deviceToHostRecordStop.enqueue(deviceToHostStream); +#endif + }); + +#if !defined UNIFIED_MEMORY + deviceToHostStream.synchronize(); +#else + stream.synchronize(); +#endif + + if (i == options.outerRepeatCount - 1 && options.verifyOutput) + verifyOutput<SampleType, VisibilityType>(samplesRef, visibilitiesRef); + +#if !defined UNIFIED_MEMORY + report("host-to-device ", hostToDeviceRecordStart, hostToDeviceRecordStop, 0, samplesRef.bytesize()); +#endif + + report("correlate-total ", computeRecordStart, computeRecordStop, options.innerRepeatCount * correlator.FLOPS()); + +#if !defined UNIFIED_MEMORY + report("device-to-host ", deviceToHostRecordStart, deviceToHostRecordStop, 0, visibilitiesRef.bytesize()); +#endif + }); + }); +} + + +template<typename SampleType> void CorrelatorTest::setTestPattern(const multi_array::array_ref<SampleType, 5> &samples) +{ +#if 0 + memset(samples.begin(), 0, samples.bytesize()); + + unsigned channel = options.nrChannels / 3; + unsigned time = options.nrSamplesPerChannel / 5; + unsigned recv0 = options.nrReceivers > 174 ? 174 : options.nrReceivers / 3; + unsigned recv1 = options.nrReceivers > 418 ? 418 : options.nrReceivers / 2; + + samples[channel][time / options.nrTimesPerBlock][recv0][POL_X][time % options.nrTimesPerBlock] = SampleType(2.0, 3.0); + samples[channel][time / options.nrTimesPerBlock][recv1][POL_X][time % options.nrTimesPerBlock] = SampleType(4.0, 5.0); +#else + SampleType randomValues[7777]; // use a limited set of random numbers to save time + + for (unsigned i = 0; i < 7777; i ++) + randomValues[i] = randomValue<SampleType>(); + + unsigned i = 0; + + for (SampleType &sample : samples) + sample = randomValues[i ++ % 7777U]; +#endif +} + + +template<typename SampleType, typename VisibilityType> void CorrelatorTest::verifyOutput(const multi_array::array_ref<SampleType, 5> &samples, const multi_array::array_ref<VisibilityType, 4> &visibilities) const +{ + std::atomic<int> count(0); + ExceptionPropagator ep; + +#if 1 + std::cout << "verifying ..." << std::endl; + +#pragma omp parallel for schedule (dynamic) + for (unsigned channel = 0; channel < options.nrChannels; channel ++) + ep([&] () { + multi_array::array<VisibilityType, 3> sum(multi_array::extents[options.nrBaselines()][options.nrPolarizations][options.nrPolarizations]); + + memset(sum.begin(), 0, sum.bytesize()); + + for (unsigned major_time = 0; major_time < options.nrSamplesPerChannel / options.nrTimesPerBlock; major_time ++) { + multi_array::array_ref<SampleType, 3> ref = samples[channel][major_time]; + + for (unsigned recv1 = 0, baseline = 0; recv1 < options.nrReceivers; recv1 ++) + for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++) + for (unsigned minor_time = 0; minor_time < options.nrTimesPerBlock; minor_time ++) + for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) + for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) { + SampleType sample0 = ref[recv0][pol0][minor_time]; + SampleType sample1 = ref[recv1][pol1][minor_time]; + + sum[baseline][pol1][pol0] += VisibilityType(sample1.real(), sample1.imag()) * conj(VisibilityType(sample0.real(), sample0.imag())); + } + } + + for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++) + for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) + for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) + if (!approximates(visibilities[channel][baseline][pol1][pol0], sum[baseline][pol1][pol0]) && ++ count < 100) +#pragma omp critical (cout) + ep([&] () { + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl; + }); + }); + + std::cout << "#errors = " << count << std::endl; +#else + for (unsigned channel = 0; channel < options.nrChannels; channel ++) + for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++) + for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) + for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) + if (visibilities[channel][baseline][pol0][pol1] != (VisibilityType)(0, 0)) + if (count ++ < 10) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl; +#endif +} + + +int main(int argc, char *argv[]) +{ + try { + cu::init(); + Options options(argc, argv); + CorrelatorTest test(options); + } catch (cu::Error &error) { + std::cerr << "cu::Error: " << error.what() << std::endl; + } catch (nvrtc::Error &error) { + std::cerr << "nvrtc::Error: " << error.what() << std::endl; + } catch (Options::Error &error) { + std::cerr << "Options::Error: " << error.what() << std::endl; + } + + return 0; +} diff --git a/test/CorrelatorTest/CorrelatorTest.h b/test/CorrelatorTest/CorrelatorTest.h new file mode 100644 index 0000000..7e2032c --- /dev/null +++ b/test/CorrelatorTest/CorrelatorTest.h @@ -0,0 +1,61 @@ +#if !defined CORRELATOR_TEST_H +#define CORRELATOR_TEST_H + +#include "test/Common/ComplexInt4.h" +#include "test/Common/UnitTest.h" +#include "test/CorrelatorTest/Options.h" +#include "libtcc/Correlator.h" +#include "util/multi_array.h" + +#include <cuda_fp16.h> + + +class CorrelatorTest : public UnitTest +{ + public: + CorrelatorTest(const Options &); + + private: + template<typename SampleType, typename VisibilityType> void doTest(); + template<typename SampleType> void setTestPattern(const multi_array::array_ref<SampleType, 5> &samples); + template<typename SampleType, typename VisibilityType> void verifyOutput(const multi_array::array_ref<SampleType, 5> &samples, const multi_array::array_ref<VisibilityType, 4> &visibilities) const; + + template<typename SampleType> static SampleType randomValue(); + template<typename VisibilityType> bool approximates(const VisibilityType &a, const VisibilityType &b) const; + + tcc::Correlator correlator; + Options options; +}; + + +template<> complex_int4_t CorrelatorTest::randomValue<complex_int4_t>() +{ + return complex_int4_t(16 * drand48() - 8, 16 * drand48() - 8); +} + + +template<> std::complex<int8_t> CorrelatorTest::randomValue<std::complex<int8_t>>() +{ + return std::complex<int8_t>(256 * drand48() - 128, 256 * drand48() - 128); +} + + +template<> std::complex<__half> CorrelatorTest::randomValue<std::complex<__half>>() +{ + return std::complex<__half>(drand48() - .5, drand48() - .5); +} + +template <typename VisibilityType> bool CorrelatorTest::approximates(const VisibilityType &a, const VisibilityType &b) const +{ + return a == b; +} + + +template <> bool CorrelatorTest::approximates(const std::complex<float> &a, const std::complex<float> &b) const +{ + float absolute = abs(a - b), relative = abs(a / b); + + return (relative > .999 && relative < 1.001) || absolute < .0001 * options.nrSamplesPerChannel; +} + +#endif diff --git a/test/CorrelatorTest/Options.cc b/test/CorrelatorTest/Options.cc new file mode 100644 index 0000000..2e27f1f --- /dev/null +++ b/test/CorrelatorTest/Options.cc @@ -0,0 +1,89 @@ +#include "test/CorrelatorTest/Options.h" + +#include <cstdlib> +#include <iostream> + +#include <unistd.h> + + +Options::Options(int argc, char *argv[]) +: + nrBits(8), + nrChannels(480), + nrReceivers(576), + nrReceiversPerBlock(64), + nrSamplesPerChannel(3072), + nrTimesPerBlock(128 / nrBits), + innerRepeatCount(1), outerRepeatCount(1), + deviceNumber(0), + verifyOutput(true) +{ + opterr = 0; + + for (int opt; (opt = getopt(argc, argv, "b:c:d:hn:N:r:R:t:V:")) >= 0;) + switch (opt) { + case 'b' : nrBits = atoi(optarg); + break; + + case 'c' : nrChannels = atoi(optarg); + break; + + case 'd' : deviceNumber = atoi(optarg); + break; + + case 'h' : std::cout << usage(argv[0]) << std::endl; + exit(0); + + case 'n' : nrReceivers = atoi(optarg); + break; + + case 'N' : nrReceiversPerBlock = atoi(optarg); + break; + + case 'r' : innerRepeatCount = atoi(optarg); + break; + + case 'R' : outerRepeatCount = atoi(optarg); + break; + + case 't' : nrSamplesPerChannel = atoi(optarg); + break; + + case 'V' : verifyOutput = atoi(optarg); + break; + + default : throw Error(usage(argv[0])); + } + + if (nrBits != 4 && nrBits != 8 && nrBits != 16) + throw Error("nrBits must be 4, 8, or 16"); + + if (nrChannels == 0) + throw Error("nrChannels must be > 0"); + + if (nrReceivers == 0) + throw Error("nrReceivers must be > 0"); + + if (nrReceiversPerBlock != 32 && nrReceiversPerBlock != 48 && nrReceiversPerBlock != 64) + throw Error("nrReceiversPerBlock must be 32, 48, or 64"); + + if (nrSamplesPerChannel == 0) + throw Error("nrSamplesPerChannel must be > 0"); + + nrTimesPerBlock = 128 / nrBits; + + if (nrSamplesPerChannel % nrTimesPerBlock != 0) + throw Error("nrSamplesPerChannel must be a multiple of " + std::to_string(nrTimesPerBlock)); +} + + +std::string Options::usage(const std::string &execName) +{ + return "usage: " + execName + " [-b nrBits] [-c nrChannels] [-n nrReceivers] [-N nrReceiversPerBlock] [-r innerRepeatCount] [-R outerRepeatCount] [-t nrSamplesPerChannel] [-V verifyOutput]"; +} + + +const char *Options::Error::what() const noexcept +{ + return msg.c_str(); +} diff --git a/test/CorrelatorTest/Options.h b/test/CorrelatorTest/Options.h new file mode 100644 index 0000000..9e51307 --- /dev/null +++ b/test/CorrelatorTest/Options.h @@ -0,0 +1,45 @@ +#if !defined OPTIONS_H +#define OPTIONS_H + +#include <exception> +#include <string> + + +class Options +{ + public: + class Error : public std::exception { + public: + Error(const std::string &msg) + : + msg(msg) + { + } + + virtual const char *what() const noexcept; + + private: + std::string msg; + }; + + Options(int argc, char *argv[]); + + unsigned nrBaselines() const { return nrReceivers * (nrReceivers + 1) / 2; } + + unsigned nrBits; + unsigned nrChannels; + unsigned nrReceivers; + unsigned nrReceiversPerBlock; + unsigned nrSamplesPerChannel; + unsigned nrTimesPerBlock; + unsigned innerRepeatCount, outerRepeatCount; + unsigned deviceNumber; + bool verifyOutput; + + static const unsigned nrPolarizations = 2; + + private: + static std::string usage(const std::string &execName); +}; + +#endif diff --git a/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc new file mode 100644 index 0000000..d28321f --- /dev/null +++ b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc @@ -0,0 +1,450 @@ +#define NR_BITS 8 +#define NR_CHANNELS 480 +#define NR_POLARIZATIONS 2 +#define NR_SAMPLES_PER_CHANNEL 3072 +#define NR_RECEIVERS 576 +#define NR_BASELINES ((NR_RECEIVERS) * ((NR_RECEIVERS) + 1) / 2) +#define NR_RECEIVERS_PER_BLOCK 64 +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) + + +#include "test/Common/ComplexInt4.h" + +#include <complex> +#include <exception> +//#include <fstream> +#include <iostream> +#include <sstream> +#include <string> +#include <vector> + +#include <omp.h> +#include <sys/types.h> + +#define __CL_ENABLE_EXCEPTIONS +#include <CL/cl.hpp> +#include <CL/cl_ext.h> + + +#if NR_BITS == 4 +typedef complex_int4_t Sample; +typedef std::complex<int32_t> Visibility; +#elif NR_BITS == 8 +typedef std::complex<int8_t> Sample; +typedef std::complex<int32_t> Visibility; +#elif NR_BITS == 16 +typedef std::complex<__half> Sample; +typedef std::complex<float> Visibility; +#endif + +typedef Sample Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef Visibility Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; + + +std::string errorMessage(cl_int error) +{ + switch (error) { + case CL_SUCCESS: return "Success!"; + case CL_DEVICE_NOT_FOUND: return "Device not found."; + case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; + case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; + case CL_OUT_OF_RESOURCES: return "Out of resources"; + case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; + case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; + case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; + case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; + case CL_MAP_FAILURE: return "Map failure"; + case CL_INVALID_VALUE: return "Invalid value"; + case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; + case CL_INVALID_PLATFORM: return "Invalid platform"; + case CL_INVALID_DEVICE: return "Invalid device"; + case CL_INVALID_CONTEXT: return "Invalid context"; + case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; + case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; + case CL_INVALID_HOST_PTR: return "Invalid host pointer"; + case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor"; + case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; + case CL_INVALID_SAMPLER: return "Invalid sampler"; + case CL_INVALID_BINARY: return "Invalid binary"; + case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; + case CL_INVALID_PROGRAM: return "Invalid program"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; + case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; + case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; + case CL_INVALID_KERNEL: return "Invalid kernel"; + case CL_INVALID_ARG_INDEX: return "Invalid argument index"; + case CL_INVALID_ARG_VALUE: return "Invalid argument value"; + case CL_INVALID_ARG_SIZE: return "Invalid argument size"; + case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; + case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; + case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; + case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; + case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; + case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; + case CL_INVALID_EVENT: return "Invalid event"; + case CL_INVALID_OPERATION: return "Invalid operation"; + case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; + case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; + case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; + case CL_INVALID_GLOBAL_WORK_SIZE: return "Invalid global work size"; +#if defined CL_INVALID_PROPERTY + case CL_INVALID_PROPERTY: return "Invalid property"; +#endif +#if defined CL_INVALID_IMAGE_DESCRIPTOR + case CL_INVALID_IMAGE_DESCRIPTOR: return "Invalid image descriptor"; +#endif +#if defined CL_INVALID_COMPILER_OPTIONS + case CL_INVALID_COMPILER_OPTIONS: return "Invalid compiler options"; +#endif +#if defined CL_INVALID_LINKER_OPTIONS + case CL_INVALID_LINKER_OPTIONS: return "Invalid linker options"; +#endif +#if defined CL_INVALID_DEVICE_PARTITION_COUNT + case CL_INVALID_DEVICE_PARTITION_COUNT: return "Invalid device partition count"; +#endif + default: std::stringstream str; + str << "Unknown (" << error << ')'; + return str.str(); + } +} + + +void createContext(cl::Context &context, std::vector<cl::Device> &devices) +{ + const char *platformName = getenv("PLATFORM"); + +#if defined __linux__ + if (platformName == 0) +#endif + //platformName = "Intel(R) FPGA SDK for OpenCL(TM)"; + //platformName = Intel(R) OpenCL"; + platformName = "NVIDIA CUDA"; + //platformName = "AMD Accelerated Parallel Processing"; + + cl_device_type type = CL_DEVICE_TYPE_DEFAULT; + + const char *deviceType = getenv("TYPE"); + + if (deviceType != 0) { + if (strcmp(deviceType, "GPU") == 0) + type = CL_DEVICE_TYPE_GPU; + else if (strcmp(deviceType, "CPU") == 0) + type = CL_DEVICE_TYPE_CPU; + else if (strcmp(deviceType, "ACCELERATOR") == 0) + type = CL_DEVICE_TYPE_ACCELERATOR; + else + std::cerr << "Unrecognized device type: " << deviceType; + } + + const char *deviceName = getenv("DEVICE"); + + std::vector<cl::Platform> platforms; + cl::Platform::get(&platforms); + + for (cl::Platform &platform : platforms) { + std::clog << "Platform name: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl; + std::clog << "Platform version: " << platform.getInfo<CL_PLATFORM_VERSION>() << std::endl; + std::clog << "Platform extensions: " << platform.getInfo<CL_PLATFORM_EXTENSIONS>() << std::endl; + } + + for (cl::Platform &platform : platforms) { + if (strcmp(platform.getInfo<CL_PLATFORM_NAME>().c_str(), platformName) == 0) { + platform.getDevices(type, &devices); + + for (cl::Device &device : devices) { + std::clog << "Device: " << device.getInfo<CL_DEVICE_NAME>() << ", " + "capability: " << device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>() << '.' << device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>() +//#if defined CL_DEVICE_TOPOLOGY_AMD +// ", PCIe bus 0x" << std::hex << std::setiosflags (std::ios::uppercase) << std::setfill('0') << std::setw(2) << (device.getInfo<CL_DEVICE_TOPOLOGY_AMD>().pcie.bus & 255) << std::dec +//#endif + << std::endl; + } + + context = cl::Context(devices); + return; + } + } + + std::cerr << "Platform not found: \"" << platformName << '"' << std::endl; + exit(1); +} + + +cl::Program createProgramFromSources(cl::Context &context, std::vector<cl::Device> &devices, const std::string &sources, const char *args) +{ + std::stringstream cmd; + cmd << "#include \"" << sources << '"' << std::endl; +#if defined CL_VERSION_1_2 + cl::Program program(context, cmd.str()); +#else + std::string str = cmd.str(); + cl::Program::Sources src(1, std::make_pair(str.c_str(), str.length())); + cl::Program program(context, src); +#endif + + try { + program.build(devices, args); + std::string msg; + program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &msg); + + std::clog << msg << std::endl; + } catch (cl::Error &error) { + if (strcmp(error.what(), "clBuildProgram") == 0) { + std::string msg; + program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &msg); + + std::cerr << msg << std::endl; + exit(1); + } else { + throw; + } + } + +#if 0 + std::vector<size_t> binarySizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>(); + std::vector<char *> binaries = program.getInfo<CL_PROGRAM_BINARIES>(); + + for (unsigned i = 0; i < binaries.size(); i++) { + std::stringstream filename; + filename << sources << '-' << i << ".ptx"; + std::ofstream(filename.str().c_str(), std::ofstream::binary).write(binaries[i], binarySizes[i]); + } + + for (unsigned b = 0; b < binaries.size(); b++) + delete [] binaries[b]; +#endif + + return program; +} + + +cl::Program createProgramFromBinaries(cl::Context &context, std::vector<cl::Device> &devices) +{ +#if 1 + cl::Program::Binaries binaries(devices.size()); + std::vector<std::string> ptx_code(devices.size()); + + for (unsigned device = 0; device < devices.size(); device ++) { + unsigned compute_capability = 10 * devices[device].getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>() + devices[device].getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>(); + std::stringstream command; + + command << "nvcc" + << " -ptx" + << " -arch=compute_" << compute_capability + << " -code=sm_" << compute_capability + << " -DNR_BITS=" << NR_BITS + << " -DNR_RECEIVERS=" << NR_RECEIVERS + << " -DNR_CHANNELS=" << NR_CHANNELS + << " -DNR_SAMPLES_PER_CHANNEL=" << NR_SAMPLES_PER_CHANNEL + << " -DNR_POLARIZATIONS=" << NR_POLARIZATIONS + << " -DNR_RECEIVERS_PER_BLOCK=" << NR_RECEIVERS_PER_BLOCK + << " -o -" + << " libtcc/TCCorrelator.cu" + << "|sed -e s/.param\\ .[a-zA-Z0-9]*/\\&\\ .ptr\\ .global/"; + + std::clog << "executing: " << command.str() << std::endl; + FILE *pipe = popen(command.str().c_str(), "r"); + + while (!feof(pipe)) { + char buffer[1024]; + + if (fgets(buffer, 1024, pipe) != NULL) + ptx_code[device] += buffer; + } + + binaries[device] = (std::make_pair(ptx_code[device].c_str(), ptx_code[device].length())); + pclose(pipe); + } + +#else + std::ifstream ifs(name, std::ios::in | std::ios::binary); + std::string str((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>()); + + cl::Program::Binaries binaries(devices.size(), std::make_pair(str.c_str(), str.length())); +#endif + + cl::Program program(context, devices, binaries); + program.build(); + + return program; +} + + +void setTestPattern(cl::CommandQueue &queue, cl::Buffer &samplesBuffer) +{ + Samples &samples = * (Samples *) queue.enqueueMapBuffer(samplesBuffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(Samples)); + +#if 0 + memset(samples, 0, sizeof(Samples)); + + unsigned channel = NR_CHANNELS / 3; + unsigned time = NR_SAMPLES_PER_CHANNEL / 5; + unsigned recv0 = NR_RECEIVERS > 174 ? 174 : NR_RECEIVERS / 3; + unsigned recv1 = NR_RECEIVERS > 418 ? 418 : NR_RECEIVERS / 2; + +#if NR_BITS == 4 + samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = complex_int4_t(2, 3); + samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = complex_int4_t(4, 5); +#elif NR_BITS == 8 + samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<int8_t>(2, 3); + samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<int8_t>(4, 5); +#elif NR_BITS == 16 + samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<__half>(2.0f, 3.0f); + samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<__half>(4.0f, 5.0f); +#endif +#else + Sample randomValues[7777]; + + for (unsigned i = 0; i < 7777; i ++) +#if NR_BITS == 4 + randomValues[i] = complex_int4_t(16 * drand48() - 8, 16 * drand48() - 8); +#elif NR_BITS == 8 + randomValues[i] = std::complex<int8_t>(256 * drand48() - 128, 256 * drand48() - 128); +#elif NR_BITS == 16 + randomValues[i] = std::complex<__half>(drand48() - .5, drand48() - .5); +#endif + +#pragma omp parallel for schedule (dynamic) + for (unsigned channel = 0; channel < NR_CHANNELS; channel ++) + for (unsigned time_major = 0, i = channel; time_major < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; time_major ++) + for (unsigned receiver = 0; receiver < NR_RECEIVERS; receiver ++) + for (unsigned polarization = 0; polarization < NR_POLARIZATIONS; polarization ++) + for (unsigned time_minor = 0; time_minor < NR_TIMES_PER_BLOCK; time_minor ++, i ++) + samples[channel][time_major][receiver][polarization][time_minor] = randomValues[i % 7777U]; +#endif + + queue.enqueueUnmapMemObject(samplesBuffer, samples); +} + + +void checkTestPattern(cl::CommandQueue &queue, cl::Buffer &visibilitiesBuffer, cl::Buffer &samplesBuffer) +{ + Visibilities &visibilities = * (Visibilities *) queue.enqueueMapBuffer(visibilitiesBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(Visibilities)); + + std::atomic<int> count(0); + +#if 0 + Samples &samples = * (Samples *) queue.enqueueMapBuffer(samplesBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(Samples)); + + std::cout << "checking ..." << std::endl; + +//#pragma omp parallel for schedule (dynamic) + for (unsigned channel = 0; channel < NR_CHANNELS; channel ++) { +//#pragma omp critical (cout) + //std::cout << "channel = " << channel << std::endl; + +#if NR_BITS == 4 || NR_BITS == 8 + typedef std::complex<int32_t> Sum[NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 16 + typedef std::complex<float> Sum[NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#endif + + Sum &sum = *new Sum[1]; + memset(sum, 0, sizeof sum); + + for (unsigned major_time = 0; major_time < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; major_time ++) + for (unsigned recv1 = 0, baseline = 0; recv1 < NR_RECEIVERS; recv1 ++) + for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++) + for (unsigned minor_time = 0; minor_time < NR_TIMES_PER_BLOCK; minor_time ++) + for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) + for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) +#if NR_BITS == 4 || NR_BITS == 8 + sum[baseline][pol1][pol0] += (std::complex<int32_t>) samples[channel][major_time][recv1][pol1][minor_time] * conj((std::complex<int32_t>) samples[channel][major_time][recv0][pol0][minor_time]); +#elif NR_BITS == 16 + sum[baseline][pol1][pol0] += samples[channel][major_time][recv1][pol1][minor_time] * conj(samples[channel][major_time][recv0][pol0][minor_time]); +#endif + + for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++) + for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) + for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) +#if NR_BITS == 4 || NR_BITS == 8 + if (visibilities[channel][baseline][pol1][pol0] != sum[baseline][pol1][pol0] && ++ count < 10) +#pragma omp critical (cout) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl; +#elif NR_BITS == 16 + float absolute = abs(visibilities[channel][baseline][pol1][pol0] - sum[baseline][pol1][pol0]); + float relative = abs(visibilities[channel][baseline][pol1][pol0] / sum[baseline][pol1][pol0]); + 195,1 83% + + + if ((relative < .999 || relative > 1.001) && absolute > .0001 * NR_SAMPLES_PER_CHANNEL && ++ count < 10) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << ", relative = " << relative << ", absolute = " << absolute << std::endl; +#endif + delete ∑ + } + + std::cout << "#errors = " << count << std::endl; + + queue.enqueueUnmapMemObject(samplesBuffer, samples); +#else + for (unsigned channel = 0; channel < NR_CHANNELS; channel ++) + for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++) + for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) + for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) +#if NR_BITS == 4 || NR_BITS == 8 + if (visibilities[channel][baseline][pol0][pol1] != std::complex<int32_t>(0, 0)) +#elif NR_BITS == 16 + if (visibilities[channel][baseline][pol0][pol1] != std::complex<float>(0.0f, 0.0f)) +#endif + if (count ++ < 10) +#pragma omp critical (cout) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl; +#endif + + queue.enqueueUnmapMemObject(visibilitiesBuffer, visibilities); +} + + +int main() +{ + try { + cl::Context context; + std::vector<cl::Device> devices; + createContext(context, devices); + + cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE); + + cl::Buffer samples(context, CL_MEM_READ_ONLY /*| CL_MEM_HOST_WRITE_ONLY*/, sizeof(Samples)); + cl::Buffer visibilities(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(Visibilities)); + + setTestPattern(queue, samples); + + cl::Program program(createProgramFromBinaries(context, devices)); + + cl::Kernel correlatorKernel(program, "correlate"); + correlatorKernel.setArg(0, visibilities); + correlatorKernel.setArg(1, samples); + + cl::Event event; + +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + unsigned nrBlocks = ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK) * ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK + 1) / 2; +#elif NR_RECEIVERS_PER_BLOCK == 64 + unsigned nrBlocks = ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK) * ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK); +#endif + + cl::NDRange globalWorkSize(32 * nrBlocks, 2 * NR_CHANNELS, 2 * 1); + cl::NDRange localWorkSize(32, 2, 2); + +for (int i = 0; i < 100; i ++) + queue.enqueueNDRangeKernel(correlatorKernel, cl::NullRange, globalWorkSize, localWorkSize, 0, &event); + queue.finish(); + + double runtime = (event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>()) * 1e-9; + unsigned long long nrOperations = 8LLU * (NR_RECEIVERS * NR_RECEIVERS + 1) / 2 * NR_POLARIZATIONS * NR_POLARIZATIONS * NR_CHANNELS * NR_SAMPLES_PER_CHANNEL; + double TFLOPS = nrOperations / runtime * 1e-12; + std::clog << "runtime = " << runtime << "s, TFLOPS = " << TFLOPS << std::endl; + + checkTestPattern(queue, visibilities, samples); + } catch (cl::Error &error) { + std::cerr << "caught cl::Error: " << error.what() << ": " << errorMessage(error.err()) << std::endl; + } catch (std::exception &error) { + std::cerr << "caught std::exception: " << error.what() << std::endl; + } + + return 0; +} diff --git a/test/SimpleExample/SimpleExample.cu b/test/SimpleExample/SimpleExample.cu new file mode 100644 index 0000000..4fa52dd --- /dev/null +++ b/test/SimpleExample/SimpleExample.cu @@ -0,0 +1,77 @@ +#define NR_BITS 8 +#define NR_CHANNELS 480 +#define NR_POLARIZATIONS 2 +#define NR_SAMPLES_PER_CHANNEL 3072 +#define NR_RECEIVERS 576 +#define NR_BASELINES ((NR_RECEIVERS) * ((NR_RECEIVERS) + 1) / 2) +#define NR_RECEIVERS_PER_BLOCK 64 +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) + + +#include "test/Common/ComplexInt4.h" +#include "libtcc/Correlator.h" + +#include <complex> +#include <iostream> + +#include <cuda.h> +#if NR_BITS == 16 +#include <cuda_fp16.h> +#endif + + +inline void checkCudaCall(cudaError_t error) +{ + if (error != cudaSuccess) { + std::cerr << "error " << error << std::endl; + exit(1); + } +} + + +#if NR_BITS == 4 +typedef complex_int4_t Sample; +typedef std::complex<int32_t> Visibility; +#elif NR_BITS == 8 +typedef std::complex<int8_t> Sample; +typedef std::complex<int32_t> Visibility; +#elif NR_BITS == 16 +typedef std::complex<__half> Sample; +typedef std::complex<float> Visibility; +#endif + +typedef Sample Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef Visibility Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; + + +int main() +{ + try { + checkCudaCall(cudaSetDevice(0)); // combine the CUDA runtime API and CUDA driver API + checkCudaCall(cudaFree(0)); + + tcc::Correlator correlator(NR_BITS, NR_RECEIVERS, NR_CHANNELS, NR_SAMPLES_PER_CHANNEL, NR_POLARIZATIONS, NR_RECEIVERS_PER_BLOCK); + + cudaStream_t stream; + Samples *samples; + Visibilities *visibilities; + + checkCudaCall(cudaStreamCreate(&stream)); + checkCudaCall(cudaMallocManaged(&samples, sizeof(Samples))); + checkCudaCall(cudaMallocManaged(&visibilities, sizeof(Visibilities))); + + (*samples)[NR_CHANNELS / 3][NR_SAMPLES_PER_CHANNEL / 5 / NR_TIMES_PER_BLOCK][174][0][NR_SAMPLES_PER_CHANNEL / 5 % NR_TIMES_PER_BLOCK] = Sample(2, 3); + (*samples)[NR_CHANNELS / 3][NR_SAMPLES_PER_CHANNEL / 5 / NR_TIMES_PER_BLOCK][418][0][NR_SAMPLES_PER_CHANNEL / 5 % NR_TIMES_PER_BLOCK] = Sample(4, 5); + + correlator.launchAsync((CUstream) stream, (CUdeviceptr) visibilities, (CUdeviceptr) samples); + checkCudaCall(cudaDeviceSynchronize()); + + std::cout << ((*visibilities)[160][87745][0][0] == Visibility(23, -2) ? "success" : "failed") << std::endl; + + checkCudaCall(cudaFree(visibilities)); + checkCudaCall(cudaFree(samples)); + checkCudaCall(cudaStreamDestroy(stream)); + } catch (std::exception &error) { + std::cerr << error.what() << std::endl; + } +} diff --git a/util/ExceptionPropagator.h b/util/ExceptionPropagator.h new file mode 100644 index 0000000..942a28e --- /dev/null +++ b/util/ExceptionPropagator.h @@ -0,0 +1,59 @@ +#if !defined EXCEPTION_PROPAGATOR_H +#define EXCEPTION_PROPAGATOR_H + +// Experimental class to propagate exceptions around OpenMP pragmas (and other +// constructs) that cannot cope with exceptions. Exceptions thrown inside a lambda +// function that is given to the () operator are propagated and rethrown when the +// progagation object is destroyed. Example use case: +// +// ExceptionPropagator ep; +// +// #pragma omp parallel for +// for (int i = 0; i < 10000; i ++) +// if (!ep) // finish loop ASAP if exception is pending +// ep([&] () { +// throw 42; // continue with code that might throw exceptions +// }); +// +// // exception is rethrown at scope exit of ep + +#include <atomic> +#include <exception> + + +class ExceptionPropagator +{ + public: + ExceptionPropagator() + : + propagateException(ATOMIC_FLAG_INIT) + { + } + + ~ExceptionPropagator() noexcept(false) + { + if (exception != nullptr && !std::uncaught_exception()) + std::rethrow_exception(exception); + } + + template <typename T> void operator () (const T &func) + { + try { + func(); + } catch (...) { + if (!atomic_flag_test_and_set(&propagateException)) + exception = std::current_exception(); + } + } + + operator bool () const // returns true iff exception pending + { + return exception != nullptr; + } + + private: + std::atomic_flag propagateException; + std::exception_ptr exception; +}; + +#endif diff --git a/util/cu.cc b/util/cu.cc new file mode 100644 index 0000000..82113be --- /dev/null +++ b/util/cu.cc @@ -0,0 +1,37 @@ +#include "cu.h" + +#include <iostream> +#include <sstream> + + +namespace cu { + +const char *Error::what() const noexcept +{ + const char *str; + return cuGetErrorString(_result, &str) != CUDA_ERROR_INVALID_VALUE ? str : "unknown error"; +} + + +Context Device::primaryCtxRetain() +{ + CUcontext context; + checkCudaCall(cuDevicePrimaryCtxRetain(&context, _obj)); + return Context(context, *this); +} + + +void Source::compile(const char *output_file_name, const char *compiler_options) +{ + std::stringstream command_line; + command_line << "nvcc -cubin " << compiler_options << " -o " << output_file_name << ' ' << input_file_name; +//#pragma omp critical (clog) + //std::clog << command_line.str() << std::endl; + + int retval = system(command_line.str().c_str()); + + if (WEXITSTATUS(retval) != 0) + throw Error(CUDA_ERROR_INVALID_SOURCE); +} + +} diff --git a/util/cu.h b/util/cu.h new file mode 100644 index 0000000..46dfdea --- /dev/null +++ b/util/cu.h @@ -0,0 +1,786 @@ +#if !defined CU_WRAPPER_H +#define CU_WRAPPER_H + +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <exception> +#include <fstream> +#include <string> +#include <vector> + + +namespace cu { + class Error : public std::exception { + public: + Error(CUresult result) + : + _result(result) + { + } + + virtual const char *what() const noexcept; + + operator CUresult () const + { + return _result; + } + + private: + CUresult _result; + }; + + + inline void checkCudaCall(CUresult result) + { + if (result != CUDA_SUCCESS) + throw Error(result); + } + + + inline void init(unsigned flags = 0) + { + checkCudaCall(cuInit(flags)); + } + + + inline int driverGetVersion() + { + int version; + checkCudaCall(cuDriverGetVersion(&version)); + return version; + } + + inline void memcpyHtoD(CUdeviceptr dst, const void *src, size_t size) + { + checkCudaCall(cuMemcpyHtoD(dst, src, size)); + } + + class Context; + class Stream; + + template <typename T> class Wrapper + { + public: + // Wrapper<T>(Wrapper<T> &) = delete; // disallow copies + + // conversion to C-style T + + operator const T & () const + { + return _obj; + } + + operator T & () + { + return _obj; + } + +#if 0 // makes no sense if object is not copyable + bool operator == (const Wrapper<T> &other) + { + return _obj == other._obj; + } + + bool operator != (const Wrapper<T> &other) + { + return _obj != other._obj; + } +#endif + + protected: + Wrapper<T>() + : + hasOwnership(true) + { + } + + Wrapper<T>(T &obj) + : + _obj(obj), + hasOwnership(false) + { + } + + bool hasOwnership; + T _obj; + }; + + class Device : public Wrapper<CUdevice> + { + public: + // Device Management + + Device(int ordinal) + { + checkCudaCall(cuDeviceGet(&_obj, ordinal)); + } + + int getAttribute(CUdevice_attribute attribute) const + { + int value; + checkCudaCall(cuDeviceGetAttribute(&value, attribute, _obj)); + return value; + } + + template <CUdevice_attribute attribute> int getAttribute() const + { + return getAttribute(attribute); + } + + static int getCount() + { + int nrDevices; + checkCudaCall(cuDeviceGetCount(&nrDevices)); + return nrDevices; + } + + std::string getName() const + { + char name[64]; + checkCudaCall(cuDeviceGetName(name, sizeof name, _obj)); + return std::string(name); + } + + size_t totalMem() const + { + size_t size; + checkCudaCall(cuDeviceTotalMem(&size, _obj)); + return size; + } + + + // Primary Context Management + + std::pair<unsigned, bool> primaryCtxGetState() const + { + unsigned flags; + int active; + checkCudaCall(cuDevicePrimaryCtxGetState(_obj, &flags, &active)); + return std::pair<unsigned, bool>(flags, active); + } + + // void primaryCtxRelease() not available; it is released on destruction of the Context returned by Device::primaryContextRetain() + + void primaryCtxReset() + { + checkCudaCall(cuDevicePrimaryCtxReset(_obj)); + } + + Context primaryCtxRetain(); // retain this context until the primary context can be released + + void primaryCtxSetFlags(unsigned flags) + { + checkCudaCall(cuDevicePrimaryCtxSetFlags(_obj, flags)); + } + }; + + + class Context : public Wrapper<CUcontext> + { + public: + // Context Management + + Context(int flags, Device &device) + : + _primaryContext(false) + { + checkCudaCall(cuCtxCreate(&_obj, flags, device)); + } + + Context(CUcontext context) + : + Wrapper<CUcontext>(context), + _primaryContext(false) + { + } + + ~Context() + { + if (hasOwnership) + checkCudaCall(cuCtxDestroy(_obj)); + //else + //checkCudaCall(cuDevicePrimaryCtxRelease(getDevice())); // FIXME + } + + unsigned getApiVersion() const + { + unsigned version; + checkCudaCall(cuCtxGetApiVersion(_obj, &version)); + return version; + } + + static CUfunc_cache getCacheConfig() + { + CUfunc_cache config; + checkCudaCall(cuCtxGetCacheConfig(&config)); + return config; + } + + static void setCacheConfig(CUfunc_cache config) + { + checkCudaCall(cuCtxSetCacheConfig(config)); + } + + static Context getCurrent() + { + CUcontext context; + checkCudaCall(cuCtxGetCurrent(&context)); + return Context(context); + } + + void setCurrent() const + { + checkCudaCall(cuCtxSetCurrent(_obj)); + } + + void pushCurrent() + { + checkCudaCall(cuCtxPushCurrent(_obj)); + } + + static Context popCurrent() + { + CUcontext context; + checkCudaCall(cuCtxPopCurrent(&context)); + return Context(context); + } + + void setSharedMemConfig(CUsharedconfig config) + { + checkCudaCall(cuCtxSetSharedMemConfig(config)); + } + + static Device getDevice() + { + CUdevice device; + checkCudaCall(cuCtxGetDevice(&device)); + return Device(device); // FIXME: ~Device() + } + + static size_t getLimit(CUlimit limit) + { + size_t value; + checkCudaCall(cuCtxGetLimit(&value, limit)); + return value; + } + + template <CUlimit limit> static size_t getLimit() + { + return getLimit(limit); + } + + static void setLimit(CUlimit limit, size_t value) + { + checkCudaCall(cuCtxSetLimit(limit, value)); + } + + template <CUlimit limit> static void setLimit(size_t value) + { + setLimit(limit, value); + } + + static void synchronize() + { + checkCudaCall(cuCtxSynchronize()); + } + + private: + friend class Device; + Context(CUcontext context, Device &device) + : + Wrapper<CUcontext>(context), + _primaryContext(true) + { + } + + bool _primaryContext; + }; + + + class HostMemory : public Wrapper<void *> + { + public: + HostMemory(size_t size, int flags = 0) + { + checkCudaCall(cuMemHostAlloc(&_obj, size, flags)); + } + + ~HostMemory() + { + checkCudaCall(cuMemFreeHost(_obj)); + } + + template <typename T> operator T * () + { + return static_cast<T *>(_obj); + } + }; + + + class DeviceMemory : public Wrapper<CUdeviceptr> + { + public: + DeviceMemory(size_t size) + { + checkCudaCall(cuMemAlloc(&_obj, size)); + } + + DeviceMemory(CUdeviceptr ptr) + : + Wrapper(ptr) + { + } + + DeviceMemory(HostMemory &hostMemory) + { + hasOwnership = false; + checkCudaCall(cuMemHostGetDevicePointer(&_obj, hostMemory, 0)); + } + + ~DeviceMemory() + { + if (hasOwnership) + checkCudaCall(cuMemFree(_obj)); + } + + const void *parameter() const // used to construct parameter list for launchKernel(); + { + return &_obj; + } + + template <typename T> operator T * () const + { + return (T *) _obj; + } + + template <typename T> operator const T * () const + { + return (const T *) _obj; + } + }; + + + class Array : public Wrapper<CUarray> + { + public: + Array(unsigned width, CUarray_format format, unsigned numChannels) + { + Array(width, 0, format, numChannels); + } + + Array(unsigned width, unsigned height, CUarray_format format, unsigned numChannels) + { + CUDA_ARRAY_DESCRIPTOR descriptor; + descriptor.Width = width; + descriptor.Height = height; + descriptor.Format = format; + descriptor.NumChannels = numChannels; + checkCudaCall(cuArrayCreate(&_obj, &descriptor)); + } + + Array(unsigned width, unsigned height, unsigned depth, CUarray_format format, unsigned numChannels) + { + CUDA_ARRAY3D_DESCRIPTOR descriptor; + descriptor.Width = width; + descriptor.Height = height; + descriptor.Depth = depth; + descriptor.Format = format; + descriptor.NumChannels = numChannels; + descriptor.Flags = 0; + checkCudaCall(cuArray3DCreate(&_obj, &descriptor)); + } + + Array(CUarray &array) + : + Wrapper(array) + { + } + + ~Array() + { + if (hasOwnership) + checkCudaCall(cuArrayDestroy(_obj)); + } + }; + + +#if 0 + class TexObject : public Wrapper<CUtexObject> { + public: + TexObject(onst CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) + { + checkCudaCall(cuTexObjectCreate(&_obj, pResDesc, pTexDesc, pResViewDesc)); + } + + TexObject(CUtexObject &obj) + : + Wrapper<CUtexObject>(obj) + { + } + + ~TexObject() + { + if (hasOwnership) + checkCudaCall(cuTexObjectDestroy(_obj)); + } + + + }; +#endif + + + class TexRef : public Wrapper<CUtexref> { + public: + TexRef(CUtexref texref) + : + Wrapper<CUtexref>(texref) + { + } + +#if 0 // deprecated + void setAddress(size_t &byte_offset, DeviceMemory &memory, size_t size) + { + checkCudaCall(cuTexRefSetAddress(&byte_offset, _obj, memory, size)); + } + + void setArray(Array &array, unsigned flags = CU_TRSA_OVERRIDE_FORMAT) + { + checkCudaCall(cuTexRefSetArray(_obj, array, flags)); + } + + void setAddressMode(int dim, CUaddress_mode am) + { + checkCudaCall(cuTexRefSetAddressMode(_obj, dim, am)); + } + + void setFilterMode(CUfilter_mode fm) + { + checkCudaCall(cuTexRefSetFilterMode(_obj, fm)); + } + + void setFlags(int flags) + { + checkCudaCall(cuTexRefSetFlags(_obj, flags)); + } + + void setFormat(CUarray_format fmt, int numPackedComponents) + { + checkCudaCall(cuTexRefSetFormat(_obj, fmt, numPackedComponents)); + } +#endif + }; + + + class Source + { + public: + Source(const char *input_file_name) + : + input_file_name(input_file_name) + { + } + + void compile(const char *ptx_name, const char *compile_options = 0); + + private: + const char *input_file_name; + }; + + + class Module : public Wrapper<CUmodule> + { + public: + Module(const char *file_name) + { +#if defined TEGRA_QUIRKS // cuModuleLoad broken on Jetson TX1 + std::ifstream file(file_name); + std::string program((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); + checkCudaCall(cuModuleLoadData(&_obj, program.c_str())); +#else + checkCudaCall(cuModuleLoad(&_obj, file_name)); +#endif + } + + Module(const void *data) + { + checkCudaCall(cuModuleLoadData(&_obj, data)); + } + + Module(CUmodule &module) + : + Wrapper(module) + { + } + + ~Module() + { + if (hasOwnership) + checkCudaCall(cuModuleUnload(_obj)); + } + + TexRef getTexRef(const char *name) const + { + CUtexref texref; + checkCudaCall(cuModuleGetTexRef(&texref, _obj, name)); + return TexRef(texref); + } + + CUdeviceptr getGlobal(const char *name) const + { + CUdeviceptr deviceptr; + checkCudaCall(cuModuleGetGlobal(&deviceptr, nullptr, _obj, name)); + return deviceptr; + } + }; + + + class Function : public Wrapper<CUfunction> + { + public: + Function(Module &module, const char *name) + { + checkCudaCall(cuModuleGetFunction(&_obj, module, name)); + } + + Function(CUfunction &function) + : + Wrapper(function) + { + } + + int getAttribute(CUfunction_attribute attribute) + { + int value; + checkCudaCall(cuFuncGetAttribute(&value, attribute, _obj)); + return value; + } + + void setCacheConfig(CUfunc_cache config) + { + checkCudaCall(cuFuncSetCacheConfig(_obj, config)); + } + +#if 0 + void paramSetTexRef(TexRef &texref) + { + checkCudaCall(cuParamSetTexRef(_obj, CU_PARAM_TR_DEFAULT, texref)); + } +#endif + }; + + + class Event : public Wrapper<CUevent> + { + public: + Event(int flags = CU_EVENT_DEFAULT) + { + checkCudaCall(cuEventCreate(&_obj, flags)); + } + + Event(CUevent &event) + : + Wrapper(event) + { + } + + ~Event() + { + if (hasOwnership) + checkCudaCall(cuEventDestroy(_obj)); + } + + float elapsedTime(Event &second) const + { + float ms; + checkCudaCall(cuEventElapsedTime(&ms, second, _obj)); + return ms; + } + + void query() const + { + checkCudaCall(cuEventQuery(_obj)); // unsuccessful result throws cu::Error + } + + void record() + { + checkCudaCall(cuEventRecord(_obj, 0)); + } + + void record(Stream &); + + void synchronize() + { + checkCudaCall(cuEventSynchronize(_obj)); + } + }; + + + class Stream : public Wrapper<CUstream> + { + friend class Event; + + public: + Stream(int flags = CU_STREAM_DEFAULT) + { + checkCudaCall(cuStreamCreate(&_obj, flags)); + } + + Stream(CUstream stream) + : + Wrapper<CUstream>(stream) + { + } + + ~Stream() + { + if (hasOwnership) + checkCudaCall(cuStreamDestroy(_obj)); + } + + void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size) + { + checkCudaCall(cuMemcpyHtoDAsync(devPtr, hostPtr, size, _obj)); + } + + void memcpyDtoHAsync(void *hostPtr, CUdeviceptr devPtr, size_t size) + { + checkCudaCall(cuMemcpyDtoHAsync(hostPtr, devPtr, size, _obj)); + } + + void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> ¶meters) + { + checkCudaCall(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(¶meters[0]), 0)); + } + +#if CUDART_VERSION >= 9000 + void launchCooperativeKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> ¶meters) + { + checkCudaCall(cuLaunchCooperativeKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(¶meters[0]))); + } +#endif + + void query() + { + checkCudaCall(cuStreamQuery(_obj)); // unsuccessful result throws cu::Error + } + + void synchronize() + { + checkCudaCall(cuStreamSynchronize(_obj)); + } + + void waitEvent(Event &event) + { + checkCudaCall(cuStreamWaitEvent(_obj, event, 0)); + } + + void addCallback(CUstreamCallback callback, void *userData, int flags = 0) + { + checkCudaCall(cuStreamAddCallback(_obj, callback, userData, flags)); + } + + void record(Event &event) + { + checkCudaCall(cuEventRecord(event, _obj)); + } + + void batchMemOp(unsigned count, CUstreamBatchMemOpParams *paramArray, unsigned flags) + { + checkCudaCall(cuStreamBatchMemOp(_obj, count, paramArray, flags)); + } + + void waitValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags) const + { + checkCudaCall(cuStreamWaitValue32(_obj, addr, value, flags)); + } + + void writeValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags) + { + checkCudaCall(cuStreamWriteValue32(_obj, addr, value, flags)); + } + }; + +#if 1 + class Graph : public Wrapper<CUgraph> + { + public: + class GraphNode : public Wrapper<CUgraphNode> + { + }; + + class ExecKernelNode : public GraphNode + { + }; + + class KernelNodeParams : public Wrapper<CUDA_KERNEL_NODE_PARAMS> + { + public: + KernelNodeParams(const Function &function, + unsigned gridDimX, unsigned gridDimY, unsigned gridDimZ, + unsigned blockDimX, unsigned blockDimY, unsigned blockDimZ, + unsigned sharedMemBytes, + const std::vector<const void *> &kernelParams) + { + _obj.func = function; + _obj.blockDimX = blockDimX; + _obj.blockDimY = blockDimY; + _obj.blockDimZ = blockDimZ; + _obj.gridDimX = gridDimX; + _obj.gridDimY = gridDimY; + _obj.gridDimZ = gridDimZ; + _obj.sharedMemBytes = sharedMemBytes; + _obj.kernelParams = const_cast<void **>(kernelParams.data()); + _obj.extra = nullptr; + } + }; + + class Exec : public Wrapper<CUgraphExec> + { + public: + void launch(Stream &stream) + { + checkCudaCall(cuGraphLaunch(_obj, stream)); + } + }; + + Graph(unsigned flags = 0) + { + checkCudaCall(cuGraphCreate(&_obj, flags)); + } + + Graph(CUgraph &graph) + : + Wrapper(graph) + { + } + + ~Graph() + { + if (hasOwnership) + checkCudaCall(cuGraphDestroy(_obj)); + } + + ExecKernelNode addKernelNode(/* std::vector<GraphNode> dependencies, */ const KernelNodeParams &kernelArgs) + { + ExecKernelNode node; + checkCudaCall(cuGraphAddKernelNode(& (CUgraphNode &) node, _obj, nullptr, 0, & (const CUDA_KERNEL_NODE_PARAMS &) kernelArgs)); + return node; + } + + Exec instantiate() + { + Exec exec; + checkCudaCall(cuGraphInstantiate(& (CUgraphExec &) exec, _obj, nullptr, nullptr, 0)); + return exec; + } + }; +#endif + + + inline void Event::record(Stream &stream) + { + checkCudaCall(cuEventRecord(_obj, stream._obj)); + } +} + +#endif diff --git a/util/multi_array.h b/util/multi_array.h new file mode 100644 index 0000000..1db0ea5 --- /dev/null +++ b/util/multi_array.h @@ -0,0 +1,63 @@ +#if !defined MULTI_ARRAY_H +#define MULTI_ARRAY_H + +// Experimental class that provides multi-dimensional arrays, similar to +// boost::multi_array[_ref], but much simpler. + +namespace multi_array { + template <std::size_t dim> class extent { + public: + extent(std::size_t size, const extent<dim - 1> &rhs) : size(rhs.size * size), next_extent(extent<dim - 1>(size, rhs.next_extent)) {} + extent<dim + 1> operator [] (std::size_t size) const { return extent<dim + 1>(size, *this); } + + const std::size_t size; + const extent<dim - 1> next_extent; + }; + + + template <> class extent<0> { + public: + extent() {} + template <typename T> extent(std::size_t, T) {} + extent<1> operator [] (std::size_t size) const { return extent<1>(size, *this); } + + static const std::size_t size = 1; + static const struct {} next_extent; + }; + + + const static extent<0> extents; + + + template<typename T, std::size_t dim> class array_ref { + public: + array_ref(T &ref, const extent<dim> &extents) : ref(ref), extents(extents) {} + template <std::size_t d = dim> typename std::enable_if<d == 1, T &>::type operator [] (std::size_t index) const { return (&ref)[index]; } + template <std::size_t d = dim> typename std::enable_if<d != 1, array_ref<T, dim - 1>>::type operator [] (std::size_t index) const { return array_ref<T, dim - 1>((&ref)[index * extents.next_extent.size], extents.next_extent); } + T *begin() const { return &ref; } + T *end() const { return &ref + extents.size; } + std::size_t bytesize() const { return sizeof(T) * extents.size; } + + const extent<dim> extents; + + private: + T &ref; + }; + + + namespace detail { + template <typename T> class array_base { + protected: + array_base(std::size_t size) : vec(size) {} + std::vector<T> vec; + }; + } + + + template<typename T, std::size_t dim> class array : private detail::array_base<T>, public array_ref<T, dim> { + public: + array(const extent<dim> &extents) : detail::array_base<T>(extents.size), array_ref<T, dim>(*detail::array_base<T>::vec.data(), extents) {} + }; +} + +#endif diff --git a/util/nvrtc.cc b/util/nvrtc.cc new file mode 100644 index 0000000..b6ac885 --- /dev/null +++ b/util/nvrtc.cc @@ -0,0 +1,11 @@ +#include "nvrtc.h" + + +namespace nvrtc { + +const char *Error::what() const noexcept +{ + return nvrtcGetErrorString(_result); +} + +} diff --git a/util/nvrtc.h b/util/nvrtc.h new file mode 100644 index 0000000..f429a73 --- /dev/null +++ b/util/nvrtc.h @@ -0,0 +1,109 @@ +#if !defined NVRTC_H +#define NVRTC_H + +#include <cuda.h> +#include <nvrtc.h> + +#include <algorithm> +#include <exception> +#include <fstream> +#include <iterator> +#include <string> +#include <vector> + + +namespace nvrtc { + class Error : public std::exception { + public: + Error(nvrtcResult result) + : + _result(result) + { + } + + virtual const char *what() const noexcept; + + operator nvrtcResult () const + { + return _result; + } + + private: + nvrtcResult _result; + }; + + + inline void checkNvrtcCall(nvrtcResult result) + { + if (result != NVRTC_SUCCESS) + throw Error(result); + } + + + class Program { + public: + Program(const std::string &src, const std::string &name, int numHeaders = 0, const char *headers[] = nullptr, const char *includeNames[] = nullptr) // TODO: use std::vector<std::string> + { + checkNvrtcCall(nvrtcCreateProgram(&program, src.c_str(), name.c_str(), numHeaders, headers, includeNames)); + } + + Program(const std::string &filename) + { + std::ifstream ifs(filename); + std::string source(std::istreambuf_iterator<char>{ifs}, {}); + checkNvrtcCall(nvrtcCreateProgram(&program, source.c_str(), filename.c_str(), 0, nullptr, nullptr)); + } + + ~Program() + { + checkNvrtcCall(nvrtcDestroyProgram(&program)); + } + + void compile(const std::vector<std::string> &options) + { + std::vector<const char *> c_options; + std::transform(options.begin(), options.end(), std::back_inserter(c_options), [] (const std::string &option) { return option.c_str();}); + checkNvrtcCall(nvrtcCompileProgram(program, c_options.size(), c_options.data())); + } + + std::string getPTX() + { + size_t size; + std::string ptx; + + checkNvrtcCall(nvrtcGetPTXSize(program, &size)); + ptx.resize(size); + checkNvrtcCall(nvrtcGetPTX(program, &ptx[0])); + return ptx; + } + +#if CUDA_VERSION >= 11020 + std::vector<char> getCUBIN() + { + size_t size; + std::vector<char> cubin; + + checkNvrtcCall(nvrtcGetCUBINSize(program, &size)); + cubin.resize(size); + checkNvrtcCall(nvrtcGetCUBIN(program, &cubin[0])); + return cubin; + } +#endif + + std::string getLog() + { + size_t size; + std::string log; + + checkNvrtcCall(nvrtcGetProgramLogSize(program, &size)); + log.resize(size); + checkNvrtcCall(nvrtcGetProgramLog(program, &log[0])); + return log; + } + + private: + nvrtcProgram program; + }; +} + +#endif -- GitLab