diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0ce2042fa081492344d8503dce6e2742ef84ead2 --- /dev/null +++ b/Makefile @@ -0,0 +1,102 @@ +VERSION= 0.5 +#CUDA= /cm/shared/package/cuda100/toolkit/10.0.130 +CUDA= $(shell dirname `dirname \`which nvcc\``) +#CUDA= /usr/local/cuda +#POWER_SENSOR= $(HOME)/projects/libpowersensor-master/build +ARCH= $(shell arch) +CC= gcc +CXX= g++ #-Wno-deprecated-declarations +NVCC= nvcc +INCLUDES= -I. +#INCLUDES+= -I$(CUDA)/include +#INCLUDES+= -I$(POWER_SENSOR)/include +CXXFLAGS+= -std=c++11 -O3 -g -fpic -fopenmp $(INCLUDES) -DNDEBUG +NVCCFLAGS= $(INCLUDES) + +#CXXFLAGS+= -march=core-avx2 -mcmodel=medium + +LIBTCC_SOURCES= util/cu.cc\ + util/nvrtc.cc\ + libtcc/CorrelatorKernel.cc\ + libtcc/Correlator.cc\ + libtcc/Kernel.cc + + +CORRELATOR_TEST_SOURCES=test/CorrelatorTest/CorrelatorTest.cc\ + test/CorrelatorTest/Options.cc\ + test/Common/Record.cc\ + test/Common/UnitTest.cc + +OPENCL_TEST_SOURCES= test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc + +SIMPLE_EXAMPLE_SOURCES= test/SimpleExample/SimpleExample.cu + + +LIBTCC_OBJECTS= $(LIBTCC_SOURCES:%.cc=%.o) libtcc/TCCorrelator.o +SIMPLE_EXAMPLE_OBJECTS= $(SIMPLE_EXAMPLE_SOURCES:%.cu=%.o) +CORRELATOR_TEST_OBJECTS=$(CORRELATOR_TEST_SOURCES:%.cc=%.o) +OPENCL_TEST_OBJECTS= $(OPENCL_TEST_SOURCES:%.cc=%.o) + +OBJECTS= $(LIBTCC_OBJECTS)\ + $(SIMPLE_EXAMPLE_OBJECTS)\ + $(CORRELATOR_TEST_OBJECTS)\ + $(OPENCL_TEST_OBJECTS) + +SHARED_OBJECTS= libtcc/libtcc.so libtcc/libtcc.so.$(VERSION) + +DEPENDENCIES= $(OBJECTS:%.o=%.d) + +EXECUTABLES= test/SimpleExample/SimpleExample\ + test/CorrelatorTest/CorrelatorTest\ + test/OpenCLCorrelatorTest/OpenCLCorrelatorTest + +LIBRARIES= -L$(CUDA)/lib64 \ + -L$(CUDA)/lib64/stubs -lcuda -lnvrtc #\ + #-L$(POWER_SENSOR)/lib -lpowersensor #-lnvidia-ml + + +%.d: %.cc + -$(CXX) $(CXXFLAGS) -MM -MT $@ -MT ${@:%.d=%.o} -MT ${@:%.d=%.s} $< -o $@ + +%.d: %.cu + -$(CXX) -x c++ $(CXXFLAGS) -MM -MT $@ -MT ${@:%.d=%.o} -MT ${@:%.d=%.s} $< -o $@ + +%.o: %.cc + $(CXX) $(CXXFLAGS) -o $@ -c $< + +%.o: %.cu + $(NVCC) $(NVCCFLAGS) -o $@ -c $< + +%.s: %.cc + $(CXX) $(CXXFLAGS) -o $@ -S $< + +%.so: %.so.$(VERSION) + rm -f $@ + ln -s $(@F).$(VERSION) $@ + +all:: $(EXECUTABLES) + +clean:: + $(RM) $(OBJECTS) $(SHARED_OBJECTS) $(DEPENDENCIES) $(EXECUTABLES) + +libtcc/TCCorrelator.o: libtcc/TCCorrelator.cu # CUDA code embedded in object file + ld -r -b binary -o $@ $< + +libtcc/TCCorrelator.d: + - + +libtcc/libtcc.so.$(VERSION): $(LIBTCC_OBJECTS) + $(CXX) -shared -o $@ -Wl,-soname=$@ $^ $(LIBRARIES) + +test/SimpleExample/SimpleExample: $(SIMPLE_EXAMPLE_OBJECTS) libtcc/libtcc.so + $(NVCC) $(NVCCFLAGS) -o $@ $(SIMPLE_EXAMPLE_OBJECTS) -Xlinker -rpath=. -Llibtcc -ltcc $(LIBRARIES) + +test/CorrelatorTest/CorrelatorTest: $(CORRELATOR_TEST_OBJECTS) libtcc/libtcc.so + $(CXX) $(CXXFLAGS) -o $@ $(CORRELATOR_TEST_OBJECTS) -Wl,-rpath=. -Llibtcc -ltcc $(LIBRARIES) + +test/OpenCLCorrelatorTest/OpenCLCorrelatorTest: $(OPENCL_TEST_OBJECTS) + $(CXX) $(CXXFLAGS) -o $@ $(OPENCL_TEST_OBJECTS) -L$(CUDA)/lib64 -lOpenCL + +ifeq (0, $(words $(findstring $(MAKECMDGOALS), clean))) +-include $(DEPENDENCIES) +endif diff --git a/libtcc/Correlator.cc b/libtcc/Correlator.cc new file mode 100644 index 0000000000000000000000000000000000000000..a4e5b7d68207908ea4af8f483029c0e81fb9b733 --- /dev/null +++ b/libtcc/Correlator.cc @@ -0,0 +1,117 @@ +#include "libtcc/Correlator.h" + +#include <iostream> + +#define GNU_SOURCE +#include <link.h> + + +extern const char _binary_libtcc_TCCorrelator_cu_start, _binary_libtcc_TCCorrelator_cu_end; + +namespace tcc { + +std::string Correlator::findNVRTCincludePath() const +{ + std::string path; + + if (dl_iterate_phdr([] (struct dl_phdr_info *info, size_t, void *arg) -> int + { + std::string &path = *static_cast<std::string *>(arg); + path = info->dlpi_name; + return path.find("libnvrtc.so") != std::string::npos; + }, &path)) + { + path.erase(path.find_last_of("/")); // remove library name + path.erase(path.find_last_of("/")); // remove /lib64 + path += "/include"; + } + + return path; +} + + +Correlator::Correlator(unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations, + unsigned nrReceiversPerBlock + ) +: + correlatorModule(compileModule(nrBits, nrReceivers, nrChannels, nrSamplesPerChannel, nrPolarizations, nrReceiversPerBlock)), + correlatorKernel(correlatorModule, nrBits, nrReceivers, nrChannels, nrSamplesPerChannel, nrPolarizations, nrReceiversPerBlock) +{ +} + + +cu::Module Correlator::compileModule(unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations, + unsigned nrReceiversPerBlock + ) +{ + cu::Device device(cu::Context::getCurrent().getDevice()); + int capability = 10 * device.getAttribute<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>() + device.getAttribute<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(); + + std::vector<std::string> options = + { + "-I" + findNVRTCincludePath(), + "-std=c++11", + "-arch=compute_" + std::to_string(capability), + "-lineinfo", + "-DNR_BITS=" + std::to_string(nrBits), + "-DNR_RECEIVERS=" + std::to_string(nrReceivers), + "-DNR_CHANNELS=" + std::to_string(nrChannels), + "-DNR_SAMPLES_PER_CHANNEL=" + std::to_string(nrSamplesPerChannel), + "-DNR_POLARIZATIONS=" + std::to_string(nrPolarizations), + "-DNR_RECEIVERS_PER_BLOCK=" + std::to_string(nrReceiversPerBlock), + }; + + //std::for_each(options.begin(), options.end(), [] (const std::string &e) { std::cout << e << ' '; }); std::cout << std::endl; + +#if 0 + nvrtc::Program program("tcc/TCCorrelator.cu"); +#else + // embed the CUDA source code in libtcc.so, so that it need not be installed separately + // for runtime compilation + // copy into std::string for '\0' termination + std::string source(&_binary_libtcc_TCCorrelator_cu_start, &_binary_libtcc_TCCorrelator_cu_end); + nvrtc::Program program(source, "TCCorrelator.cu"); +#endif + + try { + program.compile(options); + } catch (nvrtc::Error &error) { + std::cerr << program.getLog(); + throw; + } + + //std::ofstream cubin("out.ptx"); + //cubin << program.getPTX().data(); + return cu::Module((void *) program.getPTX().data()); +} + + +void Correlator::launchAsync(cu::Stream &stream, cu::DeviceMemory &visibilities, cu::DeviceMemory &samples) +{ + correlatorKernel.launchAsync(stream, visibilities, samples); +} + + +void Correlator::launchAsync(CUstream stream, CUdeviceptr visibilities, CUdeviceptr samples) +{ + cu::Stream _stream(stream); + cu::DeviceMemory _visibilities(visibilities); + cu::DeviceMemory _samples(samples); + correlatorKernel.launchAsync(_stream, _visibilities, _samples); +} + + +uint64_t Correlator::FLOPS() const +{ + return correlatorKernel.FLOPS(); +} + +} diff --git a/libtcc/Correlator.h b/libtcc/Correlator.h new file mode 100644 index 0000000000000000000000000000000000000000..723361f8109930ddf789c7e63a33d62a9c5d6d1e --- /dev/null +++ b/libtcc/Correlator.h @@ -0,0 +1,40 @@ +#if !defined CORRELATOR_H +#define CORRELATOR_H + +#include "libtcc/CorrelatorKernel.h" +#include "util/cu.h" +#include "util/nvrtc.h" + + +namespace tcc { + class Correlator { + public: + Correlator(unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations = 2, + unsigned nrReceiversPerBlock = 64 + ); // throw (cu::Error, nvrtc::Error) + + void launchAsync(cu::Stream &, cu::DeviceMemory &isibilities, cu::DeviceMemory &samples); // throw (cu::Error) + void launchAsync(CUstream, CUdeviceptr visibilities, CUdeviceptr samples); // throw (cu::Error) + + uint64_t FLOPS() const; + + private: + std::string findNVRTCincludePath() const; + cu::Module compileModule(unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations, + unsigned nrReceiversPerBlock + ); + + cu::Module correlatorModule; + CorrelatorKernel correlatorKernel; + }; +} + +#endif diff --git a/libtcc/CorrelatorKernel.cc b/libtcc/CorrelatorKernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..157281164b19f1d2294c2e8f7c741fdb470c8594 --- /dev/null +++ b/libtcc/CorrelatorKernel.cc @@ -0,0 +1,43 @@ +#include "libtcc/CorrelatorKernel.h" + + +namespace tcc { + +CorrelatorKernel::CorrelatorKernel(cu::Module &module, + unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations, + unsigned nrReceiversPerBlock + ) +: + Kernel(module, "correlate"), + nrBits(nrBits), + nrReceivers(nrReceivers), + nrChannels(nrChannels), + nrSamplesPerChannel(nrSamplesPerChannel), + nrPolarizations(nrPolarizations), + nrReceiversPerBlock(nrReceiversPerBlock) +{ + unsigned blocksPerDim = (nrReceivers + nrReceiversPerBlock - 1) / nrReceiversPerBlock; + nrThreadBlocksPerChannel = nrReceiversPerBlock == 64 ? blocksPerDim * blocksPerDim : blocksPerDim * (blocksPerDim + 1) / 2; +} + + +void CorrelatorKernel::launchAsync(cu::Stream &stream, cu::DeviceMemory &deviceVisibilities, cu::DeviceMemory &deviceSamples) +{ + std::vector<const void *> parameters = { deviceVisibilities.parameter(), deviceSamples.parameter() }; + stream.launchKernel(function, + nrThreadBlocksPerChannel, nrChannels, 1, + 32, 2, 2, + 0, parameters); +} + + +uint64_t CorrelatorKernel::FLOPS() const +{ + return 8ULL * nrReceivers * nrReceivers / 2 * nrPolarizations * nrPolarizations * nrChannels * nrSamplesPerChannel; +} + +} diff --git a/libtcc/CorrelatorKernel.h b/libtcc/CorrelatorKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..e28a2fdb197bcf08ff566331c6b2a1259dc7d60b --- /dev/null +++ b/libtcc/CorrelatorKernel.h @@ -0,0 +1,35 @@ +#if !defined CORRELATOR_KERNEL_H +#define CORRELATOR_KERNEL_H + +#include "libtcc/Kernel.h" + + +namespace tcc { + class CorrelatorKernel : public Kernel + { + public: + CorrelatorKernel(cu::Module &module, + unsigned nrBits, + unsigned nrReceivers, + unsigned nrChannels, + unsigned nrSamplesPerChannel, + unsigned nrPolarizations = 2, + unsigned nrReceiversPerBlock = 64 + ); + + void launchAsync(cu::Stream &, cu::DeviceMemory &deviceVisibilities, cu::DeviceMemory &deviceSamples); + + virtual uint64_t FLOPS() const; + + private: + const unsigned nrBits; + const unsigned nrReceivers; + const unsigned nrChannels; + const unsigned nrSamplesPerChannel; + const unsigned nrPolarizations; + const unsigned nrReceiversPerBlock; + unsigned nrThreadBlocksPerChannel; + }; +} + +#endif diff --git a/libtcc/Kernel.cc b/libtcc/Kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f3c1af54f4c902f2727d61415a0a49c9c6f8e1b --- /dev/null +++ b/libtcc/Kernel.cc @@ -0,0 +1,13 @@ +#include "libtcc/Kernel.h" + + +namespace tcc { + +Kernel::Kernel(cu::Module &module, const char *name) +: + module(module), + function(module, name) +{ +} + +} diff --git a/libtcc/Kernel.h b/libtcc/Kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f6c9ab410017c32c92fb3e2356a2546a94a76b49 --- /dev/null +++ b/libtcc/Kernel.h @@ -0,0 +1,23 @@ +#if !defined KERNEL_H +#define KERNEL_H + +#include "util/cu.h" + +#include <stdint.h> + + +namespace tcc { + class Kernel + { + public: + Kernel(cu::Module &, const char *name); + + virtual uint64_t FLOPS() const = 0; + + protected: + cu::Module &module; + cu::Function function; + }; +} + +#endif diff --git a/libtcc/TCCorrelator.cu b/libtcc/TCCorrelator.cu new file mode 100644 index 0000000000000000000000000000000000000000..0836edcbeb3e997616f1b1dc35414dea79a86276 --- /dev/null +++ b/libtcc/TCCorrelator.cu @@ -0,0 +1,596 @@ +#if 1000 * __CUDACC_VER_MAJOR__ + __CUDACC_VER_MINOR__ >= 11001 && __CUDA_ARCH__ >= 800 +#define ASYNC_COPIES +#endif + +#if defined ASYNC_COPIES +#include <cooperative_groups/memcpy_async.h> +#endif + +#include <mma.h> + +#define NR_BASELINES (NR_RECEIVERS * (NR_RECEIVERS + 1) / 2) +#define ALIGN(A,N) (((A)+(N)-1)/(N)*(N)) + +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) +#define NR_RECEIVERS_PER_TCM_X ((NR_BITS) == 4 ? 2 : 4) +#define NR_RECEIVERS_PER_TCM_Y ((NR_BITS) == 4 ? 4 : 8) + +#define COMPLEX 2 + +#if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0) +#error this architecture has no suitable tensor cores +#endif + +#if __CUDA_ARCH__ != 700 && __CUDA_ARCH__ != 720 && __CUDA_ARCH__ != 750 && __CUDA_ARCH__ != 800 && __CUDA_ARCH__ != 860 +#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory) +#endif + +#if NR_RECEIVERS_PER_BLOCK != 32 && NR_RECEIVERS_PER_BLOCK != 48 && NR_RECEIVERS_PER_BLOCK != 64 +#error unsupported NR_RECEIVERS_PER_BLOCK +#endif + +#if NR_SAMPLES_PER_CHANNEL % NR_TIMES_PER_BLOCK != 0 +#error NR_SAMPLES_PER_CHANNEL should be a multiple of NR_TIMES_PER_BLOCK +#endif + +#define MIN(A,B) ((A)<(B)?(A):(B)) + + +using namespace nvcuda::wmma; + +#if NR_BITS == 4 +typedef char Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef int2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 8 +typedef char2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef int2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 16 +typedef __half2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef float2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#endif + + +#if NR_BITS == 4 +typedef fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> Afrag; +typedef fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> Bfrag; +typedef fragment<accumulator, 8, 8, 32, int> Sum; +#elif NR_BITS == 8 +typedef fragment<matrix_a, 16, 16, 16, signed char, row_major> Afrag; +typedef fragment<matrix_b, 16, 16, 16, signed char, col_major> Bfrag; +typedef fragment<accumulator, 16, 16, 16, int> Sum; +#elif NR_BITS == 16 +typedef fragment<matrix_a, 16, 16, 16, __half, row_major> Afrag; +typedef fragment<matrix_b, 16, 16, 16, __half, col_major> Bfrag; +typedef fragment<accumulator, 16, 16, 16, float> Sum; +#endif + + +#if NR_BITS == 4 +typedef int2 ScratchSpace[4][NR_POLARIZATIONS][2][NR_POLARIZATIONS]; +#elif NR_BITS == 8 +typedef int2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS]; +#elif NR_BITS == 16 +typedef float2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS]; +#endif + + +__device__ inline int conj_perm(int v) +{ +#if NR_BITS == 4 + //return ((v & 0x0F0F0F0F) << 4) | (__vnegss4(v >> 4) & 0x0F0F0F0F); + return ((v & 0x0F0F0F0F) << 4) | ((0xF0F0F0F0 - ((v >> 4) & 0x0F0F0F0F)) & 0x0F0F0F0F); +#elif NR_BITS == 8 + //return __byte_perm(v, __vnegss4(v), 0x2705); + return __byte_perm(v, 0x00FF00FF - (v & 0xFF00FF00), 0x2705); +#elif NR_BITS == 16 + return __byte_perm(v ^ 0x80000000, v, 0x1032); +#endif +} + + +__device__ inline int2 conj_perm(int2 v) +{ + return make_int2(conj_perm(v.x), conj_perm(v.y)); +} + + +__device__ inline int4 conj_perm(int4 v) +{ + return make_int4(conj_perm(v.x), conj_perm(v.y), conj_perm(v.z), conj_perm(v.w)); +} + + +#if defined ASYNC_COPIES +#define READ_AHEAD MIN(2, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) +#define NR_SHARED_BUFFERS MIN(4, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) +#else +#define READ_AHEAD 1 +#define NR_SHARED_BUFFERS 2 +#endif + + +template <unsigned nrReceiversPerBlock = NR_RECEIVERS_PER_BLOCK> struct SharedData +{ +#if NR_BITS == 4 + typedef char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][1]; + typedef char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 16][1]; +#elif NR_BITS == 8 + typedef signed char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX]; + typedef signed char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 8][COMPLEX]; +#elif NR_BITS == 16 + typedef __half Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX]; + typedef __half Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 4][COMPLEX]; +#endif +}; + + +template <typename T> struct FetchData +{ + __device__ FetchData(unsigned loadRecv, unsigned loadPol, unsigned loadTime) + : + loadRecv(loadRecv), loadPol(loadPol), loadTime(loadTime), data({0}) + { + } + + __device__ void load(const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + //data = * (T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime]; + memcpy(&data, &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], sizeof(T)); + } + + template <typename SharedData> __device__ void storeA(SharedData samples) const + { + //* ((T *) &samples[loadRecv][loadPol][loadTime][0]) = data; + memcpy(&samples[loadRecv][loadPol][loadTime][0], &data, sizeof(T)); + } + + template <typename SharedData> __device__ void storeB(SharedData samples) const + { + //* ((T *) &samples[loadRecv][loadPol][0][loadTime][0]) = data; + //* ((T *) &samples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(data); + T tmp = conj_perm(data); + memcpy(&samples[loadRecv][loadPol][0][loadTime][0], &data, sizeof(T)); + memcpy(&samples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T)); + } + +#if defined ASYNC_COPIES + template <typename Asamples> __device__ void copyAsyncA(nvcuda::experimental::pipeline &pipe, Asamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe); + } + + template<typename Bsamples> __device__ void copyAsyncB(nvcuda::experimental::pipeline &pipe, Bsamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][0][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe); + } + + template<typename Bsamples> __device__ void fixB(Bsamples bSamples) + { + //* ((T *) &bSamples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(* ((T *) &bSamples[loadRecv][loadPol][0][loadTime][0])); + T tmp; + memcpy(&tmp, &bSamples[loadRecv][loadPol][0][loadTime][0], sizeof(T)); + tmp = conj_perm(tmp); + memcpy(&bSamples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T)); + } +#endif + + unsigned loadRecv, loadPol, loadTime; + T data; +}; + + +__device__ inline int2 make_complex(int real, int imag) +{ + return make_int2(real, imag); +} + + +__device__ inline float2 make_complex(float real, float imag) +{ + return make_float2(real, imag); +} + + +template <typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI) +{ + if ((skipCheckX || recvX + tcX <= recvY + tcY) && (skipCheckY || recvY + tcY < NR_RECEIVERS)) + visibilities[channel][baseline + tcY * recvY + tcY * (tcY + 1) / 2 + tcX][polY][polX] = make_complex(sumR, sumI); +} + + +__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned recvYoffset, unsigned recvXoffset, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp) +{ +#if defined PORTABLE + store_matrix_sync(&scratchSpace[warp][0][0][0][0].x, sum, NR_BITS == 4 ? 8 : 16, mem_row_major); + __syncwarp(); + +#if 0 + if (threadIdx.x == 0) + for (unsigned _y = 0; _y < 8; _y ++) + for (unsigned pol_y = 0; pol_y < NR_POLARIZATIONS; pol_y ++) + for (unsigned _x = 0; _x < 4; _x ++) + for (unsigned pol_x = 0; pol_x < NR_POLARIZATIONS; pol_x ++) + if (scratchSpace[warp][_y][pol_y][_x][pol_x],x != 0 || scratchSpace[warp][_y][pol_y][_x][pol_x].y != 0) + printf("firstY=%u firstX=%u warp=%u y=%u x=%u _y=%u pol_y=%u _x=%u pol_x=%u val=(%f,%f)\n", firstReceiverY, firstReceiverX, warp, y, x, _y, pol_y, _x, pol_x, scratchSpace[warp][_y][pol_y][_x][pol_x].x, scratchSpace[warp][_y][pol_y][_x][pol_x].y); +#endif + +#if NR_BITS == 4 + unsigned _y = threadIdx.x >> 3; + unsigned _x = (threadIdx.x >> 2) & 1; + unsigned polY = (threadIdx.x >> 1) & 1; + unsigned polX = threadIdx.x & 1; +#elif NR_BITS == 8 || NR_BITS == 16 + unsigned _y = threadIdx.x >> 2; + unsigned _x = threadIdx.x & 3; +#endif + + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + _y; + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + _x; + unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + + if ((skipCheckX || recvX <= recvY) && (skipCheckY || recvY < NR_RECEIVERS)) +#if NR_BITS == 4 + visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; +#elif NR_BITS == 8 || NR_BITS == 16 + for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++) + for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) + visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; +#endif +#else +#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4); + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2); + unsigned polY = threadIdx.x & 1; + unsigned polX = (threadIdx.x >> 1) & 1; +#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3); + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1); + unsigned polY = (threadIdx.x >> 2) & 1; + unsigned polX = threadIdx.x & 1; +#endif + + unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + +#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 1, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 1, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); +#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); +#if NR_BITS == 8 || NR_BITS == 16 + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 2, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 2, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); +#endif +#endif +#endif +} + + +#define NR_WARPS 4 + +#if NR_RECEIVERS_PER_BLOCK == 64 + +template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities visibilities, const Samples samples, unsigned firstReceiver, unsigned warp, unsigned tid, SharedData<>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +{ + const unsigned nrFragmentsX = NR_BITS == 4 ? 12 : 6; + const unsigned nrFragmentsY = nrFragmentsX / 2; + Sum sum[nrFragmentsX * nrFragmentsY]; + + for (auto &s : sum) + fill_fragment(s, 0); + + unsigned channel = blockIdx.y; + + const uchar2 offsets[] = { + make_uchar2( 0, 0), + make_uchar2( 0, 16), + make_uchar2( 0, 40), + make_uchar2(24, 40), + }; + + unsigned recvXoffset = offsets[warp].x; + unsigned recvYoffset = offsets[warp].y; + + FetchData<int4> tmp0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + FetchData<int4> tmp1((tid >> 2) + NR_RECEIVERS_PER_BLOCK / 2, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + +#if defined ASYNC_COPIES + using namespace nvcuda::experimental; + pipeline pipe; + + for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) { + unsigned fetchBuffer = majorTime; + + tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle); + tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle); + + pipe.commit(); + } +#else + tmp0.load(samples, channel, 0, firstReceiver, fullTriangle); + tmp1.load(samples, channel, 0, firstReceiver, fullTriangle); +#endif + + for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) { + unsigned buffer = majorTime % NR_SHARED_BUFFERS; + +#if !defined ASYNC_COPIES + tmp0.storeB(bSamples[buffer]); + tmp1.storeB(bSamples[buffer]); +#endif + + unsigned majorReadTime = majorTime + READ_AHEAD; + + if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) { +#if defined ASYNC_COPIES + unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS; + + tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle); + tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle); +#else + tmp0.load(samples, channel, majorReadTime, firstReceiver, fullTriangle); + tmp1.load(samples, channel, majorReadTime, firstReceiver, fullTriangle); +#endif + } + +#if defined ASYNC_COPIES + pipe.commit(); + pipe.wait_prior<READ_AHEAD>(); + + tmp0.fixB(bSamples[buffer]); + tmp1.fixB(bSamples[buffer]); +#endif + + __syncthreads(); + +#pragma unroll + for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) { + Afrag aFrag[nrFragmentsY]; + Bfrag bFrag[nrFragmentsX]; + + if (warp != 0) { + for (unsigned y = 0; y < nrFragmentsY; y ++) + load_matrix_sync(aFrag[y], &bSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < nrFragmentsX; x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) + mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]); + } else { + for (unsigned z = 0, i = 0; z < 3; z ++) { + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + load_matrix_sync(aFrag[y], &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < (NR_BITS == 4 ? 8 : 4); x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][/*recvXoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + for (unsigned x = 0; x < 2 + 2 * y; x ++, i ++) + mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]); + } + } + } + } + +#if defined PORTABLE + __syncthreads(); +#endif + + if (warp != 0) + for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, recvYoffset, recvXoffset, y, x, fullTriangle, x < 2 * y + (NR_BITS == 4 ? 8 : 4), sum[i], scratchSpace, warp); + else + for (unsigned z = 0, i = 0; z < 3; z ++) + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + for (unsigned x = 0; x < 2 * y + 2; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, 24 * z, 24 * z, y, x, fullTriangle, x < 2 * y, sum[i], scratchSpace, warp); +} + +#endif + + +template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +{ + const unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_X / 2 / (NR_RECEIVERS_PER_BLOCK == 64 ? 2 : 1); + + Sum sum[nrFragmentsY][nrFragmentsX]; + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + fill_fragment(sum[y][x], 0); + + unsigned tid = warpSize * (blockDim.y * threadIdx.z + threadIdx.y) + threadIdx.x; + unsigned channel = blockIdx.y; + + unsigned recvXoffset = nrFragmentsX * NR_RECEIVERS_PER_TCM_X * threadIdx.y; + unsigned recvYoffset = nrFragmentsY * NR_RECEIVERS_PER_TCM_Y * threadIdx.z; + + FetchData<int4> tmpY0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + FetchData<int4> tmpX0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); +#if NR_RECEIVERS_PER_BLOCK == 48 + FetchData<int2> tmpY1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3)); + FetchData<int2> tmpX1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3)); +#elif NR_RECEIVERS_PER_BLOCK == 64 + FetchData<int4> tmpY1((tid >> 2) + 32, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); +#endif + +#if defined ASYNC_COPIES + using namespace nvcuda::experimental; + pipeline pipe; + + for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) { + unsigned fetchBuffer = majorTime; + + tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck); +#endif + + pipe.commit(); + } +#else + tmpY0.load(samples, channel, 0, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.load(samples, channel, 0, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.load(samples, channel, 0, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.load(samples, channel, 0, firstReceiverX, skipLoadXcheck); +#endif +#endif + + for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) { + unsigned buffer = majorTime % NR_SHARED_BUFFERS; + +#if !defined ASYNC_COPIES + tmpY0.storeA(aSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.storeA(aSamples[buffer]); +#endif + tmpX0.storeB(bSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.storeB(bSamples[buffer]); +#endif +#endif + + unsigned majorReadTime = majorTime + READ_AHEAD; + + if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) { +#if defined ASYNC_COPIES + unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS; + + tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#endif +#else + tmpY0.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#endif +#endif + } + +#if defined ASYNC_COPIES + pipe.commit(); + pipe.wait_prior<READ_AHEAD>(); + + tmpX0.fixB(bSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.fixB(bSamples[buffer]); +#endif +#endif + + __syncthreads(); + +#pragma unroll + for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) { + Afrag aFrag[nrFragmentsY]; + Bfrag bFrag[nrFragmentsX]; + + for (unsigned y = 0; y < nrFragmentsY; y ++) + load_matrix_sync(aFrag[y], &aSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][minorTime][0], sizeof(aSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < nrFragmentsX; x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + mma_sync(sum[y][x], aFrag[y], bFrag[x], sum[y][x]); + } + } + +#if 0 + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + for (unsigned i = 0; i < sum[0][0].num_storage_elements; i ++) + if (sum[y][x].x[i] != 0) +#if NR_BITS == 4 || NR_BITS == 8 + printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%d\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]); +#else + printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%f\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]); +#endif +#endif + +#if defined PORTABLE + __syncthreads(); +#endif + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + storeVisibilities(visibilities, channel, firstReceiverY, firstReceiverX, recvYoffset, recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize); +} + + +extern "C" __global__ +__launch_bounds__(NR_WARPS * 32, NR_RECEIVERS_PER_BLOCK == 32 ? 4 : 2) +void correlate(Visibilities visibilities, const Samples samples) +{ + const unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2; + + unsigned block = blockIdx.x; + +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + unsigned blockY = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2; + unsigned blockX = block - blockY * (blockY + 1) / 2; + unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK; +#elif NR_RECEIVERS_PER_BLOCK == 64 + unsigned blockY = (unsigned) sqrtf(block); + unsigned blockX = block - blockY * blockY; + unsigned firstReceiverX = blockX * (NR_RECEIVERS_PER_BLOCK / 2); +#endif + unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK; + + union shared { + struct { + SharedData<>::Asamples aSamples; + SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples bSamples; + } rectangle; + struct { + SharedData<>::Bsamples samples; + } triangle; + ScratchSpace scratchSpace[NR_WARPS]; + }; + + // the following hack is necessary to run the correlator in the OpenCL environment, + // as the maximum local memory size is 48K - 16 bytes. Due to padding in bSamples, + // the last 16 bytes are not used, so allocate 16 fewer bytes. + __shared__ char rawbuffer[sizeof(union shared) - 16] __attribute__((aligned(16))); + union shared &u = (union shared &) rawbuffer; + + if (firstReceiverX == firstReceiverY) +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + doCorrelateRectangle<nrFragmentsY, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#elif NR_RECEIVERS_PER_BLOCK == 64 + if (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 && (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK)) + doCorrelateTriangle<false>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); + else + doCorrelateTriangle<true>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); +#endif +#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 + else if (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverY >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK) + doCorrelateRectangle<(NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK + 2 * NR_RECEIVERS_PER_TCM_Y - 1) / NR_RECEIVERS_PER_TCM_Y / 2, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_Y) == 0, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#endif + else + doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +} diff --git a/libtcc/TCCorrelator.cu.o b/libtcc/TCCorrelator.cu.o new file mode 100644 index 0000000000000000000000000000000000000000..943a2da3a75846584b95962314b91f4161f2bc5a Binary files /dev/null and b/libtcc/TCCorrelator.cu.o differ diff --git a/libtcc/TCCorrelator.cu.orig b/libtcc/TCCorrelator.cu.orig new file mode 100644 index 0000000000000000000000000000000000000000..ddf921cc71b295c3c8749c1bb2fe22624e857f12 --- /dev/null +++ b/libtcc/TCCorrelator.cu.orig @@ -0,0 +1,587 @@ +#if 1000 * __CUDACC_VER_MAJOR__ + __CUDACC_VER_MINOR__ >= 11001 && __CUDA_ARCH__ >= 800 +#define ASYNC_COPIES +#endif + +#if defined ASYNC_COPIES +#include <cooperative_groups/memcpy_async.h> +#endif + +#include <mma.h> + +#define NR_BASELINES (NR_RECEIVERS * (NR_RECEIVERS + 1) / 2) +#define ALIGN(A,N) (((A)+(N)-1)/(N)*(N)) + +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) +#define NR_RECEIVERS_PER_TCM_X ((NR_BITS) == 4 ? 2 : 4) +#define NR_RECEIVERS_PER_TCM_Y ((NR_BITS) == 4 ? 4 : 8) + +#define COMPLEX 2 + +#if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0) +#error this architecture has no suitable tensor cores +#endif + +#if __CUDA_ARCH__ != 700 && __CUDA_ARCH__ != 720 && __CUDA_ARCH__ != 750 && __CUDA_ARCH__ != 800 && __CUDA_ARCH__ != 860 +#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory) +#endif + +#if NR_RECEIVERS_PER_BLOCK != 32 && NR_RECEIVERS_PER_BLOCK != 48 && NR_RECEIVERS_PER_BLOCK != 64 +#error unsupported NR_RECEIVERS_PER_BLOCK +#endif + +#if NR_SAMPLES_PER_CHANNEL % NR_TIMES_PER_BLOCK != 0 +#error NR_SAMPLES_PER_CHANNEL should be a multiple of NR_TIMES_PER_BLOCK +#endif + +#define MIN(A,B) ((A)<(B)?(A):(B)) + + +using namespace nvcuda::wmma; + +#if NR_BITS == 4 +typedef char Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef int2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 8 +typedef char2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef int2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 16 +typedef __half2 Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef float2 Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#endif + + +#if NR_BITS == 4 +typedef fragment<matrix_a, 8, 8, 32, experimental::precision::s4, row_major> Afrag; +typedef fragment<matrix_b, 8, 8, 32, experimental::precision::s4, col_major> Bfrag; +typedef fragment<accumulator, 8, 8, 32, int> Sum; +#elif NR_BITS == 8 +typedef fragment<matrix_a, 16, 16, 16, signed char, row_major> Afrag; +typedef fragment<matrix_b, 16, 16, 16, signed char, col_major> Bfrag; +typedef fragment<accumulator, 16, 16, 16, int> Sum; +#elif NR_BITS == 16 +typedef fragment<matrix_a, 16, 16, 16, __half, row_major> Afrag; +typedef fragment<matrix_b, 16, 16, 16, __half, col_major> Bfrag; +typedef fragment<accumulator, 16, 16, 16, float> Sum; +#endif + + +#if NR_BITS == 4 +typedef int2 ScratchSpace[4][NR_POLARIZATIONS][2][NR_POLARIZATIONS]; +#elif NR_BITS == 8 +typedef int2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS]; +#elif NR_BITS == 16 +typedef float2 ScratchSpace[8][NR_POLARIZATIONS][4][NR_POLARIZATIONS]; +#endif + + +__device__ inline int conj_perm(int v) +{ +#if NR_BITS == 4 + //return ((v & 0x0F0F0F0F) << 4) | (__vnegss4(v >> 4) & 0x0F0F0F0F); + return ((v & 0x0F0F0F0F) << 4) | ((0xF0F0F0F0 - ((v >> 4) & 0x0F0F0F0F)) & 0x0F0F0F0F); +#elif NR_BITS == 8 + //return __byte_perm(v, __vnegss4(v), 0x2705); + return __byte_perm(v, 0x00FF00FF - (v & 0xFF00FF00), 0x2705); +#elif NR_BITS == 16 + return __byte_perm(v ^ 0x80000000, v, 0x1032); +#endif +} + + +__device__ inline int2 conj_perm(int2 v) +{ + return make_int2(conj_perm(v.x), conj_perm(v.y)); +} + + +__device__ inline int4 conj_perm(int4 v) +{ + return make_int4(conj_perm(v.x), conj_perm(v.y), conj_perm(v.z), conj_perm(v.w)); +} + + +#if defined ASYNC_COPIES +#define READ_AHEAD MIN(2, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) +#define NR_SHARED_BUFFERS MIN(4, NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) +#else +#define READ_AHEAD 1 +#define NR_SHARED_BUFFERS 2 +#endif + + +template <unsigned nrReceiversPerBlock = NR_RECEIVERS_PER_BLOCK> struct SharedData +{ +#if NR_BITS == 4 + typedef char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][1]; + typedef char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 16][1]; +#elif NR_BITS == 8 + typedef signed char Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX]; + typedef signed char Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 8][COMPLEX]; +#elif NR_BITS == 16 + typedef __half Asamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK][COMPLEX]; + typedef __half Bsamples[NR_SHARED_BUFFERS][nrReceiversPerBlock][NR_POLARIZATIONS][COMPLEX][NR_TIMES_PER_BLOCK + 4][COMPLEX]; +#endif +}; + + +template <typename T> struct FetchData +{ + __device__ FetchData(unsigned loadRecv, unsigned loadPol, unsigned loadTime) + : + loadRecv(loadRecv), loadPol(loadPol), loadTime(loadTime), data({0}) + { + } + + __device__ void load(const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + data = * (T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime]; + } + + template <typename SharedData> __device__ void storeA(SharedData samples) const + { + * ((T *) &samples[loadRecv][loadPol][loadTime][0]) = data; + } + + template <typename SharedData> __device__ void storeB(SharedData samples) const + { + * ((T *) &samples[loadRecv][loadPol][0][loadTime][0]) = data; + * ((T *) &samples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(data); + } + +#if defined ASYNC_COPIES + template <typename Asamples> __device__ void copyAsyncA(nvcuda::experimental::pipeline &pipe, Asamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe); + } + + template<typename Bsamples> __device__ void copyAsyncB(nvcuda::experimental::pipeline &pipe, Bsamples dest, const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0) + { + if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS) + nvcuda::experimental::memcpy_async(* (T *) &dest[loadRecv][loadPol][0][loadTime][0], * (const T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], pipe); + } + + template<typename Bsamples> __device__ void fixB(Bsamples bSamples) + { + * ((T *) &bSamples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(* ((T *) &bSamples[loadRecv][loadPol][0][loadTime][0])); + } +#endif + + unsigned loadRecv, loadPol, loadTime; + T data; +}; + + +__device__ inline int2 make_complex(int real, int imag) +{ + return make_int2(real, imag); +} + + +__device__ inline float2 make_complex(float real, float imag) +{ + return make_float2(real, imag); +} + + +template <typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI) +{ + if ((skipCheckX || recvX + tcX <= recvY + tcY) && (skipCheckY || recvY + tcY < NR_RECEIVERS)) + visibilities[channel][baseline + tcY * recvY + tcY * (tcY + 1) / 2 + tcX][polY][polX] = make_complex(sumR, sumI); +} + + +__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned recvYoffset, unsigned recvXoffset, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp) +{ +#if defined PORTABLE + store_matrix_sync(&scratchSpace[warp][0][0][0][0].x, sum, NR_BITS == 4 ? 8 : 16, mem_row_major); + __syncwarp(); + +#if 0 + if (threadIdx.x == 0) + for (unsigned _y = 0; _y < 8; _y ++) + for (unsigned pol_y = 0; pol_y < NR_POLARIZATIONS; pol_y ++) + for (unsigned _x = 0; _x < 4; _x ++) + for (unsigned pol_x = 0; pol_x < NR_POLARIZATIONS; pol_x ++) + if (scratchSpace[warp][_y][pol_y][_x][pol_x],x != 0 || scratchSpace[warp][_y][pol_y][_x][pol_x].y != 0) + printf("firstY=%u firstX=%u warp=%u y=%u x=%u _y=%u pol_y=%u _x=%u pol_x=%u val=(%f,%f)\n", firstReceiverY, firstReceiverX, warp, y, x, _y, pol_y, _x, pol_x, scratchSpace[warp][_y][pol_y][_x][pol_x].x, scratchSpace[warp][_y][pol_y][_x][pol_x].y); +#endif + +#if NR_BITS == 4 + unsigned _y = threadIdx.x >> 3; + unsigned _x = (threadIdx.x >> 2) & 1; + unsigned polY = (threadIdx.x >> 1) & 1; + unsigned polX = threadIdx.x & 1; +#elif NR_BITS == 8 || NR_BITS == 16 + unsigned _y = threadIdx.x >> 2; + unsigned _x = threadIdx.x & 3; +#endif + + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + _y; + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + _x; + unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + + if ((skipCheckX || recvX <= recvY) && (skipCheckY || recvY < NR_RECEIVERS)) +#if NR_BITS == 4 + visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; +#elif NR_BITS == 8 || NR_BITS == 16 + for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++) + for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) + visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; +#endif +#else +#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4); + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2); + unsigned polY = threadIdx.x & 1; + unsigned polX = (threadIdx.x >> 1) & 1; +#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 + unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3); + unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1); + unsigned polY = (threadIdx.x >> 2) & 1; + unsigned polX = threadIdx.x & 1; +#endif + + unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + +#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 1, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 1, 1, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); +#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); +#if NR_BITS == 8 || NR_BITS == 16 + storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 2, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility(visibilities, channel, baseline, recvY, recvX, 4, 2, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); +#endif +#endif +#endif +} + + +#define NR_WARPS 4 + +#if NR_RECEIVERS_PER_BLOCK == 64 + +template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities visibilities, const Samples samples, unsigned firstReceiver, unsigned warp, unsigned tid, SharedData<>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +{ + const unsigned nrFragmentsX = NR_BITS == 4 ? 12 : 6; + const unsigned nrFragmentsY = nrFragmentsX / 2; + Sum sum[nrFragmentsX * nrFragmentsY]; + + for (auto &s : sum) + fill_fragment(s, 0); + + unsigned channel = blockIdx.y; + + const uchar2 offsets[] = { + make_uchar2( 0, 0), + make_uchar2( 0, 16), + make_uchar2( 0, 40), + make_uchar2(24, 40), + }; + + unsigned recvXoffset = offsets[warp].x; + unsigned recvYoffset = offsets[warp].y; + + FetchData<int4> tmp0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + FetchData<int4> tmp1((tid >> 2) + NR_RECEIVERS_PER_BLOCK / 2, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + +#if defined ASYNC_COPIES + using namespace nvcuda::experimental; + pipeline pipe; + + for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) { + unsigned fetchBuffer = majorTime; + + tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle); + tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiver, fullTriangle); + + pipe.commit(); + } +#else + tmp0.load(samples, channel, 0, firstReceiver, fullTriangle); + tmp1.load(samples, channel, 0, firstReceiver, fullTriangle); +#endif + + for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) { + unsigned buffer = majorTime % NR_SHARED_BUFFERS; + +#if !defined ASYNC_COPIES + tmp0.storeB(bSamples[buffer]); + tmp1.storeB(bSamples[buffer]); +#endif + + unsigned majorReadTime = majorTime + READ_AHEAD; + + if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) { +#if defined ASYNC_COPIES + unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS; + + tmp0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle); + tmp1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiver, fullTriangle); +#else + tmp0.load(samples, channel, majorReadTime, firstReceiver, fullTriangle); + tmp1.load(samples, channel, majorReadTime, firstReceiver, fullTriangle); +#endif + } + +#if defined ASYNC_COPIES + pipe.commit(); + pipe.wait_prior<READ_AHEAD>(); + + tmp0.fixB(bSamples[buffer]); + tmp1.fixB(bSamples[buffer]); +#endif + + __syncthreads(); + +#pragma unroll + for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) { + Afrag aFrag[nrFragmentsY]; + Bfrag bFrag[nrFragmentsX]; + + if (warp != 0) { + for (unsigned y = 0; y < nrFragmentsY; y ++) + load_matrix_sync(aFrag[y], &bSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < nrFragmentsX; x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) + mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]); + } else { + for (unsigned z = 0, i = 0; z < 3; z ++) { + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + load_matrix_sync(aFrag[y], &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < (NR_BITS == 4 ? 8 : 4); x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][/*recvXoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + for (unsigned x = 0; x < 2 + 2 * y; x ++, i ++) + mma_sync(sum[i], aFrag[y], bFrag[x], sum[i]); + } + } + } + } + +#if defined PORTABLE + __syncthreads(); +#endif + + if (warp != 0) + for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, recvYoffset, recvXoffset, y, x, fullTriangle, x < 2 * y + (NR_BITS == 4 ? 8 : 4), sum[i], scratchSpace, warp); + else + for (unsigned z = 0, i = 0; z < 3; z ++) + for (unsigned y = 0; y < (NR_BITS == 4 ? 4 : 2); y ++) + for (unsigned x = 0; x < 2 * y + 2; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, 24 * z, 24 * z, y, x, fullTriangle, x < 2 * y, sum[i], scratchSpace, warp); +} + +#endif + + +template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +{ + const unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_X / 2 / (NR_RECEIVERS_PER_BLOCK == 64 ? 2 : 1); + + Sum sum[nrFragmentsY][nrFragmentsX]; + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + fill_fragment(sum[y][x], 0); + + unsigned tid = warpSize * (blockDim.y * threadIdx.z + threadIdx.y) + threadIdx.x; + unsigned channel = blockIdx.y; + + unsigned recvXoffset = nrFragmentsX * NR_RECEIVERS_PER_TCM_X * threadIdx.y; + unsigned recvYoffset = nrFragmentsY * NR_RECEIVERS_PER_TCM_Y * threadIdx.z; + + FetchData<int4> tmpY0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); + FetchData<int4> tmpX0((tid >> 2) , (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); +#if NR_RECEIVERS_PER_BLOCK == 48 + FetchData<int2> tmpY1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3)); + FetchData<int2> tmpX1((tid >> 3) + 32, (tid >> 2) & 1, 32 / NR_BITS * (tid & 3)); +#elif NR_RECEIVERS_PER_BLOCK == 64 + FetchData<int4> tmpY1((tid >> 2) + 32, (tid >> 1) & 1, 64 / NR_BITS * (tid & 1)); +#endif + +#if defined ASYNC_COPIES + using namespace nvcuda::experimental; + pipeline pipe; + + for (unsigned majorTime = 0; majorTime < READ_AHEAD; majorTime ++) { + unsigned fetchBuffer = majorTime; + + tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorTime, firstReceiverX, skipLoadXcheck); +#endif + + pipe.commit(); + } +#else + tmpY0.load(samples, channel, 0, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.load(samples, channel, 0, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.load(samples, channel, 0, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.load(samples, channel, 0, firstReceiverX, skipLoadXcheck); +#endif +#endif + + for (unsigned majorTime = 0; majorTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; majorTime ++) { + unsigned buffer = majorTime % NR_SHARED_BUFFERS; + +#if !defined ASYNC_COPIES + tmpY0.storeA(aSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.storeA(aSamples[buffer]); +#endif + tmpX0.storeB(bSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.storeB(bSamples[buffer]); +#endif +#endif + + unsigned majorReadTime = majorTime + READ_AHEAD; + + if (majorReadTime < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK) { +#if defined ASYNC_COPIES + unsigned fetchBuffer = (buffer + READ_AHEAD) % NR_SHARED_BUFFERS; + + tmpY0.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.copyAsyncA(pipe, aSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.copyAsyncB(pipe, bSamples[fetchBuffer], samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#endif +#else + tmpY0.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 || NR_RECEIVERS_PER_BLOCK == 64 + tmpY1.load(samples, channel, majorReadTime, firstReceiverY, skipLoadYcheck); +#endif + tmpX0.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.load(samples, channel, majorReadTime, firstReceiverX, skipLoadXcheck); +#endif +#endif + } + +#if defined ASYNC_COPIES + pipe.commit(); + pipe.wait_prior<READ_AHEAD>(); + + tmpX0.fixB(bSamples[buffer]); +#if NR_RECEIVERS_PER_BLOCK == 48 + tmpX1.fixB(bSamples[buffer]); +#endif +#endif + + __syncthreads(); + +#pragma unroll + for (unsigned minorTime = 0; minorTime < NR_TIMES_PER_BLOCK; minorTime += ((NR_BITS) == 4 ? 16 : 8)) { + Afrag aFrag[nrFragmentsY]; + Bfrag bFrag[nrFragmentsX]; + + for (unsigned y = 0; y < nrFragmentsY; y ++) + load_matrix_sync(aFrag[y], &aSamples[buffer][recvYoffset + NR_RECEIVERS_PER_TCM_Y * y][0][minorTime][0], sizeof(aSamples[0][0][0]) * 8 / NR_BITS); + + for (unsigned x = 0; x < nrFragmentsX; x ++) + load_matrix_sync(bFrag[x], &bSamples[buffer][recvXoffset + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS); + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + mma_sync(sum[y][x], aFrag[y], bFrag[x], sum[y][x]); + } + } + +#if 0 + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + for (unsigned i = 0; i < sum[0][0].num_storage_elements; i ++) + if (sum[y][x].x[i] != 0) +#if NR_BITS == 4 || NR_BITS == 8 + printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%d\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]); +#else + printf("blockIdx=(%d,%d,%d) tid=%u y=%u x=%u i=%u v=%f\n", blockIdx.x, blockIdx.y, blockIdx.z, tid, y, x, i, sum[y][x].x[i]); +#endif +#endif + +#if defined PORTABLE + __syncthreads(); +#endif + + for (unsigned y = 0; y < nrFragmentsY; y ++) + for (unsigned x = 0; x < nrFragmentsX; x ++) + storeVisibilities(visibilities, channel, firstReceiverY, firstReceiverX, recvYoffset, recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize); +} + + +extern "C" __global__ +__launch_bounds__(NR_WARPS * 32, NR_RECEIVERS_PER_BLOCK == 32 ? 4 : 2) +void correlate(Visibilities visibilities, const Samples samples) +{ + const unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2; + + unsigned block = blockIdx.x; + +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + unsigned blockY = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2; + unsigned blockX = block - blockY * (blockY + 1) / 2; + unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK; +#elif NR_RECEIVERS_PER_BLOCK == 64 + unsigned blockY = (unsigned) sqrtf(block); + unsigned blockX = block - blockY * blockY; + unsigned firstReceiverX = blockX * (NR_RECEIVERS_PER_BLOCK / 2); +#endif + unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK; + + union shared { + struct { + SharedData<>::Asamples aSamples; + SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples bSamples; + } rectangle; + struct { + SharedData<>::Bsamples samples; + } triangle; + ScratchSpace scratchSpace[NR_WARPS]; + }; + + // the following hack is necessary to run the correlator in the OpenCL environment, + // as the maximum local memory size is 48K - 16 bytes. Due to padding in bSamples, + // the last 16 bytes are not used, so allocate 16 fewer bytes. + __shared__ char rawbuffer[sizeof(union shared) - 16] __attribute__((aligned(16))); + union shared &u = (union shared &) rawbuffer; + + if (firstReceiverX == firstReceiverY) +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + doCorrelateRectangle<nrFragmentsY, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#elif NR_RECEIVERS_PER_BLOCK == 64 + if (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 && (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK)) + doCorrelateTriangle<false>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); + else + doCorrelateTriangle<true>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); +#endif +#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 + else if (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverY >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK) + doCorrelateRectangle<(NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK + 2 * NR_RECEIVERS_PER_TCM_Y - 1) / NR_RECEIVERS_PER_TCM_Y / 2, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_Y) == 0, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#endif + else + doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +} diff --git a/test/Common/ComplexInt4.h b/test/Common/ComplexInt4.h new file mode 100644 index 0000000000000000000000000000000000000000..e448428f3fe964e666dea4bcc6221ecb9ae1aeba --- /dev/null +++ b/test/Common/ComplexInt4.h @@ -0,0 +1,21 @@ +#if !defined COMPLEX_INT4_T +#define COMPLEX_INT4_T + +#include <complex> +#include <limits> + +class complex_int4_t +{ + public: + complex_int4_t() {} + complex_int4_t(int real, int imag) { value = (imag << 4) | (real & 0xF); } + complex_int4_t operator = (const complex_int4_t &other) { value = other.value; return *this; } + int real() const { return value << (std::numeric_limits<int>::digits - 4) >> (std::numeric_limits<int>::digits - 4); } + int imag() const { return value << (std::numeric_limits<int>::digits - 8) >> (std::numeric_limits<int>::digits - 4); } + operator std::complex<int> () const { return std::complex<int>(real(), imag()); } + + private: + char value; +}; + +#endif diff --git a/test/Common/Config.h b/test/Common/Config.h new file mode 100644 index 0000000000000000000000000000000000000000..0ecedb4b6185e0741431a3067fa137ffa684bae6 --- /dev/null +++ b/test/Common/Config.h @@ -0,0 +1,22 @@ +#if !defined CONFIG_H +#define CONFIG_H + +#if defined __ARM_ARCH +#define UNIFIED_MEMORY // assume this is a Jetson Xavier +#endif + +#undef MEASURE_POWER + +#define POL_X 0 +#define POL_Y 1 +#define NR_POLARIZATIONS 2 + +#if !defined NR_TIMES +#define NR_TIMES 768 +#endif + +#define REAL 0 +#define IMAG 1 +#define COMPLEX 2 + +#endif diff --git a/test/Common/Record.cc b/test/Common/Record.cc new file mode 100644 index 0000000000000000000000000000000000000000..81714861c039df594456ac604a2b162f8f81af84 --- /dev/null +++ b/test/Common/Record.cc @@ -0,0 +1,35 @@ +#include "test/Common/Record.h" + + +#if defined MEASURE_POWER +Record::Record(powersensor::PowerSensor &powerSensor) +: + powerSensor(powerSensor) +{ +} +#endif + + +#if defined MEASURE_POWER +void Record::getPower(CUstream, CUresult, void *userData) +{ + Record *record = (Record *) userData; + record->state = record->powerSensor.read(); +} + +#endif + + +void Record::enqueue(cu::Stream &stream) +{ + stream.record(event); // if this is omitted, the callback takes ~100 ms ?! + +#if defined MEASURE_POWER +#if !defined TEGRA_QUIRKS + stream.addCallback(&Record::getPower, this); // does not work well on Tegra +#else + stream.synchronize(); + state = powerSensor.read(); +#endif +#endif +} diff --git a/test/Common/Record.h b/test/Common/Record.h new file mode 100644 index 0000000000000000000000000000000000000000..2704748115ff90eaeb62c158377fbb2d00df655d --- /dev/null +++ b/test/Common/Record.h @@ -0,0 +1,33 @@ +#if !defined RECORD_H +#define RECORD_H + +#include "test/Common/Config.h" + +#include "util/cu.h" + +#if defined MEASURE_POWER +#include <powersensor/NVMLPowerSensor.h> +#endif + + +struct Record +{ + public: +#if defined MEASURE_POWER + Record(powersensor::PowerSensor &); +#endif + + void enqueue(cu::Stream &); + + mutable cu::Event event; + +#if defined MEASURE_POWER + powersensor::PowerSensor &powerSensor; + powersensor::State state; + + private: + static void getPower(CUstream, CUresult, void *userData); +#endif +}; + +#endif diff --git a/test/Common/UnitTest.cc b/test/Common/UnitTest.cc new file mode 100644 index 0000000000000000000000000000000000000000..c05fb61bea22e3897052274bea7cdae65a9fc9b6 --- /dev/null +++ b/test/Common/UnitTest.cc @@ -0,0 +1,60 @@ +#include "test/Common/UnitTest.h" + +#include <iostream> + + +UnitTest::UnitTest(unsigned deviceNumber) +: + device(deviceNumber), + context(CU_CTX_SCHED_BLOCKING_SYNC, device) +#if defined MEASURE_POWER +, powerSensor(powersensor::nvml::NVMLPowerSensor::create(0)) +#endif +{ +#pragma omp critical (clog) + std::clog << "running test on " << device.getName() << std::endl; + +#if 0 && defined MEASURE_POWER + powerSensor->dump("/tmp/sensor_readings"); +#endif +} + + +UnitTest::~UnitTest() +{ +#if defined MEASURE_POWER + delete powerSensor; +#endif +} + + +void UnitTest::report(const char *name, const Record &startRecord, const Record &stopRecord, uint64_t FLOPS, uint64_t bytes) +{ +#if defined MEASURE_POWER + //powerSensor->mark(startRecord.state, name); + + double Watt = powersensor::PowerSensor::Watt(startRecord.state, stopRecord.state); +#endif + + double runtime = stopRecord.event.elapsedTime(startRecord.event) * 1e-3; + +#pragma omp critical (cout) + { + std::cout << name << ": " << runtime << " s"; + + if (FLOPS != 0) + std::cout << ", " << FLOPS / runtime * 1e-12 << " TOPS"; + + if (bytes != 0) + std::cout << ", " << bytes / runtime * 1e-9 << " GB/s"; + +#if defined MEASURE_POWER + std::cout << ", " << Watt << " W"; + + if (FLOPS != 0) + std::cout << ", " << FLOPS / runtime / Watt * 1e-9 << " GOPS/W"; +#endif + + std::cout << std::endl; + } +} diff --git a/test/Common/UnitTest.h b/test/Common/UnitTest.h new file mode 100644 index 0000000000000000000000000000000000000000..dce8b476d3a008b0c265d245cf640a5f10e47ff8 --- /dev/null +++ b/test/Common/UnitTest.h @@ -0,0 +1,30 @@ +#if !defined UNIT_TEST_H +#define UNIT_TEST_H + +#include "test/Common/Record.h" +#include "util/cu.h" + +#if defined MEASURE_POWER +#include <powersensor/NVMLPowerSensor.h> +#endif + + +class UnitTest +{ + public: + UnitTest(unsigned deviceNumber); + ~UnitTest(); + + protected: + void report(const char *name, const Record &startRecord, const Record &stopRecord, uint64_t FLOPS = 0, uint64_t bytes = 0); + + cu::Device device; + cu::Context context; + cu::Stream stream; + +#if defined MEASURE_POWER + powersensor::PowerSensor *powerSensor; +#endif +}; + +#endif diff --git a/test/CorrelatorTest/CorrelatorTest.cc b/test/CorrelatorTest/CorrelatorTest.cc new file mode 100644 index 0000000000000000000000000000000000000000..23bc7c156344e3f78766f4422d49e53cb09b81f9 --- /dev/null +++ b/test/CorrelatorTest/CorrelatorTest.cc @@ -0,0 +1,239 @@ +#include "test/Common/ComplexInt4.h" +#include "test/Common/Record.h" +#include "test/CorrelatorTest/CorrelatorTest.h" +#include "util/ExceptionPropagator.h" +#include "util/nvrtc.h" + +#include <cstdlib> +#include <cstring> +#include <iostream> + +#define GNU_SOURCE +#include <link.h> +#include <omp.h> + + +CorrelatorTest::CorrelatorTest(const Options &options) +: + UnitTest(options.deviceNumber), + options(options), + correlator(options.nrBits, options.nrReceivers, options.nrChannels, options.nrSamplesPerChannel, options.nrPolarizations, options.nrReceiversPerBlock) +{ +#if defined MEASURE_POWER + Record start(*powerSensor), stop(*powerSensor); +#else + Record start, stop; +#endif + + start.enqueue(stream); + + switch (options.nrBits) { + case 4 : doTest<complex_int4_t, std::complex<int32_t>>(); + break; + + case 8 : doTest<std::complex<int8_t>, std::complex<int32_t>>(); + break; + + case 16 : doTest<std::complex<__half>, std::complex<float>>(); + break; + } + + stop.enqueue(stream); + stream.synchronize(); + + report("total ", start, stop, options.innerRepeatCount * options.outerRepeatCount * correlator.FLOPS()); +} + + +template <typename SampleType, typename VisibilityType> void CorrelatorTest::doTest() +{ + omp_set_nested(1); + + ExceptionPropagator ep; + +#pragma omp parallel num_threads(2) + ep([&] () { + context.setCurrent(); + + multi_array::extent<5> samplesExtent(multi_array::extents[options.nrChannels][options.nrSamplesPerChannel / options.nrTimesPerBlock][options.nrReceivers][options.nrPolarizations][options.nrTimesPerBlock]); + multi_array::extent<4> visibilitiesExtent(multi_array::extents[options.nrChannels][options.nrBaselines()][options.nrPolarizations][options.nrPolarizations]); + + cu::HostMemory hostSamples(sizeof(SampleType) * samplesExtent.size, CU_MEMHOSTALLOC_WRITECOMBINED); + cu::HostMemory hostVisibilities(sizeof(VisibilityType) * visibilitiesExtent.size); + +#if defined UNIFIED_MEMORY + cu::DeviceMemory deviceSamples(hostSamples); + cu::DeviceMemory deviceVisibilities(hostVisibilities); +#else + cu::DeviceMemory deviceSamples(sizeof(SampleType) * samplesExtent.size); + cu::DeviceMemory deviceVisibilities(sizeof(VisibilityType) * visibilitiesExtent.size); + + cu::Stream hostToDeviceStream, deviceToHostStream; +#endif + + multi_array::array_ref<SampleType, 5> samplesRef(* (SampleType *) hostSamples, samplesExtent); + multi_array::array_ref<VisibilityType, 4> visibilitiesRef(* (VisibilityType *) hostVisibilities, visibilitiesExtent); + + setTestPattern<SampleType>(samplesRef); + +#pragma omp for schedule(dynamic), ordered + for (int i = 0; i < options.outerRepeatCount; i ++) + if (!ep) + ep([&] () { +#if !defined UNIFIED_MEMORY + cu::Event inputDataTransferred, executeFinished, executeFinished2; +#endif + +#if defined MEASURE_POWER + Record hostToDeviceRecordStart(*powerSensor), hostToDeviceRecordStop(*powerSensor); + Record computeRecordStart(*powerSensor), computeRecordStop(*powerSensor); + Record deviceToHostRecordStart(*powerSensor), deviceToHostRecordStop(*powerSensor); +#else + Record hostToDeviceRecordStart, hostToDeviceRecordStop; + Record computeRecordStart, computeRecordStop; + Record deviceToHostRecordStart, deviceToHostRecordStop; +#endif + +#pragma omp critical (GPU) // TODO: use multiple locks when using multiple GPUs + ep([&] () { +#if !defined UNIFIED_MEMORY + hostToDeviceRecordStart.enqueue(hostToDeviceStream); + hostToDeviceStream.memcpyHtoDAsync(deviceSamples, hostSamples, samplesRef.bytesize()); + hostToDeviceRecordStop.enqueue(hostToDeviceStream); + + stream.waitEvent(hostToDeviceRecordStop.event); +#endif + + computeRecordStart.enqueue(stream); + + for (unsigned j = 0; j < options.innerRepeatCount; j ++) + correlator.launchAsync(stream, deviceVisibilities, deviceSamples); + + computeRecordStop.enqueue(stream); + +#if !defined UNIFIED_MEMORY + deviceToHostStream.waitEvent(computeRecordStop.event); + deviceToHostRecordStart.enqueue(deviceToHostStream); + deviceToHostStream.memcpyDtoHAsync(hostVisibilities, deviceVisibilities, visibilitiesRef.bytesize()); + deviceToHostRecordStop.enqueue(deviceToHostStream); +#endif + }); + +#if !defined UNIFIED_MEMORY + deviceToHostStream.synchronize(); +#else + stream.synchronize(); +#endif + + if (i == options.outerRepeatCount - 1 && options.verifyOutput) + verifyOutput<SampleType, VisibilityType>(samplesRef, visibilitiesRef); + +#if !defined UNIFIED_MEMORY + report("host-to-device ", hostToDeviceRecordStart, hostToDeviceRecordStop, 0, samplesRef.bytesize()); +#endif + + report("correlate-total ", computeRecordStart, computeRecordStop, options.innerRepeatCount * correlator.FLOPS()); + +#if !defined UNIFIED_MEMORY + report("device-to-host ", deviceToHostRecordStart, deviceToHostRecordStop, 0, visibilitiesRef.bytesize()); +#endif + }); + }); +} + + +template<typename SampleType> void CorrelatorTest::setTestPattern(const multi_array::array_ref<SampleType, 5> &samples) +{ +#if 0 + memset(samples.begin(), 0, samples.bytesize()); + + unsigned channel = options.nrChannels / 3; + unsigned time = options.nrSamplesPerChannel / 5; + unsigned recv0 = options.nrReceivers > 174 ? 174 : options.nrReceivers / 3; + unsigned recv1 = options.nrReceivers > 418 ? 418 : options.nrReceivers / 2; + + samples[channel][time / options.nrTimesPerBlock][recv0][POL_X][time % options.nrTimesPerBlock] = SampleType(2.0, 3.0); + samples[channel][time / options.nrTimesPerBlock][recv1][POL_X][time % options.nrTimesPerBlock] = SampleType(4.0, 5.0); +#else + SampleType randomValues[7777]; // use a limited set of random numbers to save time + + for (unsigned i = 0; i < 7777; i ++) + randomValues[i] = randomValue<SampleType>(); + + unsigned i = 0; + + for (SampleType &sample : samples) + sample = randomValues[i ++ % 7777U]; +#endif +} + + +template<typename SampleType, typename VisibilityType> void CorrelatorTest::verifyOutput(const multi_array::array_ref<SampleType, 5> &samples, const multi_array::array_ref<VisibilityType, 4> &visibilities) const +{ + std::atomic<int> count(0); + ExceptionPropagator ep; + +#if 1 + std::cout << "verifying ..." << std::endl; + +#pragma omp parallel for schedule (dynamic) + for (unsigned channel = 0; channel < options.nrChannels; channel ++) + ep([&] () { + multi_array::array<VisibilityType, 3> sum(multi_array::extents[options.nrBaselines()][options.nrPolarizations][options.nrPolarizations]); + + memset(sum.begin(), 0, sum.bytesize()); + + for (unsigned major_time = 0; major_time < options.nrSamplesPerChannel / options.nrTimesPerBlock; major_time ++) { + multi_array::array_ref<SampleType, 3> ref = samples[channel][major_time]; + + for (unsigned recv1 = 0, baseline = 0; recv1 < options.nrReceivers; recv1 ++) + for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++) + for (unsigned minor_time = 0; minor_time < options.nrTimesPerBlock; minor_time ++) + for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) + for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) { + SampleType sample0 = ref[recv0][pol0][minor_time]; + SampleType sample1 = ref[recv1][pol1][minor_time]; + + sum[baseline][pol1][pol0] += VisibilityType(sample1.real(), sample1.imag()) * conj(VisibilityType(sample0.real(), sample0.imag())); + } + } + + for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++) + for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) + for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) + if (!approximates(visibilities[channel][baseline][pol1][pol0], sum[baseline][pol1][pol0]) && ++ count < 100) +#pragma omp critical (cout) + ep([&] () { + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl; + }); + }); + + std::cout << "#errors = " << count << std::endl; +#else + for (unsigned channel = 0; channel < options.nrChannels; channel ++) + for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++) + for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) + for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) + if (visibilities[channel][baseline][pol0][pol1] != (VisibilityType)(0, 0)) + if (count ++ < 10) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl; +#endif +} + + +int main(int argc, char *argv[]) +{ + try { + cu::init(); + Options options(argc, argv); + CorrelatorTest test(options); + } catch (cu::Error &error) { + std::cerr << "cu::Error: " << error.what() << std::endl; + } catch (nvrtc::Error &error) { + std::cerr << "nvrtc::Error: " << error.what() << std::endl; + } catch (Options::Error &error) { + std::cerr << "Options::Error: " << error.what() << std::endl; + } + + return 0; +} diff --git a/test/CorrelatorTest/CorrelatorTest.h b/test/CorrelatorTest/CorrelatorTest.h new file mode 100644 index 0000000000000000000000000000000000000000..7e2032cf2330a2f0312b544f7d9b66ce04e72df8 --- /dev/null +++ b/test/CorrelatorTest/CorrelatorTest.h @@ -0,0 +1,61 @@ +#if !defined CORRELATOR_TEST_H +#define CORRELATOR_TEST_H + +#include "test/Common/ComplexInt4.h" +#include "test/Common/UnitTest.h" +#include "test/CorrelatorTest/Options.h" +#include "libtcc/Correlator.h" +#include "util/multi_array.h" + +#include <cuda_fp16.h> + + +class CorrelatorTest : public UnitTest +{ + public: + CorrelatorTest(const Options &); + + private: + template<typename SampleType, typename VisibilityType> void doTest(); + template<typename SampleType> void setTestPattern(const multi_array::array_ref<SampleType, 5> &samples); + template<typename SampleType, typename VisibilityType> void verifyOutput(const multi_array::array_ref<SampleType, 5> &samples, const multi_array::array_ref<VisibilityType, 4> &visibilities) const; + + template<typename SampleType> static SampleType randomValue(); + template<typename VisibilityType> bool approximates(const VisibilityType &a, const VisibilityType &b) const; + + tcc::Correlator correlator; + Options options; +}; + + +template<> complex_int4_t CorrelatorTest::randomValue<complex_int4_t>() +{ + return complex_int4_t(16 * drand48() - 8, 16 * drand48() - 8); +} + + +template<> std::complex<int8_t> CorrelatorTest::randomValue<std::complex<int8_t>>() +{ + return std::complex<int8_t>(256 * drand48() - 128, 256 * drand48() - 128); +} + + +template<> std::complex<__half> CorrelatorTest::randomValue<std::complex<__half>>() +{ + return std::complex<__half>(drand48() - .5, drand48() - .5); +} + +template <typename VisibilityType> bool CorrelatorTest::approximates(const VisibilityType &a, const VisibilityType &b) const +{ + return a == b; +} + + +template <> bool CorrelatorTest::approximates(const std::complex<float> &a, const std::complex<float> &b) const +{ + float absolute = abs(a - b), relative = abs(a / b); + + return (relative > .999 && relative < 1.001) || absolute < .0001 * options.nrSamplesPerChannel; +} + +#endif diff --git a/test/CorrelatorTest/Options.cc b/test/CorrelatorTest/Options.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e27f1f55fe70632943aff92948bb0b5c10e741f --- /dev/null +++ b/test/CorrelatorTest/Options.cc @@ -0,0 +1,89 @@ +#include "test/CorrelatorTest/Options.h" + +#include <cstdlib> +#include <iostream> + +#include <unistd.h> + + +Options::Options(int argc, char *argv[]) +: + nrBits(8), + nrChannels(480), + nrReceivers(576), + nrReceiversPerBlock(64), + nrSamplesPerChannel(3072), + nrTimesPerBlock(128 / nrBits), + innerRepeatCount(1), outerRepeatCount(1), + deviceNumber(0), + verifyOutput(true) +{ + opterr = 0; + + for (int opt; (opt = getopt(argc, argv, "b:c:d:hn:N:r:R:t:V:")) >= 0;) + switch (opt) { + case 'b' : nrBits = atoi(optarg); + break; + + case 'c' : nrChannels = atoi(optarg); + break; + + case 'd' : deviceNumber = atoi(optarg); + break; + + case 'h' : std::cout << usage(argv[0]) << std::endl; + exit(0); + + case 'n' : nrReceivers = atoi(optarg); + break; + + case 'N' : nrReceiversPerBlock = atoi(optarg); + break; + + case 'r' : innerRepeatCount = atoi(optarg); + break; + + case 'R' : outerRepeatCount = atoi(optarg); + break; + + case 't' : nrSamplesPerChannel = atoi(optarg); + break; + + case 'V' : verifyOutput = atoi(optarg); + break; + + default : throw Error(usage(argv[0])); + } + + if (nrBits != 4 && nrBits != 8 && nrBits != 16) + throw Error("nrBits must be 4, 8, or 16"); + + if (nrChannels == 0) + throw Error("nrChannels must be > 0"); + + if (nrReceivers == 0) + throw Error("nrReceivers must be > 0"); + + if (nrReceiversPerBlock != 32 && nrReceiversPerBlock != 48 && nrReceiversPerBlock != 64) + throw Error("nrReceiversPerBlock must be 32, 48, or 64"); + + if (nrSamplesPerChannel == 0) + throw Error("nrSamplesPerChannel must be > 0"); + + nrTimesPerBlock = 128 / nrBits; + + if (nrSamplesPerChannel % nrTimesPerBlock != 0) + throw Error("nrSamplesPerChannel must be a multiple of " + std::to_string(nrTimesPerBlock)); +} + + +std::string Options::usage(const std::string &execName) +{ + return "usage: " + execName + " [-b nrBits] [-c nrChannels] [-n nrReceivers] [-N nrReceiversPerBlock] [-r innerRepeatCount] [-R outerRepeatCount] [-t nrSamplesPerChannel] [-V verifyOutput]"; +} + + +const char *Options::Error::what() const noexcept +{ + return msg.c_str(); +} diff --git a/test/CorrelatorTest/Options.h b/test/CorrelatorTest/Options.h new file mode 100644 index 0000000000000000000000000000000000000000..9e51307fcb5e2eefdabf08b11c04119bbab433e6 --- /dev/null +++ b/test/CorrelatorTest/Options.h @@ -0,0 +1,45 @@ +#if !defined OPTIONS_H +#define OPTIONS_H + +#include <exception> +#include <string> + + +class Options +{ + public: + class Error : public std::exception { + public: + Error(const std::string &msg) + : + msg(msg) + { + } + + virtual const char *what() const noexcept; + + private: + std::string msg; + }; + + Options(int argc, char *argv[]); + + unsigned nrBaselines() const { return nrReceivers * (nrReceivers + 1) / 2; } + + unsigned nrBits; + unsigned nrChannels; + unsigned nrReceivers; + unsigned nrReceiversPerBlock; + unsigned nrSamplesPerChannel; + unsigned nrTimesPerBlock; + unsigned innerRepeatCount, outerRepeatCount; + unsigned deviceNumber; + bool verifyOutput; + + static const unsigned nrPolarizations = 2; + + private: + static std::string usage(const std::string &execName); +}; + +#endif diff --git a/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc new file mode 100644 index 0000000000000000000000000000000000000000..d28321fa17979d11f7a475c7d61c4003d4eaeac8 --- /dev/null +++ b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc @@ -0,0 +1,450 @@ +#define NR_BITS 8 +#define NR_CHANNELS 480 +#define NR_POLARIZATIONS 2 +#define NR_SAMPLES_PER_CHANNEL 3072 +#define NR_RECEIVERS 576 +#define NR_BASELINES ((NR_RECEIVERS) * ((NR_RECEIVERS) + 1) / 2) +#define NR_RECEIVERS_PER_BLOCK 64 +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) + + +#include "test/Common/ComplexInt4.h" + +#include <complex> +#include <exception> +//#include <fstream> +#include <iostream> +#include <sstream> +#include <string> +#include <vector> + +#include <omp.h> +#include <sys/types.h> + +#define __CL_ENABLE_EXCEPTIONS +#include <CL/cl.hpp> +#include <CL/cl_ext.h> + + +#if NR_BITS == 4 +typedef complex_int4_t Sample; +typedef std::complex<int32_t> Visibility; +#elif NR_BITS == 8 +typedef std::complex<int8_t> Sample; +typedef std::complex<int32_t> Visibility; +#elif NR_BITS == 16 +typedef std::complex<__half> Sample; +typedef std::complex<float> Visibility; +#endif + +typedef Sample Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef Visibility Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; + + +std::string errorMessage(cl_int error) +{ + switch (error) { + case CL_SUCCESS: return "Success!"; + case CL_DEVICE_NOT_FOUND: return "Device not found."; + case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; + case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; + case CL_OUT_OF_RESOURCES: return "Out of resources"; + case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; + case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; + case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; + case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; + case CL_MAP_FAILURE: return "Map failure"; + case CL_INVALID_VALUE: return "Invalid value"; + case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; + case CL_INVALID_PLATFORM: return "Invalid platform"; + case CL_INVALID_DEVICE: return "Invalid device"; + case CL_INVALID_CONTEXT: return "Invalid context"; + case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; + case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; + case CL_INVALID_HOST_PTR: return "Invalid host pointer"; + case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor"; + case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; + case CL_INVALID_SAMPLER: return "Invalid sampler"; + case CL_INVALID_BINARY: return "Invalid binary"; + case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; + case CL_INVALID_PROGRAM: return "Invalid program"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; + case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; + case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; + case CL_INVALID_KERNEL: return "Invalid kernel"; + case CL_INVALID_ARG_INDEX: return "Invalid argument index"; + case CL_INVALID_ARG_VALUE: return "Invalid argument value"; + case CL_INVALID_ARG_SIZE: return "Invalid argument size"; + case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; + case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; + case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; + case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; + case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; + case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; + case CL_INVALID_EVENT: return "Invalid event"; + case CL_INVALID_OPERATION: return "Invalid operation"; + case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; + case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; + case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; + case CL_INVALID_GLOBAL_WORK_SIZE: return "Invalid global work size"; +#if defined CL_INVALID_PROPERTY + case CL_INVALID_PROPERTY: return "Invalid property"; +#endif +#if defined CL_INVALID_IMAGE_DESCRIPTOR + case CL_INVALID_IMAGE_DESCRIPTOR: return "Invalid image descriptor"; +#endif +#if defined CL_INVALID_COMPILER_OPTIONS + case CL_INVALID_COMPILER_OPTIONS: return "Invalid compiler options"; +#endif +#if defined CL_INVALID_LINKER_OPTIONS + case CL_INVALID_LINKER_OPTIONS: return "Invalid linker options"; +#endif +#if defined CL_INVALID_DEVICE_PARTITION_COUNT + case CL_INVALID_DEVICE_PARTITION_COUNT: return "Invalid device partition count"; +#endif + default: std::stringstream str; + str << "Unknown (" << error << ')'; + return str.str(); + } +} + + +void createContext(cl::Context &context, std::vector<cl::Device> &devices) +{ + const char *platformName = getenv("PLATFORM"); + +#if defined __linux__ + if (platformName == 0) +#endif + //platformName = "Intel(R) FPGA SDK for OpenCL(TM)"; + //platformName = Intel(R) OpenCL"; + platformName = "NVIDIA CUDA"; + //platformName = "AMD Accelerated Parallel Processing"; + + cl_device_type type = CL_DEVICE_TYPE_DEFAULT; + + const char *deviceType = getenv("TYPE"); + + if (deviceType != 0) { + if (strcmp(deviceType, "GPU") == 0) + type = CL_DEVICE_TYPE_GPU; + else if (strcmp(deviceType, "CPU") == 0) + type = CL_DEVICE_TYPE_CPU; + else if (strcmp(deviceType, "ACCELERATOR") == 0) + type = CL_DEVICE_TYPE_ACCELERATOR; + else + std::cerr << "Unrecognized device type: " << deviceType; + } + + const char *deviceName = getenv("DEVICE"); + + std::vector<cl::Platform> platforms; + cl::Platform::get(&platforms); + + for (cl::Platform &platform : platforms) { + std::clog << "Platform name: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl; + std::clog << "Platform version: " << platform.getInfo<CL_PLATFORM_VERSION>() << std::endl; + std::clog << "Platform extensions: " << platform.getInfo<CL_PLATFORM_EXTENSIONS>() << std::endl; + } + + for (cl::Platform &platform : platforms) { + if (strcmp(platform.getInfo<CL_PLATFORM_NAME>().c_str(), platformName) == 0) { + platform.getDevices(type, &devices); + + for (cl::Device &device : devices) { + std::clog << "Device: " << device.getInfo<CL_DEVICE_NAME>() << ", " + "capability: " << device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>() << '.' << device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>() +//#if defined CL_DEVICE_TOPOLOGY_AMD +// ", PCIe bus 0x" << std::hex << std::setiosflags (std::ios::uppercase) << std::setfill('0') << std::setw(2) << (device.getInfo<CL_DEVICE_TOPOLOGY_AMD>().pcie.bus & 255) << std::dec +//#endif + << std::endl; + } + + context = cl::Context(devices); + return; + } + } + + std::cerr << "Platform not found: \"" << platformName << '"' << std::endl; + exit(1); +} + + +cl::Program createProgramFromSources(cl::Context &context, std::vector<cl::Device> &devices, const std::string &sources, const char *args) +{ + std::stringstream cmd; + cmd << "#include \"" << sources << '"' << std::endl; +#if defined CL_VERSION_1_2 + cl::Program program(context, cmd.str()); +#else + std::string str = cmd.str(); + cl::Program::Sources src(1, std::make_pair(str.c_str(), str.length())); + cl::Program program(context, src); +#endif + + try { + program.build(devices, args); + std::string msg; + program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &msg); + + std::clog << msg << std::endl; + } catch (cl::Error &error) { + if (strcmp(error.what(), "clBuildProgram") == 0) { + std::string msg; + program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &msg); + + std::cerr << msg << std::endl; + exit(1); + } else { + throw; + } + } + +#if 0 + std::vector<size_t> binarySizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>(); + std::vector<char *> binaries = program.getInfo<CL_PROGRAM_BINARIES>(); + + for (unsigned i = 0; i < binaries.size(); i++) { + std::stringstream filename; + filename << sources << '-' << i << ".ptx"; + std::ofstream(filename.str().c_str(), std::ofstream::binary).write(binaries[i], binarySizes[i]); + } + + for (unsigned b = 0; b < binaries.size(); b++) + delete [] binaries[b]; +#endif + + return program; +} + + +cl::Program createProgramFromBinaries(cl::Context &context, std::vector<cl::Device> &devices) +{ +#if 1 + cl::Program::Binaries binaries(devices.size()); + std::vector<std::string> ptx_code(devices.size()); + + for (unsigned device = 0; device < devices.size(); device ++) { + unsigned compute_capability = 10 * devices[device].getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>() + devices[device].getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>(); + std::stringstream command; + + command << "nvcc" + << " -ptx" + << " -arch=compute_" << compute_capability + << " -code=sm_" << compute_capability + << " -DNR_BITS=" << NR_BITS + << " -DNR_RECEIVERS=" << NR_RECEIVERS + << " -DNR_CHANNELS=" << NR_CHANNELS + << " -DNR_SAMPLES_PER_CHANNEL=" << NR_SAMPLES_PER_CHANNEL + << " -DNR_POLARIZATIONS=" << NR_POLARIZATIONS + << " -DNR_RECEIVERS_PER_BLOCK=" << NR_RECEIVERS_PER_BLOCK + << " -o -" + << " libtcc/TCCorrelator.cu" + << "|sed -e s/.param\\ .[a-zA-Z0-9]*/\\&\\ .ptr\\ .global/"; + + std::clog << "executing: " << command.str() << std::endl; + FILE *pipe = popen(command.str().c_str(), "r"); + + while (!feof(pipe)) { + char buffer[1024]; + + if (fgets(buffer, 1024, pipe) != NULL) + ptx_code[device] += buffer; + } + + binaries[device] = (std::make_pair(ptx_code[device].c_str(), ptx_code[device].length())); + pclose(pipe); + } + +#else + std::ifstream ifs(name, std::ios::in | std::ios::binary); + std::string str((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>()); + + cl::Program::Binaries binaries(devices.size(), std::make_pair(str.c_str(), str.length())); +#endif + + cl::Program program(context, devices, binaries); + program.build(); + + return program; +} + + +void setTestPattern(cl::CommandQueue &queue, cl::Buffer &samplesBuffer) +{ + Samples &samples = * (Samples *) queue.enqueueMapBuffer(samplesBuffer, CL_TRUE, CL_MAP_WRITE, 0, sizeof(Samples)); + +#if 0 + memset(samples, 0, sizeof(Samples)); + + unsigned channel = NR_CHANNELS / 3; + unsigned time = NR_SAMPLES_PER_CHANNEL / 5; + unsigned recv0 = NR_RECEIVERS > 174 ? 174 : NR_RECEIVERS / 3; + unsigned recv1 = NR_RECEIVERS > 418 ? 418 : NR_RECEIVERS / 2; + +#if NR_BITS == 4 + samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = complex_int4_t(2, 3); + samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = complex_int4_t(4, 5); +#elif NR_BITS == 8 + samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<int8_t>(2, 3); + samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<int8_t>(4, 5); +#elif NR_BITS == 16 + samples[channel][time / NR_TIMES_PER_BLOCK][recv0][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<__half>(2.0f, 3.0f); + samples[channel][time / NR_TIMES_PER_BLOCK][recv1][POL_X][time % NR_TIMES_PER_BLOCK] = std::complex<__half>(4.0f, 5.0f); +#endif +#else + Sample randomValues[7777]; + + for (unsigned i = 0; i < 7777; i ++) +#if NR_BITS == 4 + randomValues[i] = complex_int4_t(16 * drand48() - 8, 16 * drand48() - 8); +#elif NR_BITS == 8 + randomValues[i] = std::complex<int8_t>(256 * drand48() - 128, 256 * drand48() - 128); +#elif NR_BITS == 16 + randomValues[i] = std::complex<__half>(drand48() - .5, drand48() - .5); +#endif + +#pragma omp parallel for schedule (dynamic) + for (unsigned channel = 0; channel < NR_CHANNELS; channel ++) + for (unsigned time_major = 0, i = channel; time_major < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; time_major ++) + for (unsigned receiver = 0; receiver < NR_RECEIVERS; receiver ++) + for (unsigned polarization = 0; polarization < NR_POLARIZATIONS; polarization ++) + for (unsigned time_minor = 0; time_minor < NR_TIMES_PER_BLOCK; time_minor ++, i ++) + samples[channel][time_major][receiver][polarization][time_minor] = randomValues[i % 7777U]; +#endif + + queue.enqueueUnmapMemObject(samplesBuffer, samples); +} + + +void checkTestPattern(cl::CommandQueue &queue, cl::Buffer &visibilitiesBuffer, cl::Buffer &samplesBuffer) +{ + Visibilities &visibilities = * (Visibilities *) queue.enqueueMapBuffer(visibilitiesBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(Visibilities)); + + std::atomic<int> count(0); + +#if 0 + Samples &samples = * (Samples *) queue.enqueueMapBuffer(samplesBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(Samples)); + + std::cout << "checking ..." << std::endl; + +//#pragma omp parallel for schedule (dynamic) + for (unsigned channel = 0; channel < NR_CHANNELS; channel ++) { +//#pragma omp critical (cout) + //std::cout << "channel = " << channel << std::endl; + +#if NR_BITS == 4 || NR_BITS == 8 + typedef std::complex<int32_t> Sum[NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#elif NR_BITS == 16 + typedef std::complex<float> Sum[NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; +#endif + + Sum &sum = *new Sum[1]; + memset(sum, 0, sizeof sum); + + for (unsigned major_time = 0; major_time < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; major_time ++) + for (unsigned recv1 = 0, baseline = 0; recv1 < NR_RECEIVERS; recv1 ++) + for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++) + for (unsigned minor_time = 0; minor_time < NR_TIMES_PER_BLOCK; minor_time ++) + for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) + for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) +#if NR_BITS == 4 || NR_BITS == 8 + sum[baseline][pol1][pol0] += (std::complex<int32_t>) samples[channel][major_time][recv1][pol1][minor_time] * conj((std::complex<int32_t>) samples[channel][major_time][recv0][pol0][minor_time]); +#elif NR_BITS == 16 + sum[baseline][pol1][pol0] += samples[channel][major_time][recv1][pol1][minor_time] * conj(samples[channel][major_time][recv0][pol0][minor_time]); +#endif + + for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++) + for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) + for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) +#if NR_BITS == 4 || NR_BITS == 8 + if (visibilities[channel][baseline][pol1][pol0] != sum[baseline][pol1][pol0] && ++ count < 10) +#pragma omp critical (cout) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl; +#elif NR_BITS == 16 + float absolute = abs(visibilities[channel][baseline][pol1][pol0] - sum[baseline][pol1][pol0]); + float relative = abs(visibilities[channel][baseline][pol1][pol0] / sum[baseline][pol1][pol0]); + 195,1 83% + + + if ((relative < .999 || relative > 1.001) && absolute > .0001 * NR_SAMPLES_PER_CHANNEL && ++ count < 10) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << ", relative = " << relative << ", absolute = " << absolute << std::endl; +#endif + delete ∑ + } + + std::cout << "#errors = " << count << std::endl; + + queue.enqueueUnmapMemObject(samplesBuffer, samples); +#else + for (unsigned channel = 0; channel < NR_CHANNELS; channel ++) + for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++) + for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) + for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) +#if NR_BITS == 4 || NR_BITS == 8 + if (visibilities[channel][baseline][pol0][pol1] != std::complex<int32_t>(0, 0)) +#elif NR_BITS == 16 + if (visibilities[channel][baseline][pol0][pol1] != std::complex<float>(0.0f, 0.0f)) +#endif + if (count ++ < 10) +#pragma omp critical (cout) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl; +#endif + + queue.enqueueUnmapMemObject(visibilitiesBuffer, visibilities); +} + + +int main() +{ + try { + cl::Context context; + std::vector<cl::Device> devices; + createContext(context, devices); + + cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE); + + cl::Buffer samples(context, CL_MEM_READ_ONLY /*| CL_MEM_HOST_WRITE_ONLY*/, sizeof(Samples)); + cl::Buffer visibilities(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(Visibilities)); + + setTestPattern(queue, samples); + + cl::Program program(createProgramFromBinaries(context, devices)); + + cl::Kernel correlatorKernel(program, "correlate"); + correlatorKernel.setArg(0, visibilities); + correlatorKernel.setArg(1, samples); + + cl::Event event; + +#if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 + unsigned nrBlocks = ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK) * ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK + 1) / 2; +#elif NR_RECEIVERS_PER_BLOCK == 64 + unsigned nrBlocks = ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK) * ((NR_RECEIVERS + NR_RECEIVERS_PER_BLOCK - 1) / NR_RECEIVERS_PER_BLOCK); +#endif + + cl::NDRange globalWorkSize(32 * nrBlocks, 2 * NR_CHANNELS, 2 * 1); + cl::NDRange localWorkSize(32, 2, 2); + +for (int i = 0; i < 100; i ++) + queue.enqueueNDRangeKernel(correlatorKernel, cl::NullRange, globalWorkSize, localWorkSize, 0, &event); + queue.finish(); + + double runtime = (event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>()) * 1e-9; + unsigned long long nrOperations = 8LLU * (NR_RECEIVERS * NR_RECEIVERS + 1) / 2 * NR_POLARIZATIONS * NR_POLARIZATIONS * NR_CHANNELS * NR_SAMPLES_PER_CHANNEL; + double TFLOPS = nrOperations / runtime * 1e-12; + std::clog << "runtime = " << runtime << "s, TFLOPS = " << TFLOPS << std::endl; + + checkTestPattern(queue, visibilities, samples); + } catch (cl::Error &error) { + std::cerr << "caught cl::Error: " << error.what() << ": " << errorMessage(error.err()) << std::endl; + } catch (std::exception &error) { + std::cerr << "caught std::exception: " << error.what() << std::endl; + } + + return 0; +} diff --git a/test/SimpleExample/SimpleExample.cu b/test/SimpleExample/SimpleExample.cu new file mode 100644 index 0000000000000000000000000000000000000000..4fa52ddd281d7483c89031dac05dbe1b7a669783 --- /dev/null +++ b/test/SimpleExample/SimpleExample.cu @@ -0,0 +1,77 @@ +#define NR_BITS 8 +#define NR_CHANNELS 480 +#define NR_POLARIZATIONS 2 +#define NR_SAMPLES_PER_CHANNEL 3072 +#define NR_RECEIVERS 576 +#define NR_BASELINES ((NR_RECEIVERS) * ((NR_RECEIVERS) + 1) / 2) +#define NR_RECEIVERS_PER_BLOCK 64 +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) + + +#include "test/Common/ComplexInt4.h" +#include "libtcc/Correlator.h" + +#include <complex> +#include <iostream> + +#include <cuda.h> +#if NR_BITS == 16 +#include <cuda_fp16.h> +#endif + + +inline void checkCudaCall(cudaError_t error) +{ + if (error != cudaSuccess) { + std::cerr << "error " << error << std::endl; + exit(1); + } +} + + +#if NR_BITS == 4 +typedef complex_int4_t Sample; +typedef std::complex<int32_t> Visibility; +#elif NR_BITS == 8 +typedef std::complex<int8_t> Sample; +typedef std::complex<int32_t> Visibility; +#elif NR_BITS == 16 +typedef std::complex<__half> Sample; +typedef std::complex<float> Visibility; +#endif + +typedef Sample Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; +typedef Visibility Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; + + +int main() +{ + try { + checkCudaCall(cudaSetDevice(0)); // combine the CUDA runtime API and CUDA driver API + checkCudaCall(cudaFree(0)); + + tcc::Correlator correlator(NR_BITS, NR_RECEIVERS, NR_CHANNELS, NR_SAMPLES_PER_CHANNEL, NR_POLARIZATIONS, NR_RECEIVERS_PER_BLOCK); + + cudaStream_t stream; + Samples *samples; + Visibilities *visibilities; + + checkCudaCall(cudaStreamCreate(&stream)); + checkCudaCall(cudaMallocManaged(&samples, sizeof(Samples))); + checkCudaCall(cudaMallocManaged(&visibilities, sizeof(Visibilities))); + + (*samples)[NR_CHANNELS / 3][NR_SAMPLES_PER_CHANNEL / 5 / NR_TIMES_PER_BLOCK][174][0][NR_SAMPLES_PER_CHANNEL / 5 % NR_TIMES_PER_BLOCK] = Sample(2, 3); + (*samples)[NR_CHANNELS / 3][NR_SAMPLES_PER_CHANNEL / 5 / NR_TIMES_PER_BLOCK][418][0][NR_SAMPLES_PER_CHANNEL / 5 % NR_TIMES_PER_BLOCK] = Sample(4, 5); + + correlator.launchAsync((CUstream) stream, (CUdeviceptr) visibilities, (CUdeviceptr) samples); + checkCudaCall(cudaDeviceSynchronize()); + + std::cout << ((*visibilities)[160][87745][0][0] == Visibility(23, -2) ? "success" : "failed") << std::endl; + + checkCudaCall(cudaFree(visibilities)); + checkCudaCall(cudaFree(samples)); + checkCudaCall(cudaStreamDestroy(stream)); + } catch (std::exception &error) { + std::cerr << error.what() << std::endl; + } +} diff --git a/util/ExceptionPropagator.h b/util/ExceptionPropagator.h new file mode 100644 index 0000000000000000000000000000000000000000..942a28e375d4025484df8cec3198caabb9d4cb40 --- /dev/null +++ b/util/ExceptionPropagator.h @@ -0,0 +1,59 @@ +#if !defined EXCEPTION_PROPAGATOR_H +#define EXCEPTION_PROPAGATOR_H + +// Experimental class to propagate exceptions around OpenMP pragmas (and other +// constructs) that cannot cope with exceptions. Exceptions thrown inside a lambda +// function that is given to the () operator are propagated and rethrown when the +// progagation object is destroyed. Example use case: +// +// ExceptionPropagator ep; +// +// #pragma omp parallel for +// for (int i = 0; i < 10000; i ++) +// if (!ep) // finish loop ASAP if exception is pending +// ep([&] () { +// throw 42; // continue with code that might throw exceptions +// }); +// +// // exception is rethrown at scope exit of ep + +#include <atomic> +#include <exception> + + +class ExceptionPropagator +{ + public: + ExceptionPropagator() + : + propagateException(ATOMIC_FLAG_INIT) + { + } + + ~ExceptionPropagator() noexcept(false) + { + if (exception != nullptr && !std::uncaught_exception()) + std::rethrow_exception(exception); + } + + template <typename T> void operator () (const T &func) + { + try { + func(); + } catch (...) { + if (!atomic_flag_test_and_set(&propagateException)) + exception = std::current_exception(); + } + } + + operator bool () const // returns true iff exception pending + { + return exception != nullptr; + } + + private: + std::atomic_flag propagateException; + std::exception_ptr exception; +}; + +#endif diff --git a/util/cu.cc b/util/cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..82113bedfe3a17f0c66e300214ca9cb197cb4e35 --- /dev/null +++ b/util/cu.cc @@ -0,0 +1,37 @@ +#include "cu.h" + +#include <iostream> +#include <sstream> + + +namespace cu { + +const char *Error::what() const noexcept +{ + const char *str; + return cuGetErrorString(_result, &str) != CUDA_ERROR_INVALID_VALUE ? str : "unknown error"; +} + + +Context Device::primaryCtxRetain() +{ + CUcontext context; + checkCudaCall(cuDevicePrimaryCtxRetain(&context, _obj)); + return Context(context, *this); +} + + +void Source::compile(const char *output_file_name, const char *compiler_options) +{ + std::stringstream command_line; + command_line << "nvcc -cubin " << compiler_options << " -o " << output_file_name << ' ' << input_file_name; +//#pragma omp critical (clog) + //std::clog << command_line.str() << std::endl; + + int retval = system(command_line.str().c_str()); + + if (WEXITSTATUS(retval) != 0) + throw Error(CUDA_ERROR_INVALID_SOURCE); +} + +} diff --git a/util/cu.h b/util/cu.h new file mode 100644 index 0000000000000000000000000000000000000000..46dfdea4dce9cbb44ee4b26b6f45331cebc49175 --- /dev/null +++ b/util/cu.h @@ -0,0 +1,786 @@ +#if !defined CU_WRAPPER_H +#define CU_WRAPPER_H + +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <exception> +#include <fstream> +#include <string> +#include <vector> + + +namespace cu { + class Error : public std::exception { + public: + Error(CUresult result) + : + _result(result) + { + } + + virtual const char *what() const noexcept; + + operator CUresult () const + { + return _result; + } + + private: + CUresult _result; + }; + + + inline void checkCudaCall(CUresult result) + { + if (result != CUDA_SUCCESS) + throw Error(result); + } + + + inline void init(unsigned flags = 0) + { + checkCudaCall(cuInit(flags)); + } + + + inline int driverGetVersion() + { + int version; + checkCudaCall(cuDriverGetVersion(&version)); + return version; + } + + inline void memcpyHtoD(CUdeviceptr dst, const void *src, size_t size) + { + checkCudaCall(cuMemcpyHtoD(dst, src, size)); + } + + class Context; + class Stream; + + template <typename T> class Wrapper + { + public: + // Wrapper<T>(Wrapper<T> &) = delete; // disallow copies + + // conversion to C-style T + + operator const T & () const + { + return _obj; + } + + operator T & () + { + return _obj; + } + +#if 0 // makes no sense if object is not copyable + bool operator == (const Wrapper<T> &other) + { + return _obj == other._obj; + } + + bool operator != (const Wrapper<T> &other) + { + return _obj != other._obj; + } +#endif + + protected: + Wrapper<T>() + : + hasOwnership(true) + { + } + + Wrapper<T>(T &obj) + : + _obj(obj), + hasOwnership(false) + { + } + + bool hasOwnership; + T _obj; + }; + + class Device : public Wrapper<CUdevice> + { + public: + // Device Management + + Device(int ordinal) + { + checkCudaCall(cuDeviceGet(&_obj, ordinal)); + } + + int getAttribute(CUdevice_attribute attribute) const + { + int value; + checkCudaCall(cuDeviceGetAttribute(&value, attribute, _obj)); + return value; + } + + template <CUdevice_attribute attribute> int getAttribute() const + { + return getAttribute(attribute); + } + + static int getCount() + { + int nrDevices; + checkCudaCall(cuDeviceGetCount(&nrDevices)); + return nrDevices; + } + + std::string getName() const + { + char name[64]; + checkCudaCall(cuDeviceGetName(name, sizeof name, _obj)); + return std::string(name); + } + + size_t totalMem() const + { + size_t size; + checkCudaCall(cuDeviceTotalMem(&size, _obj)); + return size; + } + + + // Primary Context Management + + std::pair<unsigned, bool> primaryCtxGetState() const + { + unsigned flags; + int active; + checkCudaCall(cuDevicePrimaryCtxGetState(_obj, &flags, &active)); + return std::pair<unsigned, bool>(flags, active); + } + + // void primaryCtxRelease() not available; it is released on destruction of the Context returned by Device::primaryContextRetain() + + void primaryCtxReset() + { + checkCudaCall(cuDevicePrimaryCtxReset(_obj)); + } + + Context primaryCtxRetain(); // retain this context until the primary context can be released + + void primaryCtxSetFlags(unsigned flags) + { + checkCudaCall(cuDevicePrimaryCtxSetFlags(_obj, flags)); + } + }; + + + class Context : public Wrapper<CUcontext> + { + public: + // Context Management + + Context(int flags, Device &device) + : + _primaryContext(false) + { + checkCudaCall(cuCtxCreate(&_obj, flags, device)); + } + + Context(CUcontext context) + : + Wrapper<CUcontext>(context), + _primaryContext(false) + { + } + + ~Context() + { + if (hasOwnership) + checkCudaCall(cuCtxDestroy(_obj)); + //else + //checkCudaCall(cuDevicePrimaryCtxRelease(getDevice())); // FIXME + } + + unsigned getApiVersion() const + { + unsigned version; + checkCudaCall(cuCtxGetApiVersion(_obj, &version)); + return version; + } + + static CUfunc_cache getCacheConfig() + { + CUfunc_cache config; + checkCudaCall(cuCtxGetCacheConfig(&config)); + return config; + } + + static void setCacheConfig(CUfunc_cache config) + { + checkCudaCall(cuCtxSetCacheConfig(config)); + } + + static Context getCurrent() + { + CUcontext context; + checkCudaCall(cuCtxGetCurrent(&context)); + return Context(context); + } + + void setCurrent() const + { + checkCudaCall(cuCtxSetCurrent(_obj)); + } + + void pushCurrent() + { + checkCudaCall(cuCtxPushCurrent(_obj)); + } + + static Context popCurrent() + { + CUcontext context; + checkCudaCall(cuCtxPopCurrent(&context)); + return Context(context); + } + + void setSharedMemConfig(CUsharedconfig config) + { + checkCudaCall(cuCtxSetSharedMemConfig(config)); + } + + static Device getDevice() + { + CUdevice device; + checkCudaCall(cuCtxGetDevice(&device)); + return Device(device); // FIXME: ~Device() + } + + static size_t getLimit(CUlimit limit) + { + size_t value; + checkCudaCall(cuCtxGetLimit(&value, limit)); + return value; + } + + template <CUlimit limit> static size_t getLimit() + { + return getLimit(limit); + } + + static void setLimit(CUlimit limit, size_t value) + { + checkCudaCall(cuCtxSetLimit(limit, value)); + } + + template <CUlimit limit> static void setLimit(size_t value) + { + setLimit(limit, value); + } + + static void synchronize() + { + checkCudaCall(cuCtxSynchronize()); + } + + private: + friend class Device; + Context(CUcontext context, Device &device) + : + Wrapper<CUcontext>(context), + _primaryContext(true) + { + } + + bool _primaryContext; + }; + + + class HostMemory : public Wrapper<void *> + { + public: + HostMemory(size_t size, int flags = 0) + { + checkCudaCall(cuMemHostAlloc(&_obj, size, flags)); + } + + ~HostMemory() + { + checkCudaCall(cuMemFreeHost(_obj)); + } + + template <typename T> operator T * () + { + return static_cast<T *>(_obj); + } + }; + + + class DeviceMemory : public Wrapper<CUdeviceptr> + { + public: + DeviceMemory(size_t size) + { + checkCudaCall(cuMemAlloc(&_obj, size)); + } + + DeviceMemory(CUdeviceptr ptr) + : + Wrapper(ptr) + { + } + + DeviceMemory(HostMemory &hostMemory) + { + hasOwnership = false; + checkCudaCall(cuMemHostGetDevicePointer(&_obj, hostMemory, 0)); + } + + ~DeviceMemory() + { + if (hasOwnership) + checkCudaCall(cuMemFree(_obj)); + } + + const void *parameter() const // used to construct parameter list for launchKernel(); + { + return &_obj; + } + + template <typename T> operator T * () const + { + return (T *) _obj; + } + + template <typename T> operator const T * () const + { + return (const T *) _obj; + } + }; + + + class Array : public Wrapper<CUarray> + { + public: + Array(unsigned width, CUarray_format format, unsigned numChannels) + { + Array(width, 0, format, numChannels); + } + + Array(unsigned width, unsigned height, CUarray_format format, unsigned numChannels) + { + CUDA_ARRAY_DESCRIPTOR descriptor; + descriptor.Width = width; + descriptor.Height = height; + descriptor.Format = format; + descriptor.NumChannels = numChannels; + checkCudaCall(cuArrayCreate(&_obj, &descriptor)); + } + + Array(unsigned width, unsigned height, unsigned depth, CUarray_format format, unsigned numChannels) + { + CUDA_ARRAY3D_DESCRIPTOR descriptor; + descriptor.Width = width; + descriptor.Height = height; + descriptor.Depth = depth; + descriptor.Format = format; + descriptor.NumChannels = numChannels; + descriptor.Flags = 0; + checkCudaCall(cuArray3DCreate(&_obj, &descriptor)); + } + + Array(CUarray &array) + : + Wrapper(array) + { + } + + ~Array() + { + if (hasOwnership) + checkCudaCall(cuArrayDestroy(_obj)); + } + }; + + +#if 0 + class TexObject : public Wrapper<CUtexObject> { + public: + TexObject(onst CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) + { + checkCudaCall(cuTexObjectCreate(&_obj, pResDesc, pTexDesc, pResViewDesc)); + } + + TexObject(CUtexObject &obj) + : + Wrapper<CUtexObject>(obj) + { + } + + ~TexObject() + { + if (hasOwnership) + checkCudaCall(cuTexObjectDestroy(_obj)); + } + + + }; +#endif + + + class TexRef : public Wrapper<CUtexref> { + public: + TexRef(CUtexref texref) + : + Wrapper<CUtexref>(texref) + { + } + +#if 0 // deprecated + void setAddress(size_t &byte_offset, DeviceMemory &memory, size_t size) + { + checkCudaCall(cuTexRefSetAddress(&byte_offset, _obj, memory, size)); + } + + void setArray(Array &array, unsigned flags = CU_TRSA_OVERRIDE_FORMAT) + { + checkCudaCall(cuTexRefSetArray(_obj, array, flags)); + } + + void setAddressMode(int dim, CUaddress_mode am) + { + checkCudaCall(cuTexRefSetAddressMode(_obj, dim, am)); + } + + void setFilterMode(CUfilter_mode fm) + { + checkCudaCall(cuTexRefSetFilterMode(_obj, fm)); + } + + void setFlags(int flags) + { + checkCudaCall(cuTexRefSetFlags(_obj, flags)); + } + + void setFormat(CUarray_format fmt, int numPackedComponents) + { + checkCudaCall(cuTexRefSetFormat(_obj, fmt, numPackedComponents)); + } +#endif + }; + + + class Source + { + public: + Source(const char *input_file_name) + : + input_file_name(input_file_name) + { + } + + void compile(const char *ptx_name, const char *compile_options = 0); + + private: + const char *input_file_name; + }; + + + class Module : public Wrapper<CUmodule> + { + public: + Module(const char *file_name) + { +#if defined TEGRA_QUIRKS // cuModuleLoad broken on Jetson TX1 + std::ifstream file(file_name); + std::string program((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); + checkCudaCall(cuModuleLoadData(&_obj, program.c_str())); +#else + checkCudaCall(cuModuleLoad(&_obj, file_name)); +#endif + } + + Module(const void *data) + { + checkCudaCall(cuModuleLoadData(&_obj, data)); + } + + Module(CUmodule &module) + : + Wrapper(module) + { + } + + ~Module() + { + if (hasOwnership) + checkCudaCall(cuModuleUnload(_obj)); + } + + TexRef getTexRef(const char *name) const + { + CUtexref texref; + checkCudaCall(cuModuleGetTexRef(&texref, _obj, name)); + return TexRef(texref); + } + + CUdeviceptr getGlobal(const char *name) const + { + CUdeviceptr deviceptr; + checkCudaCall(cuModuleGetGlobal(&deviceptr, nullptr, _obj, name)); + return deviceptr; + } + }; + + + class Function : public Wrapper<CUfunction> + { + public: + Function(Module &module, const char *name) + { + checkCudaCall(cuModuleGetFunction(&_obj, module, name)); + } + + Function(CUfunction &function) + : + Wrapper(function) + { + } + + int getAttribute(CUfunction_attribute attribute) + { + int value; + checkCudaCall(cuFuncGetAttribute(&value, attribute, _obj)); + return value; + } + + void setCacheConfig(CUfunc_cache config) + { + checkCudaCall(cuFuncSetCacheConfig(_obj, config)); + } + +#if 0 + void paramSetTexRef(TexRef &texref) + { + checkCudaCall(cuParamSetTexRef(_obj, CU_PARAM_TR_DEFAULT, texref)); + } +#endif + }; + + + class Event : public Wrapper<CUevent> + { + public: + Event(int flags = CU_EVENT_DEFAULT) + { + checkCudaCall(cuEventCreate(&_obj, flags)); + } + + Event(CUevent &event) + : + Wrapper(event) + { + } + + ~Event() + { + if (hasOwnership) + checkCudaCall(cuEventDestroy(_obj)); + } + + float elapsedTime(Event &second) const + { + float ms; + checkCudaCall(cuEventElapsedTime(&ms, second, _obj)); + return ms; + } + + void query() const + { + checkCudaCall(cuEventQuery(_obj)); // unsuccessful result throws cu::Error + } + + void record() + { + checkCudaCall(cuEventRecord(_obj, 0)); + } + + void record(Stream &); + + void synchronize() + { + checkCudaCall(cuEventSynchronize(_obj)); + } + }; + + + class Stream : public Wrapper<CUstream> + { + friend class Event; + + public: + Stream(int flags = CU_STREAM_DEFAULT) + { + checkCudaCall(cuStreamCreate(&_obj, flags)); + } + + Stream(CUstream stream) + : + Wrapper<CUstream>(stream) + { + } + + ~Stream() + { + if (hasOwnership) + checkCudaCall(cuStreamDestroy(_obj)); + } + + void memcpyHtoDAsync(CUdeviceptr devPtr, const void *hostPtr, size_t size) + { + checkCudaCall(cuMemcpyHtoDAsync(devPtr, hostPtr, size, _obj)); + } + + void memcpyDtoHAsync(void *hostPtr, CUdeviceptr devPtr, size_t size) + { + checkCudaCall(cuMemcpyDtoHAsync(hostPtr, devPtr, size, _obj)); + } + + void launchKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> ¶meters) + { + checkCudaCall(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(¶meters[0]), 0)); + } + +#if CUDART_VERSION >= 9000 + void launchCooperativeKernel(Function &function, unsigned gridX, unsigned gridY, unsigned gridZ, unsigned blockX, unsigned blockY, unsigned blockZ, unsigned sharedMemBytes, const std::vector<const void *> ¶meters) + { + checkCudaCall(cuLaunchCooperativeKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, sharedMemBytes, _obj, const_cast<void **>(¶meters[0]))); + } +#endif + + void query() + { + checkCudaCall(cuStreamQuery(_obj)); // unsuccessful result throws cu::Error + } + + void synchronize() + { + checkCudaCall(cuStreamSynchronize(_obj)); + } + + void waitEvent(Event &event) + { + checkCudaCall(cuStreamWaitEvent(_obj, event, 0)); + } + + void addCallback(CUstreamCallback callback, void *userData, int flags = 0) + { + checkCudaCall(cuStreamAddCallback(_obj, callback, userData, flags)); + } + + void record(Event &event) + { + checkCudaCall(cuEventRecord(event, _obj)); + } + + void batchMemOp(unsigned count, CUstreamBatchMemOpParams *paramArray, unsigned flags) + { + checkCudaCall(cuStreamBatchMemOp(_obj, count, paramArray, flags)); + } + + void waitValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags) const + { + checkCudaCall(cuStreamWaitValue32(_obj, addr, value, flags)); + } + + void writeValue32(CUdeviceptr addr, cuuint32_t value, unsigned flags) + { + checkCudaCall(cuStreamWriteValue32(_obj, addr, value, flags)); + } + }; + +#if 1 + class Graph : public Wrapper<CUgraph> + { + public: + class GraphNode : public Wrapper<CUgraphNode> + { + }; + + class ExecKernelNode : public GraphNode + { + }; + + class KernelNodeParams : public Wrapper<CUDA_KERNEL_NODE_PARAMS> + { + public: + KernelNodeParams(const Function &function, + unsigned gridDimX, unsigned gridDimY, unsigned gridDimZ, + unsigned blockDimX, unsigned blockDimY, unsigned blockDimZ, + unsigned sharedMemBytes, + const std::vector<const void *> &kernelParams) + { + _obj.func = function; + _obj.blockDimX = blockDimX; + _obj.blockDimY = blockDimY; + _obj.blockDimZ = blockDimZ; + _obj.gridDimX = gridDimX; + _obj.gridDimY = gridDimY; + _obj.gridDimZ = gridDimZ; + _obj.sharedMemBytes = sharedMemBytes; + _obj.kernelParams = const_cast<void **>(kernelParams.data()); + _obj.extra = nullptr; + } + }; + + class Exec : public Wrapper<CUgraphExec> + { + public: + void launch(Stream &stream) + { + checkCudaCall(cuGraphLaunch(_obj, stream)); + } + }; + + Graph(unsigned flags = 0) + { + checkCudaCall(cuGraphCreate(&_obj, flags)); + } + + Graph(CUgraph &graph) + : + Wrapper(graph) + { + } + + ~Graph() + { + if (hasOwnership) + checkCudaCall(cuGraphDestroy(_obj)); + } + + ExecKernelNode addKernelNode(/* std::vector<GraphNode> dependencies, */ const KernelNodeParams &kernelArgs) + { + ExecKernelNode node; + checkCudaCall(cuGraphAddKernelNode(& (CUgraphNode &) node, _obj, nullptr, 0, & (const CUDA_KERNEL_NODE_PARAMS &) kernelArgs)); + return node; + } + + Exec instantiate() + { + Exec exec; + checkCudaCall(cuGraphInstantiate(& (CUgraphExec &) exec, _obj, nullptr, nullptr, 0)); + return exec; + } + }; +#endif + + + inline void Event::record(Stream &stream) + { + checkCudaCall(cuEventRecord(_obj, stream._obj)); + } +} + +#endif diff --git a/util/multi_array.h b/util/multi_array.h new file mode 100644 index 0000000000000000000000000000000000000000..1db0ea5a62be9a2acc5d66d8b5128a14641db4e8 --- /dev/null +++ b/util/multi_array.h @@ -0,0 +1,63 @@ +#if !defined MULTI_ARRAY_H +#define MULTI_ARRAY_H + +// Experimental class that provides multi-dimensional arrays, similar to +// boost::multi_array[_ref], but much simpler. + +namespace multi_array { + template <std::size_t dim> class extent { + public: + extent(std::size_t size, const extent<dim - 1> &rhs) : size(rhs.size * size), next_extent(extent<dim - 1>(size, rhs.next_extent)) {} + extent<dim + 1> operator [] (std::size_t size) const { return extent<dim + 1>(size, *this); } + + const std::size_t size; + const extent<dim - 1> next_extent; + }; + + + template <> class extent<0> { + public: + extent() {} + template <typename T> extent(std::size_t, T) {} + extent<1> operator [] (std::size_t size) const { return extent<1>(size, *this); } + + static const std::size_t size = 1; + static const struct {} next_extent; + }; + + + const static extent<0> extents; + + + template<typename T, std::size_t dim> class array_ref { + public: + array_ref(T &ref, const extent<dim> &extents) : ref(ref), extents(extents) {} + template <std::size_t d = dim> typename std::enable_if<d == 1, T &>::type operator [] (std::size_t index) const { return (&ref)[index]; } + template <std::size_t d = dim> typename std::enable_if<d != 1, array_ref<T, dim - 1>>::type operator [] (std::size_t index) const { return array_ref<T, dim - 1>((&ref)[index * extents.next_extent.size], extents.next_extent); } + T *begin() const { return &ref; } + T *end() const { return &ref + extents.size; } + std::size_t bytesize() const { return sizeof(T) * extents.size; } + + const extent<dim> extents; + + private: + T &ref; + }; + + + namespace detail { + template <typename T> class array_base { + protected: + array_base(std::size_t size) : vec(size) {} + std::vector<T> vec; + }; + } + + + template<typename T, std::size_t dim> class array : private detail::array_base<T>, public array_ref<T, dim> { + public: + array(const extent<dim> &extents) : detail::array_base<T>(extents.size), array_ref<T, dim>(*detail::array_base<T>::vec.data(), extents) {} + }; +} + +#endif diff --git a/util/nvrtc.cc b/util/nvrtc.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6ac88599a78ba2cad86136e2013674d52b30943 --- /dev/null +++ b/util/nvrtc.cc @@ -0,0 +1,11 @@ +#include "nvrtc.h" + + +namespace nvrtc { + +const char *Error::what() const noexcept +{ + return nvrtcGetErrorString(_result); +} + +} diff --git a/util/nvrtc.h b/util/nvrtc.h new file mode 100644 index 0000000000000000000000000000000000000000..f429a737bb5f8bdd05fb2c798bb970c50ad4b924 --- /dev/null +++ b/util/nvrtc.h @@ -0,0 +1,109 @@ +#if !defined NVRTC_H +#define NVRTC_H + +#include <cuda.h> +#include <nvrtc.h> + +#include <algorithm> +#include <exception> +#include <fstream> +#include <iterator> +#include <string> +#include <vector> + + +namespace nvrtc { + class Error : public std::exception { + public: + Error(nvrtcResult result) + : + _result(result) + { + } + + virtual const char *what() const noexcept; + + operator nvrtcResult () const + { + return _result; + } + + private: + nvrtcResult _result; + }; + + + inline void checkNvrtcCall(nvrtcResult result) + { + if (result != NVRTC_SUCCESS) + throw Error(result); + } + + + class Program { + public: + Program(const std::string &src, const std::string &name, int numHeaders = 0, const char *headers[] = nullptr, const char *includeNames[] = nullptr) // TODO: use std::vector<std::string> + { + checkNvrtcCall(nvrtcCreateProgram(&program, src.c_str(), name.c_str(), numHeaders, headers, includeNames)); + } + + Program(const std::string &filename) + { + std::ifstream ifs(filename); + std::string source(std::istreambuf_iterator<char>{ifs}, {}); + checkNvrtcCall(nvrtcCreateProgram(&program, source.c_str(), filename.c_str(), 0, nullptr, nullptr)); + } + + ~Program() + { + checkNvrtcCall(nvrtcDestroyProgram(&program)); + } + + void compile(const std::vector<std::string> &options) + { + std::vector<const char *> c_options; + std::transform(options.begin(), options.end(), std::back_inserter(c_options), [] (const std::string &option) { return option.c_str();}); + checkNvrtcCall(nvrtcCompileProgram(program, c_options.size(), c_options.data())); + } + + std::string getPTX() + { + size_t size; + std::string ptx; + + checkNvrtcCall(nvrtcGetPTXSize(program, &size)); + ptx.resize(size); + checkNvrtcCall(nvrtcGetPTX(program, &ptx[0])); + return ptx; + } + +#if CUDA_VERSION >= 11020 + std::vector<char> getCUBIN() + { + size_t size; + std::vector<char> cubin; + + checkNvrtcCall(nvrtcGetCUBINSize(program, &size)); + cubin.resize(size); + checkNvrtcCall(nvrtcGetCUBIN(program, &cubin[0])); + return cubin; + } +#endif + + std::string getLog() + { + size_t size; + std::string log; + + checkNvrtcCall(nvrtcGetProgramLogSize(program, &size)); + log.resize(size); + checkNvrtcCall(nvrtcGetProgramLog(program, &log[0])); + return log; + } + + private: + nvrtcProgram program; + }; +} + +#endif