Skip to content
Snippets Groups Projects
Commit 7a9dd8b2 authored by John Romein's avatar John Romein
Browse files

Initial commit.

parent a5ec00d9
No related branches found
No related tags found
No related merge requests found
Showing
with 2191 additions and 0 deletions
Makefile 0 → 100644
VERSION= 0.5
#CUDA= /cm/shared/package/cuda100/toolkit/10.0.130
CUDA= $(shell dirname `dirname \`which nvcc\``)
#CUDA= /usr/local/cuda
#POWER_SENSOR= $(HOME)/projects/libpowersensor-master/build
ARCH= $(shell arch)
CC= gcc
CXX= g++ #-Wno-deprecated-declarations
NVCC= nvcc
INCLUDES= -I.
#INCLUDES+= -I$(CUDA)/include
#INCLUDES+= -I$(POWER_SENSOR)/include
CXXFLAGS+= -std=c++11 -O3 -g -fpic -fopenmp $(INCLUDES) -DNDEBUG
NVCCFLAGS= $(INCLUDES)
#CXXFLAGS+= -march=core-avx2 -mcmodel=medium
LIBTCC_SOURCES= util/cu.cc\
util/nvrtc.cc\
libtcc/CorrelatorKernel.cc\
libtcc/Correlator.cc\
libtcc/Kernel.cc
CORRELATOR_TEST_SOURCES=test/CorrelatorTest/CorrelatorTest.cc\
test/CorrelatorTest/Options.cc\
test/Common/Record.cc\
test/Common/UnitTest.cc
OPENCL_TEST_SOURCES= test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc
SIMPLE_EXAMPLE_SOURCES= test/SimpleExample/SimpleExample.cu
LIBTCC_OBJECTS= $(LIBTCC_SOURCES:%.cc=%.o) libtcc/TCCorrelator.o
SIMPLE_EXAMPLE_OBJECTS= $(SIMPLE_EXAMPLE_SOURCES:%.cu=%.o)
CORRELATOR_TEST_OBJECTS=$(CORRELATOR_TEST_SOURCES:%.cc=%.o)
OPENCL_TEST_OBJECTS= $(OPENCL_TEST_SOURCES:%.cc=%.o)
OBJECTS= $(LIBTCC_OBJECTS)\
$(SIMPLE_EXAMPLE_OBJECTS)\
$(CORRELATOR_TEST_OBJECTS)\
$(OPENCL_TEST_OBJECTS)
SHARED_OBJECTS= libtcc/libtcc.so libtcc/libtcc.so.$(VERSION)
DEPENDENCIES= $(OBJECTS:%.o=%.d)
EXECUTABLES= test/SimpleExample/SimpleExample\
test/CorrelatorTest/CorrelatorTest\
test/OpenCLCorrelatorTest/OpenCLCorrelatorTest
LIBRARIES= -L$(CUDA)/lib64 \
-L$(CUDA)/lib64/stubs -lcuda -lnvrtc #\
#-L$(POWER_SENSOR)/lib -lpowersensor #-lnvidia-ml
%.d: %.cc
-$(CXX) $(CXXFLAGS) -MM -MT $@ -MT ${@:%.d=%.o} -MT ${@:%.d=%.s} $< -o $@
%.d: %.cu
-$(CXX) -x c++ $(CXXFLAGS) -MM -MT $@ -MT ${@:%.d=%.o} -MT ${@:%.d=%.s} $< -o $@
%.o: %.cc
$(CXX) $(CXXFLAGS) -o $@ -c $<
%.o: %.cu
$(NVCC) $(NVCCFLAGS) -o $@ -c $<
%.s: %.cc
$(CXX) $(CXXFLAGS) -o $@ -S $<
%.so: %.so.$(VERSION)
rm -f $@
ln -s $(@F).$(VERSION) $@
all:: $(EXECUTABLES)
clean::
$(RM) $(OBJECTS) $(SHARED_OBJECTS) $(DEPENDENCIES) $(EXECUTABLES)
libtcc/TCCorrelator.o: libtcc/TCCorrelator.cu # CUDA code embedded in object file
ld -r -b binary -o $@ $<
libtcc/TCCorrelator.d:
-
libtcc/libtcc.so.$(VERSION): $(LIBTCC_OBJECTS)
$(CXX) -shared -o $@ -Wl,-soname=$@ $^ $(LIBRARIES)
test/SimpleExample/SimpleExample: $(SIMPLE_EXAMPLE_OBJECTS) libtcc/libtcc.so
$(NVCC) $(NVCCFLAGS) -o $@ $(SIMPLE_EXAMPLE_OBJECTS) -Xlinker -rpath=. -Llibtcc -ltcc $(LIBRARIES)
test/CorrelatorTest/CorrelatorTest: $(CORRELATOR_TEST_OBJECTS) libtcc/libtcc.so
$(CXX) $(CXXFLAGS) -o $@ $(CORRELATOR_TEST_OBJECTS) -Wl,-rpath=. -Llibtcc -ltcc $(LIBRARIES)
test/OpenCLCorrelatorTest/OpenCLCorrelatorTest: $(OPENCL_TEST_OBJECTS)
$(CXX) $(CXXFLAGS) -o $@ $(OPENCL_TEST_OBJECTS) -L$(CUDA)/lib64 -lOpenCL
ifeq (0, $(words $(findstring $(MAKECMDGOALS), clean)))
-include $(DEPENDENCIES)
endif
#include "libtcc/Correlator.h"
#include <iostream>
#define GNU_SOURCE
#include <link.h>
extern const char _binary_libtcc_TCCorrelator_cu_start, _binary_libtcc_TCCorrelator_cu_end;
namespace tcc {
std::string Correlator::findNVRTCincludePath() const
{
std::string path;
if (dl_iterate_phdr([] (struct dl_phdr_info *info, size_t, void *arg) -> int
{
std::string &path = *static_cast<std::string *>(arg);
path = info->dlpi_name;
return path.find("libnvrtc.so") != std::string::npos;
}, &path))
{
path.erase(path.find_last_of("/")); // remove library name
path.erase(path.find_last_of("/")); // remove /lib64
path += "/include";
}
return path;
}
Correlator::Correlator(unsigned nrBits,
unsigned nrReceivers,
unsigned nrChannels,
unsigned nrSamplesPerChannel,
unsigned nrPolarizations,
unsigned nrReceiversPerBlock
)
:
correlatorModule(compileModule(nrBits, nrReceivers, nrChannels, nrSamplesPerChannel, nrPolarizations, nrReceiversPerBlock)),
correlatorKernel(correlatorModule, nrBits, nrReceivers, nrChannels, nrSamplesPerChannel, nrPolarizations, nrReceiversPerBlock)
{
}
cu::Module Correlator::compileModule(unsigned nrBits,
unsigned nrReceivers,
unsigned nrChannels,
unsigned nrSamplesPerChannel,
unsigned nrPolarizations,
unsigned nrReceiversPerBlock
)
{
cu::Device device(cu::Context::getCurrent().getDevice());
int capability = 10 * device.getAttribute<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>() + device.getAttribute<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
std::vector<std::string> options =
{
"-I" + findNVRTCincludePath(),
"-std=c++11",
"-arch=compute_" + std::to_string(capability),
"-lineinfo",
"-DNR_BITS=" + std::to_string(nrBits),
"-DNR_RECEIVERS=" + std::to_string(nrReceivers),
"-DNR_CHANNELS=" + std::to_string(nrChannels),
"-DNR_SAMPLES_PER_CHANNEL=" + std::to_string(nrSamplesPerChannel),
"-DNR_POLARIZATIONS=" + std::to_string(nrPolarizations),
"-DNR_RECEIVERS_PER_BLOCK=" + std::to_string(nrReceiversPerBlock),
};
//std::for_each(options.begin(), options.end(), [] (const std::string &e) { std::cout << e << ' '; }); std::cout << std::endl;
#if 0
nvrtc::Program program("tcc/TCCorrelator.cu");
#else
// embed the CUDA source code in libtcc.so, so that it need not be installed separately
// for runtime compilation
// copy into std::string for '\0' termination
std::string source(&_binary_libtcc_TCCorrelator_cu_start, &_binary_libtcc_TCCorrelator_cu_end);
nvrtc::Program program(source, "TCCorrelator.cu");
#endif
try {
program.compile(options);
} catch (nvrtc::Error &error) {
std::cerr << program.getLog();
throw;
}
//std::ofstream cubin("out.ptx");
//cubin << program.getPTX().data();
return cu::Module((void *) program.getPTX().data());
}
void Correlator::launchAsync(cu::Stream &stream, cu::DeviceMemory &visibilities, cu::DeviceMemory &samples)
{
correlatorKernel.launchAsync(stream, visibilities, samples);
}
void Correlator::launchAsync(CUstream stream, CUdeviceptr visibilities, CUdeviceptr samples)
{
cu::Stream _stream(stream);
cu::DeviceMemory _visibilities(visibilities);
cu::DeviceMemory _samples(samples);
correlatorKernel.launchAsync(_stream, _visibilities, _samples);
}
uint64_t Correlator::FLOPS() const
{
return correlatorKernel.FLOPS();
}
}
#if !defined CORRELATOR_H
#define CORRELATOR_H
#include "libtcc/CorrelatorKernel.h"
#include "util/cu.h"
#include "util/nvrtc.h"
namespace tcc {
class Correlator {
public:
Correlator(unsigned nrBits,
unsigned nrReceivers,
unsigned nrChannels,
unsigned nrSamplesPerChannel,
unsigned nrPolarizations = 2,
unsigned nrReceiversPerBlock = 64
); // throw (cu::Error, nvrtc::Error)
void launchAsync(cu::Stream &, cu::DeviceMemory &isibilities, cu::DeviceMemory &samples); // throw (cu::Error)
void launchAsync(CUstream, CUdeviceptr visibilities, CUdeviceptr samples); // throw (cu::Error)
uint64_t FLOPS() const;
private:
std::string findNVRTCincludePath() const;
cu::Module compileModule(unsigned nrBits,
unsigned nrReceivers,
unsigned nrChannels,
unsigned nrSamplesPerChannel,
unsigned nrPolarizations,
unsigned nrReceiversPerBlock
);
cu::Module correlatorModule;
CorrelatorKernel correlatorKernel;
};
}
#endif
#include "libtcc/CorrelatorKernel.h"
namespace tcc {
CorrelatorKernel::CorrelatorKernel(cu::Module &module,
unsigned nrBits,
unsigned nrReceivers,
unsigned nrChannels,
unsigned nrSamplesPerChannel,
unsigned nrPolarizations,
unsigned nrReceiversPerBlock
)
:
Kernel(module, "correlate"),
nrBits(nrBits),
nrReceivers(nrReceivers),
nrChannels(nrChannels),
nrSamplesPerChannel(nrSamplesPerChannel),
nrPolarizations(nrPolarizations),
nrReceiversPerBlock(nrReceiversPerBlock)
{
unsigned blocksPerDim = (nrReceivers + nrReceiversPerBlock - 1) / nrReceiversPerBlock;
nrThreadBlocksPerChannel = nrReceiversPerBlock == 64 ? blocksPerDim * blocksPerDim : blocksPerDim * (blocksPerDim + 1) / 2;
}
void CorrelatorKernel::launchAsync(cu::Stream &stream, cu::DeviceMemory &deviceVisibilities, cu::DeviceMemory &deviceSamples)
{
std::vector<const void *> parameters = { deviceVisibilities.parameter(), deviceSamples.parameter() };
stream.launchKernel(function,
nrThreadBlocksPerChannel, nrChannels, 1,
32, 2, 2,
0, parameters);
}
uint64_t CorrelatorKernel::FLOPS() const
{
return 8ULL * nrReceivers * nrReceivers / 2 * nrPolarizations * nrPolarizations * nrChannels * nrSamplesPerChannel;
}
}
#if !defined CORRELATOR_KERNEL_H
#define CORRELATOR_KERNEL_H
#include "libtcc/Kernel.h"
namespace tcc {
class CorrelatorKernel : public Kernel
{
public:
CorrelatorKernel(cu::Module &module,
unsigned nrBits,
unsigned nrReceivers,
unsigned nrChannels,
unsigned nrSamplesPerChannel,
unsigned nrPolarizations = 2,
unsigned nrReceiversPerBlock = 64
);
void launchAsync(cu::Stream &, cu::DeviceMemory &deviceVisibilities, cu::DeviceMemory &deviceSamples);
virtual uint64_t FLOPS() const;
private:
const unsigned nrBits;
const unsigned nrReceivers;
const unsigned nrChannels;
const unsigned nrSamplesPerChannel;
const unsigned nrPolarizations;
const unsigned nrReceiversPerBlock;
unsigned nrThreadBlocksPerChannel;
};
}
#endif
#include "libtcc/Kernel.h"
namespace tcc {
Kernel::Kernel(cu::Module &module, const char *name)
:
module(module),
function(module, name)
{
}
}
#if !defined KERNEL_H
#define KERNEL_H
#include "util/cu.h"
#include <stdint.h>
namespace tcc {
class Kernel
{
public:
Kernel(cu::Module &, const char *name);
virtual uint64_t FLOPS() const = 0;
protected:
cu::Module &module;
cu::Function function;
};
}
#endif
This diff is collapsed.
File added
This diff is collapsed.
#if !defined COMPLEX_INT4_T
#define COMPLEX_INT4_T
#include <complex>
#include <limits>
class complex_int4_t
{
public:
complex_int4_t() {}
complex_int4_t(int real, int imag) { value = (imag << 4) | (real & 0xF); }
complex_int4_t operator = (const complex_int4_t &other) { value = other.value; return *this; }
int real() const { return value << (std::numeric_limits<int>::digits - 4) >> (std::numeric_limits<int>::digits - 4); }
int imag() const { return value << (std::numeric_limits<int>::digits - 8) >> (std::numeric_limits<int>::digits - 4); }
operator std::complex<int> () const { return std::complex<int>(real(), imag()); }
private:
char value;
};
#endif
#if !defined CONFIG_H
#define CONFIG_H
#if defined __ARM_ARCH
#define UNIFIED_MEMORY // assume this is a Jetson Xavier
#endif
#undef MEASURE_POWER
#define POL_X 0
#define POL_Y 1
#define NR_POLARIZATIONS 2
#if !defined NR_TIMES
#define NR_TIMES 768
#endif
#define REAL 0
#define IMAG 1
#define COMPLEX 2
#endif
#include "test/Common/Record.h"
#if defined MEASURE_POWER
Record::Record(powersensor::PowerSensor &powerSensor)
:
powerSensor(powerSensor)
{
}
#endif
#if defined MEASURE_POWER
void Record::getPower(CUstream, CUresult, void *userData)
{
Record *record = (Record *) userData;
record->state = record->powerSensor.read();
}
#endif
void Record::enqueue(cu::Stream &stream)
{
stream.record(event); // if this is omitted, the callback takes ~100 ms ?!
#if defined MEASURE_POWER
#if !defined TEGRA_QUIRKS
stream.addCallback(&Record::getPower, this); // does not work well on Tegra
#else
stream.synchronize();
state = powerSensor.read();
#endif
#endif
}
#if !defined RECORD_H
#define RECORD_H
#include "test/Common/Config.h"
#include "util/cu.h"
#if defined MEASURE_POWER
#include <powersensor/NVMLPowerSensor.h>
#endif
struct Record
{
public:
#if defined MEASURE_POWER
Record(powersensor::PowerSensor &);
#endif
void enqueue(cu::Stream &);
mutable cu::Event event;
#if defined MEASURE_POWER
powersensor::PowerSensor &powerSensor;
powersensor::State state;
private:
static void getPower(CUstream, CUresult, void *userData);
#endif
};
#endif
#include "test/Common/UnitTest.h"
#include <iostream>
UnitTest::UnitTest(unsigned deviceNumber)
:
device(deviceNumber),
context(CU_CTX_SCHED_BLOCKING_SYNC, device)
#if defined MEASURE_POWER
, powerSensor(powersensor::nvml::NVMLPowerSensor::create(0))
#endif
{
#pragma omp critical (clog)
std::clog << "running test on " << device.getName() << std::endl;
#if 0 && defined MEASURE_POWER
powerSensor->dump("/tmp/sensor_readings");
#endif
}
UnitTest::~UnitTest()
{
#if defined MEASURE_POWER
delete powerSensor;
#endif
}
void UnitTest::report(const char *name, const Record &startRecord, const Record &stopRecord, uint64_t FLOPS, uint64_t bytes)
{
#if defined MEASURE_POWER
//powerSensor->mark(startRecord.state, name);
double Watt = powersensor::PowerSensor::Watt(startRecord.state, stopRecord.state);
#endif
double runtime = stopRecord.event.elapsedTime(startRecord.event) * 1e-3;
#pragma omp critical (cout)
{
std::cout << name << ": " << runtime << " s";
if (FLOPS != 0)
std::cout << ", " << FLOPS / runtime * 1e-12 << " TOPS";
if (bytes != 0)
std::cout << ", " << bytes / runtime * 1e-9 << " GB/s";
#if defined MEASURE_POWER
std::cout << ", " << Watt << " W";
if (FLOPS != 0)
std::cout << ", " << FLOPS / runtime / Watt * 1e-9 << " GOPS/W";
#endif
std::cout << std::endl;
}
}
#if !defined UNIT_TEST_H
#define UNIT_TEST_H
#include "test/Common/Record.h"
#include "util/cu.h"
#if defined MEASURE_POWER
#include <powersensor/NVMLPowerSensor.h>
#endif
class UnitTest
{
public:
UnitTest(unsigned deviceNumber);
~UnitTest();
protected:
void report(const char *name, const Record &startRecord, const Record &stopRecord, uint64_t FLOPS = 0, uint64_t bytes = 0);
cu::Device device;
cu::Context context;
cu::Stream stream;
#if defined MEASURE_POWER
powersensor::PowerSensor *powerSensor;
#endif
};
#endif
#include "test/Common/ComplexInt4.h"
#include "test/Common/Record.h"
#include "test/CorrelatorTest/CorrelatorTest.h"
#include "util/ExceptionPropagator.h"
#include "util/nvrtc.h"
#include <cstdlib>
#include <cstring>
#include <iostream>
#define GNU_SOURCE
#include <link.h>
#include <omp.h>
CorrelatorTest::CorrelatorTest(const Options &options)
:
UnitTest(options.deviceNumber),
options(options),
correlator(options.nrBits, options.nrReceivers, options.nrChannels, options.nrSamplesPerChannel, options.nrPolarizations, options.nrReceiversPerBlock)
{
#if defined MEASURE_POWER
Record start(*powerSensor), stop(*powerSensor);
#else
Record start, stop;
#endif
start.enqueue(stream);
switch (options.nrBits) {
case 4 : doTest<complex_int4_t, std::complex<int32_t>>();
break;
case 8 : doTest<std::complex<int8_t>, std::complex<int32_t>>();
break;
case 16 : doTest<std::complex<__half>, std::complex<float>>();
break;
}
stop.enqueue(stream);
stream.synchronize();
report("total ", start, stop, options.innerRepeatCount * options.outerRepeatCount * correlator.FLOPS());
}
template <typename SampleType, typename VisibilityType> void CorrelatorTest::doTest()
{
omp_set_nested(1);
ExceptionPropagator ep;
#pragma omp parallel num_threads(2)
ep([&] () {
context.setCurrent();
multi_array::extent<5> samplesExtent(multi_array::extents[options.nrChannels][options.nrSamplesPerChannel / options.nrTimesPerBlock][options.nrReceivers][options.nrPolarizations][options.nrTimesPerBlock]);
multi_array::extent<4> visibilitiesExtent(multi_array::extents[options.nrChannels][options.nrBaselines()][options.nrPolarizations][options.nrPolarizations]);
cu::HostMemory hostSamples(sizeof(SampleType) * samplesExtent.size, CU_MEMHOSTALLOC_WRITECOMBINED);
cu::HostMemory hostVisibilities(sizeof(VisibilityType) * visibilitiesExtent.size);
#if defined UNIFIED_MEMORY
cu::DeviceMemory deviceSamples(hostSamples);
cu::DeviceMemory deviceVisibilities(hostVisibilities);
#else
cu::DeviceMemory deviceSamples(sizeof(SampleType) * samplesExtent.size);
cu::DeviceMemory deviceVisibilities(sizeof(VisibilityType) * visibilitiesExtent.size);
cu::Stream hostToDeviceStream, deviceToHostStream;
#endif
multi_array::array_ref<SampleType, 5> samplesRef(* (SampleType *) hostSamples, samplesExtent);
multi_array::array_ref<VisibilityType, 4> visibilitiesRef(* (VisibilityType *) hostVisibilities, visibilitiesExtent);
setTestPattern<SampleType>(samplesRef);
#pragma omp for schedule(dynamic), ordered
for (int i = 0; i < options.outerRepeatCount; i ++)
if (!ep)
ep([&] () {
#if !defined UNIFIED_MEMORY
cu::Event inputDataTransferred, executeFinished, executeFinished2;
#endif
#if defined MEASURE_POWER
Record hostToDeviceRecordStart(*powerSensor), hostToDeviceRecordStop(*powerSensor);
Record computeRecordStart(*powerSensor), computeRecordStop(*powerSensor);
Record deviceToHostRecordStart(*powerSensor), deviceToHostRecordStop(*powerSensor);
#else
Record hostToDeviceRecordStart, hostToDeviceRecordStop;
Record computeRecordStart, computeRecordStop;
Record deviceToHostRecordStart, deviceToHostRecordStop;
#endif
#pragma omp critical (GPU) // TODO: use multiple locks when using multiple GPUs
ep([&] () {
#if !defined UNIFIED_MEMORY
hostToDeviceRecordStart.enqueue(hostToDeviceStream);
hostToDeviceStream.memcpyHtoDAsync(deviceSamples, hostSamples, samplesRef.bytesize());
hostToDeviceRecordStop.enqueue(hostToDeviceStream);
stream.waitEvent(hostToDeviceRecordStop.event);
#endif
computeRecordStart.enqueue(stream);
for (unsigned j = 0; j < options.innerRepeatCount; j ++)
correlator.launchAsync(stream, deviceVisibilities, deviceSamples);
computeRecordStop.enqueue(stream);
#if !defined UNIFIED_MEMORY
deviceToHostStream.waitEvent(computeRecordStop.event);
deviceToHostRecordStart.enqueue(deviceToHostStream);
deviceToHostStream.memcpyDtoHAsync(hostVisibilities, deviceVisibilities, visibilitiesRef.bytesize());
deviceToHostRecordStop.enqueue(deviceToHostStream);
#endif
});
#if !defined UNIFIED_MEMORY
deviceToHostStream.synchronize();
#else
stream.synchronize();
#endif
if (i == options.outerRepeatCount - 1 && options.verifyOutput)
verifyOutput<SampleType, VisibilityType>(samplesRef, visibilitiesRef);
#if !defined UNIFIED_MEMORY
report("host-to-device ", hostToDeviceRecordStart, hostToDeviceRecordStop, 0, samplesRef.bytesize());
#endif
report("correlate-total ", computeRecordStart, computeRecordStop, options.innerRepeatCount * correlator.FLOPS());
#if !defined UNIFIED_MEMORY
report("device-to-host ", deviceToHostRecordStart, deviceToHostRecordStop, 0, visibilitiesRef.bytesize());
#endif
});
});
}
template<typename SampleType> void CorrelatorTest::setTestPattern(const multi_array::array_ref<SampleType, 5> &samples)
{
#if 0
memset(samples.begin(), 0, samples.bytesize());
unsigned channel = options.nrChannels / 3;
unsigned time = options.nrSamplesPerChannel / 5;
unsigned recv0 = options.nrReceivers > 174 ? 174 : options.nrReceivers / 3;
unsigned recv1 = options.nrReceivers > 418 ? 418 : options.nrReceivers / 2;
samples[channel][time / options.nrTimesPerBlock][recv0][POL_X][time % options.nrTimesPerBlock] = SampleType(2.0, 3.0);
samples[channel][time / options.nrTimesPerBlock][recv1][POL_X][time % options.nrTimesPerBlock] = SampleType(4.0, 5.0);
#else
SampleType randomValues[7777]; // use a limited set of random numbers to save time
for (unsigned i = 0; i < 7777; i ++)
randomValues[i] = randomValue<SampleType>();
unsigned i = 0;
for (SampleType &sample : samples)
sample = randomValues[i ++ % 7777U];
#endif
}
template<typename SampleType, typename VisibilityType> void CorrelatorTest::verifyOutput(const multi_array::array_ref<SampleType, 5> &samples, const multi_array::array_ref<VisibilityType, 4> &visibilities) const
{
std::atomic<int> count(0);
ExceptionPropagator ep;
#if 1
std::cout << "verifying ..." << std::endl;
#pragma omp parallel for schedule (dynamic)
for (unsigned channel = 0; channel < options.nrChannels; channel ++)
ep([&] () {
multi_array::array<VisibilityType, 3> sum(multi_array::extents[options.nrBaselines()][options.nrPolarizations][options.nrPolarizations]);
memset(sum.begin(), 0, sum.bytesize());
for (unsigned major_time = 0; major_time < options.nrSamplesPerChannel / options.nrTimesPerBlock; major_time ++) {
multi_array::array_ref<SampleType, 3> ref = samples[channel][major_time];
for (unsigned recv1 = 0, baseline = 0; recv1 < options.nrReceivers; recv1 ++)
for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++)
for (unsigned minor_time = 0; minor_time < options.nrTimesPerBlock; minor_time ++)
for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) {
SampleType sample0 = ref[recv0][pol0][minor_time];
SampleType sample1 = ref[recv1][pol1][minor_time];
sum[baseline][pol1][pol0] += VisibilityType(sample1.real(), sample1.imag()) * conj(VisibilityType(sample0.real(), sample0.imag()));
}
}
for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++)
for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++)
if (!approximates(visibilities[channel][baseline][pol1][pol0], sum[baseline][pol1][pol0]) && ++ count < 100)
#pragma omp critical (cout)
ep([&] () {
std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl;
});
});
std::cout << "#errors = " << count << std::endl;
#else
for (unsigned channel = 0; channel < options.nrChannels; channel ++)
for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++)
for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++)
if (visibilities[channel][baseline][pol0][pol1] != (VisibilityType)(0, 0))
if (count ++ < 10)
std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl;
#endif
}
int main(int argc, char *argv[])
{
try {
cu::init();
Options options(argc, argv);
CorrelatorTest test(options);
} catch (cu::Error &error) {
std::cerr << "cu::Error: " << error.what() << std::endl;
} catch (nvrtc::Error &error) {
std::cerr << "nvrtc::Error: " << error.what() << std::endl;
} catch (Options::Error &error) {
std::cerr << "Options::Error: " << error.what() << std::endl;
}
return 0;
}
#if !defined CORRELATOR_TEST_H
#define CORRELATOR_TEST_H
#include "test/Common/ComplexInt4.h"
#include "test/Common/UnitTest.h"
#include "test/CorrelatorTest/Options.h"
#include "libtcc/Correlator.h"
#include "util/multi_array.h"
#include <cuda_fp16.h>
class CorrelatorTest : public UnitTest
{
public:
CorrelatorTest(const Options &);
private:
template<typename SampleType, typename VisibilityType> void doTest();
template<typename SampleType> void setTestPattern(const multi_array::array_ref<SampleType, 5> &samples);
template<typename SampleType, typename VisibilityType> void verifyOutput(const multi_array::array_ref<SampleType, 5> &samples, const multi_array::array_ref<VisibilityType, 4> &visibilities) const;
template<typename SampleType> static SampleType randomValue();
template<typename VisibilityType> bool approximates(const VisibilityType &a, const VisibilityType &b) const;
tcc::Correlator correlator;
Options options;
};
template<> complex_int4_t CorrelatorTest::randomValue<complex_int4_t>()
{
return complex_int4_t(16 * drand48() - 8, 16 * drand48() - 8);
}
template<> std::complex<int8_t> CorrelatorTest::randomValue<std::complex<int8_t>>()
{
return std::complex<int8_t>(256 * drand48() - 128, 256 * drand48() - 128);
}
template<> std::complex<__half> CorrelatorTest::randomValue<std::complex<__half>>()
{
return std::complex<__half>(drand48() - .5, drand48() - .5);
}
template <typename VisibilityType> bool CorrelatorTest::approximates(const VisibilityType &a, const VisibilityType &b) const
{
return a == b;
}
template <> bool CorrelatorTest::approximates(const std::complex<float> &a, const std::complex<float> &b) const
{
float absolute = abs(a - b), relative = abs(a / b);
return (relative > .999 && relative < 1.001) || absolute < .0001 * options.nrSamplesPerChannel;
}
#endif
#include "test/CorrelatorTest/Options.h"
#include <cstdlib>
#include <iostream>
#include <unistd.h>
Options::Options(int argc, char *argv[])
:
nrBits(8),
nrChannels(480),
nrReceivers(576),
nrReceiversPerBlock(64),
nrSamplesPerChannel(3072),
nrTimesPerBlock(128 / nrBits),
innerRepeatCount(1), outerRepeatCount(1),
deviceNumber(0),
verifyOutput(true)
{
opterr = 0;
for (int opt; (opt = getopt(argc, argv, "b:c:d:hn:N:r:R:t:V:")) >= 0;)
switch (opt) {
case 'b' : nrBits = atoi(optarg);
break;
case 'c' : nrChannels = atoi(optarg);
break;
case 'd' : deviceNumber = atoi(optarg);
break;
case 'h' : std::cout << usage(argv[0]) << std::endl;
exit(0);
case 'n' : nrReceivers = atoi(optarg);
break;
case 'N' : nrReceiversPerBlock = atoi(optarg);
break;
case 'r' : innerRepeatCount = atoi(optarg);
break;
case 'R' : outerRepeatCount = atoi(optarg);
break;
case 't' : nrSamplesPerChannel = atoi(optarg);
break;
case 'V' : verifyOutput = atoi(optarg);
break;
default : throw Error(usage(argv[0]));
}
if (nrBits != 4 && nrBits != 8 && nrBits != 16)
throw Error("nrBits must be 4, 8, or 16");
if (nrChannels == 0)
throw Error("nrChannels must be > 0");
if (nrReceivers == 0)
throw Error("nrReceivers must be > 0");
if (nrReceiversPerBlock != 32 && nrReceiversPerBlock != 48 && nrReceiversPerBlock != 64)
throw Error("nrReceiversPerBlock must be 32, 48, or 64");
if (nrSamplesPerChannel == 0)
throw Error("nrSamplesPerChannel must be > 0");
nrTimesPerBlock = 128 / nrBits;
if (nrSamplesPerChannel % nrTimesPerBlock != 0)
throw Error("nrSamplesPerChannel must be a multiple of " + std::to_string(nrTimesPerBlock));
}
std::string Options::usage(const std::string &execName)
{
return "usage: " + execName + " [-b nrBits] [-c nrChannels] [-n nrReceivers] [-N nrReceiversPerBlock] [-r innerRepeatCount] [-R outerRepeatCount] [-t nrSamplesPerChannel] [-V verifyOutput]";
}
const char *Options::Error::what() const noexcept
{
return msg.c_str();
}
#if !defined OPTIONS_H
#define OPTIONS_H
#include <exception>
#include <string>
class Options
{
public:
class Error : public std::exception {
public:
Error(const std::string &msg)
:
msg(msg)
{
}
virtual const char *what() const noexcept;
private:
std::string msg;
};
Options(int argc, char *argv[]);
unsigned nrBaselines() const { return nrReceivers * (nrReceivers + 1) / 2; }
unsigned nrBits;
unsigned nrChannels;
unsigned nrReceivers;
unsigned nrReceiversPerBlock;
unsigned nrSamplesPerChannel;
unsigned nrTimesPerBlock;
unsigned innerRepeatCount, outerRepeatCount;
unsigned deviceNumber;
bool verifyOutput;
static const unsigned nrPolarizations = 2;
private:
static std::string usage(const std::string &execName);
};
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment