Commit 9c578e0f authored by Bram Veenboer's avatar Bram Veenboer

Cleanup of Proxy::do_transform interface

With set_grid and get_grid, it is ambigous to have an additional grid
argument. It is now up to the Proxy to decide how to operate on m_grid.
The legacy version with std::complex<float> pointer is left for the C
interface.
parent 5fde8334
......@@ -329,8 +329,8 @@ void run() {
clog << ">>> Run fft" << endl;
double runtime_fft = -omp_get_wtime();
if (!disable_fft) {
proxy.transform(idg::FourierDomainToImageDomain, *grid);
proxy.transform(idg::ImageDomainToFourierDomain, *grid);
proxy.transform(idg::FourierDomainToImageDomain);
proxy.transform(idg::ImageDomainToFourierDomain);
}
runtimes_fft.push_back(runtime_fft + omp_get_wtime());
clog << endl;
......
......@@ -449,7 +449,7 @@ void run_master() {
// Run FFT
runtimes_grid_fft[cycle] = -omp_get_wtime();
proxy.transform(idg::FourierDomainToImageDomain, *grid);
proxy.transform(idg::FourierDomainToImageDomain);
runtimes_grid_fft[cycle] += omp_get_wtime();
// Reduce grids
......@@ -477,7 +477,7 @@ void run_master() {
// Run FFT
runtimes_grid_fft[cycle] -= omp_get_wtime();
proxy.transform(idg::ImageDomainToFourierDomain, *grid);
proxy.transform(idg::ImageDomainToFourierDomain);
runtimes_grid_fft[cycle] += omp_get_wtime();
}
runtime_imaging += omp_get_wtime();
......@@ -626,7 +626,7 @@ void run_worker() {
grid = proxy.get_grid();
// Run FFT
proxy.transform(idg::FourierDomainToImageDomain, *grid);
proxy.transform(idg::FourierDomainToImageDomain);
// Reduce grids
reduce_grids(grid, rank, world_size);
......@@ -640,7 +640,7 @@ void run_worker() {
proxy.set_grid(grid);
// Run FFT
proxy.transform(idg::ImageDomainToFourierDomain, *grid);
proxy.transform(idg::ImageDomainToFourierDomain);
// Subtract model visibilities
// not implemented
......
......@@ -119,7 +119,7 @@ int test01() {
proxy.gridding(plan, w_offset, shift, cell_size, kernel_size, subgrid_size,
frequencies, visibilities_ref, uvw, baselines, aterms,
aterms_offsets, spheroidal);
proxy.transform(idg::FourierDomainToImageDomain, *grid);
proxy.transform(idg::FourierDomainToImageDomain);
float grid_error = get_accuracy(grid_size * grid_size * nr_correlations,
(std::complex<float> *)grid->data(),
......@@ -128,7 +128,8 @@ int test01() {
// Predict visibilities
clog << ">>> Predict visibilities" << endl;
proxy.transform(idg::ImageDomainToFourierDomain, *grid_ref);
proxy.set_grid(grid_ref);
proxy.transform(idg::ImageDomainToFourierDomain);
// Set reference grid
proxy.set_grid(grid_ref, subgrid_size, image_size, w_step, shift.data());
......
......@@ -21,7 +21,7 @@ namespace cpu {
// Constructor
CPU::CPU(std::vector<std::string> libraries) : kernels(libraries) {
#if defined(DEBUG)
std::cout << __func__ << std::endl;
std::cout << "CPU::" << __func__ << std::endl;
#endif
powerSensor = get_power_sensor(sensor_host);
......@@ -31,7 +31,7 @@ CPU::CPU(std::vector<std::string> libraries) : kernels(libraries) {
// Destructor
CPU::~CPU() {
#if defined(DEBUG)
std::cout << __func__ << std::endl;
std::cout << "CPU::" << __func__ << std::endl;
#endif
// Delete power sensor
......@@ -646,44 +646,58 @@ void CPU::do_calibrate_update_hessian_vector_product2(
station_nr, aterms, derivative_aterms, parameter_vector);
}
void CPU::do_transform(DomainAtoDomainB direction,
Array3D<std::complex<float>> &grid) {
void CPU::do_transform(DomainAtoDomainB direction) {
#if defined(DEBUG)
std::cout << __func__ << std::endl;
std::cout << "FFT (direction: " << direction << ")" << std::endl;
#endif
try {
int sign = (direction == FourierDomainToImageDomain) ? 1 : -1;
const auto& grid = get_grid();
// Constants
auto grid_size = grid.get_x_dim();
unsigned int nr_w_layers = grid->get_w_dim();
unsigned int nr_correlations = grid->get_z_dim();
unsigned int grid_size = grid->get_y_dim();
// Performance measurement
report.initialize(0, 0, grid_size);
kernels.set_report(report);
State states[2];
states[0] = powerSensor->read();
// FFT shift
if (direction == FourierDomainToImageDomain) {
kernels.shift(grid); // TODO: integrate into adder?
} else {
kernels.shift(grid); // TODO: remove
}
for (unsigned int w = 0; w < nr_w_layers; w++)
{
int sign = (direction == FourierDomainToImageDomain) ? 1 : -1;
// Run FFT
kernels.run_fft(grid_size, grid_size, 1, grid.data(), sign);
// Grid pointer
idg::Array3D<std::complex<float>> grid_ptr(grid->data(w), nr_correlations,
grid_size, grid_size);
// FFT shift
if (direction == FourierDomainToImageDomain)
kernels.shift(grid); // TODO: remove
else
kernels.shift(grid); // TODO: integrate into splitter?
// Constants
auto grid_size = grid->get_x_dim();
// End measurement
states[1] = powerSensor->read();
report.update_host(states[0], states[1]);
State states[2];
states[0] = powerSensor->read();
// FFT shift
if (direction == FourierDomainToImageDomain) {
kernels.shift(grid_ptr); // TODO: integrate into adder?
} else {
kernels.shift(grid_ptr); // TODO: remove
}
// Run FFT
kernels.run_fft(grid_size, grid_size, 1, grid->data(), sign);
// FFT shift
if (direction == FourierDomainToImageDomain)
kernels.shift(grid_ptr); // TODO: remove
else
kernels.shift(grid_ptr); // TODO: integrate into splitter?
// End measurement
states[1] = powerSensor->read();
report.update_host(states[0], states[1]);
}
// Report performance
report.print_total();
......
......@@ -122,8 +122,7 @@ class CPU : public Proxy {
const Array4D<Matrix2x2<std::complex<float>>>& derivative_aterms,
Array2D<float>& parameter_vector) override;
virtual void do_transform(DomainAtoDomainB direction,
Array3D<std::complex<float>>& grid) override;
virtual void do_transform(DomainAtoDomainB direction) override;
virtual void do_compute_avg_beam(
const unsigned int nr_antennas, const unsigned int nr_channels,
......
......@@ -18,75 +18,16 @@ Generic::Generic(ProxyInfo info) : CUDA(info) {
#if defined(DEBUG)
std::cout << "Generic::" << __func__ << std::endl;
#endif
// Initialize host PowerSensor
hostPowerSensor = get_power_sensor(sensor_host);
}
// Destructor
Generic::~Generic() { delete hostPowerSensor; }
/* High level routines */
void Generic::do_transform(DomainAtoDomainB direction,
Array3D<std::complex<float>>& grid) {
Generic::~Generic() {
#if defined(DEBUG)
std::cout << __func__ << std::endl;
std::cout << "Transform direction: " << direction << std::endl;
std::cout << "Generic::" << __func__ << std::endl;
#endif
}
// Constants
auto grid_size = grid.get_x_dim();
// Load device
InstanceCUDA& device = get_device(0);
// Initialize
cu::Stream& stream = device.get_execute_stream();
// Device memory
cu::DeviceMemory& d_grid = device.retrieve_device_grid();
// Performance measurements
report.initialize(0, 0, grid_size);
device.set_report(report);
State powerStates[4];
powerStates[0] = hostPowerSensor->read();
powerStates[2] = device.measure();
// Perform fft shift
device.shift(grid);
// Copy grid to device
device.copy_htod(stream, d_grid, grid.data(), grid.bytes());
// Execute fft
device.launch_grid_fft(d_grid, grid_size, direction);
// Copy grid to host
device.copy_dtoh(stream, grid.data(), d_grid, grid.bytes());
stream.synchronize();
// Perform fft shift
device.shift(grid);
// Perform fft scaling
std::complex<float> scale =
std::complex<float>(2.0 / (grid_size * grid_size), 0);
if (direction == FourierDomainToImageDomain) {
device.scale(grid, scale);
}
// End measurements
stream.synchronize();
powerStates[1] = hostPowerSensor->read();
powerStates[3] = device.measure();
// Report performance
report.update_host(powerStates[0], powerStates[1]);
report.print_total();
report.print_device(powerStates[2], powerStates[3]);
} // end transform
/* High level routines */
void Generic::run_gridding(
const Plan& plan, const float w_step, const Array1D<float>& shift,
const float cell_size, const unsigned int kernel_size,
......
......@@ -51,11 +51,6 @@ class Generic : public CUDA {
const Array1D<unsigned int>& aterms_offsets,
const Array2D<float>& spheroidal) override;
virtual void do_transform(DomainAtoDomainB direction,
Array3D<std::complex<float>>& grid) override;
powersensor::PowerSensor* hostPowerSensor;
void run_gridding(
const Plan& plan, const float w_step, const Array1D<float>& shift,
const float cell_size, const unsigned int kernel_size,
......
......@@ -34,74 +34,6 @@ Unified::~Unified() {
#endif
}
void Unified::do_transform(DomainAtoDomainB direction,
Array3D<std::complex<float>>& grid) {
#if defined(DEBUG)
std::cout << "Unified::" << __func__ << std::endl;
cout << "Transform direction: " << direction << endl;
#endif
// TODO: fix this method
// (1) does the m_grid_tiled need to be untiled before use?
// (2) even though m_grid_tiled is Unified Memory, cuFFT fails
// Constants
auto nr_correlations = grid.get_z_dim();
;
auto grid_size = grid.get_x_dim();
// Load device
InstanceCUDA& device = get_device(0);
// Get UnifiedMemory object for grid data
cu::UnifiedMemory u_grid(device.get_context(), m_grid_tiled->data(),
m_grid_tiled->bytes());
// Initialize
cu::Stream& stream = device.get_execute_stream();
// Performance measurements
report.initialize(0, 0, grid_size);
device.set_report(report);
State powerStates[4];
powerStates[0] = hostPowerSensor->read();
powerStates[2] = device.measure();
// Perform fft shift
double time_shift = -omp_get_wtime();
device.shift(grid);
time_shift += omp_get_wtime();
// Execute fft
device.launch_grid_fft_unified(grid_size, nr_correlations, grid, direction);
stream.synchronize();
// Perform fft shift
time_shift = -omp_get_wtime();
device.shift(grid);
time_shift += omp_get_wtime();
// Perform fft scaling
double time_scale = -omp_get_wtime();
complex<float> scale = complex<float>(2.0 / (grid_size * grid_size), 0);
if (direction == FourierDomainToImageDomain) {
device.scale(grid, scale);
}
time_scale += omp_get_wtime();
// End measurements
stream.synchronize();
powerStates[1] = hostPowerSensor->read();
powerStates[3] = device.measure();
// Report performance
report.update_fft_shift(time_shift);
report.update_fft_scale(time_scale);
report.update_host(powerStates[0], powerStates[1]);
report.print_total();
report.print_device(powerStates[2], powerStates[3]);
} // end transform
void Unified::do_gridding(
const Plan& plan,
const float w_step, // in lambda
......
......@@ -52,10 +52,7 @@ class Unified : public Generic {
const Array1D<unsigned int>& aterms_offsets,
const Array2D<float>& spheroidal) override;
virtual void do_transform(DomainAtoDomainB direction,
Array3D<std::complex<float>>& grid) override;
virtual void set_grid(std::shared_ptr<Grid> grid);
virtual void set_grid(std::shared_ptr<Grid> grid) override;
virtual void set_grid(std::shared_ptr<Grid> grid, int subgrid_size,
float image_size, float w_step,
......
......@@ -30,11 +30,15 @@ CUDA::CUDA(ProxyInfo info) : mInfo(info) {
print_devices();
print_compiler_flags();
cuProfilerStart();
// Initialize host PowerSensor
hostPowerSensor = powersensor::get_power_sensor(powersensor::sensor_host);
};
CUDA::~CUDA() {
cuProfilerStop();
free_devices();
delete hostPowerSensor;
}
void CUDA::init_devices() {
......@@ -447,6 +451,76 @@ void CUDA::initialize(
marker.end();
} // end initialize
void CUDA::do_transform(DomainAtoDomainB direction)
{
#if defined(DEBUG)
std::cout << "CUDA::" << __func__ << std::endl;
#endif
const auto& grid = get_grid();
// Constants
unsigned int grid_size = m_grid->get_x_dim();
unsigned int nr_w_layers = m_grid->get_w_dim();
assert(nr_w_layers == 1);
unsigned int nr_correlations = m_grid->get_z_dim();
assert(nr_correlations == 4);
// Grid pointer (4D to 3D, assume nr_w_layers == 1)
idg::Array3D<std::complex<float>> grid_ptr(grid->data(), nr_correlations,
grid_size, grid_size);
// Load device
InstanceCUDA& device = get_device(0);
// Initialize
cu::Stream& stream = device.get_execute_stream();
// Device memory
cu::DeviceMemory& d_grid = device.retrieve_device_grid();
// Performance measurements
report.initialize(0, 0, grid_size);
device.set_report(report);
powersensor::State powerStates[4];
powerStates[0] = hostPowerSensor->read();
powerStates[2] = device.measure();
// Perform fft shift
device.shift(grid_ptr);
// Copy grid to device
device.copy_htod(stream, d_grid, grid->data(), grid->bytes());
// Execute fft
device.launch_grid_fft(d_grid, grid_size, direction);
// Copy grid to host
device.copy_dtoh(stream, grid->data(), d_grid, grid->bytes());
stream.synchronize();
// Perform fft shift
device.shift(grid_ptr);
// Perform fft scaling
std::complex<float> scale =
std::complex<float>(2.0 / (grid_size * grid_size), 0);
if (direction == FourierDomainToImageDomain) {
device.scale(grid_ptr, scale);
}
// End measurements
stream.synchronize();
powerStates[1] = hostPowerSensor->read();
powerStates[3] = device.measure();
// Report performance
report.update_host(powerStates[0], powerStates[1]);
report.print_total();
report.print_device(powerStates[2], powerStates[3]);
}
void CUDA::do_compute_avg_beam(
const unsigned int nr_antennas, const unsigned int nr_channels,
const Array2D<UVW<float>>& uvw,
......
......@@ -61,6 +61,8 @@ class CUDA : public Proxy {
const Array4D<float>& weights,
idg::Array4D<std::complex<float>>& average_beam) override;
virtual void do_transform(DomainAtoDomainB direction) override;
void cleanup();
protected:
......@@ -68,6 +70,8 @@ class CUDA : public Proxy {
void free_devices();
static ProxyInfo default_info();
powersensor::PowerSensor* hostPowerSensor;
struct {
unsigned int nr_stations = 0;
unsigned int nr_timeslots = 0;
......
......@@ -59,6 +59,10 @@ InstanceCUDA::InstanceCUDA(ProxyInfo& info, int device_nr, int device_id)
// Destructor
InstanceCUDA::~InstanceCUDA() {
#if defined(DEBUG)
std::cout << __func__ << std::endl;
#endif
free_host_memory();
free_device_memory();
free_fft_plans();
......
......@@ -31,9 +31,6 @@ GenericOptimized::GenericOptimized() : CUDA(default_info()) {
// Initialize cpu proxy
cpuProxy = new idg::proxy::cpu::Optimized();
// Initialize host PowerSensor
hostPowerSensor = get_power_sensor(sensor_host);
omp_set_nested(true);
cuProfilerStart();
......@@ -41,21 +38,24 @@ GenericOptimized::GenericOptimized() : CUDA(default_info()) {
// Destructor
GenericOptimized::~GenericOptimized() {
#if defined(DEBUG)
std::cout << "GenericOptimized::" << __func__ << std::endl;
#endif
delete cpuProxy;
delete hostPowerSensor;
cuProfilerStop();
}
/*
* FFT
*/
void GenericOptimized::do_transform(DomainAtoDomainB direction, Grid& grid) {
void GenericOptimized::do_transform(DomainAtoDomainB direction) {
#if defined(DEBUG)
std::cout << "GenericOptimized::" << __func__ << std::endl;
std::cout << "Transform direction: " << direction << std::endl;
#endif
cpuProxy->transform(direction, grid);
cpuProxy->transform(direction);
} // end transform
/*
......
......@@ -65,7 +65,7 @@ class GenericOptimized : public cuda::CUDA {
const Array1D<unsigned int>& aterms_offsets,
const Array2D<float>& spheroidal) override;
virtual void do_transform(DomainAtoDomainB direction, Grid& grid) override;
virtual void do_transform(DomainAtoDomainB direction) override;
void run_gridding(
const Plan& plan, const float w_step, const Array1D<float>& shift,
......@@ -132,7 +132,6 @@ class GenericOptimized : public cuda::CUDA {
Plan::Options options) override;
protected:
powersensor::PowerSensor* hostPowerSensor;
idg::proxy::cpu::CPU* cpuProxy;
/*
......
......@@ -25,13 +25,13 @@ UnifiedOptimized::~UnifiedOptimized() {
delete cpuProxy;
}
void UnifiedOptimized::do_transform(DomainAtoDomainB direction, Grid& grid) {
void UnifiedOptimized::do_transform(DomainAtoDomainB direction) {
#if defined(DEBUG)
std::cout << "UnifiedOptimized::" << __func__ << std::endl;
std::cout << "Transform direction: " << direction << std::endl;
#endif
cpuProxy->transform(direction, grid);
cpuProxy->transform(direction);
} // end transform
} // namespace hybrid
......
......@@ -17,7 +17,7 @@ class UnifiedOptimized : public cuda::Unified {
~UnifiedOptimized();
virtual void do_transform(DomainAtoDomainB direction, Grid& grid);
virtual void do_transform(DomainAtoDomainB direction);
private:
idg::proxy::cpu::CPU* cpuProxy;
......
......@@ -413,11 +413,11 @@ void Proxy::set_avg_aterm_correction(
void Proxy::unset_avg_aterm_correction() { m_avg_aterm_correction.resize(0); }
void Proxy::transform(DomainAtoDomainB direction, Grid& grid) {
do_transform(direction, grid);
void Proxy::transform(DomainAtoDomainB direction) {
do_transform(direction);
}
void Proxy::transform(DomainAtoDomainB direction, std::complex<float>* grid,
void Proxy::transform(DomainAtoDomainB direction, std::complex<float>* grid_ptr,
unsigned int grid_nr_correlations,
unsigned int grid_height, unsigned int grid_width) {
throw_assert(grid_height == grid_width, ""); // TODO: remove restriction
......@@ -425,25 +425,14 @@ void Proxy::transform(DomainAtoDomainB direction, std::complex<float>* grid,
unsigned int grid_nr_w_layers = 1; // TODO: make this a parameter
Grid grid_(grid, grid_nr_w_layers, grid_nr_correlations, grid_height,
grid_width);
auto grid = std::shared_ptr<Grid>(
new Grid(grid_ptr, grid_nr_w_layers, grid_nr_correlations, grid_height, grid_width));
do_transform(direction, grid_);
}
set_grid(grid);
void Proxy::do_transform(DomainAtoDomainB direction, Grid& grid) {
unsigned int nr_w_layers = grid.get_w_dim();
unsigned int nr_correlations = grid.get_z_dim();
unsigned int height = grid.get_y_dim();
unsigned int width = grid.get_x_dim();
throw_assert(height == width, ""); // TODO: remove restriction
unsigned int grid_size = height;
for</