Commit 89886092 authored by Bram Veenboer's avatar Bram Veenboer

Add CUDA FFTShift kernel

This is now used as default in the CUDA::do_transform method. This has
the advantage of not having to copy the grid to the host to perform the
FFT shift.
parent 9c561bf0
......@@ -51,6 +51,7 @@ install(
kernels/KernelSplitter.cu
kernels/KernelCalibrate.cu
kernels/KernelAverageBeam.cu
kernels/KernelFFTShift.cu
kernels/math.cu
kernels/Types.h
DESTINATION
......
......@@ -456,8 +456,6 @@ void CUDA::do_transform(DomainAtoDomainB direction) {
std::cout << "CUDA::" << __func__ << std::endl;
#endif
const auto& grid = get_grid();
// Constants
unsigned int grid_size = m_grid->get_x_dim();
unsigned int nr_w_layers = m_grid->get_w_dim();
......@@ -465,10 +463,6 @@ void CUDA::do_transform(DomainAtoDomainB direction) {
unsigned int nr_correlations = m_grid->get_z_dim();
assert(nr_correlations == 4);
// Grid pointer (4D to 3D, assume nr_w_layers == 1)
idg::Array3D<std::complex<float>> grid_ptr(grid->data(), nr_correlations,
grid_size, grid_size);
// Load device
InstanceCUDA& device = get_device(0);
......@@ -486,27 +480,16 @@ void CUDA::do_transform(DomainAtoDomainB direction) {
powerStates[2] = device.measure();
// Perform fft shift
device.shift(grid_ptr);
// Copy grid to device
device.copy_htod(stream, d_grid, grid->data(), grid->bytes());
device.launch_fft_shift(d_grid, nr_correlations, grid_size);
// Execute fft
device.launch_grid_fft(d_grid, grid_size, direction);
// Copy grid to host
device.copy_dtoh(stream, grid->data(), d_grid, grid->bytes());
stream.synchronize();
// Perform fft shift
device.shift(grid_ptr);
// Perform fft scaling
std::complex<float> scale =
std::complex<float>(2.0 / (grid_size * grid_size), 0);
if (direction == FourierDomainToImageDomain) {
device.scale(grid_ptr, scale);
}
// Perform fft shift and scaling
std::complex<float> scale = (direction == FourierDomainToImageDomain)
? std::complex<float>(2.0 / (grid_size * grid_size), 0)
: std::complex<float>(1.0, 1.0);
device.launch_fft_shift(d_grid, nr_correlations, grid_size, scale);
// End measurements
stream.synchronize();
......
......@@ -200,6 +200,11 @@ void InstanceCUDA::compile_kernels() {
cubin.push_back("AverageBeam.cubin");
flags.push_back(flags_common);
// FFT shift
src.push_back("KernelFFTShift.cu");
cubin.push_back("KernelFFTShift.cubin");
flags.push_back(flags_common);
// FFT
#if USE_CUSTOM_FFT
src.push_back("KernelFFT.cu");
......@@ -282,9 +287,16 @@ void InstanceCUDA::load_kernels() {
found++;
}
// Load FFT shift function
if (cuModuleGetFunction(&function, *mModules[7], name_fft_shift.c_str()) ==
CUDA_SUCCESS) {
function_fft_shift.reset(new cu::Function(*context, function));
found++;
}
// Load FFT function
#if USE_CUSTOM_FFT
if (cuModuleGetFunction(&function, *mModules[6], name_fft.c_str()) ==
if (cuModuleGetFunction(&function, *mModules[8], name_fft.c_str()) ==
CUDA_SUCCESS) {
function_fft.reset(new cu::Function(function));
found++;
......@@ -915,6 +927,23 @@ void InstanceCUDA::launch_grid_fft_unified(unsigned long size,
}
}
void InstanceCUDA::launch_fft_shift(cu::DeviceMemory& d_data,
int batch, long size,
std::complex<float> scale)
{
const void* parameters[] = {&size, d_data, &scale};
dim3 grid(batch, ceil(size/2.0));
dim3 block(128);
UpdateData* data =
get_update_data(get_event(), powerSensor, report, &Report::update_fft_shift);
start_measurement(data);
executestream->launchKernel(*function_fft_shift, grid, block, 0,
parameters);
end_measurement(data);
}
void InstanceCUDA::launch_adder(int nr_subgrids, long grid_size,
int subgrid_size, cu::DeviceMemory& d_metadata,
cu::DeviceMemory& d_subgrid,
......
......@@ -87,6 +87,9 @@ class InstanceCUDA : public KernelsInstance {
Array3D<std::complex<float>>& grid,
DomainAtoDomainB direction);
void launch_fft_shift(cu::DeviceMemory& d_data, int batch,
long size, std::complex<float> scale = {1.0, 1.0});
void launch_adder(int nr_subgrids, long grid_size, int subgrid_size,
cu::DeviceMemory& d_metadata, cu::DeviceMemory& d_subgrid,
cu::DeviceMemory& d_grid);
......@@ -224,6 +227,7 @@ class InstanceCUDA : public KernelsInstance {
std::unique_ptr<cu::Function> function_splitter;
std::unique_ptr<cu::Function> function_scaler;
std::unique_ptr<cu::Function> function_average_beam;
std::unique_ptr<cu::Function> function_fft_shift;
std::vector<std::unique_ptr<cu::Function>> functions_calibrate;
// One instance per device
......@@ -318,6 +322,7 @@ static const std::string name_calibrate_sums = "kernel_calibrate_sums";
static const std::string name_calibrate_gradient = "kernel_calibrate_gradient";
static const std::string name_calibrate_hessian = "kernel_calibrate_hessian";
static const std::string name_average_beam = "kernel_average_beam";
static const std::string name_fft_shift = "kernel_fft_shift";
} // end namespace cuda
} // end namespace kernel
......
// Copyright (C) 2020 ASTRON (Netherlands Institute for Radio Astronomy)
// SPDX-License-Identifier: GPL-3.0-or-later
#include <cuComplex.h>
#include "Types.h"
#include "math.cu"
extern "C" {
inline __device__ size_t index(size_t n, int i,
int y, int x)
{
// data: [batch][n]]n]
return i * n * n +
y * n + x;
}
/*
Kernel
*/
__global__ void kernel_fft_shift(
const long n,
float2* data,
float2 scale)
{
unsigned long i = blockIdx.x;
unsigned long y = blockIdx.y;
unsigned long n2 = n / 2;
for (unsigned long x = threadIdx.x; x < n2; x += blockDim.x)
{
if (y < n2)
{
unsigned long idx1 = index(n, i, y, x);
unsigned long idx3 = index(n, i, y + n2, x + n2);
float2 tmp1 = data[idx1];
float2 tmp3 = data[idx3];
data[idx1] = make_float2(tmp3.x * scale.x, tmp3.y * scale.y);
data[idx3] = make_float2(tmp1.x * scale.x, tmp1.y * scale.y);
unsigned long idx2 = index(n, i, y + n2, x);
unsigned long idx4 = index(n, i, y, x + n2);
float2 tmp2 = data[idx2];
float2 tmp4 = data[idx4];
data[idx2] = make_float2(tmp4.x * scale.x, tmp4.y * scale.y);
data[idx4] = make_float2(tmp2.x * scale.x, tmp2.y * scale.y);
}
}
} // end kernel_fft_shift
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment