diff --git a/idg-lib/src/CUDA/common/CMakeLists.txt b/idg-lib/src/CUDA/common/CMakeLists.txt index 0a80117760dbf6d184141bef270c3359c2e10c9e..eb2ad6a24c20a75c9b0b16a6237b876f2812c98f 100644 --- a/idg-lib/src/CUDA/common/CMakeLists.txt +++ b/idg-lib/src/CUDA/common/CMakeLists.txt @@ -51,6 +51,7 @@ install( kernels/KernelSplitter.cu kernels/KernelCalibrate.cu kernels/KernelAverageBeam.cu + kernels/KernelFFTShift.cu kernels/math.cu kernels/Types.h DESTINATION diff --git a/idg-lib/src/CUDA/common/CUDA.cpp b/idg-lib/src/CUDA/common/CUDA.cpp index e6743fbe2dd9f8da74c2377bffda51858f048839..6034aaf422ab2b1c05adf5404184b49364ba5f71 100644 --- a/idg-lib/src/CUDA/common/CUDA.cpp +++ b/idg-lib/src/CUDA/common/CUDA.cpp @@ -456,8 +456,6 @@ void CUDA::do_transform(DomainAtoDomainB direction) { std::cout << "CUDA::" << __func__ << std::endl; #endif - const auto& grid = get_grid(); - // Constants unsigned int grid_size = m_grid->get_x_dim(); unsigned int nr_w_layers = m_grid->get_w_dim(); @@ -465,10 +463,6 @@ void CUDA::do_transform(DomainAtoDomainB direction) { unsigned int nr_correlations = m_grid->get_z_dim(); assert(nr_correlations == 4); - // Grid pointer (4D to 3D, assume nr_w_layers == 1) - idg::Array3D> grid_ptr(grid->data(), nr_correlations, - grid_size, grid_size); - // Load device InstanceCUDA& device = get_device(0); @@ -486,27 +480,17 @@ void CUDA::do_transform(DomainAtoDomainB direction) { powerStates[2] = device.measure(); // Perform fft shift - device.shift(grid_ptr); - - // Copy grid to device - device.copy_htod(stream, d_grid, grid->data(), grid->bytes()); + device.launch_fft_shift(d_grid, nr_correlations, grid_size); // Execute fft device.launch_grid_fft(d_grid, grid_size, direction); - // Copy grid to host - device.copy_dtoh(stream, grid->data(), d_grid, grid->bytes()); - stream.synchronize(); - - // Perform fft shift - device.shift(grid_ptr); - - // Perform fft scaling + // Perform fft shift and scaling std::complex scale = - std::complex(2.0 / (grid_size * grid_size), 0); - if (direction == FourierDomainToImageDomain) { - device.scale(grid_ptr, scale); - } + (direction == FourierDomainToImageDomain) + ? std::complex(2.0 / (grid_size * grid_size), 0) + : std::complex(1.0, 1.0); + device.launch_fft_shift(d_grid, nr_correlations, grid_size, scale); // End measurements stream.synchronize(); diff --git a/idg-lib/src/CUDA/common/InstanceCUDA.cpp b/idg-lib/src/CUDA/common/InstanceCUDA.cpp index ccb47ca4dd935277ab9eb4ac7b4219676aea05d3..8f8e9e07969c361f19f4b902f1b80cb05f59cf07 100644 --- a/idg-lib/src/CUDA/common/InstanceCUDA.cpp +++ b/idg-lib/src/CUDA/common/InstanceCUDA.cpp @@ -200,6 +200,11 @@ void InstanceCUDA::compile_kernels() { cubin.push_back("AverageBeam.cubin"); flags.push_back(flags_common); + // FFT shift + src.push_back("KernelFFTShift.cu"); + cubin.push_back("KernelFFTShift.cubin"); + flags.push_back(flags_common); + // FFT #if USE_CUSTOM_FFT src.push_back("KernelFFT.cu"); @@ -282,9 +287,16 @@ void InstanceCUDA::load_kernels() { found++; } + // Load FFT shift function + if (cuModuleGetFunction(&function, *mModules[7], name_fft_shift.c_str()) == + CUDA_SUCCESS) { + function_fft_shift.reset(new cu::Function(*context, function)); + found++; + } + // Load FFT function #if USE_CUSTOM_FFT - if (cuModuleGetFunction(&function, *mModules[6], name_fft.c_str()) == + if (cuModuleGetFunction(&function, *mModules[8], name_fft.c_str()) == CUDA_SUCCESS) { function_fft.reset(new cu::Function(function)); found++; @@ -915,6 +927,20 @@ void InstanceCUDA::launch_grid_fft_unified(unsigned long size, } } +void InstanceCUDA::launch_fft_shift(cu::DeviceMemory& d_data, int batch, + long size, std::complex scale) { + const void* parameters[] = {&size, d_data, &scale}; + + dim3 grid(batch, ceil(size / 2.0)); + dim3 block(128); + + UpdateData* data = get_update_data(get_event(), powerSensor, report, + &Report::update_fft_shift); + start_measurement(data); + executestream->launchKernel(*function_fft_shift, grid, block, 0, parameters); + end_measurement(data); +} + void InstanceCUDA::launch_adder(int nr_subgrids, long grid_size, int subgrid_size, cu::DeviceMemory& d_metadata, cu::DeviceMemory& d_subgrid, @@ -926,7 +952,6 @@ void InstanceCUDA::launch_adder(int nr_subgrids, long grid_size, UpdateData* data = get_update_data(get_event(), powerSensor, report, &Report::update_adder); start_measurement(data); - data->start->enqueue(*executestream); #if ENABLE_REPEAT_KERNELS for (int i = 0; i < NR_REPETITIONS_ADDER; i++) #endif diff --git a/idg-lib/src/CUDA/common/InstanceCUDA.h b/idg-lib/src/CUDA/common/InstanceCUDA.h index 0bcddec2b4270a116deea648d878e214d2e08516..1d6b559429a36f7fba299ed48fbef06775dd3346 100644 --- a/idg-lib/src/CUDA/common/InstanceCUDA.h +++ b/idg-lib/src/CUDA/common/InstanceCUDA.h @@ -87,6 +87,9 @@ class InstanceCUDA : public KernelsInstance { Array3D>& grid, DomainAtoDomainB direction); + void launch_fft_shift(cu::DeviceMemory& d_data, int batch, long size, + std::complex scale = {1.0, 1.0}); + void launch_adder(int nr_subgrids, long grid_size, int subgrid_size, cu::DeviceMemory& d_metadata, cu::DeviceMemory& d_subgrid, cu::DeviceMemory& d_grid); @@ -224,6 +227,7 @@ class InstanceCUDA : public KernelsInstance { std::unique_ptr function_splitter; std::unique_ptr function_scaler; std::unique_ptr function_average_beam; + std::unique_ptr function_fft_shift; std::vector> functions_calibrate; // One instance per device @@ -318,6 +322,7 @@ static const std::string name_calibrate_sums = "kernel_calibrate_sums"; static const std::string name_calibrate_gradient = "kernel_calibrate_gradient"; static const std::string name_calibrate_hessian = "kernel_calibrate_hessian"; static const std::string name_average_beam = "kernel_average_beam"; +static const std::string name_fft_shift = "kernel_fft_shift"; } // end namespace cuda } // end namespace kernel diff --git a/idg-lib/src/CUDA/common/kernels/KernelFFTShift.cu b/idg-lib/src/CUDA/common/kernels/KernelFFTShift.cu new file mode 100644 index 0000000000000000000000000000000000000000..babebac2fc296f0ea40f3c617748cc166ff3e079 --- /dev/null +++ b/idg-lib/src/CUDA/common/kernels/KernelFFTShift.cu @@ -0,0 +1,52 @@ +// Copyright (C) 2020 ASTRON (Netherlands Institute for Radio Astronomy) +// SPDX-License-Identifier: GPL-3.0-or-later + +#include + +#include "Types.h" +#include "math.cu" + +extern "C" { + +inline __device__ size_t index(size_t n, int i, + int y, int x) +{ + // data: [batch][n]]n] + return i * n * n + + y * n + x; +} + +/* + Kernel +*/ +__global__ void kernel_fft_shift( + const long n, + float2* data, + float2 scale) +{ + unsigned long i = blockIdx.x; + unsigned long y = blockIdx.y; + + unsigned long n2 = n / 2; + + for (unsigned long x = threadIdx.x; x < n2; x += blockDim.x) + { + if (y < n2) + { + unsigned long idx1 = index(n, i, y, x); + unsigned long idx3 = index(n, i, y + n2, x + n2); + float2 tmp1 = data[idx1]; + float2 tmp3 = data[idx3]; + data[idx1] = make_float2(tmp3.x * scale.x, tmp3.y * scale.y); + data[idx3] = make_float2(tmp1.x * scale.x, tmp1.y * scale.y); + + unsigned long idx2 = index(n, i, y + n2, x); + unsigned long idx4 = index(n, i, y, x + n2); + float2 tmp2 = data[idx2]; + float2 tmp4 = data[idx4]; + data[idx2] = make_float2(tmp4.x * scale.x, tmp4.y * scale.y); + data[idx4] = make_float2(tmp2.x * scale.x, tmp2.y * scale.y); + } + } +} // end kernel_fft_shift +}