InstanceCUDA.h 12.7 KB
Newer Older
1 2 3
// Copyright (C) 2020 ASTRON (Netherlands Institute for Radio Astronomy)
// SPDX-License-Identifier: GPL-3.0-or-later

4 5
#ifndef IDG_CUDA_INSTANCE_H_
#define IDG_CUDA_INSTANCE_H_
6

7 8
#include <memory>

9 10
#include "idg-common.h"

11
#include "CU.h"
Bram Veenboer's avatar
Bram Veenboer committed
12
#include "CUFFT.h"
13
#include "PowerRecord.h"
Bram Veenboer's avatar
Bram Veenboer committed
14

15
namespace idg {
Jakob Maljaars's avatar
Jakob Maljaars committed
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
namespace kernel {
namespace cuda {

class InstanceCUDA : public KernelsInstance {
 public:
  // Constructor
  InstanceCUDA(ProxyInfo& info, int device_nr = 0, int device_id = 0);

  // Destructor
  ~InstanceCUDA();

  cu::Context& get_context() const { return *context; }
  cu::Device& get_device() const { return *device; }
  cu::Stream& get_execute_stream() const { return *executestream; };
  cu::Stream& get_htod_stream() const { return *htodstream; };
  cu::Stream& get_dtoh_stream() const { return *dtohstream; };

  std::string get_compiler_flags();

  powersensor::State measure();
  void measure(PowerRecord& record, cu::Stream& stream);

  void launch_gridder(int time_offset, int nr_subgrids, int grid_size,
                      int subgrid_size, float image_size, float w_step,
                      int nr_channels, int nr_stations, cu::DeviceMemory& d_uvw,
                      cu::DeviceMemory& d_wavenumbers,
                      cu::DeviceMemory& d_visibilities,
                      cu::DeviceMemory& d_spheroidal, cu::DeviceMemory& d_aterm,
                      cu::DeviceMemory& d_aterm_indices,
                      cu::DeviceMemory& d_avg_aterm_correction,
                      cu::DeviceMemory& d_metadata,
                      cu::DeviceMemory& d_subgrid);

  void launch_degridder(
      int time_offset, int nr_subgrids, int grid_size, int subgrid_size,
      float image_size, float w_step, int nr_channels, int nr_stations,
      cu::DeviceMemory& d_uvw, cu::DeviceMemory& d_wavenumbers,
      cu::DeviceMemory& d_visibilities, cu::DeviceMemory& d_spheroidal,
      cu::DeviceMemory& d_aterm, cu::DeviceMemory& d_aterm_indices,
      cu::DeviceMemory& d_metadata, cu::DeviceMemory& d_subgrid);

Bram Veenboer's avatar
Bram Veenboer committed
57 58 59 60 61 62 63 64
  void launch_average_beam(int nr_baselines, int nr_antennas, int nr_timesteps,
                           int nr_channels, int nr_aterms, int subgrid_size,
                           cu::DeviceMemory& d_uvw,
                           cu::DeviceMemory& d_baselines,
                           cu::DeviceMemory& d_aterms,
                           cu::DeviceMemory& d_aterms_offsets,
                           cu::DeviceMemory& d_weights,
                           cu::DeviceMemory& d_average_beam);
65

Jakob Maljaars's avatar
Jakob Maljaars committed
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
  void launch_calibrate(
      int nr_subgrids, int grid_size, int subgrid_size, float image_size,
      float w_step, int total_nr_timesteps, int nr_channels, int nr_stations,
      int nr_terms, cu::DeviceMemory& d_uvw, cu::DeviceMemory& d_wavenumbers,
      cu::DeviceMemory& d_visibilities, cu::DeviceMemory& d_weights,
      cu::DeviceMemory& d_aterm, cu::DeviceMemory& d_aterm_derivatives,
      cu::DeviceMemory& d_aterm_indices, cu::DeviceMemory& d_metadata,
      cu::DeviceMemory& d_subgrid, cu::DeviceMemory& d_sums1,
      cu::DeviceMemory& d_sums2, cu::DeviceMemory& d_lmnp,
      cu::DeviceMemory& d_hessian, cu::DeviceMemory& d_gradient,
      cu::DeviceMemory& d_residual);

  void launch_grid_fft(cu::DeviceMemory& d_data, int size,
                       DomainAtoDomainB direction);

  void plan_subgrid_fft(unsigned size, unsigned batch);

  void launch_subgrid_fft(cu::DeviceMemory& d_data, unsigned nr_subgrids,
                          DomainAtoDomainB direction);

  void launch_grid_fft_unified(unsigned long size, unsigned batch,
                               Array3D<std::complex<float>>& grid,
                               DomainAtoDomainB direction);

Bram Veenboer's avatar
Bram Veenboer committed
90 91
  void launch_fft_shift(cu::DeviceMemory& d_data, int batch, long size,
                        std::complex<float> scale = {1.0, 1.0});
Bram Veenboer's avatar
Bram Veenboer committed
92

Jakob Maljaars's avatar
Jakob Maljaars committed
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
  void launch_adder(int nr_subgrids, long grid_size, int subgrid_size,
                    cu::DeviceMemory& d_metadata, cu::DeviceMemory& d_subgrid,
                    cu::DeviceMemory& d_grid);

  void launch_adder_unified(int nr_subgrids, long grid_size, int subgrid_size,
                            cu::DeviceMemory& d_metadata,
                            cu::DeviceMemory& d_subgrid, void* u_grid);

  void launch_splitter(int nr_subgrids, long grid_size, int subgrid_size,
                       cu::DeviceMemory& d_metadata,
                       cu::DeviceMemory& d_subgrid, cu::DeviceMemory& d_grid);

  void launch_splitter_unified(int nr_subgrids, long grid_size,
                               int subgrid_size, cu::DeviceMemory& d_metadata,
                               cu::DeviceMemory& d_subgrid, void* u_grid);

  void launch_scaler(int nr_subgrids, int subgrid_size,
                     cu::DeviceMemory& d_subgrid);

  void launch_scaler(int nr_subgrids, int subgrid_size, void* u_subgrid);

  // Memory management per device
  cu::DeviceMemory& allocate_device_grid(size_t bytes);
  cu::DeviceMemory& allocate_device_wavenumbers(size_t bytes);
  cu::DeviceMemory& allocate_device_aterms(size_t bytes);
  cu::DeviceMemory& allocate_device_aterms_indices(size_t bytes);
  cu::DeviceMemory& allocate_device_spheroidal(size_t bytes);
  cu::DeviceMemory& allocate_device_avg_aterm_correction(size_t bytes);

  // Memory management per stream
  cu::HostMemory& allocate_host_subgrids(size_t bytes);
  cu::HostMemory& allocate_host_visibilities(size_t bytes);
  cu::HostMemory& allocate_host_uvw(size_t bytes);
  cu::DeviceMemory& allocate_device_visibilities(unsigned int id, size_t bytes);
  cu::DeviceMemory& allocate_device_uvw(unsigned int id, size_t bytes);
  cu::DeviceMemory& allocate_device_subgrids(unsigned int id, size_t bytes);
  cu::DeviceMemory& allocate_device_metadata(unsigned int id, size_t bytes);

  // Memory management for misc device buffers
  unsigned int allocate_device_memory(size_t bytes);
  cu::DeviceMemory& retrieve_device_memory(unsigned int id);

  // Memory management for misc page-locked host buffers
  void register_host_memory(void* ptr, size_t bytes);

  // Retrieve pre-allocated buffers (per device)
  cu::DeviceMemory& retrieve_device_grid() { return *d_grid.get(); }
  cu::DeviceMemory& retrieve_device_aterms() { return *d_aterms; }
  cu::DeviceMemory& retrieve_device_aterms_indices() {
    return *d_aterms_indices;
  }
  cu::DeviceMemory& retrieve_device_aterms_derivatives() {
    return *d_aterms_derivatives;
  }
  cu::DeviceMemory& retrieve_device_wavenumbers() { return *d_wavenumbers; }
  cu::DeviceMemory& retrieve_device_spheroidal() { return *d_spheroidal; }
  cu::DeviceMemory& retrieve_device_avg_aterm_correction() {
    return *d_avg_aterm_correction;
  }

  // Retrieve pre-allocated buffers (per stream)
  cu::DeviceMemory& retrieve_device_visibilities(unsigned int id) {
    return *d_visibilities_[id];
  }
  cu::DeviceMemory& retrieve_device_uvw(unsigned int id) { return *d_uvw_[id]; }
  cu::DeviceMemory& retrieve_device_subgrids(unsigned int id) {
    return *d_subgrids_[id];
  }
  cu::DeviceMemory& retrieve_device_metadata(unsigned int id) {
    return *d_metadata_[id];
  }

  // Free buffers
  void free_device_wavenumbers() { d_wavenumbers.reset(); };
  void free_device_spheroidal() { d_spheroidal.reset(); };
  void free_device_aterms() { d_aterms.reset(); };
  void free_device_aterms_indices() { d_aterms_indices.reset(); };
  void free_device_avg_aterm_correction() { d_avg_aterm_correction.reset(); };
  void free_device_visibilities() { d_visibilities_.clear(); };
  void free_device_uvw() { d_uvw_.clear(); };
  void free_device_subgrids() { d_subgrids_.clear(); };
  void free_device_metadata() { d_metadata_.clear(); };
  void unmap_host_memory() { h_registered_.clear(); };

  // Misc
  void free_fft_plans();
  int get_tile_size_grid() const { return tile_size_grid; };
180
  void free_device_memory();
181 182 183 184 185 186 187 188
  void free_events();

  // Device interface
  void print_device_memory_info() const;
  size_t get_free_memory() const;
  size_t get_total_memory() const;
  template <CUdevice_attribute attribute>
  int get_attribute() const;
Jakob Maljaars's avatar
Jakob Maljaars committed
189 190 191 192 193

 private:
  void free_host_memory();
  void reset();

194 195 196 197
  // Since no CUDA calls are allowed from a callback, we have to
  // keep track of the cu::Events used in the UpdateData and
  // free them explicitely using the free_events() method.
  cu::Event& get_event();
Bram Veenboer's avatar
Bram Veenboer committed
198
  std::vector<std::unique_ptr<cu::Event>> events;
199

Jakob Maljaars's avatar
Jakob Maljaars committed
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
 protected:
  cu::Module* compile_kernel(std::string& flags, std::string& src,
                             std::string& bin);
  void compile_kernels();
  void load_kernels();
  void set_parameters();
  void set_parameters_default();
  void set_parameters_kepler();
  void set_parameters_maxwell();
  void set_parameters_gp100();
  void set_parameters_pascal();
  void set_parameters_volta();

 protected:
  // Variables shared by all InstanceCUDA instances
  ProxyInfo& mInfo;

 private:
  std::unique_ptr<cu::Context> context;
  std::unique_ptr<cu::Device> device;
  std::unique_ptr<cu::Stream> executestream;
  std::unique_ptr<cu::Stream> htodstream;
  std::unique_ptr<cu::Stream> dtohstream;
  std::unique_ptr<cu::Function> function_gridder;
  std::unique_ptr<cu::Function> function_degridder;
  std::unique_ptr<cu::Function> function_fft;
  std::unique_ptr<cu::Function> function_adder;
  std::unique_ptr<cu::Function> function_splitter;
  std::unique_ptr<cu::Function> function_scaler;
229
  std::unique_ptr<cu::Function> function_average_beam;
Bram Veenboer's avatar
Bram Veenboer committed
230
  std::unique_ptr<cu::Function> function_fft_shift;
Jakob Maljaars's avatar
Jakob Maljaars committed
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
  std::vector<std::unique_ptr<cu::Function>> functions_calibrate;

  // One instance per device
  std::unique_ptr<cu::DeviceMemory> d_aterms;
  std::unique_ptr<cu::DeviceMemory> d_aterms_indices;
  std::unique_ptr<cu::DeviceMemory> d_aterms_derivatives;
  std::unique_ptr<cu::DeviceMemory> d_avg_aterm_correction;
  std::unique_ptr<cu::DeviceMemory> d_wavenumbers;
  std::unique_ptr<cu::DeviceMemory> d_spheroidal;
  std::unique_ptr<cu::DeviceMemory> d_grid;
  std::unique_ptr<cu::HostMemory> h_visibilities;
  std::unique_ptr<cu::HostMemory> h_uvw;
  std::unique_ptr<cu::HostMemory> h_subgrids;

  // One instance per stream
  std::vector<std::unique_ptr<cu::DeviceMemory>> d_visibilities_;
  std::vector<std::unique_ptr<cu::DeviceMemory>> d_uvw_;
  std::vector<std::unique_ptr<cu::DeviceMemory>> d_metadata_;
  std::vector<std::unique_ptr<cu::DeviceMemory>> d_subgrids_;

  // Registered host memory
  std::vector<std::unique_ptr<cu::RegisteredMemory>> h_registered_;

  // Misc device memory
  std::vector<std::unique_ptr<cu::DeviceMemory>> d_misc_;

  // All CUDA modules private to this InstanceCUDA
  std::vector<std::unique_ptr<cu::Module>> mModules;

 protected:
  dim3 block_gridder;
  dim3 block_degridder;
  dim3 block_calibrate;
  dim3 block_adder;
  dim3 block_splitter;
  dim3 block_scaler;

  int batch_gridder;
  int batch_degridder;
  int tile_size_grid;

  // Grid FFT
  int m_fft_grid_size = 0;
  std::unique_ptr<cufft::C2C_2D> m_fft_plan_grid;

  // Subgrid FFT
  const unsigned m_fft_subgrid_bulk_default = 1024;
  unsigned m_fft_subgrid_bulk = m_fft_subgrid_bulk_default;
  unsigned m_fft_subgrid_size = 0;
  std::unique_ptr<cufft::C2C_2D> m_fft_plan_subgrid;
  std::unique_ptr<cu::DeviceMemory> d_fft_subgrid;

 private:
  // Memory allocation/reuse methods
  template <typename T>
  T* reuse_memory(uint64_t size, std::unique_ptr<T>& memory);

  template <typename T>
  T* reuse_memory(std::vector<std::unique_ptr<T>>& memories, unsigned int id,
                  uint64_t size);

  template <typename T>
  T* reuse_memory(std::vector<std::unique_ptr<T>>& memories, uint64_t size,
                  void* ptr);

 public:
  void enqueue_report(cu::Stream& stream, int nr_timesteps, int nr_subgrids);

  void copy_htoh(void* dst, void* src, size_t bytes);

  void copy_dtoh(cu::Stream& stream, void* dst, cu::DeviceMemory& src,
                 size_t bytes);

  void copy_htod(cu::Stream& stream, cu::DeviceMemory& dst, void* src,
                 size_t bytes);

 private:
  void start_measurement(void* data);
  void end_measurement(void* data);
};
std::ostream& operator<<(std::ostream& os, InstanceCUDA& d);

// Kernel names
static const std::string name_gridder = "kernel_gridder";
static const std::string name_degridder = "kernel_degridder";
static const std::string name_adder = "kernel_adder";
static const std::string name_splitter = "kernel_splitter";
static const std::string name_fft = "kernel_fft";
static const std::string name_scaler = "kernel_scaler";
static const std::string name_calibrate_lmnp = "kernel_calibrate_lmnp";
static const std::string name_calibrate_sums = "kernel_calibrate_sums";
static const std::string name_calibrate_gradient = "kernel_calibrate_gradient";
static const std::string name_calibrate_hessian = "kernel_calibrate_hessian";
324
static const std::string name_average_beam = "kernel_average_beam";
Bram Veenboer's avatar
Bram Veenboer committed
325
static const std::string name_fft_shift = "kernel_fft_shift";
Jakob Maljaars's avatar
Jakob Maljaars committed
326 327 328 329

}  // end namespace cuda
}  // end namespace kernel
}  // end namespace idg
330 331

#endif