Skip to content
Snippets Groups Projects
Commit 170ea032 authored by Wiebe van Breukelen's avatar Wiebe van Breukelen
Browse files

Merge branch 'integrate-cudawrappers' into 'main'

Add cudawrappers and AMD HIP support

See merge request !17
parents 6583a98e e3a4567e
Branches main
No related tags found
1 merge request!17Add cudawrappers and AMD HIP support
Pipeline #97995 passed
Showing
with 272 additions and 592 deletions
#include "GPUPlan.hpp"
#include <cassert>
#include <exception>
#include <iostream>
#include "cudawrappers/cu.hpp"
namespace dedisp {
// Public interface
GPUPlan::GPUPlan(size_type nchans, float_type dt, float_type f0, float_type df,
int device_idx)
: Plan(nchans, dt, f0, df) {
// Initialize CUDA
cu::init();
// Initialize device
set_device(device_idx);
......@@ -15,13 +24,15 @@ GPUPlan::GPUPlan(size_type nchans, float_type dt, float_type f0, float_type df,
executestream.reset(new cu::Stream());
// Initialize delay table
d_delay_table.resize(nchans * sizeof(dedisp_float));
htodstream->memcpyHtoDAsync(d_delay_table, h_delay_table.data(),
d_delay_table.size());
d_delay_table =
std::make_unique<cu::DeviceMemory>(nchans * sizeof(dedisp_float));
htodstream->memcpyHtoDAsync(*d_delay_table, h_delay_table.data(),
d_delay_table->size());
// Initialize the killmask
d_killmask.resize(nchans * sizeof(dedisp_bool));
htodstream->memcpyHtoDAsync(d_killmask, h_killmask.data(), d_killmask.size());
d_killmask = std::make_unique<cu::DeviceMemory>(nchans * sizeof(dedisp_bool));
htodstream->memcpyHtoDAsync(*d_killmask, h_killmask.data(),
d_killmask->size());
}
// Destructor
......@@ -29,34 +40,38 @@ GPUPlan::~GPUPlan() {}
void GPUPlan::set_device(int device_idx) {
m_device.reset(new cu::Device(device_idx));
m_context.reset(new cu::Context(CU_CTX_SCHED_BLOCKING_SYNC, *m_device));
}
void GPUPlan::generate_dm_list(float_type dm_start, float_type dm_end,
float_type ti, float_type tol) {
Plan::generate_dm_list(dm_start, dm_end, ti, tol);
// Allocate device memory for the DM list
d_dm_list.resize(m_dm_count * sizeof(dedisp_float));
d_dm_list =
std::make_unique<cu::DeviceMemory>(m_dm_count * sizeof(dedisp_float));
assert(d_dm_list);
// Copy the DM list to the device
htodstream->memcpyHtoDAsync(d_dm_list, h_dm_list.data(), d_dm_list.size());
htodstream->memcpyHtoDAsync(*d_dm_list, h_dm_list.data(), d_dm_list->size());
}
void GPUPlan::set_dm_list(const float_type *dm_list, size_type count) {
Plan::set_dm_list(dm_list, count);
// Allocate device memory for the DM list
d_dm_list.resize(m_dm_count * sizeof(dedisp_float));
d_dm_list =
std::make_unique<cu::DeviceMemory>(m_dm_count * sizeof(dedisp_float));
assert(d_dm_list);
// Copy the DM list to the device
htodstream->memcpyHtoDAsync(d_dm_list, h_dm_list.data(), d_dm_list.size());
htodstream->memcpyHtoDAsync(*d_dm_list, h_dm_list.data(), d_dm_list->size());
}
void GPUPlan::set_killmask(const bool_type *killmask) {
Plan::set_killmask(killmask);
// Copy the killmask to the device
htodstream->memcpyHtoDAsync(d_killmask, h_killmask.data(), d_killmask.size());
htodstream->memcpyHtoDAsync(*d_killmask, h_killmask.data(),
d_killmask->size());
}
} // end namespace dedisp
#ifndef DEDISP_PLAN_H_GPU_INCLUDE_GUARD
#define DEDISP_PLAN_H_GPU_INCLUDE_GUARD
#ifndef DEDISP_GPU_PLAN_HPP_
#define DEDISP_GPU_PLAN_HPP_
#include "Plan.hpp"
#include "common/cuda/CU.h"
#include <cudawrappers/cu.hpp>
#include <cudawrappers/nvrtc.hpp>
namespace dedisp {
......@@ -32,11 +33,12 @@ protected:
// Device
void set_device(int device_idx);
std::unique_ptr<cu::Device> m_device;
std::unique_ptr<cu::Context> m_context;
// Device arrays
cu::DeviceMemory d_dm_list; // type = dedisp_float
cu::DeviceMemory d_delay_table; // type = dedisp_float
cu::DeviceMemory d_killmask; // type = dedisp_bool
std::unique_ptr<cu::DeviceMemory> d_dm_list; // type = dedisp_float
std::unique_ptr<cu::DeviceMemory> d_delay_table; // type = dedisp_float
std::unique_ptr<cu::DeviceMemory> d_killmask; // type = dedisp_bool
// Streams
std::unique_ptr<cu::Stream> htodstream;
......@@ -55,4 +57,4 @@ public:
} // end namespace dedisp
#endif // DEDISP_PLAN_GPU_H_INCLUDE_GUARD
#endif // DEDISP_GPU_PLAN_HPP_
#include "Plan.hpp"
#include "common/cuda/CU.h"
#include "common/dedisp_error.hpp"
#include "cudawrappers/cu.hpp"
#include <cmath>
namespace dedisp {
......@@ -80,7 +80,7 @@ void Plan::set_killmask(const bool_type *killmask) {
}
}
void Plan::sync() { cu::checkError(cudaDeviceSynchronize()); }
void Plan::sync() { cu::Context::synchronize(); }
// Private helper functions
void Plan::generate_delay_table(dedisp_float *h_delay_table, dedisp_size nchans,
......
#ifndef DEDISP_PLAN_H_INCLUDE_GUARD
#define DEDISP_PLAN_H_INCLUDE_GUARD
#ifndef DEDISP_PLAN_HPP_
#define DEDISP_PLAN_HPP_
#include <memory>
#include <vector>
#include "common/dedisp_types.h"
......@@ -96,4 +95,4 @@ protected:
} // end namespace dedisp
#endif // DEDISP_PLAN_H_INCLUDE_GUARD
#endif // DEDISP_PLAN_HPP_
add_library(common OBJECT dedisp_error.cpp helper.cpp cuda/CU.cpp)
add_library(common OBJECT dedisp_error.cpp helper.cpp)
target_link_libraries(common CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver
OpenMP::OpenMP_CXX)
target_link_libraries(common PUBLIC OpenMP::OpenMP_CXX cudawrappers::cu
cudawrappers::nvrtc cudawrappers::nvtx)
set_target_properties(common PROPERTIES VERSION ${DEDISP_VERSION}
SOVERSION ${DEDISP_VERSION_MAJOR})
if(${DEDISP_BACKEND_HIP})
get_target_property(sources common SOURCES)
set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP)
endif()
install(TARGETS common LIBRARY DESTINATION lib)
# Install dedisp_types.h in common subdirectory for backwards compatibility with
......
/*
* CU, a CUDA driver api C++ wrapper.
* This code is copied from the IDG repository (https://git.astron.nl/RD/idg)
* and changed to meet the needs for this library.
*/
#include "CU.h"
#include <cassert>
#include <cstring>
#include <iostream>
#include <sstream>
#define assertCudaCall(val) __assertCudaCall(val, #val, __FILE__, __LINE__)
#define checkCudaCall(val) __checkCudaCall(val, #val, __FILE__, __LINE__)
#define assertCuCall(val) __assertCuCall(val, #val, __FILE__, __LINE__)
#define checkCuCall(val) __checkCuCall(val, #val, __FILE__, __LINE__)
namespace cu {
/*
Error checking
*/
inline void __assertCudaCall(cudaError_t result, char const *const func,
const char *const file, int const line) {
if (result != cudaSuccess) {
const char *msg;
msg = cudaGetErrorString(result);
std::cerr << "CUDA Error at " << file;
std::cerr << ":" << line;
std::cerr << " in function " << func;
std::cerr << ": " << msg;
std::cerr << std::endl;
throw Error<cudaError_t>(result);
}
}
inline void __checkCudaCall(cudaError_t result, char const *const func,
const char *const file, int const line) {
try {
__assertCudaCall(result, func, file, line);
} catch (Error<cudaError_t> &error) {
// pass
}
}
inline void __assertCuCall(CUresult result, char const *const func,
const char *const file, int const line) {
if (result != CUDA_SUCCESS) {
const char *msg;
cuGetErrorString(result, &msg);
std::cerr << "CU Error at " << file;
std::cerr << ":" << line;
std::cerr << " in function " << func;
std::cerr << ": ";
if (msg)
std::cerr << msg;
std::cerr << std::endl;
throw Error<CUresult>(result);
}
}
inline void __checkCuCall(CUresult result, char const *const func,
const char *const file, int const line) {
try {
__assertCuCall(result, func, file, line);
} catch (Error<CUresult> &error) {
// pass
}
}
void checkError() { assertCudaCall(cudaGetLastError()); }
void checkError(cudaError_t error) { assertCudaCall(error); }
void checkError(CUresult error) { assertCuCall(error); }
/*
Device
*/
Device::Device(int device) {
m_device = device;
checkCudaCall(cudaSetDevice(device));
}
unsigned int Device::get_capability() {
cudaDeviceProp device_props;
cudaGetDeviceProperties(&device_props, m_device);
return 10 * device_props.major + device_props.minor;
}
size_t Device::get_total_const_memory() const {
cudaDeviceProp device_props;
cudaGetDeviceProperties(&device_props, m_device);
return device_props.totalConstMem;
}
size_t Device::get_free_memory() const {
size_t free;
size_t total;
cudaMemGetInfo(&free, &total);
return free;
}
size_t Device::get_total_memory() const {
size_t free;
size_t total;
cudaMemGetInfo(&free, &total);
return total;
}
CUcontext Device::get_current_context() {
CUcontext context{};
assertCuCall(cuCtxGetCurrent(&context));
return context;
}
void Device::set_context(CUcontext context) { cuCtxSetCurrent(context); }
/*
HostMemory
*/
HostMemory::HostMemory(size_t size, int flags) {
m_capacity = size;
m_size = size;
m_flags = flags;
assertCudaCall(cudaHostAlloc(&m_ptr, size, m_flags));
}
HostMemory::~HostMemory() { release(); }
void HostMemory::resize(size_t size) {
assert(size > 0);
m_size = size;
if (size > m_capacity) {
release();
assertCudaCall(cudaHostAlloc(&m_ptr, size, m_flags));
m_capacity = size;
}
}
void HostMemory::release() { assertCudaCall(cudaFreeHost(m_ptr)); }
void HostMemory::zero() { memset(m_ptr, 0, m_size); }
/*
DeviceMemory
*/
DeviceMemory::DeviceMemory(size_t size) {
m_capacity = size;
m_size = size;
if (size) {
assertCudaCall(cudaMalloc(&m_ptr, size));
}
}
DeviceMemory::~DeviceMemory() { release(); }
void DeviceMemory::resize(size_t size) {
assert(size > 0);
m_size = size;
if (size > m_capacity) {
release();
assertCudaCall(cudaMalloc(&m_ptr, size));
m_capacity = size;
}
}
void DeviceMemory::release() {
if (m_capacity) {
assertCudaCall(cudaFree(m_ptr));
}
}
void DeviceMemory::zero(cudaStream_t stream) {
if (m_size) {
if (stream != NULL) {
assertCudaCall(cudaMemsetAsync(m_ptr, 0, m_size, stream));
} else {
assertCudaCall(cudaMemset(m_ptr, 0, m_size));
}
}
}
/*
Event
*/
Event::Event(int flags) { assertCudaCall(cudaEventCreate(&m_event, flags)); }
Event::~Event() { assertCudaCall(cudaEventDestroy(m_event)); }
void Event::synchronize() { assertCudaCall(cudaEventSynchronize(m_event)); }
float Event::elapsedTime(Event &second) {
float ms;
assertCudaCall(cudaEventElapsedTime(&ms, second, m_event));
return ms;
}
Event::operator cudaEvent_t() { return m_event; }
/*
Stream
*/
Stream::Stream(int flags) {
assertCudaCall(cudaStreamCreateWithFlags(&m_stream, flags));
}
Stream::~Stream() { assertCudaCall(cudaStreamDestroy(m_stream)); }
void Stream::memcpyHtoDAsync(void *devPtr, const void *hostPtr, size_t size) {
assertCudaCall(
cudaMemcpyAsync(devPtr, hostPtr, size, cudaMemcpyHostToDevice, m_stream));
}
void Stream::memcpyDtoHAsync(void *hostPtr, void *devPtr, size_t size) {
assertCudaCall(
cudaMemcpyAsync(hostPtr, devPtr, size, cudaMemcpyDeviceToHost, m_stream));
}
void Stream::memcpyDtoDAsync(void *dstPtr, void *srcPtr, size_t size) {
assertCudaCall(cudaMemcpyAsync(dstPtr, srcPtr, size, cudaMemcpyDeviceToDevice,
m_stream));
}
void Stream::memcpyHtoD2DAsync(void *dstPtr, size_t dstWidth,
const void *srcPtr, size_t srcWidth,
size_t widthBytes, size_t height) {
assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth,
widthBytes, height, cudaMemcpyHostToDevice,
m_stream));
}
void Stream::memcpyDtoH2DAsync(void *dstPtr, size_t dstWidth,
const void *srcPtr, size_t srcWidth,
size_t widthBytes, size_t height) {
assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth,
widthBytes, height, cudaMemcpyDeviceToHost,
m_stream));
}
void Stream::memcpyHtoH2DAsync(void *dstPtr, size_t dstWidth,
const void *srcPtr, size_t srcWidth,
size_t widthBytes, size_t height) {
assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth,
widthBytes, height, cudaMemcpyHostToHost,
m_stream));
}
void Stream::synchronize() { assertCudaCall(cudaStreamSynchronize(m_stream)); }
void Stream::waitEvent(Event &event) {
assertCudaCall(cudaStreamWaitEvent(m_stream, event, 0));
}
void Stream::record(Event &event) {
assertCudaCall(cudaEventRecord(event, m_stream));
}
void Stream::zero(void *ptr, size_t size) {
assertCudaCall(cudaMemsetAsync(ptr, 0, size, m_stream));
}
Stream::operator cudaStream_t() { return m_stream; }
/*
Marker
*/
Marker::Marker(const char *message, Color color) {
_attributes.version = NVTX_VERSION;
_attributes.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
_attributes.colorType = NVTX_COLOR_ARGB;
_attributes.color = convert(color);
_attributes.messageType = NVTX_MESSAGE_TYPE_ASCII;
_attributes.message.ascii = message;
}
void Marker::start() { _id = nvtxRangeStartEx(&_attributes); }
void Marker::end() { nvtxRangeEnd(_id); }
void Marker::start(cu::Event &event) {
event.synchronize();
start();
}
void Marker::end(cu::Event &event) {
event.synchronize();
end();
}
unsigned int Marker::convert(Color color) {
switch (color) {
case red:
return 0xffff0000;
case green:
return 0xff00ff00;
case blue:
return 0xff0000ff;
case yellow:
return 0xffffff00;
case black:
return 0xff000000;
default:
return 0xff00ff00;
}
}
/*
ScopedMarker
*/
ScopedMarker::ScopedMarker(const char *message, Color color)
: Marker(message, color) {
_id = nvtxRangeStartEx(&_attributes);
};
ScopedMarker::~ScopedMarker() { nvtxRangeEnd(_id); }
} // end namespace cu
/*
* CU, a CUDA driver api C++ wrapper.
* This code is copied from the IDG repository (https://git.astron.nl/RD/idg)
* and changed to meet the needs for this library.
*/
#ifndef CU_WRAPPER_H
#define CU_WRAPPER_H
#include <stdexcept>
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvToolsExt.h>
namespace cu {
template <typename T> class Error : public std::exception {
public:
Error(T result) : _result(result) {}
operator T() const { return _result; }
private:
T _result;
};
void checkError();
void checkError(cudaError_t error);
class Device {
public:
Device(int device);
unsigned int get_capability();
size_t get_total_const_memory() const;
size_t get_free_memory() const;
size_t get_total_memory() const;
CUcontext get_current_context();
void set_context(CUcontext cntx);
private:
int m_device;
};
class Memory {
public:
void *data() { return m_ptr; }
size_t size() { return m_size; }
virtual void resize(size_t size) = 0;
template <typename T> operator T *() { return static_cast<T *>(m_ptr); }
protected:
size_t m_capacity = 0; // capacity, in bytes
size_t m_size = 0; // size, in bytes
void *m_ptr = nullptr;
};
class HostMemory : public virtual Memory {
public:
HostMemory(size_t size = 0, int flags = cudaHostAllocPortable);
virtual ~HostMemory();
void resize(size_t size) override;
void zero();
private:
void release();
int m_flags;
};
class DeviceMemory : public virtual Memory {
public:
DeviceMemory(size_t size = 0);
~DeviceMemory();
void resize(size_t size);
void zero(cudaStream_t stream = NULL);
template <typename T> operator T() { return static_cast<T>(m_ptr); }
private:
void release();
};
class Event {
public:
Event(int flags = cudaEventDefault);
~Event();
void synchronize();
float elapsedTime(Event &second);
operator cudaEvent_t();
private:
cudaEvent_t m_event;
};
class Stream {
public:
Stream(int flags = cudaStreamDefault);
~Stream();
void memcpyHtoDAsync(void *devPtr, const void *hostPtr, size_t size);
void memcpyDtoHAsync(void *hostPtr, void *devPtr, size_t size);
void memcpyDtoDAsync(void *dstPtr, void *srcPtr, size_t size);
void memcpyHtoD2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr,
size_t srcWidth, size_t widthBytes, size_t height);
void memcpyDtoH2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr,
size_t srcWidth, size_t widthBytes, size_t height);
void memcpyHtoH2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr,
size_t srcWidth, size_t widthBytes, size_t height);
void synchronize();
void waitEvent(Event &event);
void record(Event &event);
void zero(void *ptr, size_t size);
operator cudaStream_t();
private:
cudaStream_t m_stream;
};
class Marker {
public:
enum Color { red, green, blue, yellow, black };
Marker(const char *message, Marker::Color color = Color::red);
void start();
void end();
void start(cu::Event &event);
void end(cu::Event &event);
private:
unsigned int convert(Color color);
protected:
nvtxEventAttributes_t _attributes;
nvtxRangeId_t _id;
};
class ScopedMarker : public Marker {
public:
ScopedMarker(const char *message, Marker::Color color = Color::red);
~ScopedMarker();
void start() = delete;
void end() = delete;
void start(cu::Event &event) = delete;
void end(cu::Event &event) = delete;
};
} // end namespace cu
#endif // end CU_WRAPPER_H
\ No newline at end of file
......@@ -37,8 +37,8 @@
* \bug Asynchronous execution is not currently operational.
*/
#ifndef DEDISP_H_INCLUDE_GUARD
#define DEDISP_H_INCLUDE_GUARD
#ifndef DEDISP_COMMON_DEDISP_H_
#define DEDISP_COMMON_DEDISP_H_
// Use C linkage to allow cross-language use of the library
#ifdef __cplusplus
......@@ -489,4 +489,4 @@ const dedisp_size *dedisp_get_dt_factors(const dedisp_plan plan);
} // closing brace for extern "C"
#endif
#endif // DEDISP_H_INCLUDE_GUARD
#endif // DEDISP_COMMON_DEDISP_H_
#ifndef DEDISP_ERROR_H_INCLUDE_GUARD
#define DEDISP_ERROR_H_INCLUDE_GUARD
#ifndef DEDISP_COMMON_DEDISP_ERROR_H_
#define DEDISP_COMMON_DEDISP_ERROR_H_
#include <sstream>
#include <stdexcept>
......@@ -79,4 +79,4 @@ inline void _throw_error(dedisp_error error, char const *const func,
inline void check_error(dedisp_error error) { throw_error(error); }
#endif // DEDISP_ERROR_H_INCLUDE_GUARD
\ No newline at end of file
#endif // DEDISP_COMMON_DEDISP_ERROR_H_
\ No newline at end of file
......@@ -12,6 +12,8 @@ static const std::string output_memcpy_time_str = "Output memcpy time : ";
static const std::string runtime_time_str = "Runtime : ";
static const std::string gpuexec_time_str = "GPU execution time : ";
static const std::string total_time_str = "Total time : ";
static const std::string pmt_joules_str = "Joules : ";
static const std::string pmt_watts_str = "Watts : ";
static const std::string preprocessing_perf_str =
"Preprocessing performance : ";
......
#ifndef DEDISP_TYPES_H_INCLUDE_GUARD
#define DEDISP_TYPES_H_INCLUDE_GUARD
#ifndef DEDISP_COMMON_TYPES_H_
#define DEDISP_COMMON_TYPES_H_
// Types
// -----
......@@ -48,4 +48,4 @@ typedef enum {
* are complete.
*/
#endif // DEDISP_TYPES_H_INCLUDE_GUARD
\ No newline at end of file
#endif // DEDISP_COMMON_TYPES_H_
\ No newline at end of file
#include "helper.h"
#include <omp.h>
#include <sys/resource.h> // get used memory
#include <unistd.h> // get total memory
......
#ifndef HELPER_H_INCLUDE_GUARD
#define HELPER_H_INCLUDE_GUARD
#ifndef DEDISP_COMMON_HELPER_H_
#define DEDISP_COMMON_HELPER_H_
#include <cstddef>
......@@ -17,4 +17,4 @@ size_t get_free_memory();
} // end namespace dedisp
#endif // HELPER_H_INCLUDE_GUARD
\ No newline at end of file
#endif // DEDISP_COMMON_HELPER_H_
\ No newline at end of file
......@@ -2,17 +2,20 @@ add_library(
dedisp SHARED
cinterface.cpp
DedispPlan.cpp
dedisperse/dedisperse.cu
unpack/unpack.cu
transpose/transpose.cu
dedisperse/DedispKernel.cu
unpack/UnpackKernel.cu
transpose/TransposeKernel.cu
$<TARGET_OBJECTS:common>
$<TARGET_OBJECTS:plan>
$<TARGET_OBJECTS:external>)
target_include_directories(dedisp PRIVATE ${CMAKE_SOURCE_DIR}/src)
target_include_directories(dedisp PRIVATE "${CMAKE_SOURCE_DIR}/src")
target_link_libraries(dedisp CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver
OpenMP::OpenMP_CXX)
target_embed_source(dedisp transpose/transpose_kernel.cu)
target_embed_source(dedisp dedisperse/dedisperse_kernel.cu)
target_link_libraries(dedisp PUBLIC OpenMP::OpenMP_CXX cudawrappers::cu
cudawrappers::nvtx cudawrappers::nvrtc)
target_compile_options(dedisp
PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>)
......@@ -23,6 +26,11 @@ set_target_properties(
SOVERSION ${DEDISP_VERSION_MAJOR}
PUBLIC_HEADER DedispPlan.hpp)
if(${DEDISP_BACKEND_HIP})
get_target_property(sources ${PROJECT_NAME} SOURCES)
set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP)
endif()
install(
TARGETS dedisp
LIBRARY DESTINATION lib
......
......@@ -6,9 +6,8 @@
#include "common/dedisp_strings.h"
#include "common/dedisp_types.h"
#include "dedisperse/dedisperse.h"
#include "transpose/transpose.hpp"
#include "unpack/unpack.h"
#include "dedisperse/dedisp_constants.cuh"
#include "unpack/UnpackKernel.hpp"
#if defined(DEDISP_BENCHMARK)
#include "external/Stopwatch.h"
......@@ -45,8 +44,6 @@ void DedispPlan::set_gulp_size(size_type gulp_size) { m_gulp_size = gulp_size; }
void DedispPlan::execute(size_type nsamps, const byte_type *in,
size_type in_nbits, byte_type *out,
size_type out_nbits, unsigned flags) {
enum { BITS_PER_BYTE = 8 };
// Note: The default out_stride is nsamps - m_max_delay
dedisp_size out_bytes_per_sample =
out_nbits / (sizeof(dedisp_byte) * BITS_PER_BYTE);
......@@ -76,13 +73,6 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in,
byte_type *out, size_type out_nbits,
size_type out_stride, size_type first_dm_idx,
size_type dm_count, unsigned flags) {
cu::checkError();
enum {
BITS_PER_BYTE = 8,
BYTES_PER_WORD = sizeof(dedisp_word) / sizeof(dedisp_byte)
};
dedisp_size out_bytes_per_sample =
out_nbits / (sizeof(dedisp_byte) * BITS_PER_BYTE);
......@@ -126,10 +116,6 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in,
init_timer->Start();
#endif
// Copy the lookup tables to constant memory on the device
copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0, 0);
copy_killmask(d_killmask, m_nchans * sizeof(dedisp_bool), 0, 0);
// Compute the problem decomposition
dedisp_size nsamps_computed = nsamps - m_max_delay;
......@@ -219,39 +205,41 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in,
nsamps_gulp); // height
htodstream->synchronize();
#ifdef DEDISP_BENCHMARK
cudaDeviceSynchronize();
input_timer->Pause();
preprocessing_timer->Start();
#endif
// Transpose the words in the input
transpose((dedisp_word *)d_in, nchan_words, nsamps_gulp,
dedisp_kernel_transpose.run(d_in, nchan_words, nsamps_gulp,
in_buf_stride_words, nsamps_padded_gulp,
(dedisp_word *)d_transposed);
d_transposed, *executestream);
#ifdef DEDISP_BENCHMARK
cudaDeviceSynchronize();
executestream->synchronize();
#endif
// Unpack the transposed data
unpack(d_transposed, nsamps_padded_gulp, nchan_words, d_unpacked, in_nbits,
unpacked_in_nbits);
dedisp_kernel_unpack.run(d_transposed, nsamps_padded_gulp, nchan_words,
d_unpacked, in_nbits, unpacked_in_nbits,
*executestream);
#ifdef DEDISP_BENCHMARK
cudaDeviceSynchronize();
executestream->synchronize();
preprocessing_timer->Pause();
dedispersion_timer->Start();
#endif
// Perform direct dedispersion without scrunching
if (!dedisperse( // d_transposed,
if (!dedisp_kernel_dedisperse.run( // d_transposed,
d_unpacked, nsamps_padded_gulp, nsamps_computed_gulp,
unpacked_in_nbits, // in_nbits,
m_nchans, 1, d_dm_list, dm_count, 1, d_out, out_stride_gulp_samples,
out_nbits, 1, 0, 0, 0, 0)) {
m_nchans, 1, *d_dm_list, dm_count, 1, d_out,
out_stride_gulp_samples, out_nbits, 1, 0, 0, 0, 0,
h_delay_table.data(), h_delay_table.size() * sizeof(dedisp_float),
h_killmask.data(), h_killmask.size() * sizeof(dedisp_bool),
*htodstream, *executestream)) {
throw_error(DEDISP_INTERNAL_GPU_ERROR);
}
#ifdef DEDISP_BENCHMARK
cudaDeviceSynchronize();
executestream->synchronize();
dedispersion_timer->Pause();
#endif
// Copy output back to host memory
......@@ -263,13 +251,12 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in,
#endif
dtohstream->memcpyDtoH2DAsync(out + gulp_samp_byte_idx, // dst
out_stride, // dst stride
(byte_type *)d_out, // src
d_out, // src
out_stride_gulp_bytes, // src stride
nsamp_bytes_computed_gulp, // width bytes
dm_count); // height
dtohstream->synchronize();
#ifdef DEDISP_BENCHMARK
cudaDeviceSynchronize();
output_timer->Pause();
#endif
......
#ifndef H_DEDISP_PLAN_INCLUDE_GUARD
#define H_DEDISP_PLAN_INCLUDE_GUARD
#ifndef DEDISP_DEDISP_DEDISP_PLAN_HPP_
#define DEDISP_DEDISP_DEDISP_PLAN_HPP_
#include "GPUPlan.hpp"
#include "dedisperse/DedispKernel.hpp"
#include "transpose/TransposeKernel.hpp"
#include "unpack/UnpackKernel.hpp"
namespace dedisp {
......@@ -48,8 +51,11 @@ public:
private:
dedisp_size m_gulp_size;
TransposeKernel dedisp_kernel_transpose;
DedisperseKernel dedisp_kernel_dedisperse;
UnpackKernel dedisp_kernel_unpack;
};
} // end namespace dedisp
#endif // H_DEDISP_PLAN_INCLUDE_GUARD
\ No newline at end of file
#endif // DEDISP_DEDISP_DEDISP_PLAN_HPP_
\ No newline at end of file
......@@ -45,10 +45,6 @@ dedisp_error dedisp_create_plan(dedisp_plan *plan, dedisp_size nchans,
*plan = nullptr;
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
*plan = new dedisp_plan_struct();
if (!plan) {
throw_error(DEDISP_MEM_ALLOC_FAILED);
......@@ -70,10 +66,6 @@ dedisp_error dedisp_set_gulp_size(dedisp_plan plan, dedisp_size gulp_size) {
throw_error(DEDISP_INVALID_PLAN);
}
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
try {
static_cast<dedisp::DedispPlan *>(plan->plan_.get())
->set_gulp_size(gulp_size);
......@@ -89,10 +81,6 @@ dedisp_size dedisp_get_gulp_size(dedisp_plan plan) {
throw_error(DEDISP_INVALID_PLAN);
}
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
try {
return static_cast<dedisp::DedispPlan *>(plan->plan_.get())
->get_gulp_size();
......@@ -107,10 +95,6 @@ dedisp_error dedisp_set_dm_list(dedisp_plan plan, const dedisp_float *dm_list,
throw_error(DEDISP_INVALID_PLAN);
}
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
try {
plan->plan_->set_dm_list(dm_list, count);
} catch (...) {
......@@ -127,10 +111,6 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, dedisp_float dm_start,
throw_error(DEDISP_INVALID_PLAN);
}
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
try {
plan->plan_->generate_dm_list(dm_start, dm_end, ti, tol);
} catch (...) {
......@@ -190,10 +170,6 @@ dedisp_error dedisp_set_killmask(dedisp_plan plan,
throw_error(DEDISP_INVALID_PLAN);
}
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
try {
plan->plan_->set_killmask(killmask);
} catch (...) {
......@@ -295,10 +271,6 @@ dedisp_error dedisp_execute_guru(const dedisp_plan plan, dedisp_size nsamps,
throw_error(DEDISP_INVALID_PLAN);
}
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
try {
static_cast<dedisp::DedispPlan *>(plan->plan_.get())
->execute_guru(nsamps, in, in_nbits, in_stride, out, out_nbits,
......@@ -319,10 +291,6 @@ dedisp_error dedisp_execute_adv(const dedisp_plan plan, dedisp_size nsamps,
throw_error(DEDISP_INVALID_PLAN);
}
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
try {
static_cast<dedisp::DedispPlan *>(plan->plan_.get())
->execute_adv(nsamps, in, in_nbits, in_stride, out, out_nbits,
......@@ -343,10 +311,6 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps,
throw_error(DEDISP_INVALID_PLAN);
}
if (cudaGetLastError() != cudaSuccess) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
try {
plan->plan_->execute(nsamps, in, in_nbits, out, out_nbits, flags);
} catch (...) {
......@@ -358,7 +322,7 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps,
dedisp_error dedisp_sync(void) {
try {
cu::checkError(cudaDeviceSynchronize());
cu::Context::synchronize();
} catch (...) {
throw_error(DEDISP_PRIOR_GPU_ERROR);
}
......
#include "DedispKernel.hpp"
#include "GPUKernel.hpp"
#include "dedisperse_kernel.cu"
#include <cudawrappers/cu.hpp>
#include <iostream>
#include <memory>
/*
* Helper functions
*/
/*
* dedisperse routine
*/
bool DedisperseKernel::run(
const dedisp_word *d_in, dedisp_size in_stride, dedisp_size nsamps,
dedisp_size in_nbits, dedisp_size nchans, dedisp_size chan_stride,
const dedisp_float *d_dm_list, dedisp_size dm_count, dedisp_size dm_stride,
dedisp_byte *d_out, dedisp_size out_stride, dedisp_size out_nbits,
dedisp_size batch_size, dedisp_size batch_in_stride,
dedisp_size batch_dm_stride, dedisp_size batch_chan_stride,
dedisp_size batch_out_stride, const void *delay_table,
const size_t delay_table_size, const void *killmask,
const size_t killmask_size, cu::Stream &htodstream,
cu::Stream &executestream) {
// Define thread decomposition
// Note: Block dimensions x and y represent time samples and DMs respectively
dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
// Note: Grid dimension x represents time samples. Dimension y represents
// DMs and batch jobs flattened together.
// Divide and round up
dedisp_size nsamp_blocks =
(nsamps - 1) / ((dedisp_size)DEDISP_SAMPS_PER_THREAD * block.x) + 1;
dedisp_size ndm_blocks = (dm_count - 1) / (dedisp_size)block.y + 1;
// Constrain the grid size to the maximum allowed
// TODO: Consider cropping the batch size dimension instead and looping over
// it inside the kernel
ndm_blocks = min((unsigned int)ndm_blocks,
(unsigned int)(MAX_CUDA_GRID_SIZE_Y / batch_size));
// Note: We combine the DM and batch dimensions into one
dim3 grid(nsamp_blocks, ndm_blocks * batch_size);
// Divide and round up
dedisp_size nsamps_reduced = (nsamps - 1) / DEDISP_SAMPS_PER_THREAD + 1;
const std::vector<const void *> parameters = {&d_in,
&nsamps,
&nsamps_reduced,
&nsamp_blocks,
&in_stride,
&dm_count,
&dm_stride,
&ndm_blocks,
&nchans,
&chan_stride,
&d_out,
&out_nbits,
&out_stride,
&d_dm_list,
&batch_in_stride,
&batch_dm_stride,
&batch_chan_stride,
&batch_out_stride};
std::ostringstream ss;
ss << "-DPARAM_IN_NBITS=" << in_nbits;
CompiledKernelInfo kernel;
bool did_recompile;
std::tie(kernel, did_recompile) = compile({ss.str()});
assertCompiled(kernel);
// Copy delay table and killmask
if (did_recompile) {
if (delay_table) {
htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_delay_table"),
delay_table, delay_table_size);
htodstream.synchronize();
}
if (killmask) {
htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_killmask"),
killmask, killmask_size);
htodstream.synchronize();
}
}
executestream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z,
block.x, block.y, block.z, 0, parameters);
return true;
}
#ifndef DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_
#define DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_
#include <string>
#include <cudawrappers/cu.hpp>
#include "common/dedisp_types.h"
#include <GPUKernel.hpp>
extern const char _binary_src_dedisp_dedisperse_dedisperse_kernel_cu_start,
_binary_src_dedisp_dedisperse_dedisperse_kernel_cu_end;
class DedisperseKernel : public GPUKernel {
public:
DedisperseKernel()
: GPUKernel(
"dedisperse_kernel.cu", "dedisperse_kernel",
std::string(
reinterpret_cast<const char *>(
&_binary_src_dedisp_dedisperse_dedisperse_kernel_cu_start),
reinterpret_cast<const char *>(
&_binary_src_dedisp_dedisperse_dedisperse_kernel_cu_end))) {
}
bool run(const dedisp_word *d_in, dedisp_size in_stride, dedisp_size nsamps,
dedisp_size in_nbits, dedisp_size nchans, dedisp_size chan_stride,
const dedisp_float *d_dm_list, dedisp_size dm_count,
dedisp_size dm_stride, dedisp_byte *d_out, dedisp_size out_stride,
dedisp_size out_nbits, dedisp_size batch_size,
dedisp_size batch_in_stride, dedisp_size batch_dm_stride,
dedisp_size batch_chan_stride, dedisp_size batch_out_stride,
const void *delay_table, const size_t delay_table_size,
const void *killmask, const size_t killmask_size,
cu::Stream &h2dstream, cu::Stream &stream);
private:
const void *delay_table_src_host_ = nullptr;
size_t delay_table_count_ = 0;
size_t delay_table_offset_ = 0;
const void *killmask_src_host_ = nullptr;
size_t killmask_count_ = 0;
size_t killmask_offset_ = 0;
};
#endif // DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_
\ No newline at end of file
#ifndef DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_
#define DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_
#define DEDISP_DEFAULT_GULP_SIZE 65536u // 131072
// TODO: Make sure this doesn't limit GPU constant memory
// available to users.
#define DEDISP_MAX_NCHANS 8192u
// Kernel tuning parameters
#define DEDISP_BLOCK_SIZE 256u
#define DEDISP_BLOCK_SAMPS 8u
#define DEDISP_SAMPS_PER_THREAD 2u // 4 is better for Fermi?
#define BITS_PER_BYTE 8
#define BYTES_PER_WORD (sizeof(dedisp_word) / sizeof(unsigned char))
#define MAX_CUDA_GRID_SIZE_X 65535u
#define MAX_CUDA_GRID_SIZE_Y 65535u
constexpr unsigned int BLOCK_DIM_X = DEDISP_BLOCK_SAMPS;
constexpr unsigned int BLOCK_DIM_Y = (DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS);
#endif // DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment