Skip to content
Snippets Groups Projects
Commit 0bd56ff2 authored by Bram Veenboer's avatar Bram Veenboer
Browse files

Update computation and reporting of memory requirements

The goal is to have at least double-buffering in both channels and DMs.
To achieve this, the code now uses 'more optimal' initial values and
reduces ndm_batch_max to prevent running of out GPU memory.
parent a7121ab9
No related branches found
Tags
No related merge requests found
...@@ -111,15 +111,16 @@ void FDDGPUPlan::execute_gpu( ...@@ -111,15 +111,16 @@ void FDDGPUPlan::execute_gpu(
// Maximum number of DMs computed in one gulp // Maximum number of DMs computed in one gulp
// Parameters might be tuned for efficiency depending on system architecture // Parameters might be tuned for efficiency depending on system architecture
unsigned int ndm_batch_max = std::min(ndm / 4, (unsigned int) 64); unsigned int ndm_batch_max = round_up((ndm / 8), 8);
unsigned int ndm_fft_batch = 32; ndm_batch_max = std::max(ndm_batch_max, (unsigned int) 64); // ndm_batch_max >= NDM_BATCH_GRID
unsigned int ndm_fft_batch = 16;
ndm_fft_batch = std::min(ndm_batch_max, ndm_fft_batch); ndm_fft_batch = std::min(ndm_batch_max, ndm_fft_batch);
// The number of buffers for DM results is configured below based on the amount of available GPU memory. // The number of buffers for DM results is configured below based on the amount of available GPU memory.
unsigned int ndm_buffers = 1; unsigned int ndm_buffers = 2;
// Maximum number of channels processed in one gulp // Maximum number of channels processed in one gulp
// Parameters might be tuned for efficiency depending on system architecture // Parameters might be tuned for efficiency depending on system architecture
unsigned int nchan_batch_max = std::min(nchan / 4, (unsigned int) 64); unsigned int nchan_batch_max = std::min(nchan / 8, (unsigned int) 64);
unsigned int nchan_fft_batch = 64; unsigned int nchan_fft_batch = 64;
unsigned int nchan_buffers = 2; unsigned int nchan_buffers = 2;
...@@ -214,16 +215,15 @@ void FDDGPUPlan::execute_gpu( ...@@ -214,16 +215,15 @@ void FDDGPUPlan::execute_gpu(
mPrepSpinf.end(); mPrepSpinf.end();
// Determine the amount of memory to use // Determine the amount of memory to use
size_t d_memory_total = m_device->get_total_memory(); size_t d_memory_total = m_device->get_total_memory(); // in Bytes
size_t d_memory_free = m_device->get_free_memory(); size_t d_memory_free = m_device->get_free_memory(); // in Bytes
size_t sizeof_data_t_nu = 1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word); size_t sizeof_data_t_nu = 1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word);
size_t sizeof_data_x_nu = 1ULL * nchan_batch_max * nsamp_padded * sizeof(float); size_t sizeof_data_x_nu = 1ULL * nchan_batch_max * nsamp_padded * sizeof(float);
size_t sizeof_data_x_dm = 1ULL * ndm_batch_max * nsamp_padded * sizeof(float); size_t sizeof_data_x_dm = 1ULL * ndm_batch_max * nsamp_padded * sizeof(float);
// For device side, initial value
size_t d_memory_required = sizeof_data_t_nu * nchan_buffers + size_t d_memory_required = sizeof_data_t_nu * nchan_buffers +
sizeof_data_x_nu * 1 + sizeof_data_x_nu * 1 +
sizeof_data_x_dm * ndm_buffers; sizeof_data_x_dm * ndm_buffers;
size_t d_memory_reserved = 0.05 * d_memory_total; size_t d_memory_reserved = 0.05 * d_memory_total; // 5% margin
// Subtract the memory usage of any pre-existing device buffers // Subtract the memory usage of any pre-existing device buffers
size_t d_memory_in_use = 0; size_t d_memory_in_use = 0;
...@@ -237,15 +237,45 @@ void FDDGPUPlan::execute_gpu( ...@@ -237,15 +237,45 @@ void FDDGPUPlan::execute_gpu(
} }
d_memory_free += d_memory_in_use; d_memory_free += d_memory_in_use;
// Iteratively search for a maximum amount of ndm_buffers, with safety margin // For host side
// Make sure that it fits on device memory size_t h_memory_total = get_total_memory() / std::pow(1024, 1); // in GBytes
size_t h_memory_free = get_free_memory() / std::pow(1024, 1); // in GBytes
size_t h_memory_required = sizeof_data_t_nu * nchan_buffers +
sizeof_data_x_dm * ndm_buffers; // in Bytes
size_t h_memory_reserved = 0.05 * h_memory_free * 0.05; // 5% margin
if ((((double) h_memory_required / std::pow(1024, 3)) + h_memory_reserved) > h_memory_free)
{
/* Note: does not take uninitialized application memory in to account!
* E.g. a malloc for the paged output buffer on the application side does not register the buffer as system memory in use
* Over-using host memory for the application + plan is the responsibiltiy of the application,
* here we can only check for the memory used by the plan itself.*/
std::cout << "Host memory total = " << h_memory_total << " Gb" << std::endl;
std::cout << "Host memory free = " << h_memory_free << " Gb" << std::endl;
std::cout << "Host memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
throw std::runtime_error("FDDGPUPlan runtime error: required host memory is too large");
}
// Iteratively search for a setting of ndm_batch_max and ndm_buffers
// to match the available device memory.
while (ndm_batch_max > 0 &&
(d_memory_required + d_memory_reserved) > d_memory_free)
{
ndm_batch_max /= 2;
d_memory_required -= sizeof_data_x_dm;
sizeof_data_x_dm /= 2;
d_memory_required += sizeof_data_x_dm;
}
while ((ndm_buffers * ndm_batch_max) < ndm && while ((ndm_buffers * ndm_batch_max) < ndm &&
(d_memory_required + d_memory_reserved + sizeof_data_x_dm) < d_memory_free) (d_memory_required + d_memory_reserved) < d_memory_free)
{ {
ndm_buffers++; ndm_buffers++;
d_memory_required = sizeof_data_t_nu * nchan_buffers + d_memory_required = sizeof_data_t_nu * nchan_buffers +
sizeof_data_x_nu * 1 + sizeof_data_x_nu * 1 +
sizeof_data_x_dm * (ndm_buffers); sizeof_data_x_dm * ndm_buffers;
h_memory_required = sizeof_data_t_nu * nchan_buffers +
sizeof_data_x_dm * ndm_buffers;
}; };
// Debug // Debug
...@@ -255,9 +285,10 @@ void FDDGPUPlan::execute_gpu( ...@@ -255,9 +285,10 @@ void FDDGPUPlan::execute_gpu(
std::cout << "nchan_buffers = " << nchan_buffers << " x " << nchan_batch_max << " channels" << std::endl; std::cout << "nchan_buffers = " << nchan_buffers << " x " << nchan_batch_max << " channels" << std::endl;
std::cout << "Device memory total = " << d_memory_total / std::pow(1024, 3) << " Gb" << std::endl; std::cout << "Device memory total = " << d_memory_total / std::pow(1024, 3) << " Gb" << std::endl;
std::cout << "Device memory free = " << d_memory_free / std::pow(1024, 3) << " Gb" << std::endl; std::cout << "Device memory free = " << d_memory_free / std::pow(1024, 3) << " Gb" << std::endl;
std::cout << "Device Memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl; std::cout << "Device memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
std::cout << "Host memory total = " << get_total_memory() / std::pow(1024, 1) << " Gb" << std::endl; std::cout << "Host memory total = " << h_memory_total << " Gb" << std::endl;
std::cout << "Host memory free = " << get_free_memory() / std::pow(1024, 1) << " Gb" << std::endl; std::cout << "Host memory free = " << h_memory_free << " Gb" << std::endl;
std::cout << "Host Memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
#endif #endif
// Allocate memory // Allocate memory
...@@ -302,8 +333,8 @@ void FDDGPUPlan::execute_gpu( ...@@ -302,8 +333,8 @@ void FDDGPUPlan::execute_gpu(
mAllocMem.end(); mAllocMem.end();
#ifdef DEDISP_DEBUG #ifdef DEDISP_DEBUG
size_t d_memory_free_after_malloc = m_device->get_free_memory(); //bytes size_t d_memory_free_after_malloc = m_device->get_free_memory(); // in Bytes
size_t h_memory_free_after_malloc = get_free_memory(); //MB size_t h_memory_free_after_malloc = get_free_memory(); // in Mbytes
std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc / std::pow(1024, 3) << " Gb" << std::endl; std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc / std::pow(1024, 3) << " Gb" << std::endl;
std::cout << "Host memory free after memory allocations = " << h_memory_free_after_malloc / std::pow(1024, 1) << " Gb" << std::endl; std::cout << "Host memory free after memory allocations = " << h_memory_free_after_malloc / std::pow(1024, 1) << " Gb" << std::endl;
#endif #endif
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment