diff --git a/src/fdd/FDDGPUPlan.cpp b/src/fdd/FDDGPUPlan.cpp index 86cb095ad8a671aafc0d3a2b82ce9b9525c29d0d..33f3bbb52037fe4bf5bb87808fe72dc9fbc09f0b 100644 --- a/src/fdd/FDDGPUPlan.cpp +++ b/src/fdd/FDDGPUPlan.cpp @@ -111,15 +111,16 @@ void FDDGPUPlan::execute_gpu( // Maximum number of DMs computed in one gulp // Parameters might be tuned for efficiency depending on system architecture - unsigned int ndm_batch_max = std::min(ndm / 4, (unsigned int) 64); - unsigned int ndm_fft_batch = 32; + unsigned int ndm_batch_max = round_up((ndm / 8), 8); + ndm_batch_max = std::max(ndm_batch_max, (unsigned int) 64); // ndm_batch_max >= NDM_BATCH_GRID + unsigned int ndm_fft_batch = 16; ndm_fft_batch = std::min(ndm_batch_max, ndm_fft_batch); // The number of buffers for DM results is configured below based on the amount of available GPU memory. - unsigned int ndm_buffers = 1; + unsigned int ndm_buffers = 2; // Maximum number of channels processed in one gulp // Parameters might be tuned for efficiency depending on system architecture - unsigned int nchan_batch_max = std::min(nchan / 4, (unsigned int) 64); + unsigned int nchan_batch_max = std::min(nchan / 8, (unsigned int) 64); unsigned int nchan_fft_batch = 64; unsigned int nchan_buffers = 2; @@ -214,16 +215,15 @@ void FDDGPUPlan::execute_gpu( mPrepSpinf.end(); // Determine the amount of memory to use - size_t d_memory_total = m_device->get_total_memory(); - size_t d_memory_free = m_device->get_free_memory(); + size_t d_memory_total = m_device->get_total_memory(); // in Bytes + size_t d_memory_free = m_device->get_free_memory(); // in Bytes size_t sizeof_data_t_nu = 1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word); size_t sizeof_data_x_nu = 1ULL * nchan_batch_max * nsamp_padded * sizeof(float); size_t sizeof_data_x_dm = 1ULL * ndm_batch_max * nsamp_padded * sizeof(float); - // For device side, initial value size_t d_memory_required = sizeof_data_t_nu * nchan_buffers + - sizeof_data_x_nu * 1 + - sizeof_data_x_dm * ndm_buffers; - size_t d_memory_reserved = 0.05 * d_memory_total; + sizeof_data_x_nu * 1 + + sizeof_data_x_dm * ndm_buffers; + size_t d_memory_reserved = 0.05 * d_memory_total; // 5% margin // Subtract the memory usage of any pre-existing device buffers size_t d_memory_in_use = 0; @@ -237,15 +237,45 @@ void FDDGPUPlan::execute_gpu( } d_memory_free += d_memory_in_use; - // Iteratively search for a maximum amount of ndm_buffers, with safety margin - // Make sure that it fits on device memory + // For host side + size_t h_memory_total = get_total_memory() / std::pow(1024, 1); // in GBytes + size_t h_memory_free = get_free_memory() / std::pow(1024, 1); // in GBytes + size_t h_memory_required = sizeof_data_t_nu * nchan_buffers + + sizeof_data_x_dm * ndm_buffers; // in Bytes + size_t h_memory_reserved = 0.05 * h_memory_free * 0.05; // 5% margin + + if ((((double) h_memory_required / std::pow(1024, 3)) + h_memory_reserved) > h_memory_free) + { + /* Note: does not take uninitialized application memory in to account! + * E.g. a malloc for the paged output buffer on the application side does not register the buffer as system memory in use + * Over-using host memory for the application + plan is the responsibiltiy of the application, + * here we can only check for the memory used by the plan itself.*/ + std::cout << "Host memory total = " << h_memory_total << " Gb" << std::endl; + std::cout << "Host memory free = " << h_memory_free << " Gb" << std::endl; + std::cout << "Host memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl; + throw std::runtime_error("FDDGPUPlan runtime error: required host memory is too large"); + } + + // Iteratively search for a setting of ndm_batch_max and ndm_buffers + // to match the available device memory. + while (ndm_batch_max > 0 && + (d_memory_required + d_memory_reserved) > d_memory_free) + { + ndm_batch_max /= 2; + d_memory_required -= sizeof_data_x_dm; + sizeof_data_x_dm /= 2; + d_memory_required += sizeof_data_x_dm; + } + while ((ndm_buffers * ndm_batch_max) < ndm && - (d_memory_required + d_memory_reserved + sizeof_data_x_dm) < d_memory_free) + (d_memory_required + d_memory_reserved) < d_memory_free) { ndm_buffers++; d_memory_required = sizeof_data_t_nu * nchan_buffers + - sizeof_data_x_nu * 1 + - sizeof_data_x_dm * (ndm_buffers); + sizeof_data_x_nu * 1 + + sizeof_data_x_dm * ndm_buffers; + h_memory_required = sizeof_data_t_nu * nchan_buffers + + sizeof_data_x_dm * ndm_buffers; }; // Debug @@ -255,9 +285,10 @@ void FDDGPUPlan::execute_gpu( std::cout << "nchan_buffers = " << nchan_buffers << " x " << nchan_batch_max << " channels" << std::endl; std::cout << "Device memory total = " << d_memory_total / std::pow(1024, 3) << " Gb" << std::endl; std::cout << "Device memory free = " << d_memory_free / std::pow(1024, 3) << " Gb" << std::endl; - std::cout << "Device Memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl; - std::cout << "Host memory total = " << get_total_memory() / std::pow(1024, 1) << " Gb" << std::endl; - std::cout << "Host memory free = " << get_free_memory() / std::pow(1024, 1) << " Gb" << std::endl; + std::cout << "Device memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl; + std::cout << "Host memory total = " << h_memory_total << " Gb" << std::endl; + std::cout << "Host memory free = " << h_memory_free << " Gb" << std::endl; + std::cout << "Host Memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl; #endif // Allocate memory @@ -302,10 +333,10 @@ void FDDGPUPlan::execute_gpu( mAllocMem.end(); #ifdef DEDISP_DEBUG - size_t d_memory_free_after_malloc = m_device->get_free_memory(); //bytes - size_t h_memory_free_after_malloc = get_free_memory(); //MB - std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc / std::pow(1024, 3) << " Gb" << std::endl; - std::cout << "Host memory free after memory allocations = " << h_memory_free_after_malloc / std::pow(1024, 1) << " Gb" << std::endl; + size_t d_memory_free_after_malloc = m_device->get_free_memory(); // in Bytes + size_t h_memory_free_after_malloc = get_free_memory(); // in Mbytes + std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc / std::pow(1024, 3) << " Gb" << std::endl; + std::cout << "Host memory free after memory allocations = " << h_memory_free_after_malloc / std::pow(1024, 1) << " Gb" << std::endl; #endif // Initialize FDDKernel