Update computation and reporting of memory requirements

The goal is to have at least double-buffering in both channels and DMs. To achieve this, the code now uses 'more optimal' initial values and reduces ndm_batch_max to prevent running of out GPU memory.

Update computation and reporting of memory requirements
0bd56ff2 · Bram Veenboer · a7121ab9 · 0bd56ff2
Commit 0bd56ff2 authored 4 years ago by Bram Veenboer
--- a/src/fdd/FDDGPUPlan.cpp
+++ b/src/fdd/FDDGPUPlan.cpp
@@ -111,15 +111,16 @@ void FDDGPUPlan::execute_gpu(
    // Maximum number of DMs computed in one gulp
    // Parameters might be tuned for efficiency depending on system architecture
-    unsigned int ndm_batch_max = std::min(ndm / 4, (unsigned int) 64);
+    unsigned int ndm_batch_max = round_up((ndm / 8), 8);
-    unsigned int ndm_fft_batch = 32;
+                 ndm_batch_max = std::max(ndm_batch_max, (unsigned int) 64); // ndm_batch_max >= NDM_BATCH_GRID
+    unsigned int ndm_fft_batch = 16;
                 ndm_fft_batch = std::min(ndm_batch_max, ndm_fft_batch);
    // The number of buffers for DM results is configured below based on the amount of available GPU memory.
-    unsigned int ndm_buffers   = 1;
+    unsigned int ndm_buffers   = 2;
    // Maximum number of channels processed in one gulp
    // Parameters might be tuned for efficiency depending on system architecture
-    unsigned int nchan_batch_max = std::min(nchan / 4, (unsigned int) 64);
+    unsigned int nchan_batch_max = std::min(nchan / 8, (unsigned int) 64);
    unsigned int nchan_fft_batch = 64;
    unsigned int nchan_buffers   = 2;
@@ -214,16 +215,15 @@ void FDDGPUPlan::execute_gpu(
    mPrepSpinf.end();
    // Determine the amount of memory to use
-    size_t d_memory_total = m_device->get_total_memory();
+    size_t d_memory_total = m_device->get_total_memory(); // in Bytes
-    size_t d_memory_free = m_device->get_free_memory();
+    size_t d_memory_free = m_device->get_free_memory(); // in Bytes
    size_t sizeof_data_t_nu = 1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word);
    size_t sizeof_data_x_nu = 1ULL * nchan_batch_max * nsamp_padded * sizeof(float);
    size_t sizeof_data_x_dm = 1ULL * ndm_batch_max * nsamp_padded * sizeof(float);
-    // For device side, initial value
    size_t d_memory_required  = sizeof_data_t_nu * nchan_buffers +
                                sizeof_data_x_nu * 1 +
                                sizeof_data_x_dm * ndm_buffers;
-    size_t d_memory_reserved  = 0.05 * d_memory_total;
+    size_t d_memory_reserved  = 0.05 * d_memory_total; // 5% margin
    // Subtract the memory usage of any pre-existing device buffers
    size_t d_memory_in_use    = 0;
@@ -237,15 +237,45 @@ void FDDGPUPlan::execute_gpu(
    }
    d_memory_free += d_memory_in_use;
-    // Iteratively search for a maximum amount of ndm_buffers, with safety margin
+    // For host side
-    // Make sure that it fits on device memory
+    size_t h_memory_total    = get_total_memory() / std::pow(1024, 1); // in GBytes
+    size_t h_memory_free     = get_free_memory() / std::pow(1024, 1); // in GBytes
+    size_t h_memory_required = sizeof_data_t_nu * nchan_buffers +
+                               sizeof_data_x_dm * ndm_buffers; // in Bytes
+    size_t h_memory_reserved = 0.05 * h_memory_free * 0.05; // 5% margin
+    if ((((double) h_memory_required / std::pow(1024, 3)) + h_memory_reserved) > h_memory_free)
+    {
+        /* Note: does not take uninitialized application memory in to account!
+        *  E.g. a malloc for the paged output buffer on the application side does not register the buffer as system memory in use
+        *  Over-using host memory for the application + plan is the responsibiltiy of the application,
+        *  here we can only check for the memory used by the plan itself.*/
+        std::cout << "Host memory total    = " << h_memory_total << " Gb" << std::endl;
+        std::cout << "Host memory free     = " << h_memory_free << " Gb" << std::endl;
+        std::cout << "Host memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
+        throw std::runtime_error("FDDGPUPlan runtime error: required host memory is too large");
+    }
+    // Iteratively search for a setting of ndm_batch_max and ndm_buffers
+    // to match the available device memory.
+    while (ndm_batch_max > 0 &&
+          (d_memory_required + d_memory_reserved) > d_memory_free)
+    {
+        ndm_batch_max /= 2;
+        d_memory_required -= sizeof_data_x_dm;
+        sizeof_data_x_dm /= 2;
+        d_memory_required += sizeof_data_x_dm;
+    }
    while ((ndm_buffers * ndm_batch_max) < ndm &&
-           (d_memory_required + d_memory_reserved + sizeof_data_x_dm) < d_memory_free)
+           (d_memory_required + d_memory_reserved) < d_memory_free)
    {
        ndm_buffers++;
        d_memory_required = sizeof_data_t_nu * nchan_buffers +
                            sizeof_data_x_nu * 1 +
-                          sizeof_data_x_dm * (ndm_buffers);
+                            sizeof_data_x_dm * ndm_buffers;
+        h_memory_required = sizeof_data_t_nu * nchan_buffers +
+                            sizeof_data_x_dm * ndm_buffers;
    };
    // Debug
@@ -255,9 +285,10 @@ void FDDGPUPlan::execute_gpu(
    std::cout << "nchan_buffers   = " << nchan_buffers << " x " << nchan_batch_max << " channels" << std::endl;
    std::cout << "Device memory total    = " << d_memory_total / std::pow(1024, 3) << " Gb" << std::endl;
    std::cout << "Device memory free     = " << d_memory_free  / std::pow(1024, 3) << " Gb" << std::endl;
-    std::cout << "Device Memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
+    std::cout << "Device memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
-    std::cout << "Host memory total    = " << get_total_memory() / std::pow(1024, 1) << " Gb" << std::endl;
+    std::cout << "Host memory total    = " << h_memory_total << " Gb" << std::endl;
-    std::cout << "Host memory free     = " << get_free_memory()  / std::pow(1024, 1) << " Gb" << std::endl;
+    std::cout << "Host memory free     = " << h_memory_free  << " Gb" << std::endl;
+    std::cout << "Host Memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
 #endif
    // Allocate memory
@@ -302,8 +333,8 @@ void FDDGPUPlan::execute_gpu(
    mAllocMem.end();
 #ifdef DEDISP_DEBUG
-    size_t d_memory_free_after_malloc = m_device->get_free_memory(); //bytes
+    size_t d_memory_free_after_malloc = m_device->get_free_memory(); // in Bytes
-    size_t h_memory_free_after_malloc = get_free_memory(); //MB
+    size_t h_memory_free_after_malloc = get_free_memory(); // in Mbytes
    std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc  / std::pow(1024, 3) << " Gb" << std::endl;
    std::cout << "Host memory free after memory allocations   = " << h_memory_free_after_malloc  / std::pow(1024, 1) << " Gb" << std::endl;
 #endif