diff --git a/src/fdd/FDDGPUPlan.cpp b/src/fdd/FDDGPUPlan.cpp
index 86cb095ad8a671aafc0d3a2b82ce9b9525c29d0d..33f3bbb52037fe4bf5bb87808fe72dc9fbc09f0b 100644
--- a/src/fdd/FDDGPUPlan.cpp
+++ b/src/fdd/FDDGPUPlan.cpp
@@ -111,15 +111,16 @@ void FDDGPUPlan::execute_gpu(
 
     // Maximum number of DMs computed in one gulp
     // Parameters might be tuned for efficiency depending on system architecture
-    unsigned int ndm_batch_max = std::min(ndm / 4, (unsigned int) 64);
-    unsigned int ndm_fft_batch = 32;
+    unsigned int ndm_batch_max = round_up((ndm / 8), 8);
+                 ndm_batch_max = std::max(ndm_batch_max, (unsigned int) 64); // ndm_batch_max >= NDM_BATCH_GRID
+    unsigned int ndm_fft_batch = 16;
                  ndm_fft_batch = std::min(ndm_batch_max, ndm_fft_batch);
     // The number of buffers for DM results is configured below based on the amount of available GPU memory.
-    unsigned int ndm_buffers   = 1;
+    unsigned int ndm_buffers   = 2;
 
     // Maximum number of channels processed in one gulp
     // Parameters might be tuned for efficiency depending on system architecture
-    unsigned int nchan_batch_max = std::min(nchan / 4, (unsigned int) 64);
+    unsigned int nchan_batch_max = std::min(nchan / 8, (unsigned int) 64);
     unsigned int nchan_fft_batch = 64;
     unsigned int nchan_buffers   = 2;
 
@@ -214,16 +215,15 @@ void FDDGPUPlan::execute_gpu(
     mPrepSpinf.end();
 
     // Determine the amount of memory to use
-    size_t d_memory_total = m_device->get_total_memory();
-    size_t d_memory_free = m_device->get_free_memory();
+    size_t d_memory_total = m_device->get_total_memory(); // in Bytes
+    size_t d_memory_free = m_device->get_free_memory(); // in Bytes
     size_t sizeof_data_t_nu = 1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word);
     size_t sizeof_data_x_nu = 1ULL * nchan_batch_max * nsamp_padded * sizeof(float);
     size_t sizeof_data_x_dm = 1ULL * ndm_batch_max * nsamp_padded * sizeof(float);
-    // For device side, initial value
     size_t d_memory_required  = sizeof_data_t_nu * nchan_buffers +
-                              sizeof_data_x_nu * 1 +
-                              sizeof_data_x_dm * ndm_buffers;
-    size_t d_memory_reserved  = 0.05 * d_memory_total;
+                                sizeof_data_x_nu * 1 +
+                                sizeof_data_x_dm * ndm_buffers;
+    size_t d_memory_reserved  = 0.05 * d_memory_total; // 5% margin
 
     // Subtract the memory usage of any pre-existing device buffers
     size_t d_memory_in_use    = 0;
@@ -237,15 +237,45 @@ void FDDGPUPlan::execute_gpu(
     }
     d_memory_free += d_memory_in_use;
 
-    // Iteratively search for a maximum amount of ndm_buffers, with safety margin
-    // Make sure that it fits on device memory
+    // For host side
+    size_t h_memory_total    = get_total_memory() / std::pow(1024, 1); // in GBytes
+    size_t h_memory_free     = get_free_memory() / std::pow(1024, 1); // in GBytes
+    size_t h_memory_required = sizeof_data_t_nu * nchan_buffers +
+                               sizeof_data_x_dm * ndm_buffers; // in Bytes
+    size_t h_memory_reserved = 0.05 * h_memory_free * 0.05; // 5% margin
+
+    if ((((double) h_memory_required / std::pow(1024, 3)) + h_memory_reserved) > h_memory_free)
+    {
+        /* Note: does not take uninitialized application memory in to account!
+        *  E.g. a malloc for the paged output buffer on the application side does not register the buffer as system memory in use
+        *  Over-using host memory for the application + plan is the responsibiltiy of the application,
+        *  here we can only check for the memory used by the plan itself.*/
+        std::cout << "Host memory total    = " << h_memory_total << " Gb" << std::endl;
+        std::cout << "Host memory free     = " << h_memory_free << " Gb" << std::endl;
+        std::cout << "Host memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
+        throw std::runtime_error("FDDGPUPlan runtime error: required host memory is too large");
+    }
+
+    // Iteratively search for a setting of ndm_batch_max and ndm_buffers
+    // to match the available device memory.
+    while (ndm_batch_max > 0 &&
+          (d_memory_required + d_memory_reserved) > d_memory_free)
+    {
+        ndm_batch_max /= 2;
+        d_memory_required -= sizeof_data_x_dm;
+        sizeof_data_x_dm /= 2;
+        d_memory_required += sizeof_data_x_dm;
+    }
+
     while ((ndm_buffers * ndm_batch_max) < ndm &&
-           (d_memory_required + d_memory_reserved + sizeof_data_x_dm) < d_memory_free)
+           (d_memory_required + d_memory_reserved) < d_memory_free)
     {
         ndm_buffers++;
         d_memory_required = sizeof_data_t_nu * nchan_buffers +
-                          sizeof_data_x_nu * 1 +
-                          sizeof_data_x_dm * (ndm_buffers);
+                            sizeof_data_x_nu * 1 +
+                            sizeof_data_x_dm * ndm_buffers;
+        h_memory_required = sizeof_data_t_nu * nchan_buffers +
+                            sizeof_data_x_dm * ndm_buffers;
     };
 
     // Debug
@@ -255,9 +285,10 @@ void FDDGPUPlan::execute_gpu(
     std::cout << "nchan_buffers   = " << nchan_buffers << " x " << nchan_batch_max << " channels" << std::endl;
     std::cout << "Device memory total    = " << d_memory_total / std::pow(1024, 3) << " Gb" << std::endl;
     std::cout << "Device memory free     = " << d_memory_free  / std::pow(1024, 3) << " Gb" << std::endl;
-    std::cout << "Device Memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
-    std::cout << "Host memory total    = " << get_total_memory() / std::pow(1024, 1) << " Gb" << std::endl;
-    std::cout << "Host memory free     = " << get_free_memory()  / std::pow(1024, 1) << " Gb" << std::endl;
+    std::cout << "Device memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
+    std::cout << "Host memory total    = " << h_memory_total << " Gb" << std::endl;
+    std::cout << "Host memory free     = " << h_memory_free  << " Gb" << std::endl;
+    std::cout << "Host Memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
 #endif
 
     // Allocate memory
@@ -302,10 +333,10 @@ void FDDGPUPlan::execute_gpu(
     mAllocMem.end();
 
 #ifdef DEDISP_DEBUG
-    size_t d_memory_free_after_malloc = m_device->get_free_memory(); //bytes
-    size_t h_memory_free_after_malloc = get_free_memory(); //MB
-    std::cout << "Device memory free after memory allocations    = " << d_memory_free_after_malloc  / std::pow(1024, 3) << " Gb" << std::endl;
-    std::cout << "Host memory free after memory allocations    = " << h_memory_free_after_malloc  / std::pow(1024, 1) << " Gb" << std::endl;
+    size_t d_memory_free_after_malloc = m_device->get_free_memory(); // in Bytes
+    size_t h_memory_free_after_malloc = get_free_memory(); // in Mbytes
+    std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc  / std::pow(1024, 3) << " Gb" << std::endl;
+    std::cout << "Host memory free after memory allocations   = " << h_memory_free_after_malloc  / std::pow(1024, 1) << " Gb" << std::endl;
 #endif
 
     // Initialize FDDKernel