diff --git a/bin/benchmark/bench.hpp b/bin/benchmark/bench.hpp
index 3b882f2809459c056604d868dbbf6adf0ab6e48f..db5af6942c5ad4a092a55a1dac91bb88866997f9 100644
--- a/bin/benchmark/bench.hpp
+++ b/bin/benchmark/bench.hpp
@@ -346,17 +346,17 @@ int run(BenchParameters & benchParameter)
   }
   tinit->Pause();
 
+  if(benchParameter.verbose) printf("Create plan and init GPU\n");
+  // Create a dedispersion plan
+  tplan->Start();
+  PlanType plan(nchans, dt, f0, df, device_idx);
+  tplan->Pause();
+
   for(int iPlan = 0; iPlan < benchParameter.niterations; iPlan++)
   {
-    tplan->Start();
-
     printf("\n");
     printf("------------------------- ITERATION %d out of %d  -------------------------\n", iPlan+1, benchParameter.niterations);
 
-    if(benchParameter.verbose) printf("Create plan and init GPU\n");
-    // Create a dedispersion plan
-    PlanType plan(nchans, dt, f0, df, device_idx);
-
     if(benchParameter.verbose) printf("Gen DM list\n");
     // Generate a list of dispersion measures for the plan
     if (dm_count==0)
@@ -391,7 +391,6 @@ int run(BenchParameters & benchParameter)
       printf("\n");
     }
 
-    tplan->Pause();
     texecute->Start();
 
     printf("\n");
diff --git a/src/fdd/FDDGPUPlan.cpp b/src/fdd/FDDGPUPlan.cpp
index 8780b0c1e1416e58dea7a10d0c92d4b49dd5f82e..33f3bbb52037fe4bf5bb87808fe72dc9fbc09f0b 100644
--- a/src/fdd/FDDGPUPlan.cpp
+++ b/src/fdd/FDDGPUPlan.cpp
@@ -111,15 +111,16 @@ void FDDGPUPlan::execute_gpu(
 
     // Maximum number of DMs computed in one gulp
     // Parameters might be tuned for efficiency depending on system architecture
-    unsigned int ndm_batch_max = std::min(ndm / 4, (unsigned int) 64);
-    unsigned int ndm_fft_batch = 32;
+    unsigned int ndm_batch_max = round_up((ndm / 8), 8);
+                 ndm_batch_max = std::max(ndm_batch_max, (unsigned int) 64); // ndm_batch_max >= NDM_BATCH_GRID
+    unsigned int ndm_fft_batch = 16;
                  ndm_fft_batch = std::min(ndm_batch_max, ndm_fft_batch);
     // The number of buffers for DM results is configured below based on the amount of available GPU memory.
-    unsigned int ndm_buffers   = 1;
+    unsigned int ndm_buffers   = 2;
 
     // Maximum number of channels processed in one gulp
     // Parameters might be tuned for efficiency depending on system architecture
-    unsigned int nchan_batch_max = std::min(nchan / 4, (unsigned int) 64);
+    unsigned int nchan_batch_max = std::min(nchan / 8, (unsigned int) 64);
     unsigned int nchan_fft_batch = 64;
     unsigned int nchan_buffers   = 2;
 
@@ -214,16 +215,15 @@ void FDDGPUPlan::execute_gpu(
     mPrepSpinf.end();
 
     // Determine the amount of memory to use
-    size_t d_memory_total = m_device->get_total_memory();
-    size_t d_memory_free = m_device->get_free_memory();
+    size_t d_memory_total = m_device->get_total_memory(); // in Bytes
+    size_t d_memory_free = m_device->get_free_memory(); // in Bytes
     size_t sizeof_data_t_nu = 1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word);
     size_t sizeof_data_x_nu = 1ULL * nchan_batch_max * nsamp_padded * sizeof(float);
     size_t sizeof_data_x_dm = 1ULL * ndm_batch_max * nsamp_padded * sizeof(float);
-    // For device side, initial value
     size_t d_memory_required  = sizeof_data_t_nu * nchan_buffers +
-                              sizeof_data_x_nu * 1 +
-                              sizeof_data_x_dm * ndm_buffers;
-    size_t d_memory_reserved  = 0.05 * d_memory_total;
+                                sizeof_data_x_nu * 1 +
+                                sizeof_data_x_dm * ndm_buffers;
+    size_t d_memory_reserved  = 0.05 * d_memory_total; // 5% margin
 
     // Subtract the memory usage of any pre-existing device buffers
     size_t d_memory_in_use    = 0;
@@ -237,15 +237,45 @@ void FDDGPUPlan::execute_gpu(
     }
     d_memory_free += d_memory_in_use;
 
-    // Iteratively search for a maximum amount of ndm_buffers, with safety margin
-    // Make sure that it fits on device memory
+    // For host side
+    size_t h_memory_total    = get_total_memory() / std::pow(1024, 1); // in GBytes
+    size_t h_memory_free     = get_free_memory() / std::pow(1024, 1); // in GBytes
+    size_t h_memory_required = sizeof_data_t_nu * nchan_buffers +
+                               sizeof_data_x_dm * ndm_buffers; // in Bytes
+    size_t h_memory_reserved = 0.05 * h_memory_free * 0.05; // 5% margin
+
+    if ((((double) h_memory_required / std::pow(1024, 3)) + h_memory_reserved) > h_memory_free)
+    {
+        /* Note: does not take uninitialized application memory in to account!
+        *  E.g. a malloc for the paged output buffer on the application side does not register the buffer as system memory in use
+        *  Over-using host memory for the application + plan is the responsibiltiy of the application,
+        *  here we can only check for the memory used by the plan itself.*/
+        std::cout << "Host memory total    = " << h_memory_total << " Gb" << std::endl;
+        std::cout << "Host memory free     = " << h_memory_free << " Gb" << std::endl;
+        std::cout << "Host memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
+        throw std::runtime_error("FDDGPUPlan runtime error: required host memory is too large");
+    }
+
+    // Iteratively search for a setting of ndm_batch_max and ndm_buffers
+    // to match the available device memory.
+    while (ndm_batch_max > 0 &&
+          (d_memory_required + d_memory_reserved) > d_memory_free)
+    {
+        ndm_batch_max /= 2;
+        d_memory_required -= sizeof_data_x_dm;
+        sizeof_data_x_dm /= 2;
+        d_memory_required += sizeof_data_x_dm;
+    }
+
     while ((ndm_buffers * ndm_batch_max) < ndm &&
-           (d_memory_required + d_memory_reserved + sizeof_data_x_dm) < d_memory_free)
+           (d_memory_required + d_memory_reserved) < d_memory_free)
     {
         ndm_buffers++;
         d_memory_required = sizeof_data_t_nu * nchan_buffers +
-                          sizeof_data_x_nu * 1 +
-                          sizeof_data_x_dm * (ndm_buffers);
+                            sizeof_data_x_nu * 1 +
+                            sizeof_data_x_dm * ndm_buffers;
+        h_memory_required = sizeof_data_t_nu * nchan_buffers +
+                            sizeof_data_x_dm * ndm_buffers;
     };
 
     // Debug
@@ -255,9 +285,10 @@ void FDDGPUPlan::execute_gpu(
     std::cout << "nchan_buffers   = " << nchan_buffers << " x " << nchan_batch_max << " channels" << std::endl;
     std::cout << "Device memory total    = " << d_memory_total / std::pow(1024, 3) << " Gb" << std::endl;
     std::cout << "Device memory free     = " << d_memory_free  / std::pow(1024, 3) << " Gb" << std::endl;
-    std::cout << "Device Memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
-    std::cout << "Host memory total    = " << get_total_memory() / std::pow(1024, 1) << " Gb" << std::endl;
-    std::cout << "Host memory free     = " << get_free_memory()  / std::pow(1024, 1) << " Gb" << std::endl;
+    std::cout << "Device memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
+    std::cout << "Host memory total    = " << h_memory_total << " Gb" << std::endl;
+    std::cout << "Host memory free     = " << h_memory_free  << " Gb" << std::endl;
+    std::cout << "Host Memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl;
 #endif
 
     // Allocate memory
@@ -302,10 +333,10 @@ void FDDGPUPlan::execute_gpu(
     mAllocMem.end();
 
 #ifdef DEDISP_DEBUG
-    size_t d_memory_free_after_malloc = m_device->get_free_memory(); //bytes
-    size_t h_memory_free_after_malloc = get_free_memory(); //MB
-    std::cout << "Device memory free after memory allocations    = " << d_memory_free_after_malloc  / std::pow(1024, 3) << " Gb" << std::endl;
-    std::cout << "Host memory free after memory allocations    = " << h_memory_free_after_malloc  / std::pow(1024, 1) << " Gb" << std::endl;
+    size_t d_memory_free_after_malloc = m_device->get_free_memory(); // in Bytes
+    size_t h_memory_free_after_malloc = get_free_memory(); // in Mbytes
+    std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc  / std::pow(1024, 3) << " Gb" << std::endl;
+    std::cout << "Host memory free after memory allocations   = " << h_memory_free_after_malloc  / std::pow(1024, 1) << " Gb" << std::endl;
 #endif
 
     // Initialize FDDKernel
@@ -402,11 +433,9 @@ void FDDGPUPlan::execute_gpu(
                 std::cout << "Copy output " << dm_job.idm_start << " to " << dm_job.idm_end << " with " << dm_job.ndm_current << " ndms" << std::endl;
             }
 #endif
-            // copy part from pinned h_data_t_dm to part of paged return buffer out
-            // GPU Host mem pointers
+            // Copy partial output from pinned memory to output buffer
             dedisp_size src_stride = 1ULL * nsamp_padded * out_bytes_per_sample;
             auto* h_src = dm_job.h_data_t_dm->data();
-            // CPU mem pointers
             dedisp_size dst_stride = 1ULL * nsamp_computed * out_bytes_per_sample;
             dedisp_size dst_offset = 1ULL * dm_job.idm_start * dst_stride;
             auto* h_dst = (void *) (((size_t) out) + dst_offset);
diff --git a/src/fdd/dedisperse/FDDKernel.cu b/src/fdd/dedisperse/FDDKernel.cu
index 4c5a642c25df498c856c8e6c29f80d152c0d398c..5f4e5792145374cbeab34efc784557e540613331 100644
--- a/src/fdd/dedisperse/FDDKernel.cu
+++ b/src/fdd/dedisperse/FDDKernel.cu
@@ -62,7 +62,7 @@ void FDDKernel::launch(
     *  depending on the system configurations.
     */
     #define CALL_KERNEL(NCHAN)        \
-    dedisperse_kernel<NCHAN, false>    \
+    dedisperse_kernel<NCHAN, false>   \
     <<<grid, block, 0, stream>>>(     \
         nfreq,                        \
         dt,                           \