diff --git a/bin/benchmark/bench.hpp b/bin/benchmark/bench.hpp index 3b882f2809459c056604d868dbbf6adf0ab6e48f..db5af6942c5ad4a092a55a1dac91bb88866997f9 100644 --- a/bin/benchmark/bench.hpp +++ b/bin/benchmark/bench.hpp @@ -346,17 +346,17 @@ int run(BenchParameters & benchParameter) } tinit->Pause(); + if(benchParameter.verbose) printf("Create plan and init GPU\n"); + // Create a dedispersion plan + tplan->Start(); + PlanType plan(nchans, dt, f0, df, device_idx); + tplan->Pause(); + for(int iPlan = 0; iPlan < benchParameter.niterations; iPlan++) { - tplan->Start(); - printf("\n"); printf("------------------------- ITERATION %d out of %d -------------------------\n", iPlan+1, benchParameter.niterations); - if(benchParameter.verbose) printf("Create plan and init GPU\n"); - // Create a dedispersion plan - PlanType plan(nchans, dt, f0, df, device_idx); - if(benchParameter.verbose) printf("Gen DM list\n"); // Generate a list of dispersion measures for the plan if (dm_count==0) @@ -391,7 +391,6 @@ int run(BenchParameters & benchParameter) printf("\n"); } - tplan->Pause(); texecute->Start(); printf("\n"); diff --git a/src/fdd/FDDGPUPlan.cpp b/src/fdd/FDDGPUPlan.cpp index 8780b0c1e1416e58dea7a10d0c92d4b49dd5f82e..33f3bbb52037fe4bf5bb87808fe72dc9fbc09f0b 100644 --- a/src/fdd/FDDGPUPlan.cpp +++ b/src/fdd/FDDGPUPlan.cpp @@ -111,15 +111,16 @@ void FDDGPUPlan::execute_gpu( // Maximum number of DMs computed in one gulp // Parameters might be tuned for efficiency depending on system architecture - unsigned int ndm_batch_max = std::min(ndm / 4, (unsigned int) 64); - unsigned int ndm_fft_batch = 32; + unsigned int ndm_batch_max = round_up((ndm / 8), 8); + ndm_batch_max = std::max(ndm_batch_max, (unsigned int) 64); // ndm_batch_max >= NDM_BATCH_GRID + unsigned int ndm_fft_batch = 16; ndm_fft_batch = std::min(ndm_batch_max, ndm_fft_batch); // The number of buffers for DM results is configured below based on the amount of available GPU memory. - unsigned int ndm_buffers = 1; + unsigned int ndm_buffers = 2; // Maximum number of channels processed in one gulp // Parameters might be tuned for efficiency depending on system architecture - unsigned int nchan_batch_max = std::min(nchan / 4, (unsigned int) 64); + unsigned int nchan_batch_max = std::min(nchan / 8, (unsigned int) 64); unsigned int nchan_fft_batch = 64; unsigned int nchan_buffers = 2; @@ -214,16 +215,15 @@ void FDDGPUPlan::execute_gpu( mPrepSpinf.end(); // Determine the amount of memory to use - size_t d_memory_total = m_device->get_total_memory(); - size_t d_memory_free = m_device->get_free_memory(); + size_t d_memory_total = m_device->get_total_memory(); // in Bytes + size_t d_memory_free = m_device->get_free_memory(); // in Bytes size_t sizeof_data_t_nu = 1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word); size_t sizeof_data_x_nu = 1ULL * nchan_batch_max * nsamp_padded * sizeof(float); size_t sizeof_data_x_dm = 1ULL * ndm_batch_max * nsamp_padded * sizeof(float); - // For device side, initial value size_t d_memory_required = sizeof_data_t_nu * nchan_buffers + - sizeof_data_x_nu * 1 + - sizeof_data_x_dm * ndm_buffers; - size_t d_memory_reserved = 0.05 * d_memory_total; + sizeof_data_x_nu * 1 + + sizeof_data_x_dm * ndm_buffers; + size_t d_memory_reserved = 0.05 * d_memory_total; // 5% margin // Subtract the memory usage of any pre-existing device buffers size_t d_memory_in_use = 0; @@ -237,15 +237,45 @@ void FDDGPUPlan::execute_gpu( } d_memory_free += d_memory_in_use; - // Iteratively search for a maximum amount of ndm_buffers, with safety margin - // Make sure that it fits on device memory + // For host side + size_t h_memory_total = get_total_memory() / std::pow(1024, 1); // in GBytes + size_t h_memory_free = get_free_memory() / std::pow(1024, 1); // in GBytes + size_t h_memory_required = sizeof_data_t_nu * nchan_buffers + + sizeof_data_x_dm * ndm_buffers; // in Bytes + size_t h_memory_reserved = 0.05 * h_memory_free * 0.05; // 5% margin + + if ((((double) h_memory_required / std::pow(1024, 3)) + h_memory_reserved) > h_memory_free) + { + /* Note: does not take uninitialized application memory in to account! + * E.g. a malloc for the paged output buffer on the application side does not register the buffer as system memory in use + * Over-using host memory for the application + plan is the responsibiltiy of the application, + * here we can only check for the memory used by the plan itself.*/ + std::cout << "Host memory total = " << h_memory_total << " Gb" << std::endl; + std::cout << "Host memory free = " << h_memory_free << " Gb" << std::endl; + std::cout << "Host memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl; + throw std::runtime_error("FDDGPUPlan runtime error: required host memory is too large"); + } + + // Iteratively search for a setting of ndm_batch_max and ndm_buffers + // to match the available device memory. + while (ndm_batch_max > 0 && + (d_memory_required + d_memory_reserved) > d_memory_free) + { + ndm_batch_max /= 2; + d_memory_required -= sizeof_data_x_dm; + sizeof_data_x_dm /= 2; + d_memory_required += sizeof_data_x_dm; + } + while ((ndm_buffers * ndm_batch_max) < ndm && - (d_memory_required + d_memory_reserved + sizeof_data_x_dm) < d_memory_free) + (d_memory_required + d_memory_reserved) < d_memory_free) { ndm_buffers++; d_memory_required = sizeof_data_t_nu * nchan_buffers + - sizeof_data_x_nu * 1 + - sizeof_data_x_dm * (ndm_buffers); + sizeof_data_x_nu * 1 + + sizeof_data_x_dm * ndm_buffers; + h_memory_required = sizeof_data_t_nu * nchan_buffers + + sizeof_data_x_dm * ndm_buffers; }; // Debug @@ -255,9 +285,10 @@ void FDDGPUPlan::execute_gpu( std::cout << "nchan_buffers = " << nchan_buffers << " x " << nchan_batch_max << " channels" << std::endl; std::cout << "Device memory total = " << d_memory_total / std::pow(1024, 3) << " Gb" << std::endl; std::cout << "Device memory free = " << d_memory_free / std::pow(1024, 3) << " Gb" << std::endl; - std::cout << "Device Memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl; - std::cout << "Host memory total = " << get_total_memory() / std::pow(1024, 1) << " Gb" << std::endl; - std::cout << "Host memory free = " << get_free_memory() / std::pow(1024, 1) << " Gb" << std::endl; + std::cout << "Device memory required = " << d_memory_required / std::pow(1024, 3) << " Gb" << std::endl; + std::cout << "Host memory total = " << h_memory_total << " Gb" << std::endl; + std::cout << "Host memory free = " << h_memory_free << " Gb" << std::endl; + std::cout << "Host Memory required = " << h_memory_required / std::pow(1024, 3) << " Gb" << std::endl; #endif // Allocate memory @@ -302,10 +333,10 @@ void FDDGPUPlan::execute_gpu( mAllocMem.end(); #ifdef DEDISP_DEBUG - size_t d_memory_free_after_malloc = m_device->get_free_memory(); //bytes - size_t h_memory_free_after_malloc = get_free_memory(); //MB - std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc / std::pow(1024, 3) << " Gb" << std::endl; - std::cout << "Host memory free after memory allocations = " << h_memory_free_after_malloc / std::pow(1024, 1) << " Gb" << std::endl; + size_t d_memory_free_after_malloc = m_device->get_free_memory(); // in Bytes + size_t h_memory_free_after_malloc = get_free_memory(); // in Mbytes + std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc / std::pow(1024, 3) << " Gb" << std::endl; + std::cout << "Host memory free after memory allocations = " << h_memory_free_after_malloc / std::pow(1024, 1) << " Gb" << std::endl; #endif // Initialize FDDKernel @@ -402,11 +433,9 @@ void FDDGPUPlan::execute_gpu( std::cout << "Copy output " << dm_job.idm_start << " to " << dm_job.idm_end << " with " << dm_job.ndm_current << " ndms" << std::endl; } #endif - // copy part from pinned h_data_t_dm to part of paged return buffer out - // GPU Host mem pointers + // Copy partial output from pinned memory to output buffer dedisp_size src_stride = 1ULL * nsamp_padded * out_bytes_per_sample; auto* h_src = dm_job.h_data_t_dm->data(); - // CPU mem pointers dedisp_size dst_stride = 1ULL * nsamp_computed * out_bytes_per_sample; dedisp_size dst_offset = 1ULL * dm_job.idm_start * dst_stride; auto* h_dst = (void *) (((size_t) out) + dst_offset); diff --git a/src/fdd/dedisperse/FDDKernel.cu b/src/fdd/dedisperse/FDDKernel.cu index 4c5a642c25df498c856c8e6c29f80d152c0d398c..5f4e5792145374cbeab34efc784557e540613331 100644 --- a/src/fdd/dedisperse/FDDKernel.cu +++ b/src/fdd/dedisperse/FDDKernel.cu @@ -62,7 +62,7 @@ void FDDKernel::launch( * depending on the system configurations. */ #define CALL_KERNEL(NCHAN) \ - dedisperse_kernel<NCHAN, false> \ + dedisperse_kernel<NCHAN, false> \ <<<grid, block, 0, stream>>>( \ nfreq, \ dt, \