diff --git a/CMakeLists.txt b/CMakeLists.txt index 51a824a40ce57428d1f4c2dac9d85b149221a914..67b01daf83ae132b0b22b3bfb309658ec37537cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.17) +cmake_minimum_required(VERSION 3.17 FATAL_ERROR) project(dedisp) @@ -9,17 +9,60 @@ set(DEDISP_VERSION_PATCH 1) set(DEDISP_VERSION ${DEDISP_VERSION_MAJOR}.${DEDISP_VERSION_MINOR}.${DEDISP_VERSION_PATCH}) +set(CMAKE_CXX_STANDARD 14) + # dependencies find_package(PkgConfig REQUIRED) -# CUDA -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 80) +if(NOT DEFINED DEDISP_BACKEND) + set(DEDISP_BACKEND "CUDA") +endif() +set(DEDISP_BACKEND + ${DEDISP_BACKEND} + CACHE STRING "GPU backend API to use") +set_property(CACHE DEDISP_BACKEND PROPERTY STRINGS "CUDA" "HIP") + +if(${DEDISP_BACKEND} STREQUAL "CUDA") + set(DEDISP_BACKEND_CUDA True) +elseif(${DEDISP_BACKEND} STREQUAL "HIP") + set(DEDISP_BACKEND_HIP True) +else() + message(FATAL_ERROR "Invalid value for DEDISP_BACKEND: ${DEDISP_BACKEND}") endif() -find_package(CUDAToolkit REQUIRED) +find_package(Threads REQUIRED) +find_package(OpenMP REQUIRED) -enable_language(CUDA) +set(CUDAWRAPPERS_BACKEND ${DEDISP_BACKEND}) +if(${DEDISP_BACKEND_HIP}) + # Workaround missing omp.h for compilation of sources with language property + # HIP by adding the OpenMP include directory of the host compiler to the + # OpenMP target. + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + get_filename_component(OpenMP_LIB_DIR ${OpenMP_gomp_LIBRARY} DIRECTORY) + message(STATUS "OpenMP library directory: ${OpenMP_LIB_DIR}") + find_path( + OpenMP_INCLUDE_DIR omp.h + PATHS ${OpenMP_LIB_DIR} "${OpenMP_LIB_DIR}/.." + PATH_SUFFIXES + "include" + "lib/gcc/x86_64-pc-linux-gnu/${CMAKE_CXX_COMPILER_VERSION}/include") + target_include_directories(OpenMP::OpenMP_CXX SYSTEM + INTERFACE ${OpenMP_INCLUDE_DIR}) + endif() + + enable_language(HIP) + set(LINK_gpu_runtime hip::host) +elseif(${DEDISP_BACKEND_CUDA}) + enable_language(CUDA) + set(CUDA_MIN_VERSION 10.0) + find_package(CUDAToolkit REQUIRED) + set(LINK_gpu_runtime CUDA::cudart) + + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 80) + endif() +endif() # set Release as default build type if(NOT CMAKE_BUILD_TYPE) @@ -33,23 +76,32 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) option(DEDISP_BUILD_BENCHMARKS "Build benchmarks" ON) if(${DEDISP_BUILD_BENCHMARKS}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEDISP_BENCHMARK") + add_compile_options("-DDEDISP_BENCHMARK") endif() option(DEDISP_ENABLE_DEBUG "Enable extra verbose output" OFF) if(${DEDISP_ENABLE_DEBUG}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEDISP_DEBUG") + add_compile_options("-DDEDISP_BENCHMARK") endif() -option(DEDISP_BUILD_TESTS "Build tests" OFF) +option(DEDISP_BUILD_TESTING "Build tests" OFF) +option(DEDISP_BENCHMARK_WITH_PMT + "Enable Power Measurement Toolkit support in the benchmark suite" OFF) -if(${DEDISP_BUILD_TESTS}) +if(${DEDISP_BUILD_TESTING}) enable_testing() endif() -# OpenMP configuration -find_package(OpenMP REQUIRED) +# CUDA Wrappers +include(GNUInstallDirs) +include(FetchContent) + +FetchContent_Declare( + cudawrappers + GIT_REPOSITORY https://github.com/nlesc-recruit/cudawrappers + GIT_TAG main) +FetchContent_MakeAvailable(cudawrappers) # library add_subdirectory(src) diff --git a/README.md b/README.md index ecbbf6d911dd94113a0bac0df92abb75df4d0f16..167574266b6321a41848aa7d0f8c0e0a624f1a7e 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Installation Instructions: 3. Optionally further configure cmake through interactive build settings * `$ccmake .` * e.g. set `DEDISP_BUILD_BENCHMARKS` to `ON` to enable build of performance benchmarks [default: ON] - * e.g. set `DEDISP_BUILD_TESTS` to `ON` to enable build of tests [default: ON] + * e.g. set `DEDISP_BUILD_TESTING` to `ON` to enable build of tests [default: ON] * e.g. set `DEDISP_DEBUG` to `ON` to enable build with more verbose output (for debugging purposes) [default: OFF] 4. make and install * `$ make install` diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt index 864be15fa2357df1c7c21fe7dacfd9b6ced43318..bcb67c32afe32aa2cfb90e8a83a6f3946a7f8e7c 100644 --- a/bin/CMakeLists.txt +++ b/bin/CMakeLists.txt @@ -2,7 +2,7 @@ set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") # add binaries -if(${DEDISP_BUILD_TESTS}) +if(${DEDISP_BUILD_TESTING}) add_subdirectory(test) endif() if(${DEDISP_BUILD_BENCHMARKS}) diff --git a/bin/benchmark/CMakeLists.txt b/bin/benchmark/CMakeLists.txt index 5e4ca31951d80354b967e559c0e52c8331bc1757..c5cc05652d4e86496f91e8951c41069610a60a49 100644 --- a/bin/benchmark/CMakeLists.txt +++ b/bin/benchmark/CMakeLists.txt @@ -1,3 +1,5 @@ -add_subdirectory(dedisp) +if(${DEDISP_BACKEND_CUDA}) + add_subdirectory(dedisp) +endif() add_subdirectory(tdd) add_subdirectory(fdd) diff --git a/bin/benchmark/common/bench.hpp b/bin/benchmark/common/bench.hpp index c021237acebf7357e50bf3827a37350abae34728..451c24ababa0d748c3303c8c3689c7cb655f4a70 100644 --- a/bin/benchmark/common/bench.hpp +++ b/bin/benchmark/common/bench.hpp @@ -425,6 +425,14 @@ template <typename PlanType> int run(BenchParameters &benchParameter) { // Print timings if (benchParameter.verbose) { tbench->Pause(); + + dedisp_float out_mean, out_sigma; + calc_stats_float(output, nsamps_computed * dm_count, &out_mean, + &out_sigma); + + printf("Output RMS : %f\n", out_mean); + printf("Output StdDev : %f\n", out_sigma); + printf("\n"); printf("------------- BENCHMARK TIMES (accumulated for %d iteration(s)) " "-------------\n", diff --git a/bin/benchmark/dedisp/CMakeLists.txt b/bin/benchmark/dedisp/CMakeLists.txt index 0fbb6999ff57c284172cc4738ae9e6c7196d7987..c0435751a618409bcb0b291c76f51697d8663406 100644 --- a/bin/benchmark/dedisp/CMakeLists.txt +++ b/bin/benchmark/dedisp/CMakeLists.txt @@ -1,6 +1,6 @@ add_executable(benchdedisp benchdedisp.cpp) -target_include_directories(benchdedisp PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(benchdedisp PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(benchdedisp dedisp) diff --git a/bin/benchmark/fdd/CMakeLists.txt b/bin/benchmark/fdd/CMakeLists.txt index 60418d97f82581670efcd622ca23c9449167a3f0..b170820c1d1a9aa2a5e195e6b9aba1feac30b01b 100644 --- a/bin/benchmark/fdd/CMakeLists.txt +++ b/bin/benchmark/fdd/CMakeLists.txt @@ -1,6 +1,6 @@ add_executable(benchfdd benchfdd.cpp) -target_include_directories(benchfdd PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(benchfdd PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(benchfdd fdd) diff --git a/bin/benchmark/tdd/CMakeLists.txt b/bin/benchmark/tdd/CMakeLists.txt index f6db2e7588cf749271021e8f10d3419d2a51ab27..76b05b84dd8c50549fd1c53c3ab784a2aa68ed00 100644 --- a/bin/benchmark/tdd/CMakeLists.txt +++ b/bin/benchmark/tdd/CMakeLists.txt @@ -1,6 +1,6 @@ add_executable(benchtdd benchtdd.cpp) -target_include_directories(benchtdd PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(benchtdd PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(benchtdd tdd) diff --git a/bin/test/CMakeLists.txt b/bin/test/CMakeLists.txt index 5dd541b18dad16cb9e4e6dc46be6d195c0dbca1e..c4611915d9c2ecfcd77c903242447e5428e6c538 100644 --- a/bin/test/CMakeLists.txt +++ b/bin/test/CMakeLists.txt @@ -1,5 +1,29 @@ -if(${DEDISP_BUILD_TESTS}) - add_subdirectory(dedisp) +if(${DEDISP_BUILD_TESTING}) + if(${DEDISP_BACKEND_CUDA}) + add_subdirectory(dedisp) + endif() add_subdirectory(tdd) add_subdirectory(fdd) + + include(FetchContent) + + FetchContent_Declare( + Catch2 + GIT_REPOSITORY https://github.com/catchorg/Catch2.git + GIT_TAG v3.6.0) + + FetchContent_MakeAvailable(Catch2) + list(APPEND CMAKE_MODULE_PATH "${catch2_SOURCE_DIR}/contrib") + include(Catch) + + add_executable(test_unit UnitTests.cpp) + target_include_directories(test_unit PRIVATE "${CMAKE_SOURCE_DIR}/src") + + if(${DEDISP_BACKEND_CUDA}) + target_link_libraries(test_unit dedisp fdd tdd Catch2::Catch2WithMain) + else() + target_link_libraries(test_unit fdd tdd Catch2::Catch2WithMain) + endif() + + catch_discover_tests(test_unit) endif() diff --git a/bin/test/UnitTests.cpp b/bin/test/UnitTests.cpp new file mode 100644 index 0000000000000000000000000000000000000000..42379b720d730583f83958b075d8dcfe6902774c --- /dev/null +++ b/bin/test/UnitTests.cpp @@ -0,0 +1,84 @@ +#include "fdd/FDDGPUPlan.hpp" +#define CATCH_CONFIG_MAIN +#include <catch2/catch_approx.hpp> +#include <catch2/catch_test_macros.hpp> +#include <catch2/matchers/catch_matchers.hpp> +#include <catch2/matchers/catch_matchers_vector.hpp> + +#include "common/test.hpp" + +#include "fdd/FDDGPUPlan.hpp" +#include "tdd/TDDPlan.hpp" + +// Dedisp is not supported on HIP. +#ifndef __HIP__ +#include "dedisp/DedispPlan.hpp" + +TEST_CASE("Dedisp compare expected DM output") { + const std::vector<float> expected_dms = { + 0.392292, 0.395637, 0.401067, 0.397621, 0.390937, 0.446373, + 0.401329, 0.405182, 0.391987, 0.391658, 0.391621}; + + unsigned int seed = 0; + TestInput test_input{&seed}; + TestOutput test_output; + + run<dedisp::DedispPlan>(test_input, + test_output); // uses run method from test.hpp + + REQUIRE( + test_output.out_mean == + Catch::Approx(0.376468509).margin(std::numeric_limits<float>::epsilon())); + REQUIRE( + test_output.out_sigma == + Catch::Approx(0.002306607).margin(std::numeric_limits<float>::epsilon())); + REQUIRE_THAT(test_output.calculated_dms, + Catch::Matchers::Approx(expected_dms) + .margin(std::numeric_limits<float>::epsilon())); +} +#endif + +TEST_CASE("tdd compare expected DM output") { + const std::vector<float> expected_dms = { + 0.392292, 0.395637, 0.401067, 0.397621, 0.390937, 0.446373, + 0.401329, 0.405182, 0.391987, 0.391658, 0.391621}; + + unsigned int seed = 0; + TestInput test_input{&seed}; + TestOutput test_output; + + run<dedisp::TDDPlan>(test_input, + test_output); // uses run method from test.hpp + + REQUIRE( + test_output.out_mean == + Catch::Approx(0.376468509).margin(std::numeric_limits<float>::epsilon())); + REQUIRE( + test_output.out_sigma == + Catch::Approx(0.002306607).margin(std::numeric_limits<float>::epsilon())); + REQUIRE_THAT(test_output.calculated_dms, + Catch::Matchers::Approx(expected_dms) + .margin(std::numeric_limits<float>::epsilon())); +} + +TEST_CASE("fdd compare expected DM output") { + const std::vector<float> expected_dms = { + 4.850260, 6.077153, 8.697535, 7.709451, 4.709877, 7.514279, + 20.041370, 6.095791, 7.270431, 9.196441, 4.596337, 4.517603}; + + unsigned int seed = 0; + TestInput test_input{&seed}; + TestOutput test_output; + + run<dedisp::FDDGPUPlan>(test_input, + test_output); // uses run method from test.hpp + + REQUIRE(test_output.out_mean == + Catch::Approx(-0.000709622) + .margin(std::numeric_limits<float>::epsilon())); + REQUIRE(test_output.out_sigma == Catch::Approx(0.747440).margin( + std::numeric_limits<float>::epsilon())); + REQUIRE_THAT(test_output.calculated_dms, + Catch::Matchers::Approx(expected_dms) + .margin(std::numeric_limits<float>::epsilon())); +} diff --git a/bin/test/common/test.hpp b/bin/test/common/test.hpp index 237606b4868b86b71fefc0976da178310eb921e7..9656d3dbb59b14d2eaedafcf7fecee7ac0224e68 100644 --- a/bin/test/common/test.hpp +++ b/bin/test/common/test.hpp @@ -11,6 +11,7 @@ #include <time.h> #include <functional> +#include <optional> #include <Plan.hpp> @@ -18,6 +19,16 @@ #define WRITE_INPUT_DATA 0 #define WRITE_OUTPUT_DATA 0 +struct TestInput { + unsigned int *seed = nullptr; +}; + +struct TestOutput { + float out_mean; + float out_sigma; + std::vector<float> calculated_dms; +}; + // Assume input is a 0 mean float and quantize to an unsigned 8-bit quantity dedisp_byte bytequant(dedisp_float f) { dedisp_float v = f + 127.5f; @@ -94,7 +105,8 @@ void calc_stats_float(dedisp_float *a, dedisp_size n, dedisp_float *mean, return; } -template <typename PlanType> int run() { +template <typename PlanType> +int run(const TestInput &test_input, TestOutput &test_output) { int device_idx = 0; dedisp_float sampletime_base = @@ -153,7 +165,7 @@ template <typename PlanType> int run() { /* Random number generator setup */ static std::random_device rd; - static std::mt19937 generator(rd()); + static std::mt19937 generator((test_input.seed) ? *(test_input.seed) : rd()); static std::normal_distribution<float> distribution(0.0, 1.0); /* First build 2-D array of floats with our signal in it */ @@ -263,22 +275,26 @@ template <typename PlanType> int run() { (double)(clock() - startclock) / CLOCKS_PER_SEC); // Look for significant peaks - dedisp_float out_mean, out_sigma; - calc_stats_float(output, nsamps_computed * dm_count, &out_mean, &out_sigma); + calc_stats_float(output, nsamps_computed * dm_count, &test_output.out_mean, + &test_output.out_sigma); - printf("Output RMS : %f\n", out_mean); - printf("Output StdDev : %f\n", out_sigma); + printf("Output RMS : %f\n", + test_output.out_mean); + printf("Output StdDev : %f\n", + test_output.out_sigma); i = 0; for (nd = 0; nd < dm_count; nd++) { for (ns = 0; ns < nsamps_computed; ns++) { dedisp_size idx = nd * nsamps_computed + ns; dedisp_float val = output[idx]; - if (val - out_mean > 6.0 * out_sigma) { + if (val - test_output.out_mean > 6.0 * test_output.out_sigma) { printf( "DM trial %u (%.3f pc/cm^3), Samp %u (%.6f s): %f (%.2f sigma)\n", - nd, dmlist[nd], ns, ns * dt, val, (val - out_mean) / out_sigma); + nd, dmlist[nd], ns, ns * dt, val, + (val - test_output.out_mean) / test_output.out_sigma); i++; + test_output.calculated_dms.push_back(val); if (i > 100) break; } @@ -305,4 +321,4 @@ template <typename PlanType> int run() { free(input); printf("Dedispersion successful.\n"); return 0; -} \ No newline at end of file +} diff --git a/bin/test/dedisp/CMakeLists.txt b/bin/test/dedisp/CMakeLists.txt index 0e5344a31fba95dbbc3bf2d933ace3bd0e4c6acc..d5491c0e08c8c58b1666220cc5e593b2bb6d99e7 100644 --- a/bin/test/dedisp/CMakeLists.txt +++ b/bin/test/dedisp/CMakeLists.txt @@ -1,12 +1,12 @@ # test for dedisp C interface add_executable(ctestdedisp ctestdedisp.c) -target_include_directories(ctestdedisp PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(ctestdedisp PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(ctestdedisp dedisp m) add_test(ctestdedisp ctestdedisp) # test for dedisp C++ interface add_executable(testdedisp testdedisp.cpp) -target_include_directories(testdedisp PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(testdedisp PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(testdedisp dedisp) add_test(testdedisp testdedisp) diff --git a/bin/test/dedisp/testdedisp.cpp b/bin/test/dedisp/testdedisp.cpp index 6db39772bcc27d5f36326cc464361fc3972b2630..c9848de54885fad8c3bd791d2b1c2ed79f434085 100644 --- a/bin/test/dedisp/testdedisp.cpp +++ b/bin/test/dedisp/testdedisp.cpp @@ -2,8 +2,12 @@ #include "../common/test.hpp" -template <typename PlanType> int run(); +template <typename PlanType> int run(const TestInput &, TestOutput &); int main(int argc, char *argv[]) { - return run<dedisp::DedispPlan>(); // uses run method from test.hpp + TestInput test_input{}; + TestOutput test_output{}; + + return run<dedisp::DedispPlan>(test_input, + test_output); // uses run method from test.hpp } \ No newline at end of file diff --git a/bin/test/fdd/CMakeLists.txt b/bin/test/fdd/CMakeLists.txt index d8d7ba46dc1e5c27a70b1779da8a35dbce72d715..8a94488ff5c150488f8f14d5904ab42922961c8b 100644 --- a/bin/test/fdd/CMakeLists.txt +++ b/bin/test/fdd/CMakeLists.txt @@ -1,12 +1,12 @@ # test for fdd C interface add_executable(ctestfdd ctestfdd.c) -target_include_directories(ctestfdd PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(ctestfdd PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(ctestfdd fdd m) add_test(ctestfdd ctestfdd) # test for fdd C++ interface add_executable(testfdd testfdd.cpp) -target_include_directories(testfdd PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(testfdd PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(testfdd fdd) add_test(testfdd testfdd) diff --git a/bin/test/fdd/testfdd.cpp b/bin/test/fdd/testfdd.cpp index 351da4ebb857f500e62966c966963fa8090fee72..31d64c641ae75e717deb3a5d73cbab2f4e590931 100644 --- a/bin/test/fdd/testfdd.cpp +++ b/bin/test/fdd/testfdd.cpp @@ -5,17 +5,20 @@ #include "../common/test.hpp" -template <typename PlanType> int run(); +template <typename PlanType> int run(const TestInput &, TestOutput &); int main(int argc, char *argv[]) { + TestInput test_input{}; + TestOutput test_output{}; + // Set environment variable USE_CPU to switch to CPU implementation of FDD char *use_cpu_str = getenv("USE_CPU"); bool use_cpu = !use_cpu_str ? false : atoi(use_cpu_str); if (use_cpu) { std::cout << "Test FDD on CPU" << std::endl; - return run<dedisp::FDDCPUPlan>(); + return run<dedisp::FDDCPUPlan>(test_input, test_output); } else { std::cout << "Test FDD on GPU" << std::endl; - return run<dedisp::FDDGPUPlan>(); + return run<dedisp::FDDGPUPlan>(test_input, test_output); } } \ No newline at end of file diff --git a/bin/test/tdd/CMakeLists.txt b/bin/test/tdd/CMakeLists.txt index 7cbaebd3a3acf24b54e6a3be00275bd337ca6eec..ae1cc4ba1afccd438807af72be15617d67fe6907 100644 --- a/bin/test/tdd/CMakeLists.txt +++ b/bin/test/tdd/CMakeLists.txt @@ -1,12 +1,12 @@ # test for tdd C interface add_executable(ctesttdd ctesttdd.c) -target_include_directories(ctesttdd PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(ctesttdd PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(ctesttdd tdd m) add_test(ctesttdd ctesttdd) # test for tdd C++ interface add_executable(testtdd testtdd.cpp) -target_include_directories(testtdd PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(testtdd PRIVATE "${CMAKE_SOURCE_DIR}/src") target_link_libraries(testtdd tdd) add_test(testtdd testtdd) diff --git a/bin/test/tdd/testtdd.cpp b/bin/test/tdd/testtdd.cpp index 8a84b7b6ee9ab2fa2a4fc188498dcaeb95b5d316..0c68f94948c6e5e7084e10bd5c4066d89065876d 100644 --- a/bin/test/tdd/testtdd.cpp +++ b/bin/test/tdd/testtdd.cpp @@ -2,8 +2,12 @@ #include "../common/test.hpp" -template <typename PlanType> int run(); +template <typename PlanType> int run(const TestInput &, TestOutput &); int main(int argc, char *argv[]) { - return run<dedisp::TDDPlan>(); // uses run method from test.hpp + TestInput test_input{}; + TestOutput test_output{}; + + return run<dedisp::TDDPlan>(test_input, + test_output); // uses run method from test.hpp } \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7acf14db1092a4bdfc0c8e8220b004f89b258bc6..2d04818c3fc5a3aee051ec864d0a0a31ed9a5dba 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,6 @@ # helper libraries -# include directory for common header files -include_directories(common) +include_directories(common PRIVATE ${cudawrappers_SOURCE_DIR}/include) # add subdirectories for individual libraries add_subdirectory(common) @@ -14,11 +13,28 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") # plan library -add_library(plan OBJECT Plan.cpp GPUPlan.cpp) -target_link_libraries(plan CUDA::cudart) - -# dedisp library -add_subdirectory(dedisp) +add_library(plan OBJECT Plan.cpp GPUPlan.cpp GPUKernel.cpp) +target_link_libraries(plan PRIVATE LINK_gpu_runtime cudawrappers::cu + cudawrappers::nvrtc cudawrappers::nvtx) + +if(DEDISP_BENCHMARK_WITH_PMT) + if(${DEDISP_BACKEND_HIP}) + set(PMT_BUILD_ROCM ON) + else() + set(PMT_BUILD_NVML ON) + set(PMT_BUILD_NVIDIA ON) + endif() + FetchContent_Declare(pmt GIT_REPOSITORY https://git.astron.nl/RD/pmt) + FetchContent_MakeAvailable(pmt) + add_compile_definitions("HAVE_PMT") + include_directories(common PRIVATE ${pmt_SOURCE_DIR}/include) +endif() + +target_include_directories(common PRIVATE "${PROJECT_SOURCE_DIR}/include") + +if(${DEDISP_BACKEND_CUDA}) + add_subdirectory(dedisp) +endif() # TDD library add_subdirectory(tdd) diff --git a/src/GPUKernel.cpp b/src/GPUKernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4cb6d46febda761d07e54e24a50622b0dcf39238 --- /dev/null +++ b/src/GPUKernel.cpp @@ -0,0 +1,58 @@ +#include "GPUKernel.hpp" +#include <cudawrappers/cu.hpp> + +#include <iostream> +#include <stdexcept> + +void GPUKernel::assertCompiled(const CompiledKernelInfo &kernel_info) const { + if (!kernel_info.function) { + std::ostringstream ss; + ss << func_name_ << " in file " << filename_ + << " has not yet been compiled"; + throw std::runtime_error(ss.str()); + } +} + +std::pair<const CompiledKernelInfo &, bool> +GPUKernel::compile(const std::vector<std::string> &compile_options) { + // Concat the compile options into a single string to be used as a map key. + std::string options; + + if (compile_options.size()) { + // Allocate some space beforehand to avoid repeated expanding allocations. + options.reserve(compile_options.size() * 32); + + for (const auto &compile_option : compile_options) { + options += compile_option; + } + } + + // Check if the function has already been compiled which the same options. + if (compiled_kernels_.find(options) != compiled_kernels_.end()) { + return {compiled_kernels_[options], false}; + } + + // Create a new map entry inplace. + compiled_kernels_[options] = CompiledKernelInfo{}; + CompiledKernelInfo &info = compiled_kernels_[options]; + + try { + program_->compile(compile_options); + } catch (nvrtc::Error &error) { + std::cerr << program_->getLog(); + throw; + } + + // Create a new module from the compiled PTX if needed. + info.module = std::make_unique<cu::Module>( + static_cast<const void *>(program_->getPTX().data())); + + if (!info.module) { + throw std::runtime_error("Unable to create kernel module"); + } + + info.function = std::make_shared<cu::Function>( + *info.module, std::string(func_name_).c_str()); + + return {info, true}; +} \ No newline at end of file diff --git a/src/GPUKernel.hpp b/src/GPUKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a5b46afe29008b2ec996aef5c4fab8c2eb1c541f --- /dev/null +++ b/src/GPUKernel.hpp @@ -0,0 +1,42 @@ +#ifndef DEDISP_GPU_RUNTIME_KERNEL_HPP_ +#define DEDISP_GPU_RUNTIME_KERNEL_HPP_ + +#include <memory> +#include <optional> +#include <sstream> +#include <stdexcept> +#include <unordered_map> +#include <vector> + +#include <cudawrappers/cu.hpp> +#include <cudawrappers/nvrtc.hpp> + +struct CompiledKernelInfo { + std::shared_ptr<cu::Module> module; + std::shared_ptr<cu::Function> function; +}; + +class GPUKernel { +public: + GPUKernel(const std::string &filename, const std::string &func_name, + const std::string &file_src) + : filename_(filename), func_name_(func_name), + program_(std::make_unique<nvrtc::Program>(file_src, filename)) {} + void setFunctionParams(const std::string &kernel_function_name, + const std::vector<const void *> ¶ms); + void assertCompiled(const CompiledKernelInfo &kernel_info) const; + + inline std::string getFilename() const { return filename_; } + +protected: + std::pair<const CompiledKernelInfo &, bool> + compile(const std::vector<std::string> &compile_options = {}); + +private: + const std::string filename_; + const std::string func_name_; + const std::unique_ptr<nvrtc::Program> program_; + + std::map<std::string, CompiledKernelInfo> compiled_kernels_; +}; +#endif // DEDISP_GPU_RUNTIME_KERNEL_HPP_ \ No newline at end of file diff --git a/src/GPUPlan.cpp b/src/GPUPlan.cpp index 1a9f1b6b4c12a579c052c01a0b79722a77a0586a..81cafd0e41b1d5861beea0bd8d133e1ee5e173ae 100644 --- a/src/GPUPlan.cpp +++ b/src/GPUPlan.cpp @@ -1,11 +1,20 @@ #include "GPUPlan.hpp" +#include <cassert> +#include <exception> +#include <iostream> + +#include "cudawrappers/cu.hpp" + namespace dedisp { // Public interface GPUPlan::GPUPlan(size_type nchans, float_type dt, float_type f0, float_type df, int device_idx) : Plan(nchans, dt, f0, df) { + // Initialize CUDA + cu::init(); + // Initialize device set_device(device_idx); @@ -15,13 +24,15 @@ GPUPlan::GPUPlan(size_type nchans, float_type dt, float_type f0, float_type df, executestream.reset(new cu::Stream()); // Initialize delay table - d_delay_table.resize(nchans * sizeof(dedisp_float)); - htodstream->memcpyHtoDAsync(d_delay_table, h_delay_table.data(), - d_delay_table.size()); + d_delay_table = + std::make_unique<cu::DeviceMemory>(nchans * sizeof(dedisp_float)); + htodstream->memcpyHtoDAsync(*d_delay_table, h_delay_table.data(), + d_delay_table->size()); // Initialize the killmask - d_killmask.resize(nchans * sizeof(dedisp_bool)); - htodstream->memcpyHtoDAsync(d_killmask, h_killmask.data(), d_killmask.size()); + d_killmask = std::make_unique<cu::DeviceMemory>(nchans * sizeof(dedisp_bool)); + htodstream->memcpyHtoDAsync(*d_killmask, h_killmask.data(), + d_killmask->size()); } // Destructor @@ -29,34 +40,38 @@ GPUPlan::~GPUPlan() {} void GPUPlan::set_device(int device_idx) { m_device.reset(new cu::Device(device_idx)); + m_context.reset(new cu::Context(CU_CTX_SCHED_BLOCKING_SYNC, *m_device)); } void GPUPlan::generate_dm_list(float_type dm_start, float_type dm_end, float_type ti, float_type tol) { Plan::generate_dm_list(dm_start, dm_end, ti, tol); - // Allocate device memory for the DM list - d_dm_list.resize(m_dm_count * sizeof(dedisp_float)); + d_dm_list = + std::make_unique<cu::DeviceMemory>(m_dm_count * sizeof(dedisp_float)); + assert(d_dm_list); // Copy the DM list to the device - htodstream->memcpyHtoDAsync(d_dm_list, h_dm_list.data(), d_dm_list.size()); + htodstream->memcpyHtoDAsync(*d_dm_list, h_dm_list.data(), d_dm_list->size()); } void GPUPlan::set_dm_list(const float_type *dm_list, size_type count) { Plan::set_dm_list(dm_list, count); - // Allocate device memory for the DM list - d_dm_list.resize(m_dm_count * sizeof(dedisp_float)); + d_dm_list = + std::make_unique<cu::DeviceMemory>(m_dm_count * sizeof(dedisp_float)); + assert(d_dm_list); // Copy the DM list to the device - htodstream->memcpyHtoDAsync(d_dm_list, h_dm_list.data(), d_dm_list.size()); + htodstream->memcpyHtoDAsync(*d_dm_list, h_dm_list.data(), d_dm_list->size()); } void GPUPlan::set_killmask(const bool_type *killmask) { Plan::set_killmask(killmask); // Copy the killmask to the device - htodstream->memcpyHtoDAsync(d_killmask, h_killmask.data(), d_killmask.size()); + htodstream->memcpyHtoDAsync(*d_killmask, h_killmask.data(), + d_killmask->size()); } } // end namespace dedisp diff --git a/src/GPUPlan.hpp b/src/GPUPlan.hpp index 96387fdf7f008cea7c22f5bbd2ad4fd9e1d94d0a..8911364e923db3663783a3716dd504b35a693fe7 100644 --- a/src/GPUPlan.hpp +++ b/src/GPUPlan.hpp @@ -1,9 +1,10 @@ -#ifndef DEDISP_PLAN_H_GPU_INCLUDE_GUARD -#define DEDISP_PLAN_H_GPU_INCLUDE_GUARD +#ifndef DEDISP_GPU_PLAN_HPP_ +#define DEDISP_GPU_PLAN_HPP_ #include "Plan.hpp" -#include "common/cuda/CU.h" +#include <cudawrappers/cu.hpp> +#include <cudawrappers/nvrtc.hpp> namespace dedisp { @@ -32,11 +33,12 @@ protected: // Device void set_device(int device_idx); std::unique_ptr<cu::Device> m_device; + std::unique_ptr<cu::Context> m_context; // Device arrays - cu::DeviceMemory d_dm_list; // type = dedisp_float - cu::DeviceMemory d_delay_table; // type = dedisp_float - cu::DeviceMemory d_killmask; // type = dedisp_bool + std::unique_ptr<cu::DeviceMemory> d_dm_list; // type = dedisp_float + std::unique_ptr<cu::DeviceMemory> d_delay_table; // type = dedisp_float + std::unique_ptr<cu::DeviceMemory> d_killmask; // type = dedisp_bool // Streams std::unique_ptr<cu::Stream> htodstream; @@ -55,4 +57,4 @@ public: } // end namespace dedisp -#endif // DEDISP_PLAN_GPU_H_INCLUDE_GUARD +#endif // DEDISP_GPU_PLAN_HPP_ diff --git a/src/Plan.cpp b/src/Plan.cpp index ddd26b9539ca2abdb45aa2f357a2a758be08129e..65a18c549691f4f00ade20eb5fc536824803534b 100644 --- a/src/Plan.cpp +++ b/src/Plan.cpp @@ -1,6 +1,6 @@ #include "Plan.hpp" -#include "common/cuda/CU.h" #include "common/dedisp_error.hpp" +#include "cudawrappers/cu.hpp" #include <cmath> namespace dedisp { @@ -80,7 +80,7 @@ void Plan::set_killmask(const bool_type *killmask) { } } -void Plan::sync() { cu::checkError(cudaDeviceSynchronize()); } +void Plan::sync() { cu::Context::synchronize(); } // Private helper functions void Plan::generate_delay_table(dedisp_float *h_delay_table, dedisp_size nchans, diff --git a/src/Plan.hpp b/src/Plan.hpp index 41ac61c4b19ec9ff47c7e89f856ff15f70f94a99..ec70ad5821d1081315f18e1da77e8fca4c32c8e9 100644 --- a/src/Plan.hpp +++ b/src/Plan.hpp @@ -1,7 +1,6 @@ -#ifndef DEDISP_PLAN_H_INCLUDE_GUARD -#define DEDISP_PLAN_H_INCLUDE_GUARD +#ifndef DEDISP_PLAN_HPP_ +#define DEDISP_PLAN_HPP_ -#include <memory> #include <vector> #include "common/dedisp_types.h" @@ -96,4 +95,4 @@ protected: } // end namespace dedisp -#endif // DEDISP_PLAN_H_INCLUDE_GUARD +#endif // DEDISP_PLAN_HPP_ diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index f0678b476349c03c277d95d0d322b366af04525c..f950bcfb143d8896469f7c3e7cef1bc2cbfe7fd6 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -1,11 +1,16 @@ -add_library(common OBJECT dedisp_error.cpp helper.cpp cuda/CU.cpp) +add_library(common OBJECT dedisp_error.cpp helper.cpp) -target_link_libraries(common CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver - OpenMP::OpenMP_CXX) +target_link_libraries(common PUBLIC OpenMP::OpenMP_CXX cudawrappers::cu + cudawrappers::nvrtc cudawrappers::nvtx) set_target_properties(common PROPERTIES VERSION ${DEDISP_VERSION} SOVERSION ${DEDISP_VERSION_MAJOR}) +if(${DEDISP_BACKEND_HIP}) + get_target_property(sources common SOURCES) + set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP) +endif() + install(TARGETS common LIBRARY DESTINATION lib) # Install dedisp_types.h in common subdirectory for backwards compatibility with diff --git a/src/common/cuda/CU.cpp b/src/common/cuda/CU.cpp deleted file mode 100644 index d4d44cd61c6129c23013b11e4633257c0270259c..0000000000000000000000000000000000000000 --- a/src/common/cuda/CU.cpp +++ /dev/null @@ -1,317 +0,0 @@ -/* - * CU, a CUDA driver api C++ wrapper. - * This code is copied from the IDG repository (https://git.astron.nl/RD/idg) - * and changed to meet the needs for this library. - */ -#include "CU.h" - -#include <cassert> -#include <cstring> -#include <iostream> -#include <sstream> - -#define assertCudaCall(val) __assertCudaCall(val, #val, __FILE__, __LINE__) -#define checkCudaCall(val) __checkCudaCall(val, #val, __FILE__, __LINE__) -#define assertCuCall(val) __assertCuCall(val, #val, __FILE__, __LINE__) -#define checkCuCall(val) __checkCuCall(val, #val, __FILE__, __LINE__) - -namespace cu { - -/* - Error checking -*/ -inline void __assertCudaCall(cudaError_t result, char const *const func, - const char *const file, int const line) { - if (result != cudaSuccess) { - const char *msg; - msg = cudaGetErrorString(result); - std::cerr << "CUDA Error at " << file; - std::cerr << ":" << line; - std::cerr << " in function " << func; - std::cerr << ": " << msg; - std::cerr << std::endl; - throw Error<cudaError_t>(result); - } -} - -inline void __checkCudaCall(cudaError_t result, char const *const func, - const char *const file, int const line) { - try { - __assertCudaCall(result, func, file, line); - } catch (Error<cudaError_t> &error) { - // pass - } -} - -inline void __assertCuCall(CUresult result, char const *const func, - const char *const file, int const line) { - if (result != CUDA_SUCCESS) { - const char *msg; - cuGetErrorString(result, &msg); - std::cerr << "CU Error at " << file; - std::cerr << ":" << line; - std::cerr << " in function " << func; - std::cerr << ": "; - if (msg) - std::cerr << msg; - std::cerr << std::endl; - throw Error<CUresult>(result); - } -} - -inline void __checkCuCall(CUresult result, char const *const func, - const char *const file, int const line) { - try { - __assertCuCall(result, func, file, line); - } catch (Error<CUresult> &error) { - // pass - } -} - -void checkError() { assertCudaCall(cudaGetLastError()); } - -void checkError(cudaError_t error) { assertCudaCall(error); } -void checkError(CUresult error) { assertCuCall(error); } - -/* - Device -*/ -Device::Device(int device) { - m_device = device; - checkCudaCall(cudaSetDevice(device)); -} - -unsigned int Device::get_capability() { - cudaDeviceProp device_props; - cudaGetDeviceProperties(&device_props, m_device); - return 10 * device_props.major + device_props.minor; -} - -size_t Device::get_total_const_memory() const { - cudaDeviceProp device_props; - cudaGetDeviceProperties(&device_props, m_device); - return device_props.totalConstMem; -} - -size_t Device::get_free_memory() const { - size_t free; - size_t total; - cudaMemGetInfo(&free, &total); - return free; -} - -size_t Device::get_total_memory() const { - size_t free; - size_t total; - cudaMemGetInfo(&free, &total); - return total; -} - -CUcontext Device::get_current_context() { - CUcontext context{}; - assertCuCall(cuCtxGetCurrent(&context)); - return context; -} - -void Device::set_context(CUcontext context) { cuCtxSetCurrent(context); } - -/* - HostMemory -*/ -HostMemory::HostMemory(size_t size, int flags) { - m_capacity = size; - m_size = size; - m_flags = flags; - assertCudaCall(cudaHostAlloc(&m_ptr, size, m_flags)); -} - -HostMemory::~HostMemory() { release(); } - -void HostMemory::resize(size_t size) { - assert(size > 0); - m_size = size; - if (size > m_capacity) { - release(); - assertCudaCall(cudaHostAlloc(&m_ptr, size, m_flags)); - m_capacity = size; - } -} - -void HostMemory::release() { assertCudaCall(cudaFreeHost(m_ptr)); } - -void HostMemory::zero() { memset(m_ptr, 0, m_size); } - -/* - DeviceMemory -*/ -DeviceMemory::DeviceMemory(size_t size) { - m_capacity = size; - m_size = size; - if (size) { - assertCudaCall(cudaMalloc(&m_ptr, size)); - } -} - -DeviceMemory::~DeviceMemory() { release(); } - -void DeviceMemory::resize(size_t size) { - assert(size > 0); - m_size = size; - if (size > m_capacity) { - release(); - assertCudaCall(cudaMalloc(&m_ptr, size)); - m_capacity = size; - } -} - -void DeviceMemory::release() { - if (m_capacity) { - assertCudaCall(cudaFree(m_ptr)); - } -} - -void DeviceMemory::zero(cudaStream_t stream) { - if (m_size) { - if (stream != NULL) { - assertCudaCall(cudaMemsetAsync(m_ptr, 0, m_size, stream)); - } else { - assertCudaCall(cudaMemset(m_ptr, 0, m_size)); - } - } -} - -/* - Event -*/ -Event::Event(int flags) { assertCudaCall(cudaEventCreate(&m_event, flags)); } - -Event::~Event() { assertCudaCall(cudaEventDestroy(m_event)); } - -void Event::synchronize() { assertCudaCall(cudaEventSynchronize(m_event)); } - -float Event::elapsedTime(Event &second) { - float ms; - assertCudaCall(cudaEventElapsedTime(&ms, second, m_event)); - return ms; -} - -Event::operator cudaEvent_t() { return m_event; } - -/* - Stream -*/ -Stream::Stream(int flags) { - assertCudaCall(cudaStreamCreateWithFlags(&m_stream, flags)); -} - -Stream::~Stream() { assertCudaCall(cudaStreamDestroy(m_stream)); } - -void Stream::memcpyHtoDAsync(void *devPtr, const void *hostPtr, size_t size) { - assertCudaCall( - cudaMemcpyAsync(devPtr, hostPtr, size, cudaMemcpyHostToDevice, m_stream)); -} - -void Stream::memcpyDtoHAsync(void *hostPtr, void *devPtr, size_t size) { - assertCudaCall( - cudaMemcpyAsync(hostPtr, devPtr, size, cudaMemcpyDeviceToHost, m_stream)); -} - -void Stream::memcpyDtoDAsync(void *dstPtr, void *srcPtr, size_t size) { - assertCudaCall(cudaMemcpyAsync(dstPtr, srcPtr, size, cudaMemcpyDeviceToDevice, - m_stream)); -} - -void Stream::memcpyHtoD2DAsync(void *dstPtr, size_t dstWidth, - const void *srcPtr, size_t srcWidth, - size_t widthBytes, size_t height) { - assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth, - widthBytes, height, cudaMemcpyHostToDevice, - m_stream)); -} - -void Stream::memcpyDtoH2DAsync(void *dstPtr, size_t dstWidth, - const void *srcPtr, size_t srcWidth, - size_t widthBytes, size_t height) { - assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth, - widthBytes, height, cudaMemcpyDeviceToHost, - m_stream)); -} - -void Stream::memcpyHtoH2DAsync(void *dstPtr, size_t dstWidth, - const void *srcPtr, size_t srcWidth, - size_t widthBytes, size_t height) { - assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth, - widthBytes, height, cudaMemcpyHostToHost, - m_stream)); -} - -void Stream::synchronize() { assertCudaCall(cudaStreamSynchronize(m_stream)); } - -void Stream::waitEvent(Event &event) { - assertCudaCall(cudaStreamWaitEvent(m_stream, event, 0)); -} - -void Stream::record(Event &event) { - assertCudaCall(cudaEventRecord(event, m_stream)); -} - -void Stream::zero(void *ptr, size_t size) { - assertCudaCall(cudaMemsetAsync(ptr, 0, size, m_stream)); -} - -Stream::operator cudaStream_t() { return m_stream; } - -/* - Marker -*/ -Marker::Marker(const char *message, Color color) { - _attributes.version = NVTX_VERSION; - _attributes.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - _attributes.colorType = NVTX_COLOR_ARGB; - _attributes.color = convert(color); - _attributes.messageType = NVTX_MESSAGE_TYPE_ASCII; - _attributes.message.ascii = message; -} - -void Marker::start() { _id = nvtxRangeStartEx(&_attributes); } - -void Marker::end() { nvtxRangeEnd(_id); } - -void Marker::start(cu::Event &event) { - event.synchronize(); - start(); -} - -void Marker::end(cu::Event &event) { - event.synchronize(); - end(); -} - -unsigned int Marker::convert(Color color) { - switch (color) { - case red: - return 0xffff0000; - case green: - return 0xff00ff00; - case blue: - return 0xff0000ff; - case yellow: - return 0xffffff00; - case black: - return 0xff000000; - default: - return 0xff00ff00; - } -} - -/* - ScopedMarker -*/ -ScopedMarker::ScopedMarker(const char *message, Color color) - : Marker(message, color) { - _id = nvtxRangeStartEx(&_attributes); -}; - -ScopedMarker::~ScopedMarker() { nvtxRangeEnd(_id); } - -} // end namespace cu diff --git a/src/common/cuda/CU.h b/src/common/cuda/CU.h deleted file mode 100644 index 8ad2ae3265dabe10c394105d7ceff993cf9c3014..0000000000000000000000000000000000000000 --- a/src/common/cuda/CU.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * CU, a CUDA driver api C++ wrapper. - * This code is copied from the IDG repository (https://git.astron.nl/RD/idg) - * and changed to meet the needs for this library. - */ -#ifndef CU_WRAPPER_H -#define CU_WRAPPER_H - -#include <stdexcept> - -#include <cuda.h> -#include <cuda_runtime.h> -#include <nvToolsExt.h> - -namespace cu { - -template <typename T> class Error : public std::exception { -public: - Error(T result) : _result(result) {} - - operator T() const { return _result; } - -private: - T _result; -}; - -void checkError(); -void checkError(cudaError_t error); - -class Device { -public: - Device(int device); - - unsigned int get_capability(); - size_t get_total_const_memory() const; - size_t get_free_memory() const; - size_t get_total_memory() const; - CUcontext get_current_context(); - void set_context(CUcontext cntx); - -private: - int m_device; -}; - -class Memory { -public: - void *data() { return m_ptr; } - size_t size() { return m_size; } - virtual void resize(size_t size) = 0; - template <typename T> operator T *() { return static_cast<T *>(m_ptr); } - -protected: - size_t m_capacity = 0; // capacity, in bytes - size_t m_size = 0; // size, in bytes - void *m_ptr = nullptr; -}; - -class HostMemory : public virtual Memory { -public: - HostMemory(size_t size = 0, int flags = cudaHostAllocPortable); - virtual ~HostMemory(); - - void resize(size_t size) override; - void zero(); - -private: - void release(); - int m_flags; -}; - -class DeviceMemory : public virtual Memory { -public: - DeviceMemory(size_t size = 0); - ~DeviceMemory(); - - void resize(size_t size); - void zero(cudaStream_t stream = NULL); - - template <typename T> operator T() { return static_cast<T>(m_ptr); } - -private: - void release(); -}; - -class Event { -public: - Event(int flags = cudaEventDefault); - ~Event(); - - void synchronize(); - float elapsedTime(Event &second); - - operator cudaEvent_t(); - -private: - cudaEvent_t m_event; -}; - -class Stream { -public: - Stream(int flags = cudaStreamDefault); - ~Stream(); - - void memcpyHtoDAsync(void *devPtr, const void *hostPtr, size_t size); - void memcpyDtoHAsync(void *hostPtr, void *devPtr, size_t size); - void memcpyDtoDAsync(void *dstPtr, void *srcPtr, size_t size); - void memcpyHtoD2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr, - size_t srcWidth, size_t widthBytes, size_t height); - void memcpyDtoH2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr, - size_t srcWidth, size_t widthBytes, size_t height); - void memcpyHtoH2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr, - size_t srcWidth, size_t widthBytes, size_t height); - void synchronize(); - void waitEvent(Event &event); - void record(Event &event); - void zero(void *ptr, size_t size); - - operator cudaStream_t(); - -private: - cudaStream_t m_stream; -}; - -class Marker { -public: - enum Color { red, green, blue, yellow, black }; - - Marker(const char *message, Marker::Color color = Color::red); - - void start(); - void end(); - void start(cu::Event &event); - void end(cu::Event &event); - -private: - unsigned int convert(Color color); - -protected: - nvtxEventAttributes_t _attributes; - nvtxRangeId_t _id; -}; - -class ScopedMarker : public Marker { -public: - ScopedMarker(const char *message, Marker::Color color = Color::red); - - ~ScopedMarker(); - - void start() = delete; - void end() = delete; - void start(cu::Event &event) = delete; - void end(cu::Event &event) = delete; -}; - -} // end namespace cu - -#endif // end CU_WRAPPER_H \ No newline at end of file diff --git a/src/common/dedisp.h b/src/common/dedisp.h index 99823ba2e373112ada7a6b993aee2e53284a5f56..9b9eee637fa14b55bc9a37e644ca1a1e8e4c0748 100644 --- a/src/common/dedisp.h +++ b/src/common/dedisp.h @@ -37,8 +37,8 @@ * \bug Asynchronous execution is not currently operational. */ -#ifndef DEDISP_H_INCLUDE_GUARD -#define DEDISP_H_INCLUDE_GUARD +#ifndef DEDISP_COMMON_DEDISP_H_ +#define DEDISP_COMMON_DEDISP_H_ // Use C linkage to allow cross-language use of the library #ifdef __cplusplus @@ -489,4 +489,4 @@ const dedisp_size *dedisp_get_dt_factors(const dedisp_plan plan); } // closing brace for extern "C" #endif -#endif // DEDISP_H_INCLUDE_GUARD +#endif // DEDISP_COMMON_DEDISP_H_ diff --git a/src/common/dedisp_error.hpp b/src/common/dedisp_error.hpp index fdff277f2070504931a673e7417688b5d77b25be..6d6400d75aa11e4649a332f68cb11ad257cdf65e 100644 --- a/src/common/dedisp_error.hpp +++ b/src/common/dedisp_error.hpp @@ -1,5 +1,5 @@ -#ifndef DEDISP_ERROR_H_INCLUDE_GUARD -#define DEDISP_ERROR_H_INCLUDE_GUARD +#ifndef DEDISP_COMMON_DEDISP_ERROR_H_ +#define DEDISP_COMMON_DEDISP_ERROR_H_ #include <sstream> #include <stdexcept> @@ -79,4 +79,4 @@ inline void _throw_error(dedisp_error error, char const *const func, inline void check_error(dedisp_error error) { throw_error(error); } -#endif // DEDISP_ERROR_H_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_COMMON_DEDISP_ERROR_H_ \ No newline at end of file diff --git a/src/common/dedisp_strings.h b/src/common/dedisp_strings.h index 0ae85f94122e4fcff575381a087a3572f30c6e33..4e6e2410aeb1269488fe5389e3112ccf14e5b98a 100644 --- a/src/common/dedisp_strings.h +++ b/src/common/dedisp_strings.h @@ -12,6 +12,8 @@ static const std::string output_memcpy_time_str = "Output memcpy time : "; static const std::string runtime_time_str = "Runtime : "; static const std::string gpuexec_time_str = "GPU execution time : "; static const std::string total_time_str = "Total time : "; +static const std::string pmt_joules_str = "Joules : "; +static const std::string pmt_watts_str = "Watts : "; static const std::string preprocessing_perf_str = "Preprocessing performance : "; diff --git a/src/common/dedisp_types.h b/src/common/dedisp_types.h index b92ab7051999db3e96ee9f82b0783ce44cdad5b2..cab0f0304f104ffaa61adfd86eafb1dc47c0fc18 100644 --- a/src/common/dedisp_types.h +++ b/src/common/dedisp_types.h @@ -1,5 +1,5 @@ -#ifndef DEDISP_TYPES_H_INCLUDE_GUARD -#define DEDISP_TYPES_H_INCLUDE_GUARD +#ifndef DEDISP_COMMON_TYPES_H_ +#define DEDISP_COMMON_TYPES_H_ // Types // ----- @@ -48,4 +48,4 @@ typedef enum { * are complete. */ -#endif // DEDISP_TYPES_H_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_COMMON_TYPES_H_ \ No newline at end of file diff --git a/src/common/helper.cpp b/src/common/helper.cpp index fd2a9e7a7985af36c4bb0b49bc3aa0fca25c998e..7f1660ffbd483d99e7ff2c38c3ac061ffc4466af 100644 --- a/src/common/helper.cpp +++ b/src/common/helper.cpp @@ -1,5 +1,4 @@ #include "helper.h" -#include <omp.h> #include <sys/resource.h> // get used memory #include <unistd.h> // get total memory diff --git a/src/common/helper.h b/src/common/helper.h index 8f617fc2a43bc4bbe67236d4032261007084abdf..01dfa020a46da27c9a4b9de8f7157ccfbce2a5b5 100644 --- a/src/common/helper.h +++ b/src/common/helper.h @@ -1,5 +1,5 @@ -#ifndef HELPER_H_INCLUDE_GUARD -#define HELPER_H_INCLUDE_GUARD +#ifndef DEDISP_COMMON_HELPER_H_ +#define DEDISP_COMMON_HELPER_H_ #include <cstddef> @@ -17,4 +17,4 @@ size_t get_free_memory(); } // end namespace dedisp -#endif // HELPER_H_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_COMMON_HELPER_H_ \ No newline at end of file diff --git a/src/dedisp/CMakeLists.txt b/src/dedisp/CMakeLists.txt index aafc1627b8f20aae0aae2d805ca553c9623e86f8..b4725d3cf28a5d6d19ca9246ea827cd15e8ef36d 100644 --- a/src/dedisp/CMakeLists.txt +++ b/src/dedisp/CMakeLists.txt @@ -2,17 +2,20 @@ add_library( dedisp SHARED cinterface.cpp DedispPlan.cpp - dedisperse/dedisperse.cu - unpack/unpack.cu - transpose/transpose.cu + dedisperse/DedispKernel.cu + unpack/UnpackKernel.cu + transpose/TransposeKernel.cu $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:plan> $<TARGET_OBJECTS:external>) -target_include_directories(dedisp PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories(dedisp PRIVATE "${CMAKE_SOURCE_DIR}/src") -target_link_libraries(dedisp CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver - OpenMP::OpenMP_CXX) +target_embed_source(dedisp transpose/transpose_kernel.cu) +target_embed_source(dedisp dedisperse/dedisperse_kernel.cu) + +target_link_libraries(dedisp PUBLIC OpenMP::OpenMP_CXX cudawrappers::cu + cudawrappers::nvtx cudawrappers::nvrtc) target_compile_options(dedisp PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>) @@ -23,6 +26,11 @@ set_target_properties( SOVERSION ${DEDISP_VERSION_MAJOR} PUBLIC_HEADER DedispPlan.hpp) +if(${DEDISP_BACKEND_HIP}) + get_target_property(sources ${PROJECT_NAME} SOURCES) + set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP) +endif() + install( TARGETS dedisp LIBRARY DESTINATION lib diff --git a/src/dedisp/DedispPlan.cpp b/src/dedisp/DedispPlan.cpp index 128fa71f4830ef23e5afdf9cd8081f0249496705..627c5f0380519f93d927d1eb4a2363e9e125c781 100644 --- a/src/dedisp/DedispPlan.cpp +++ b/src/dedisp/DedispPlan.cpp @@ -6,9 +6,8 @@ #include "common/dedisp_strings.h" #include "common/dedisp_types.h" -#include "dedisperse/dedisperse.h" -#include "transpose/transpose.hpp" -#include "unpack/unpack.h" +#include "dedisperse/dedisp_constants.cuh" +#include "unpack/UnpackKernel.hpp" #if defined(DEDISP_BENCHMARK) #include "external/Stopwatch.h" @@ -45,8 +44,6 @@ void DedispPlan::set_gulp_size(size_type gulp_size) { m_gulp_size = gulp_size; } void DedispPlan::execute(size_type nsamps, const byte_type *in, size_type in_nbits, byte_type *out, size_type out_nbits, unsigned flags) { - enum { BITS_PER_BYTE = 8 }; - // Note: The default out_stride is nsamps - m_max_delay dedisp_size out_bytes_per_sample = out_nbits / (sizeof(dedisp_byte) * BITS_PER_BYTE); @@ -76,13 +73,6 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in, byte_type *out, size_type out_nbits, size_type out_stride, size_type first_dm_idx, size_type dm_count, unsigned flags) { - cu::checkError(); - - enum { - BITS_PER_BYTE = 8, - BYTES_PER_WORD = sizeof(dedisp_word) / sizeof(dedisp_byte) - }; - dedisp_size out_bytes_per_sample = out_nbits / (sizeof(dedisp_byte) * BITS_PER_BYTE); @@ -126,10 +116,6 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in, init_timer->Start(); #endif - // Copy the lookup tables to constant memory on the device - copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0, 0); - copy_killmask(d_killmask, m_nchans * sizeof(dedisp_bool), 0, 0); - // Compute the problem decomposition dedisp_size nsamps_computed = nsamps - m_max_delay; @@ -219,39 +205,41 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in, nsamps_gulp); // height htodstream->synchronize(); #ifdef DEDISP_BENCHMARK - cudaDeviceSynchronize(); input_timer->Pause(); preprocessing_timer->Start(); #endif // Transpose the words in the input - transpose((dedisp_word *)d_in, nchan_words, nsamps_gulp, - in_buf_stride_words, nsamps_padded_gulp, - (dedisp_word *)d_transposed); + dedisp_kernel_transpose.run(d_in, nchan_words, nsamps_gulp, + in_buf_stride_words, nsamps_padded_gulp, + d_transposed, *executestream); #ifdef DEDISP_BENCHMARK - cudaDeviceSynchronize(); + executestream->synchronize(); #endif - // Unpack the transposed data - unpack(d_transposed, nsamps_padded_gulp, nchan_words, d_unpacked, in_nbits, - unpacked_in_nbits); + dedisp_kernel_unpack.run(d_transposed, nsamps_padded_gulp, nchan_words, + d_unpacked, in_nbits, unpacked_in_nbits, + *executestream); #ifdef DEDISP_BENCHMARK - cudaDeviceSynchronize(); + executestream->synchronize(); preprocessing_timer->Pause(); dedispersion_timer->Start(); #endif // Perform direct dedispersion without scrunching - if (!dedisperse( // d_transposed, + if (!dedisp_kernel_dedisperse.run( // d_transposed, d_unpacked, nsamps_padded_gulp, nsamps_computed_gulp, unpacked_in_nbits, // in_nbits, - m_nchans, 1, d_dm_list, dm_count, 1, d_out, out_stride_gulp_samples, - out_nbits, 1, 0, 0, 0, 0)) { + m_nchans, 1, *d_dm_list, dm_count, 1, d_out, + out_stride_gulp_samples, out_nbits, 1, 0, 0, 0, 0, + h_delay_table.data(), h_delay_table.size() * sizeof(dedisp_float), + h_killmask.data(), h_killmask.size() * sizeof(dedisp_bool), + *htodstream, *executestream)) { throw_error(DEDISP_INTERNAL_GPU_ERROR); } #ifdef DEDISP_BENCHMARK - cudaDeviceSynchronize(); + executestream->synchronize(); dedispersion_timer->Pause(); #endif // Copy output back to host memory @@ -263,13 +251,12 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in, #endif dtohstream->memcpyDtoH2DAsync(out + gulp_samp_byte_idx, // dst out_stride, // dst stride - (byte_type *)d_out, // src + d_out, // src out_stride_gulp_bytes, // src stride nsamp_bytes_computed_gulp, // width bytes dm_count); // height dtohstream->synchronize(); #ifdef DEDISP_BENCHMARK - cudaDeviceSynchronize(); output_timer->Pause(); #endif diff --git a/src/dedisp/DedispPlan.hpp b/src/dedisp/DedispPlan.hpp index 0cedfbdc277453db4d4d251ee19a6127f18a401b..9853195a312376b9a0b45044346e022f58c8f37f 100644 --- a/src/dedisp/DedispPlan.hpp +++ b/src/dedisp/DedispPlan.hpp @@ -1,7 +1,10 @@ -#ifndef H_DEDISP_PLAN_INCLUDE_GUARD -#define H_DEDISP_PLAN_INCLUDE_GUARD +#ifndef DEDISP_DEDISP_DEDISP_PLAN_HPP_ +#define DEDISP_DEDISP_DEDISP_PLAN_HPP_ #include "GPUPlan.hpp" +#include "dedisperse/DedispKernel.hpp" +#include "transpose/TransposeKernel.hpp" +#include "unpack/UnpackKernel.hpp" namespace dedisp { @@ -48,8 +51,11 @@ public: private: dedisp_size m_gulp_size; + TransposeKernel dedisp_kernel_transpose; + DedisperseKernel dedisp_kernel_dedisperse; + UnpackKernel dedisp_kernel_unpack; }; } // end namespace dedisp -#endif // H_DEDISP_PLAN_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_DEDISP_DEDISP_PLAN_HPP_ \ No newline at end of file diff --git a/src/dedisp/cinterface.cpp b/src/dedisp/cinterface.cpp index a67e2473abd8cf4d51bdd0b0efad139bc815fb6c..a9ebe6adeaf186e373b84c0435554e1f8e0138a9 100644 --- a/src/dedisp/cinterface.cpp +++ b/src/dedisp/cinterface.cpp @@ -45,10 +45,6 @@ dedisp_error dedisp_create_plan(dedisp_plan *plan, dedisp_size nchans, *plan = nullptr; - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - *plan = new dedisp_plan_struct(); if (!plan) { throw_error(DEDISP_MEM_ALLOC_FAILED); @@ -70,10 +66,6 @@ dedisp_error dedisp_set_gulp_size(dedisp_plan plan, dedisp_size gulp_size) { throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { static_cast<dedisp::DedispPlan *>(plan->plan_.get()) ->set_gulp_size(gulp_size); @@ -89,10 +81,6 @@ dedisp_size dedisp_get_gulp_size(dedisp_plan plan) { throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { return static_cast<dedisp::DedispPlan *>(plan->plan_.get()) ->get_gulp_size(); @@ -107,10 +95,6 @@ dedisp_error dedisp_set_dm_list(dedisp_plan plan, const dedisp_float *dm_list, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->set_dm_list(dm_list, count); } catch (...) { @@ -127,10 +111,6 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, dedisp_float dm_start, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->generate_dm_list(dm_start, dm_end, ti, tol); } catch (...) { @@ -190,10 +170,6 @@ dedisp_error dedisp_set_killmask(dedisp_plan plan, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->set_killmask(killmask); } catch (...) { @@ -295,10 +271,6 @@ dedisp_error dedisp_execute_guru(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { static_cast<dedisp::DedispPlan *>(plan->plan_.get()) ->execute_guru(nsamps, in, in_nbits, in_stride, out, out_nbits, @@ -319,10 +291,6 @@ dedisp_error dedisp_execute_adv(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { static_cast<dedisp::DedispPlan *>(plan->plan_.get()) ->execute_adv(nsamps, in, in_nbits, in_stride, out, out_nbits, @@ -343,10 +311,6 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->execute(nsamps, in, in_nbits, out, out_nbits, flags); } catch (...) { @@ -358,7 +322,7 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps, dedisp_error dedisp_sync(void) { try { - cu::checkError(cudaDeviceSynchronize()); + cu::Context::synchronize(); } catch (...) { throw_error(DEDISP_PRIOR_GPU_ERROR); } diff --git a/src/dedisp/dedisperse/DedispKernel.cu b/src/dedisp/dedisperse/DedispKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..19a2bfafbb1730846c972965233e67cf7d75f1dd --- /dev/null +++ b/src/dedisp/dedisperse/DedispKernel.cu @@ -0,0 +1,97 @@ +#include "DedispKernel.hpp" +#include "GPUKernel.hpp" +#include "dedisperse_kernel.cu" + +#include <cudawrappers/cu.hpp> +#include <iostream> +#include <memory> + +/* + * Helper functions + */ + +/* + * dedisperse routine + */ +bool DedisperseKernel::run( + const dedisp_word *d_in, dedisp_size in_stride, dedisp_size nsamps, + dedisp_size in_nbits, dedisp_size nchans, dedisp_size chan_stride, + const dedisp_float *d_dm_list, dedisp_size dm_count, dedisp_size dm_stride, + dedisp_byte *d_out, dedisp_size out_stride, dedisp_size out_nbits, + dedisp_size batch_size, dedisp_size batch_in_stride, + dedisp_size batch_dm_stride, dedisp_size batch_chan_stride, + dedisp_size batch_out_stride, const void *delay_table, + const size_t delay_table_size, const void *killmask, + const size_t killmask_size, cu::Stream &htodstream, + cu::Stream &executestream) { + // Define thread decomposition + // Note: Block dimensions x and y represent time samples and DMs respectively + dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + // Note: Grid dimension x represents time samples. Dimension y represents + // DMs and batch jobs flattened together. + + // Divide and round up + dedisp_size nsamp_blocks = + (nsamps - 1) / ((dedisp_size)DEDISP_SAMPS_PER_THREAD * block.x) + 1; + dedisp_size ndm_blocks = (dm_count - 1) / (dedisp_size)block.y + 1; + + // Constrain the grid size to the maximum allowed + // TODO: Consider cropping the batch size dimension instead and looping over + // it inside the kernel + ndm_blocks = min((unsigned int)ndm_blocks, + (unsigned int)(MAX_CUDA_GRID_SIZE_Y / batch_size)); + + // Note: We combine the DM and batch dimensions into one + dim3 grid(nsamp_blocks, ndm_blocks * batch_size); + + // Divide and round up + dedisp_size nsamps_reduced = (nsamps - 1) / DEDISP_SAMPS_PER_THREAD + 1; + + const std::vector<const void *> parameters = {&d_in, + &nsamps, + &nsamps_reduced, + &nsamp_blocks, + &in_stride, + &dm_count, + &dm_stride, + &ndm_blocks, + &nchans, + &chan_stride, + &d_out, + &out_nbits, + &out_stride, + &d_dm_list, + &batch_in_stride, + &batch_dm_stride, + &batch_chan_stride, + &batch_out_stride}; + + std::ostringstream ss; + ss << "-DPARAM_IN_NBITS=" << in_nbits; + + CompiledKernelInfo kernel; + bool did_recompile; + + std::tie(kernel, did_recompile) = compile({ss.str()}); + assertCompiled(kernel); + + // Copy delay table and killmask + if (did_recompile) { + if (delay_table) { + htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_delay_table"), + delay_table, delay_table_size); + htodstream.synchronize(); + } + + if (killmask) { + htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_killmask"), + killmask, killmask_size); + htodstream.synchronize(); + } + } + + executestream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, + block.x, block.y, block.z, 0, parameters); + + return true; +} diff --git a/src/dedisp/dedisperse/DedispKernel.hpp b/src/dedisp/dedisperse/DedispKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..adb248b2a6e00bfb0caee9496a0298d808c9190e --- /dev/null +++ b/src/dedisp/dedisperse/DedispKernel.hpp @@ -0,0 +1,47 @@ +#ifndef DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_ +#define DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_ + +#include <string> + +#include <cudawrappers/cu.hpp> + +#include "common/dedisp_types.h" +#include <GPUKernel.hpp> + +extern const char _binary_src_dedisp_dedisperse_dedisperse_kernel_cu_start, + _binary_src_dedisp_dedisperse_dedisperse_kernel_cu_end; + +class DedisperseKernel : public GPUKernel { +public: + DedisperseKernel() + : GPUKernel( + "dedisperse_kernel.cu", "dedisperse_kernel", + std::string( + reinterpret_cast<const char *>( + &_binary_src_dedisp_dedisperse_dedisperse_kernel_cu_start), + reinterpret_cast<const char *>( + &_binary_src_dedisp_dedisperse_dedisperse_kernel_cu_end))) { + } + + bool run(const dedisp_word *d_in, dedisp_size in_stride, dedisp_size nsamps, + dedisp_size in_nbits, dedisp_size nchans, dedisp_size chan_stride, + const dedisp_float *d_dm_list, dedisp_size dm_count, + dedisp_size dm_stride, dedisp_byte *d_out, dedisp_size out_stride, + dedisp_size out_nbits, dedisp_size batch_size, + dedisp_size batch_in_stride, dedisp_size batch_dm_stride, + dedisp_size batch_chan_stride, dedisp_size batch_out_stride, + const void *delay_table, const size_t delay_table_size, + const void *killmask, const size_t killmask_size, + cu::Stream &h2dstream, cu::Stream &stream); + +private: + const void *delay_table_src_host_ = nullptr; + size_t delay_table_count_ = 0; + size_t delay_table_offset_ = 0; + + const void *killmask_src_host_ = nullptr; + size_t killmask_count_ = 0; + size_t killmask_offset_ = 0; +}; + +#endif // DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_ \ No newline at end of file diff --git a/src/dedisp/dedisperse/dedisp_constants.cuh b/src/dedisp/dedisperse/dedisp_constants.cuh new file mode 100644 index 0000000000000000000000000000000000000000..8f928843820537e6645bcf64f35045e0a81808e0 --- /dev/null +++ b/src/dedisp/dedisperse/dedisp_constants.cuh @@ -0,0 +1,23 @@ +#ifndef DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_ +#define DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_ + +#define DEDISP_DEFAULT_GULP_SIZE 65536u // 131072 + +// TODO: Make sure this doesn't limit GPU constant memory +// available to users. +#define DEDISP_MAX_NCHANS 8192u + +// Kernel tuning parameters +#define DEDISP_BLOCK_SIZE 256u +#define DEDISP_BLOCK_SAMPS 8u +#define DEDISP_SAMPS_PER_THREAD 2u // 4 is better for Fermi? + +#define BITS_PER_BYTE 8 +#define BYTES_PER_WORD (sizeof(dedisp_word) / sizeof(unsigned char)) +#define MAX_CUDA_GRID_SIZE_X 65535u +#define MAX_CUDA_GRID_SIZE_Y 65535u + +constexpr unsigned int BLOCK_DIM_X = DEDISP_BLOCK_SAMPS; +constexpr unsigned int BLOCK_DIM_Y = (DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS); + +#endif // DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_ \ No newline at end of file diff --git a/src/dedisp/dedisperse/dedisperse.cu b/src/dedisp/dedisperse/dedisperse.cu deleted file mode 100644 index 09da8df6186a4aa18425649472be12debf362f8a..0000000000000000000000000000000000000000 --- a/src/dedisp/dedisperse/dedisperse.cu +++ /dev/null @@ -1,123 +0,0 @@ -#include "dedisperse.h" -#include "dedisperse_kernel.cuh" - -// Kernel tuning parameters -#define DEDISP_BLOCK_SIZE 256 -#define DEDISP_BLOCK_SAMPS 8 -#define DEDISP_SAMPS_PER_THREAD 2 // 4 is better for Fermi? - -/* - * Helper functions - */ -void copy_delay_table(const void *src, size_t count, size_t offset, - cudaStream_t stream) { - cudaMemcpyToSymbolAsync(c_delay_table, src, count, offset, - cudaMemcpyDeviceToDevice, stream); - cudaDeviceSynchronize(); - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) { - throw_error(DEDISP_MEM_COPY_FAILED); - } -} - -void copy_killmask(const void *src, size_t count, size_t offset, - cudaStream_t stream) { - cudaMemcpyToSymbolAsync(c_killmask, src, count, offset, - cudaMemcpyDeviceToDevice, stream); - cudaDeviceSynchronize(); - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) { - throw_error(DEDISP_MEM_COPY_FAILED); - } -} - -/* - * dedisperse routine - */ -bool dedisperse(const dedisp_word *d_in, dedisp_size in_stride, - dedisp_size nsamps, dedisp_size in_nbits, dedisp_size nchans, - dedisp_size chan_stride, const dedisp_float *d_dm_list, - dedisp_size dm_count, dedisp_size dm_stride, dedisp_byte *d_out, - dedisp_size out_stride, dedisp_size out_nbits, - dedisp_size batch_size, dedisp_size batch_in_stride, - dedisp_size batch_dm_stride, dedisp_size batch_chan_stride, - dedisp_size batch_out_stride) { - enum { - BITS_PER_BYTE = 8, - BYTES_PER_WORD = sizeof(dedisp_word) / sizeof(dedisp_byte), - BLOCK_DIM_X = DEDISP_BLOCK_SAMPS, - BLOCK_DIM_Y = DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS, - MAX_CUDA_GRID_SIZE_X = 65535, - MAX_CUDA_GRID_SIZE_Y = 65535, - }; - - // Define thread decomposition - // Note: Block dimensions x and y represent time samples and DMs respectively - dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - // Note: Grid dimension x represents time samples. Dimension y represents - // DMs and batch jobs flattened together. - - // Divide and round up - dedisp_size nsamp_blocks = - (nsamps - 1) / ((dedisp_size)DEDISP_SAMPS_PER_THREAD * block.x) + 1; - dedisp_size ndm_blocks = (dm_count - 1) / (dedisp_size)block.y + 1; - - // Constrain the grid size to the maximum allowed - // TODO: Consider cropping the batch size dimension instead and looping over - // it - // inside the kernel - ndm_blocks = min((unsigned int)ndm_blocks, - (unsigned int)(MAX_CUDA_GRID_SIZE_Y / batch_size)); - - // Note: We combine the DM and batch dimensions into one - dim3 grid(nsamp_blocks, ndm_blocks * batch_size); - - // Divide and round up - dedisp_size nsamps_reduced = (nsamps - 1) / DEDISP_SAMPS_PER_THREAD + 1; - - cudaStream_t stream = 0; - - // Execute the kernel -#define DEDISP_CALL_KERNEL(NBITS) \ - dedisperse_kernel<NBITS, DEDISP_SAMPS_PER_THREAD, BLOCK_DIM_X, BLOCK_DIM_Y> \ - <<<grid, block, 0, stream>>>( \ - d_in, nsamps, nsamps_reduced, nsamp_blocks, in_stride, dm_count, \ - dm_stride, ndm_blocks, nchans, chan_stride, d_out, out_nbits, \ - out_stride, d_dm_list, batch_in_stride, batch_dm_stride, \ - batch_chan_stride, batch_out_stride) - // Note: Here we dispatch dynamically on nbits for supported values - switch (in_nbits) { - case 1: - DEDISP_CALL_KERNEL(1); - break; - case 2: - DEDISP_CALL_KERNEL(2); - break; - case 4: - DEDISP_CALL_KERNEL(4); - break; - case 8: - DEDISP_CALL_KERNEL(8); - break; - case 16: - DEDISP_CALL_KERNEL(16); - break; - case 32: - DEDISP_CALL_KERNEL(32); - break; - default: /* should never be reached */ - break; - } -#undef DEDISP_CALL_KERNEL - - // Check for kernel errors -#ifdef DEDISP_DEBUG - cudaDeviceSynchronize(); - cudaError_t cuda_error = cudaGetLastError(); - if (cuda_error != cudaSuccess) { - return false; - } -#endif // DEDISP_DEBUG - - return true; -} diff --git a/src/dedisp/dedisperse/dedisperse.h b/src/dedisp/dedisperse/dedisperse.h deleted file mode 100644 index 30e2ddd574260cdd8ce5237bec933a1d67edbf7f..0000000000000000000000000000000000000000 --- a/src/dedisp/dedisperse/dedisperse.h +++ /dev/null @@ -1,26 +0,0 @@ -#include "common/dedisp_error.hpp" -#include "common/dedisp_types.h" - -#define DEDISP_DEFAULT_GULP_SIZE 65536 // 131072 - -// TODO: Make sure this doesn't limit GPU constant memory -// available to users. -#define DEDISP_MAX_NCHANS 8192 - -// Kernel tuning parameters -#define DEDISP_SAMPS_PER_THREAD 2 // 4 is better for Fermi? - -void copy_delay_table(const void *src, size_t count, size_t offset, - cudaStream_t stream); - -void copy_killmask(const void *src, size_t count, size_t offset, - cudaStream_t stream); - -bool dedisperse(const dedisp_word *d_in, dedisp_size in_stride, - dedisp_size nsamps, dedisp_size in_nbits, dedisp_size nchans, - dedisp_size chan_stride, const dedisp_float *d_dm_list, - dedisp_size dm_count, dedisp_size dm_stride, dedisp_byte *d_out, - dedisp_size out_stride, dedisp_size out_nbits, - dedisp_size batch_size, dedisp_size batch_in_stride, - dedisp_size batch_dm_stride, dedisp_size batch_chan_stride, - dedisp_size batch_out_stride); \ No newline at end of file diff --git a/src/dedisp/dedisperse/dedisperse_kernel.cuh b/src/dedisp/dedisperse/dedisperse_kernel.cu similarity index 85% rename from src/dedisp/dedisperse/dedisperse_kernel.cuh rename to src/dedisp/dedisperse/dedisperse_kernel.cu index 1dfbd13af0ffbee10e7163522f2db28ab0b5df2e..cf7256bd117827e39e597b862f78babf66d42b80 100644 --- a/src/dedisp/dedisperse/dedisperse_kernel.cuh +++ b/src/dedisp/dedisperse/dedisperse_kernel.cu @@ -1,3 +1,6 @@ +#include "common/dedisp_types.h" +#include "dedisp_constants.cuh" + // Constant reference for input data __constant__ dedisp_float c_delay_table[DEDISP_MAX_NCHANS]; __constant__ dedisp_bool c_killmask[DEDISP_MAX_NCHANS]; @@ -25,7 +28,6 @@ inline __host__ __device__ T extract_subword(T value, int idx) { template <int IN_NBITS, typename T, typename SumType> inline __host__ __device__ T scale_output(SumType sum, dedisp_size nchans) { - enum { BITS_PER_BYTE = 8 }; // This emulates dedisperse_all, but is specific to 8-bit output float in_range = max_value<IN_NBITS>::value; // Note: We use floats when out_nbits == 32, and scale to a range of [0:1] @@ -59,7 +61,7 @@ inline __host__ __device__ void set_out_val(dedisp_byte *d_out, dedisp_size idx, // E.g., Words bracketed: (t0c0,t0c1,t0c2,t0c3), (t1c0,t1c1,t1c2,t1c3),... // Note: out_stride should be in units of samples template <int IN_NBITS, int SAMPS_PER_THREAD, int BLOCK_DIM_X, int BLOCK_DIM_Y> -__global__ void dedisperse_kernel( +__device__ inline void dedisperse_kernel_impl( const dedisp_word *d_in, dedisp_size nsamps, dedisp_size nsamps_reduced, dedisp_size nsamp_blocks, dedisp_size stride, dedisp_size dm_count, dedisp_size dm_stride, dedisp_size ndm_blocks, dedisp_size nchans, @@ -68,10 +70,7 @@ __global__ void dedisperse_kernel( dedisp_size batch_in_stride, dedisp_size batch_dm_stride, dedisp_size batch_chan_stride, dedisp_size batch_out_stride) { // Compute compile-time constants - enum { - BITS_PER_BYTE = 8, - CHANS_PER_WORD = sizeof(dedisp_word) * BITS_PER_BYTE / IN_NBITS - }; + enum { CHANS_PER_WORD = sizeof(dedisp_word) * BITS_PER_BYTE / IN_NBITS }; // Compute the thread decomposition dedisp_size samp_block = blockIdx.x; @@ -170,4 +169,25 @@ __global__ void dedisperse_kernel( } // End of sample loop } // End of DM loop +} + +extern "C" { +#ifndef PARAM_IN_NBITS +#define PARAM_IN_NBITS 16 +#endif + +__global__ void dedisperse_kernel( + const dedisp_word *d_in, dedisp_size nsamps, dedisp_size nsamps_reduced, + dedisp_size nsamp_blocks, dedisp_size stride, dedisp_size dm_count, + dedisp_size dm_stride, dedisp_size ndm_blocks, dedisp_size nchans, + dedisp_size chan_stride, dedisp_byte *d_out, dedisp_size out_nbits, + dedisp_size out_stride, const dedisp_float *d_dm_list, + dedisp_size batch_in_stride, dedisp_size batch_dm_stride, + dedisp_size batch_chan_stride, dedisp_size batch_out_stride) { + dedisperse_kernel_impl<PARAM_IN_NBITS, DEDISP_SAMPS_PER_THREAD, BLOCK_DIM_X, + BLOCK_DIM_Y>( + d_in, nsamps, nsamps_reduced, nsamp_blocks, stride, dm_count, dm_stride, + ndm_blocks, nchans, chan_stride, d_out, out_nbits, out_stride, d_dm_list, + batch_in_stride, batch_dm_stride, batch_chan_stride, batch_out_stride); +} } \ No newline at end of file diff --git a/src/dedisp/transpose/TransposeKernel.cu b/src/dedisp/transpose/TransposeKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..964a8e87c2cee73a8f6079cf4d6ed1e6db00850d --- /dev/null +++ b/src/dedisp/transpose/TransposeKernel.cu @@ -0,0 +1,146 @@ +/* + * This file contains a CUDA implementation of the array transpose operation. + * + * Parts of this file are based on the transpose implementation in the + * NVIDIA CUDA SDK. + */ + +#include <cmath> +#include <memory> +#include <stdexcept> + +#include <cudawrappers/cu.hpp> + +#include "GPUKernel.hpp" +#include "TransposeKernel.hpp" +#include "transpose_kernel.cu" + +namespace cuda_specs { +enum { MAX_GRID_DIMENSION = 65535 }; +} + +template <typename U> inline U min(const U &a, const U &b) { + return a < b ? a : b; +} + +template <typename U> inline U round_up_pow2(const U &a) { + U r = a - 1; + for (unsigned long i = 1; i <= sizeof(U) * 8 / 2; i <<= 1) + r |= r >> i; + return r + 1; +} + +template <typename U> inline U round_down_pow2(const U &a) { + return round_up_pow2(a + 1) / 2; +} + +inline unsigned int log2(unsigned int a) { + unsigned int r; + unsigned int shift; + r = (a > 0xFFFF) << 4; + a >>= r; + shift = (a > 0xFF) << 3; + a >>= shift; + r |= shift; + shift = (a > 0xF) << 2; + a >>= shift; + r |= shift; + shift = (a > 0x3) << 1; + a >>= shift; + r |= shift; + r |= (a >> 1); + return r; +} + +inline unsigned long log2(unsigned long a) { + unsigned long r; + unsigned long shift; + r = (a > 0xFFFFFFFF) << 5; + a >>= r; + shift = (a > 0xFFFF) << 4; + a >>= shift; + r |= shift; + shift = (a > 0xFF) << 3; + a >>= shift; + r |= shift; + shift = (a > 0xF) << 2; + a >>= shift; + r |= shift; + shift = (a > 0x3) << 1; + a >>= shift; + r |= shift; + r |= (a >> 1); + return r; +} + +void TransposeKernel::run(cu::DeviceMemory &in, size_t width, size_t height, + size_t in_stride, size_t out_stride, + cu::DeviceMemory &out, cu::Stream &stream) { + // Specify thread decomposition (uses up-rounded divisions) + dim3 tot_block_count((width - 1) / TILE_DIM + 1, (height - 1) / TILE_DIM + 1); + + size_t max_grid_dim = round_down_pow2((size_t)cuda_specs::MAX_GRID_DIMENSION); + + // Partition the grid into chunks that the GPU can accept at once + for (size_t block_y_offset = 0; block_y_offset < tot_block_count.y; + block_y_offset += max_grid_dim) { + dim3 block_count; + + // Handle the possibly incomplete final grid + block_count.y = min(max_grid_dim, tot_block_count.y - block_y_offset); + + for (size_t block_x_offset = 0; block_x_offset < tot_block_count.x; + block_x_offset += max_grid_dim) { + // Handle the possibly incomplete final grid + block_count.x = min(max_grid_dim, tot_block_count.x - block_x_offset); + + // Compute the chunked parameters + size_t x_offset = block_x_offset * TILE_DIM; + size_t y_offset = block_y_offset * TILE_DIM; + size_t in_offset = x_offset + y_offset * in_stride; + size_t out_offset = y_offset + x_offset * out_stride; + size_t w = min(max_grid_dim * TILE_DIM, width - x_offset); + size_t h = min(max_grid_dim * TILE_DIM, height - y_offset); + + dim3 block(TILE_DIM, BLOCK_ROWS); + + // TODO: + // Unfortunately there are cases where rounding to a power of 2 + // becomes detrimental to performance. Could work out a heuristic. + bool round_grid_to_pow2 = true; + + unsigned int *in_ptr = reinterpret_cast<unsigned int *>( + static_cast<dedisp_word *>(in) + (in_offset)); + unsigned int *out_ptr = reinterpret_cast<unsigned int *>( + static_cast<dedisp_word *>(out) + (out_offset)); + + // Dispatch on grid-rounding + const dim3 grid = round_grid_to_pow2 ? dim3(round_up_pow2(block_count.x), + round_up_pow2(block_count.y)) + : dim3(block_count.x, block_count.y); + + const auto grid_y_log2 = log2(grid.y); + const std::vector<const void *> parameters = {&in_ptr, + &w, + &h, + &in_stride, + &out_stride, + &out_ptr, + &block_count.x, + &block_count.y, + &grid_y_log2}; + std::ostringstream ss; + ss << "-DPARAM_GRID_IS_POW2=" << round_grid_to_pow2; + + CompiledKernelInfo kernel; + bool did_recompile; + + std::tie(kernel, did_recompile) = compile({ss.str()}); + assertCompiled(kernel); + + // Run the CUDA kernel + stream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, block.x, + block.y, block.z, 0, parameters); + } + } +} \ No newline at end of file diff --git a/src/dedisp/transpose/TransposeKernel.hpp b/src/dedisp/transpose/TransposeKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3733508cf606f19a041b20684ea9f6274148d06b --- /dev/null +++ b/src/dedisp/transpose/TransposeKernel.hpp @@ -0,0 +1,27 @@ +#ifndef DEDISP_DEDISP_DEDISPERSE_TRANSPOSE_KERNEL_H_ +#define DEDISP_DEDISP_DEDISPERSE_TRANSPOSE_KERNEL_H_ + +#include <cudawrappers/cu.hpp> + +#include "common/dedisp_types.h" +#include <GPUKernel.hpp> + +extern const char _binary_src_dedisp_transpose_transpose_kernel_cu_start, + _binary_src_dedisp_transpose_transpose_kernel_cu_end; + +class TransposeKernel : public GPUKernel { +public: + TransposeKernel() + : GPUKernel( + "transpose_kernel.cu", "transpose_kernel_grid", + std::string( + reinterpret_cast<const char *>( + &_binary_src_dedisp_transpose_transpose_kernel_cu_start), + reinterpret_cast<const char *>( + &_binary_src_dedisp_transpose_transpose_kernel_cu_end))) {} + + void run(cu::DeviceMemory &in, size_t width, size_t height, size_t in_stride, + size_t out_stride, cu::DeviceMemory &out, cu::Stream &stream); +}; + +#endif // DEDISP_DEDISP_DEDISPERSE_TRANSPOSE_KERNEL_H_ \ No newline at end of file diff --git a/src/dedisp/transpose/transpose.cu b/src/dedisp/transpose/transpose.cu deleted file mode 100644 index ee37762610a1d09aa96e250476e5bd4433832417..0000000000000000000000000000000000000000 --- a/src/dedisp/transpose/transpose.cu +++ /dev/null @@ -1,21 +0,0 @@ -/* - * This header files contains the implementation of the Transpose<T> class - */ -#include "transpose.cuh" - -/* - * The generic Transpose<T>::transpose method - */ -template <typename T> -void transpose(const T *in, size_t width, size_t height, size_t in_stride, - size_t out_stride, T *out, cudaStream_t stream = 0) { - Transpose<T> transpose; - transpose.transpose(in, width, height, in_stride, out_stride, out, stream); -} - -/* - * Explicit template instantations - */ -template void transpose<unsigned int>(const unsigned int *, size_t, size_t, - size_t, size_t, unsigned int *, - cudaStream_t); \ No newline at end of file diff --git a/src/dedisp/transpose/transpose.cuh b/src/dedisp/transpose/transpose.cuh deleted file mode 100644 index 15a2a797b3051431dc703918d629ee718660b9ef..0000000000000000000000000000000000000000 --- a/src/dedisp/transpose/transpose.cuh +++ /dev/null @@ -1,163 +0,0 @@ -/* - * This file contains a CUDA implementation of the array transpose operation. - * - * Parts of this file are based on the transpose implementation in the - * NVIDIA CUDA SDK. - */ - -#pragma once - -#include <stdexcept> - -#include "transpose_kernel.cuh" - -namespace cuda_specs { -enum { MAX_GRID_DIMENSION = 65535 }; -} - -template <typename T> struct Transpose { - Transpose() {} - - void transpose(const T *in, size_t width, size_t height, size_t in_stride, - size_t out_stride, T *out, cudaStream_t stream = 0); - void transpose(const T *in, size_t width, size_t height, T *out, - cudaStream_t stream = 0) { - transpose(in, width, height, width, height, out, stream); - } - -private: - // TODO: These should probably be imported from somewhere else - template <typename U> inline U min(const U &a, const U &b) { - return a < b ? a : b; - } - template <typename U> inline U round_up_pow2(const U &a) { - U r = a - 1; - for (unsigned long i = 1; i <= sizeof(U) * 8 / 2; i <<= 1) - r |= r >> i; - return r + 1; - } - template <typename U> inline U round_down_pow2(const U &a) { - return round_up_pow2(a + 1) / 2; - } - inline unsigned int log2(unsigned int a) { - unsigned int r; - unsigned int shift; - r = (a > 0xFFFF) << 4; - a >>= r; - shift = (a > 0xFF) << 3; - a >>= shift; - r |= shift; - shift = (a > 0xF) << 2; - a >>= shift; - r |= shift; - shift = (a > 0x3) << 1; - a >>= shift; - r |= shift; - r |= (a >> 1); - return r; - } - inline unsigned long log2(unsigned long a) { - unsigned long r; - unsigned long shift; - r = (a > 0xFFFFFFFF) << 5; - a >>= r; - shift = (a > 0xFFFF) << 4; - a >>= shift; - r |= shift; - shift = (a > 0xFF) << 3; - a >>= shift; - r |= shift; - shift = (a > 0xF) << 2; - a >>= shift; - r |= shift; - shift = (a > 0x3) << 1; - a >>= shift; - r |= shift; - r |= (a >> 1); - return r; - } -}; - -template <typename T> -void Transpose<T>::transpose(const T *in, size_t width, size_t height, - size_t in_stride, size_t out_stride, T *out, - cudaStream_t stream) { - // Parameter checks - if (0 == width || 0 == height) { - return; - } - if (0 == in) { - throw std::runtime_error("Transpose: in is NULL"); - } - if (0 == out) { - throw std::runtime_error("Transpose: out is NULL"); - } - if (width > in_stride) { - throw std::runtime_error("Transpose: width exceeds in_stride"); - } - if (height > out_stride) { - throw std::runtime_error("Transpose: height exceeds out_stride"); - } - - // Specify thread decomposition (uses up-rounded divisions) - dim3 tot_block_count((width - 1) / TILE_DIM + 1, (height - 1) / TILE_DIM + 1); - - size_t max_grid_dim = round_down_pow2((size_t)cuda_specs::MAX_GRID_DIMENSION); - - // Partition the grid into chunks that the GPU can accept at once - for (size_t block_y_offset = 0; block_y_offset < tot_block_count.y; - block_y_offset += max_grid_dim) { - dim3 block_count; - - // Handle the possibly incomplete final grid - block_count.y = min(max_grid_dim, tot_block_count.y - block_y_offset); - - for (size_t block_x_offset = 0; block_x_offset < tot_block_count.x; - block_x_offset += max_grid_dim) { - // Handle the possibly incomplete final grid - block_count.x = min(max_grid_dim, tot_block_count.x - block_x_offset); - - // Compute the chunked parameters - size_t x_offset = block_x_offset * TILE_DIM; - size_t y_offset = block_y_offset * TILE_DIM; - size_t in_offset = x_offset + y_offset * in_stride; - size_t out_offset = y_offset + x_offset * out_stride; - size_t w = min(max_grid_dim * TILE_DIM, width - x_offset); - size_t h = min(max_grid_dim * TILE_DIM, height - y_offset); - - dim3 block(TILE_DIM, BLOCK_ROWS); - - // TODO: - // Unfortunately there are cases where rounding to a power of 2 - // becomes detrimental to performance. Could work out a heuristic. - bool round_grid_to_pow2 = true; - - // Dispatch on grid-rounding - if (round_grid_to_pow2) { - dim3 grid(round_up_pow2(block_count.x), round_up_pow2(block_count.y)); - - // Run the CUDA kernel - transpose_kernel<true, T><<<grid, block, 0, stream>>>( - in + in_offset, w, h, in_stride, out_stride, out + out_offset, - block_count.x, block_count.y, log2(grid.y)); - } else { - dim3 grid(block_count.x, block_count.y); - - // Run the CUDA kernel - transpose_kernel<false, T><<<grid, block, 0, stream>>>( - in + in_offset, w, h, in_stride, out_stride, out + out_offset, - block_count.x, block_count.y, log2(grid.y)); - } - -#ifndef NDEBUG - cudaStreamSynchronize(stream); - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) { - throw std::runtime_error( - std::string("Transpose: CUDA error in kernel: ") + - cudaGetErrorString(error)); - } -#endif - } - } -} \ No newline at end of file diff --git a/src/dedisp/transpose/transpose.hpp b/src/dedisp/transpose/transpose.hpp deleted file mode 100644 index 862ba667adf3c4687dbf6ef2500636525fde18ff..0000000000000000000000000000000000000000 --- a/src/dedisp/transpose/transpose.hpp +++ /dev/null @@ -1,9 +0,0 @@ -/* - * This file exposes the generic Transpose<T>::transpose method - * - * Explicit template instantiations for this method are defined in transpose.cu - */ - -template <typename T> -void transpose(const T *in, size_t width, size_t height, size_t in_stride, - size_t out_stride, T *out, cudaStream_t stream = 0); \ No newline at end of file diff --git a/src/dedisp/transpose/transpose_kernel.cuh b/src/dedisp/transpose/transpose_kernel.cu similarity index 66% rename from src/dedisp/transpose/transpose_kernel.cuh rename to src/dedisp/transpose/transpose_kernel.cu index 01f401ddaf762f868c70bd1bb76559f1f4c98b6c..615f346f93d1245f38de2670e48b03794c68f539 100644 --- a/src/dedisp/transpose/transpose_kernel.cuh +++ b/src/dedisp/transpose/transpose_kernel.cu @@ -2,20 +2,17 @@ * This file contains the implementation of transpose_kernel */ -#ifndef TRANSPOSE_KERNEL_H_INCLUDE_GUARD -#define TRANSPOSE_KERNEL_H_INCLUDE_GUARD - #define TILE_DIM 32 #define BLOCK_ROWS 8 typedef unsigned int gpu_size_t; template <bool GRID_IS_POW2, typename T> -__global__ void -transpose_kernel(const T *in, gpu_size_t width, gpu_size_t height, - gpu_size_t in_stride, gpu_size_t out_stride, T *out, - gpu_size_t block_count_x, gpu_size_t block_count_y, - gpu_size_t log2_gridDim_y) { +__device__ inline void +transpose_kernel_impl(const T *in, gpu_size_t width, gpu_size_t height, + gpu_size_t in_stride, gpu_size_t out_stride, T *out, + gpu_size_t block_count_x, gpu_size_t block_count_y, + gpu_size_t log2_gridDim_y) { __shared__ T tile[TILE_DIM][TILE_DIM]; gpu_size_t blockIdx_x, blockIdx_y; @@ -73,4 +70,19 @@ transpose_kernel(const T *in, gpu_size_t width, gpu_size_t height, } } -#endif // TRANSPOSE_KERNEL_H_INCLUDE_GUARD \ No newline at end of file +extern "C" { +#ifndef PARAM_GRID_IS_POW2 +#define PARAM_GRID_IS_POW2 false +#endif + +__global__ void transpose_kernel_grid(const unsigned int *in, size_t width, + size_t height, size_t in_stride, + size_t out_stride, unsigned int *out, + size_t block_count_x, + size_t block_count_y, + unsigned int log2_gridDim_y) { + transpose_kernel_impl<PARAM_GRID_IS_POW2>(in, width, height, in_stride, + out_stride, out, block_count_x, + block_count_y, log2_gridDim_y); +} +} \ No newline at end of file diff --git a/src/dedisp/unpack/unpack.cu b/src/dedisp/unpack/UnpackKernel.cu similarity index 87% rename from src/dedisp/unpack/unpack.cu rename to src/dedisp/unpack/UnpackKernel.cu index 8353df7cfce298e954353706b7bd918abcbdec89..4f55b35679640c119dc578ae2a5ec48bc81ad502 100644 --- a/src/dedisp/unpack/unpack.cu +++ b/src/dedisp/unpack/UnpackKernel.cu @@ -1,3 +1,5 @@ +#include "UnpackKernel.hpp" + #include <thrust/device_ptr.h> #include <thrust/functional.h> #include <thrust/iterator/counting_iterator.h> @@ -52,9 +54,11 @@ struct unpack_functor : public thrust::unary_function<unsigned int, WordType> { return result; } }; -void unpack(const dedisp_word *d_transposed, dedisp_size nsamps, - dedisp_size nchan_words, dedisp_word *d_unpacked, - dedisp_size in_nbits, dedisp_size out_nbits) { + +void UnpackKernel::run(const dedisp_word *d_transposed, dedisp_size nsamps, + dedisp_size nchan_words, dedisp_word *d_unpacked, + dedisp_size in_nbits, dedisp_size out_nbits, + cu::Stream &) { thrust::device_ptr<dedisp_word> d_unpacked_begin(d_unpacked); dedisp_size expansion = out_nbits / in_nbits; diff --git a/src/dedisp/unpack/UnpackKernel.hpp b/src/dedisp/unpack/UnpackKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..12ab25aabd93b5d45fe82c97a7f0e4defd491dda --- /dev/null +++ b/src/dedisp/unpack/UnpackKernel.hpp @@ -0,0 +1,17 @@ +#ifndef DEDISP_DEDISP_DEDISPERSE_UNPACK_KERNEL_HPP_ +#define DEDISP_DEDISP_DEDISPERSE_UNPACK_KERNEL_HPP_ + +#include <cudawrappers/cu.hpp> +#include <string> + +#include "common/dedisp_types.h" +#include <GPUKernel.hpp> + +class UnpackKernel { +public: + void run(const dedisp_word *d_transposed, dedisp_size nsamps, + dedisp_size nchan_words, dedisp_word *d_unpacked, + dedisp_size in_nbits, dedisp_size out_nbits, cu::Stream &); +}; + +#endif // DEDISP_DEDISP_DEDISPERSE_UNPACK_KERNEL_HPP_ \ No newline at end of file diff --git a/src/dedisp/unpack/unpack.h b/src/dedisp/unpack/unpack.h deleted file mode 100644 index 88abc1399b58c8e4515777efeee7807bb588270b..0000000000000000000000000000000000000000 --- a/src/dedisp/unpack/unpack.h +++ /dev/null @@ -1,5 +0,0 @@ -#include "common/dedisp_types.h" - -void unpack(const dedisp_word *d_transposed, dedisp_size nsamps, - dedisp_size nchan_words, dedisp_word *d_unpacked, - dedisp_size in_nbits, dedisp_size out_nbits); diff --git a/src/fdd/CMakeLists.txt b/src/fdd/CMakeLists.txt index cf3f11b546ee49d68fc17c3c2243a7c89a8eaa61..fef95210fa4f5c7de947d689a47133ba9dbe8962 100644 --- a/src/fdd/CMakeLists.txt +++ b/src/fdd/CMakeLists.txt @@ -4,7 +4,7 @@ add_library( FDDGPUPlan.cpp FDDCPUPlan.cpp dedisperse/FDDKernel.cu - unpack/unpack.cu + unpack/UnpackKernel.cu chunk.cpp $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:plan> @@ -14,19 +14,27 @@ pkg_search_module(FFTW REQUIRED fftw3f IMPORTED_TARGET) target_include_directories( fdd - PRIVATE ${CMAKE_SOURCE_DIR}/src - PRIVATE ${CMAKE_SOURCE_DIR}/src/fdd) + PRIVATE "${CMAKE_SOURCE_DIR}/src" + PRIVATE "${CMAKE_SOURCE_DIR}/src/fdd") + +include(GNUInstallDirs) +list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_FULL_LIBDIR}) + +target_compile_options(fdd PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>) + +target_embed_source(fdd unpack/transpose_unpack_kernel.cu) +target_embed_source(fdd dedisperse/fdd_dedisperse_kernel.cu) +target_embed_source(fdd dedisperse/fdd_scale_kernel.cu) target_link_libraries( fdd - CUDA::cudart - CUDA::cufft - CUDA::nvToolsExt - CUDA::cuda_driver - OpenMP::OpenMP_CXX - PkgConfig::FFTW) - -target_compile_options(fdd PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>) + PUBLIC cudawrappers::cufft + cudawrappers::cu + cudawrappers::nvtx + cudawrappers::nvrtc + OpenMP::OpenMP_CXX + PkgConfig::FFTW + Threads::Threads) set_target_properties( fdd @@ -35,6 +43,15 @@ set_target_properties( VERSION ${DEDISP_VERSION} SOVERSION ${DEDISP_VERSION_MAJOR}) +if(${DEDISP_BACKEND_HIP}) + set_source_files_properties(dedisperse/FDDKernel.cu unpack/UnpackKernel.cu + scale/ScaleKernel.cu PROPERTIES LANGUAGE HIP) +endif() + +if(DEDISP_BENCHMARK_WITH_PMT) + target_link_libraries(fdd PUBLIC pmt) +endif() + install( TARGETS fdd LIBRARY DESTINATION lib diff --git a/src/fdd/FDDCPUPlan.cpp b/src/fdd/FDDCPUPlan.cpp index dc6a949120b8cacae7164bf55544fc7c034ddfa0..206f4034f7a9fce099a5f6fd960fdf87c0681c6a 100644 --- a/src/fdd/FDDCPUPlan.cpp +++ b/src/fdd/FDDCPUPlan.cpp @@ -9,7 +9,6 @@ #include <thread> #include <fftw3.h> -#include <omp.h> #include "common/dedisp_strings.h" #ifdef DEDISP_BENCHMARK diff --git a/src/fdd/FDDCPUPlan.hpp b/src/fdd/FDDCPUPlan.hpp index 7d5fe05979f23379814b00b208ec4a3c32436e65..e1332fc6ffbe8d7c8a5343d50aa32d143f2107f2 100644 --- a/src/fdd/FDDCPUPlan.hpp +++ b/src/fdd/FDDCPUPlan.hpp @@ -1,10 +1,10 @@ -#ifndef H_FDD_CPU_PLAN_INCLUDE_GUARD -#define H_FDD_CPU_PLAN_INCLUDE_GUARD - -#include <vector> +#ifndef DEDISP_FDD_CPU_PLAN_HPP_ +#define DEDISP_FDD_CPU_PLAN_HPP_ #include "Plan.hpp" +#include <stddef.h> + namespace dedisp { class FDDCPUPlan : public Plan { @@ -55,4 +55,4 @@ private: } // end namespace dedisp -#endif // H_FDD_CPU_PLAN_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_FDD_CPU_PLAN_HPP_ \ No newline at end of file diff --git a/src/fdd/FDDGPUPlan.cpp b/src/fdd/FDDGPUPlan.cpp index c38e261f8e935c76bb93e09ab40d3276e06df658..c39b4c136345af1bdd378a4205b64dc87df20c27 100644 --- a/src/fdd/FDDGPUPlan.cpp +++ b/src/fdd/FDDGPUPlan.cpp @@ -1,30 +1,37 @@ #include "FDDGPUPlan.hpp" +#include <array> #include <cmath> #include <complex> #include <cstring> #include <iomanip> #include <iostream> +#include <memory> #include <mutex> #include <thread> #include <assert.h> -#include <cufft.h> -#include <omp.h> +#include "GPUPlan.hpp" #include "common/dedisp_strings.h" -#include "dedisperse/FDDKernel.hpp" -#include "unpack/unpack.h" +#include "cudawrappers/cu.hpp" +#include "dedisp_types.h" + #ifdef DEDISP_BENCHMARK #include "external/Stopwatch.h" #endif +#ifdef HAVE_PMT +#include <pmt.h> +#endif + +#include <cudawrappers/cufft.hpp> +#include <cudawrappers/nvtx.hpp> + #include "chunk.h" #include "common/helper.h" #include "helper.h" -#include <cuda_runtime.h> - namespace dedisp { // Constructor @@ -126,12 +133,12 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, // Events, markers, timers cu::Event eStartGPU, eEndGPU; - cu::Marker mAllocMem("Allocate host and device memory", cu::Marker::black); - cu::Marker mCopyMem("Copy CUDA mem to CPU mem", cu::Marker::black); - cu::Marker mPrepFFT("cufft Plan Many", cu::Marker::yellow); - cu::Marker mPrepSpinf("spin Frequency generation", cu::Marker::blue); - cu::Marker mDelayTable("Delay table copy", cu::Marker::black); - cu::Marker mExeGPU("Dedisp fdd execution on GPU", cu::Marker::green); + nvtx::Marker mAllocMem("Allocate host and device memory", + nvtx::Marker::black); + nvtx::Marker mCopyMem("Copy CUDA mem to CPU mem", nvtx::Marker::black); + nvtx::Marker mPrepFFT("cufft Plan Many", nvtx::Marker::yellow); + nvtx::Marker mPrepSpinf("spin Frequency generation", nvtx::Marker::blue); + nvtx::Marker mExeGPU("Dedisp fdd execution on GPU", nvtx::Marker::green); #ifdef DEDISP_BENCHMARK std::unique_ptr<Stopwatch> init_timer(Stopwatch::create()); std::unique_ptr<Stopwatch> preprocessing_timer(Stopwatch::create()); @@ -141,6 +148,15 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, std::unique_ptr<Stopwatch> output_timer(Stopwatch::create()); std::unique_ptr<Stopwatch> gpuexec_timer(Stopwatch::create()); std::unique_ptr<Stopwatch> total_timer(Stopwatch::create()); + +#if defined(HAVE_PMT) +#if defined(__HIP__) + auto pmt_sensor = pmt::Create("rocm", m_device->getOrdinal()); +#else + auto pmt_sensor = pmt::Create("nvidia", m_device->getOrdinal()); +#endif +#endif + total_timer->Start(); init_timer->Start(); #endif @@ -150,42 +166,30 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, std::cout << fft_plan_str << std::endl; #endif - CUcontext context = m_device->get_current_context(); - mPrepFFT.start(); - cufftHandle plan_r2c, plan_c2r; int n[] = {(int)nsamp_fft}; - int rnembed[] = {(int)nsamp_padded}; // width in real elements - int cnembed[] = {(int)nsamp_padded / 2}; // width in complex elements + + std::unique_ptr<cufft::FFT1DR2C<CUDA_R_32F>> plan_r2c; + std::unique_ptr<cufft::FFT1DC2R<CUDA_C_32F>> plan_c2r; + + const long long rnembed = nsamp_padded; // width in real elements + const long long cnembed = nsamp_padded / 2; // width in complex elements + std::thread thread_r2c = std::thread([&]() { - m_device->set_context(context); - - cufftResult result = - cufftPlanMany(&plan_r2c, // plan - 1, n, // rank, n - rnembed, 1, rnembed[0], // inembed, istride, idist - cnembed, 1, cnembed[0], // onembed, ostride, odist - CUFFT_R2C, // type - nchan_fft_batch); // batch - if (result != CUFFT_SUCCESS) { - throw std::runtime_error("Error creating real to complex FFT plan."); - } - cufftSetStream(plan_r2c, *executestream); + m_context->setCurrent(); + + plan_r2c = std::make_unique<cufft::FFT1DR2C<CUDA_R_32F>>( + n[0], nchan_fft_batch, rnembed, cnembed); + + plan_r2c->setStream(*executestream); }); std::thread thread_c2r = std::thread([&]() { - m_device->set_context(context); - - cufftResult result = - cufftPlanMany(&plan_c2r, // plan - 1, n, // rank, n - cnembed, 1, cnembed[0], // inembed, istride, idist - rnembed, 1, rnembed[0], // onembed, ostride, odist - CUFFT_C2R, // type - ndm_fft_batch); // batch - if (result != CUFFT_SUCCESS) { - throw std::runtime_error("Error creating complex to real FFT plan."); - } - cufftSetStream(plan_c2r, *executestream); + m_context->setCurrent(); + + plan_c2r = std::make_unique<cufft::FFT1DC2R<CUDA_C_32F>>( + n[0], ndm_fft_batch, cnembed, rnembed); + + plan_c2r->setStream(*executestream); }); // Wait for cuFFT plans to be created @@ -195,6 +199,7 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, if (thread_c2r.joinable()) { thread_c2r.join(); } + mPrepFFT.end(); // Generate spin frequency table @@ -205,8 +210,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, mPrepSpinf.end(); // Determine the amount of memory to use - size_t d_memory_total = m_device->get_total_memory(); // in Bytes - size_t d_memory_free = m_device->get_free_memory(); // in Bytes + size_t d_memory_total = m_context->getTotalMemory(); // in Bytes + size_t d_memory_free = m_context->getFreeMemory(); // in Bytes size_t sizeof_data_t_nu = 1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word); size_t sizeof_data_x_nu = @@ -219,11 +224,11 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, // Subtract the memory usage of any pre-existing device buffers size_t d_memory_in_use = 0; - for (cu::DeviceMemory &d_memory : d_data_t_nu_) { - d_memory_in_use += d_memory.size(); + for (auto &d_memory : d_data_t_nu_) { + d_memory_in_use += d_memory->size(); } - for (cu::DeviceMemory &d_memory : d_data_x_dm_) { - d_memory_in_use += d_memory.size(); + for (auto &d_memory : d_data_x_dm_) { + d_memory_in_use += d_memory->size(); } d_memory_free += d_memory_in_use; @@ -295,6 +300,7 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, std::cout << memory_alloc_str << std::endl; #endif mAllocMem.start(); + /* The buffers are used as follows: 1) copy into page-locked buffer: in -> memcpyHtoH -> h_data_t_nu @@ -317,22 +323,27 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, */ h_data_t_nu_.resize(nchan_buffers); h_data_t_dm_.resize(ndm_buffers); + d_data_t_nu_.resize(nchan_buffers); d_data_x_dm_.resize(ndm_buffers); + cu::DeviceMemory d_data_x_nu(sizeof_data_x_nu); + for (unsigned int i = 0; i < nchan_buffers; i++) { - h_data_t_nu_[i].resize(sizeof_data_t_nu); - d_data_t_nu_[i].resize(sizeof_data_t_nu); + h_data_t_nu_[i] = std::make_shared<cu::HostMemory>(sizeof_data_t_nu); + d_data_t_nu_[i] = std::make_shared<cu::DeviceMemory>(sizeof_data_t_nu); } + for (unsigned int i = 0; i < ndm_buffers; i++) { - h_data_t_dm_[i].resize(sizeof_data_x_dm); - d_data_x_dm_[i].resize(sizeof_data_x_dm); + h_data_t_dm_[i] = std::make_shared<cu::HostMemory>(sizeof_data_x_dm); + d_data_x_dm_[i] = std::make_shared<cu::DeviceMemory>(sizeof_data_x_dm); } + mAllocMem.end(); #ifdef DEDISP_DEBUG - size_t d_memory_free_after_malloc = m_device->get_free_memory(); // in Bytes - size_t h_memory_free_after_malloc = get_free_memory(); // in Mbytes + size_t d_memory_free_after_malloc = m_context->getFreeMemory(); // in Bytes + size_t h_memory_free_after_malloc = get_free_memory(); // in Mbytes std::cout << "Device memory free after memory allocations = " << d_memory_free_after_malloc / std::pow(1024, 3) << " Gb" << std::endl; @@ -340,13 +351,6 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, << h_memory_free_after_malloc / std::pow(1024, 1) << " Gb" << std::endl; #endif - - // Initialize FDDKernel - FDDKernel kernel; - mDelayTable.start(); - kernel.copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0, - *htodstream); - mDelayTable.end(); #ifdef DEDISP_BENCHMARK init_timer->Pause(); #endif @@ -355,8 +359,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, unsigned int ichan_start; unsigned int ichan_end; unsigned int nchan_current; - void *h_in_ptr; - void *d_in_ptr; + std::shared_ptr<cu::HostMemory> h_in_ptr; + std::shared_ptr<cu::DeviceMemory> d_in_ptr; cu::Event inputStart, inputEnd; cu::Event preprocessingStart, preprocessingEnd; cu::Event outputStart, outputEnd; @@ -370,8 +374,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, job.ichan_start = job_id == 0 ? 0 : channel_jobs[job_id - 1].ichan_end; job.nchan_current = std::min(nchan_batch_max, nchan - job.ichan_start); job.ichan_end = job.ichan_start + job.nchan_current; - job.h_in_ptr = h_data_t_nu_[job_id % nchan_buffers]; - job.d_in_ptr = d_data_t_nu_[job_id % nchan_buffers]; + job.h_in_ptr = h_data_t_nu_.at(job_id % nchan_buffers); + job.d_in_ptr = d_data_t_nu_.at(job_id % nchan_buffers); if (job.nchan_current == 0) { channel_jobs.pop_back(); } @@ -383,8 +387,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, unsigned int ndm_current; std::mutex cpu_lock; std::mutex gpu_lock; - cu::HostMemory *h_data_t_dm; - cu::DeviceMemory *d_data_x_dm; + std::shared_ptr<cu::HostMemory> h_data_t_dm; + std::shared_ptr<cu::DeviceMemory> d_data_x_dm; cu::Event inputStart, inputEnd; cu::Event dedispersionStart, dedispersionEnd; cu::Event postprocessingStart, postprocessingEnd; @@ -399,8 +403,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, job.idm_start = job_id == 0 ? 0 : dm_jobs[job_id - 1].idm_end; job.ndm_current = std::min(ndm_batch_max, ndm - job.idm_start); job.idm_end = job.idm_start + job.ndm_current; - job.h_data_t_dm = &h_data_t_dm_[job_id % ndm_buffers]; - job.d_data_x_dm = &d_data_x_dm_[job_id % ndm_buffers]; + job.h_data_t_dm = h_data_t_dm_.at(job_id % ndm_buffers); + job.d_data_x_dm = d_data_x_dm_.at(job_id % ndm_buffers); if (job.ndm_current == 0) { dm_jobs.pop_back(); } @@ -429,7 +433,7 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, #endif // Copy partial output from pinned memory to output buffer dedisp_size src_stride = 1ULL * nsamp_padded * out_bytes_per_sample; - auto *h_src = dm_job.h_data_t_dm->data(); + const void *h_src = static_cast<const void *>(*dm_job.h_data_t_dm); dedisp_size dst_stride = 1ULL * nsamp_computed * out_bytes_per_sample; dedisp_size dst_offset = 1ULL * dm_job.idm_start * dst_stride; auto *h_dst = (void *)(((size_t)out) + dst_offset); @@ -451,6 +455,12 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, #ifdef DEDISP_DEBUG std::cout << fdd_dedispersion_str << std::endl; #endif + +#ifdef HAVE_PMT + pmt::State pmt_start, pmt_end; + pmt_start = pmt_sensor->Read(); +#endif + htodstream->record(eStartGPU); mExeGPU.start(); @@ -476,54 +486,53 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, if (channel_job_id == 0) { dedisp_size gulp_chan_byte_idx = (channel_job.ichan_start / chans_per_word) * sizeof(dedisp_word); - memcpy2D(channel_job.h_in_ptr, // dst + memcpy2D(*(channel_job.h_in_ptr), // dst dst_stride, // dst width in + gulp_chan_byte_idx, // src src_stride, // src width dst_stride, // width bytes nsamp); // height htodstream->record(channel_job.inputStart); - htodstream->memcpyHtoDAsync(channel_job.d_in_ptr, // dst - channel_job.h_in_ptr, // src - nsamp * dst_stride); // size + htodstream->memcpyHtoDAsync((*channel_job.d_in_ptr), // dst + (*channel_job.h_in_ptr), // src + nsamp * dst_stride); // size htodstream->record(channel_job.inputEnd); } - executestream->waitEvent(channel_job.inputEnd); + executestream->wait(channel_job.inputEnd); // Transpose and upack the data executestream->record(channel_job.preprocessingStart); - transpose_unpack((dedisp_word *)channel_job.d_in_ptr, // d_in - nchan_words_gulp, // input width - nsamp, // input height - nchan_words_gulp, // in_stride - nsamp_padded, // out_stride - d_data_x_nu, // d_out - in_nbits, 32, // in_nbits, out_nbits - 1.0 / nchan, // scale - *executestream); // stream + fdd_kernel_unpack.run(*(channel_job.d_in_ptr), // d_in + nchan_words_gulp, // input width + nsamp, // input height + nchan_words_gulp, // in_stride + nsamp_padded, // out_stride + d_data_x_nu, // d_out + in_nbits, 32, // in_nbits, out_nbits + 1.0 / nchan, // scale + *executestream); // stream // Apply zero padding - auto dst_ptr = ((float *)d_data_x_nu.data()) + nsamp; unsigned int nsamp_padding = nsamp_padded - nsamp; - cu::checkError(cudaMemset2DAsync(dst_ptr, // devPtr - nsamp_padded * sizeof(float), // pitch - 0, // value - nsamp_padding * sizeof(float), // width - nchan_batch_max, // height - *executestream)); + + float *dst_ptr = (static_cast<float *>(d_data_x_nu)) + nsamp; + cu::DeviceMemory dev_ptr( + reinterpret_cast<CUdeviceptr>(static_cast<void *>(dst_ptr))); + + executestream->memset2DAsync(dev_ptr, // device ptr + static_cast<unsigned char>(0), // value + nsamp_padded * sizeof(float), // pitch + nsamp_padding * sizeof(float), // width + nchan_batch_max); // height // FFT data (real to complex) along time axis for (unsigned int i = 0; i < nchan_batch_max / nchan_fft_batch; i++) { - cufftReal *idata = (cufftReal *)d_data_x_nu.data() + - i * nsamp_padded * nchan_fft_batch; - cufftComplex *odata = (cufftComplex *)idata; - - cufftResult ret; - if ((ret = cufftExecR2C(plan_r2c, idata, odata)) != CUFFT_SUCCESS) { - std::stringstream ss; - ss << "Error while executing R2C FFT plan, error code: " << ret; - throw std::runtime_error(ss.str()); - } + cu::DeviceMemory idata(reinterpret_cast<CUdeviceptr>( + static_cast<cufftReal *>(d_data_x_nu) + + (i * nsamp_padded * nchan_fft_batch))); + cu::DeviceMemory odata(idata); + + plan_r2c->execute(idata, odata, CUFFT_FORWARD); } executestream->record(channel_job.preprocessingEnd); @@ -550,7 +559,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, dm_job_previous.outputEnd.synchronize(); } - dm_job.d_data_x_dm->zero(*executestream); + executestream->zero(*(dm_job.d_data_x_dm), + dm_job.d_data_x_dm->size()); } // Wait for temporary output from previous job to be copied @@ -561,21 +571,25 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, // Dedispersion in frequency domain executestream->record(dm_job.dedispersionStart); - auto d_out = (dedisp_float2 *)dm_job.d_data_x_dm->data(); - kernel.launch(dm_job.ndm_current, // ndm - nfreq, // nfreq - channel_job.nchan_current, // nchan - dt, // dt - d_spin_frequencies, // d_spin_frequencies - d_dm_list, // d_dm_list - d_data_x_nu, // d_in - d_out, // d_out - nsamp_padded / 2, // in stride - nsamp_padded / 2, // out stride - dm_job.idm_start, // idm_start - dm_job.idm_end, // idm_end - channel_job.ichan_start, // ichan_start - *executestream); // stream + + fdd_kernel_dedisperse.run( + dm_job.ndm_current, // ndm + nfreq, // nfreq + channel_job.nchan_current, // nchan + dt, // dt + *d_spin_frequencies, // d_spin_frequencies + *d_dm_list, // d_dm_list + d_data_x_nu, // d_in + *dm_job.d_data_x_dm, // d_out + nsamp_padded / 2, // in stride + nsamp_padded / 2, // out stride + dm_job.idm_start, // idm_start + dm_job.idm_end, // idm_end + channel_job.ichan_start, // ichan_start + h_delay_table.data(), // d_delay_table + h_delay_table.size() * sizeof(dedisp_float), // d_delay_table size + *htodstream, + *executestream); // stream executestream->record(dm_job.dedispersionEnd); } // end for dm_job_id_inner @@ -586,16 +600,16 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, dedisp_size gulp_chan_byte_idx = (channel_job_next.ichan_start / chans_per_word) * sizeof(dedisp_word); - memcpy2D(channel_job_next.h_in_ptr, // dst - dst_stride, // dst width - in + gulp_chan_byte_idx, // src - src_stride, // src width - dst_stride, // width bytes - nsamp); // height + memcpy2D(*(channel_job_next.h_in_ptr), // dst + dst_stride, // dst width + in + gulp_chan_byte_idx, // src + src_stride, // src width + dst_stride, // width bytes + nsamp); // height htodstream->record(channel_job_next.inputStart); - htodstream->memcpyHtoDAsync(channel_job_next.d_in_ptr, // dst - channel_job_next.h_in_ptr, // src - nsamp * dst_stride); // size + htodstream->memcpyHtoDAsync(*channel_job_next.d_in_ptr, // dst + *channel_job_next.h_in_ptr, // src + nsamp * dst_stride); // size htodstream->record(channel_job_next.inputEnd); } @@ -644,43 +658,39 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, #endif // Get pointer to DM output data on host and on device dedisp_size dm_stride = 1ULL * nsamp_padded * out_bytes_per_sample; - auto *h_out = dm_job.h_data_t_dm->data(); - auto *d_out = (float *)dm_job.d_data_x_dm->data(); + void *const h_out = (*dm_job.h_data_t_dm); + auto d_out = dm_job.d_data_x_dm; // Fourier transform results back to time domain executestream->record(dm_job.postprocessingStart); for (unsigned int i = 0; i < ndm_batch_max / ndm_fft_batch; i++) { - cufftReal *odata = - (cufftReal *)d_out + i * nsamp_padded * ndm_fft_batch; - cufftComplex *idata = (cufftComplex *)odata; - - cufftResult ret; - if ((ret = cufftExecC2R(plan_c2r, idata, odata)) != CUFFT_SUCCESS) { - std::stringstream ss; - ss << "Error while executing C2R FFT plan, error code: " << ret; - throw std::runtime_error(ss.str()); - } + cu::DeviceMemory odata( + reinterpret_cast<CUdeviceptr>(static_cast<cufftReal *>(*d_out) + + (i * nsamp_padded * ndm_fft_batch))); + cu::DeviceMemory idata = odata; + + plan_c2r->execute(idata, odata, CUFFT_INVERSE); } // FFT scaling - kernel.scale(dm_job.ndm_current, // height - nsamp_padded, // width - nsamp_padded, // stride - 1.0f / nsamp_fft, // scale - d_out, // d_data - *executestream); // stream + fdd_kernel_scale.scale(dm_job.ndm_current, // height + nsamp_padded, // width + nsamp_padded, // stride + 1.0f / nsamp_fft, // scale + *d_out, // d_data + *executestream); // stream executestream->record(dm_job.postprocessingEnd); // Copy output // Output is picked up by (already running) host side thread // and is there copied from CPU pinned to paged memory dm_job.gpu_lock.lock(); - dtohstream->waitEvent(dm_job.postprocessingEnd); + dtohstream->wait(dm_job.postprocessingEnd); dtohstream->record(dm_job.outputStart); dedisp_size size = 1ULL * dm_job.ndm_current * dm_stride; - dtohstream->memcpyDtoHAsync(h_out, // dst - d_out, // src - size); // size + dtohstream->memcpyDtoHAsync(h_out, // dst + *d_out, // src + size); // size dtohstream->record(dm_job.outputEnd); dm_job.cpu_lock.unlock(); } // end for dm_job_id_inner @@ -692,12 +702,17 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, output_thread.join(); } dtohstream->record(eEndGPU); - mExeGPU.end(eEndGPU); + dtohstream->synchronize(); + mExeGPU.end(); #ifdef DEDISP_BENCHMARK total_timer->Pause(); gpuexec_timer->Add(eEndGPU.elapsedTime(eStartGPU)); +#ifdef HAVE_PMT + pmt_end = pmt_sensor->Read(); +#endif + // Accumulate postprocessing time for all dm jobs for (auto &job : dm_jobs) { postprocessing_timer->Add( @@ -732,12 +747,14 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in, << std::endl; std::cout << total_time_str << total_timer->ToString() << " sec." << std::endl; +#ifdef HAVE_PMT + std::cout << pmt_joules_str << pmt::PMT::joules(pmt_start, pmt_end) << " J" + << std::endl; + std::cout << pmt_watts_str << pmt::PMT::watts(pmt_start, pmt_end) << " W" + << std::endl; +#endif std::cout << std::endl; #endif - - // Free FFT plans - cufftDestroy(plan_c2r); - cufftDestroy(plan_r2c); } /* Refer to execute_gpu() above for additional comments on common constructs @@ -851,12 +868,13 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, // Events, markers, timers cu::Event eStartGPU, eEndGPU; - cu::Marker mAllocMem("Allocate host and device memory", cu::Marker::black); - cu::Marker mCopyMem("Copy CUDA mem to CPU mem", cu::Marker::black); - cu::Marker mPrepFFT("cufft Plan Many", cu::Marker::yellow); - cu::Marker mPrepSpinf("spin Frequency generation", cu::Marker::blue); - cu::Marker mDelayTable("Delay table copy", cu::Marker::black); - cu::Marker mExeGPU("Dedisp fdd execution on GPU", cu::Marker::green); + nvtx::Marker mAllocMem("Allocate host and device memory", + nvtx::Marker::black); + nvtx::Marker mCopyMem("Copy CUDA mem to CPU mem", nvtx::Marker::black); + nvtx::Marker mPrepFFT("cufft Plan Many", nvtx::Marker::yellow); + nvtx::Marker mPrepSpinf("spin Frequency generation", nvtx::Marker::blue); + nvtx::Marker mDelayTable("Delay table copy", nvtx::Marker::black); + nvtx::Marker mExeGPU("Dedisp fdd execution on GPU", nvtx::Marker::green); #ifdef DEDISP_BENCHMARK std::unique_ptr<Stopwatch> init_timer(Stopwatch::create()); std::unique_ptr<Stopwatch> input_timer(Stopwatch::create()); @@ -881,17 +899,21 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, cu::HostMemory h_data_t_dm(ndm * nsamp_padded * sizeof(float)); cu::DeviceMemory d_data_t_nu(nchan_batch_max * nsamp_padded * sizeof(float)); cu::DeviceMemory d_data_f_nu(nchan_batch_max * nsamp_padded * sizeof(float)); - std::vector<cu::HostMemory> h_data_t_nu_(nchan_buffers); - std::vector<cu::DeviceMemory> d_data_t_nu_(nchan_buffers); - std::vector<cu::DeviceMemory> d_data_f_dm_(ndm_buffers); - std::vector<cu::DeviceMemory> d_data_t_dm_(ndm_buffers); + std::vector<std::unique_ptr<cu::HostMemory>> h_data_t_nu_(nchan_buffers); + std::vector<std::unique_ptr<cu::DeviceMemory>> d_data_t_nu_(nchan_buffers); + std::vector<std::unique_ptr<cu::DeviceMemory>> d_data_f_dm_(ndm_buffers); + std::vector<std::unique_ptr<cu::DeviceMemory>> d_data_t_dm_(ndm_buffers); for (unsigned int i = 0; i < nchan_buffers; i++) { - h_data_t_nu_[i].resize(nsamp * nchan_words_gulp * sizeof(dedisp_word)); - d_data_t_nu_[i].resize(nsamp * nchan_words_gulp * sizeof(dedisp_word)); + h_data_t_nu_[i] = std::make_unique<cu::HostMemory>( + nsamp * nchan_words_gulp * sizeof(dedisp_word)); + d_data_t_nu_[i] = std::make_unique<cu::DeviceMemory>( + nsamp * nchan_words_gulp * sizeof(dedisp_word)); } for (unsigned int i = 0; i < ndm_buffers; i++) { - d_data_f_dm_[i].resize(ndm_batch_max * nsamp_padded * sizeof(float)); - d_data_t_dm_[i].resize(ndm_batch_max * nsamp_padded * sizeof(float)); + d_data_f_dm_[i] = std::make_unique<cu::DeviceMemory>( + ndm_batch_max * nsamp_padded * sizeof(float)); + d_data_t_dm_[i] = std::make_unique<cu::DeviceMemory>( + ndm_batch_max * nsamp_padded * sizeof(float)); } mAllocMem.end(); @@ -900,38 +922,16 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, std::cout << fft_plan_str << std::endl; #endif mPrepFFT.start(); - cufftHandle plan_r2c, plan_c2r; - int n[] = {(int)nfft}; - std::thread thread_r2c = std::thread([&]() { - int inembed[] = {(int)nsamp_good}; - int onembed[] = {(int)nfreq_chunk_padded}; - cufftResult result = - cufftPlanMany(&plan_r2c, // plan - 1, n, // rank, n - inembed, 1, inembed[0], // inembed, istride, idist - onembed, 1, onembed[0], // onembed, ostride, odist - CUFFT_R2C, // type - nchunk); // batch - if (result != CUFFT_SUCCESS) { - throw std::runtime_error("Error creating real to complex FFT plan."); - } - cufftSetStream(plan_r2c, *executestream); - }); - std::thread thread_c2r = std::thread([&]() { - int inembed[] = {(int)nfreq_chunk_padded}; - int onembed[] = {(int)nfreq_chunk_padded * 2}; - cufftResult result = - cufftPlanMany(&plan_c2r, // plan - 1, n, // rank, n - inembed, 1, inembed[0], // inembed, istride, idist - onembed, 1, onembed[0], // onembed, ostride, odist - CUFFT_C2R, // type - nchunk); // batch - if (result != CUFFT_SUCCESS) { - throw std::runtime_error("Error creating complex to real FFT plan."); - } - cufftSetStream(plan_c2r, *executestream); - }); + + cufft::FFT1DR2C<CUDA_R_32F> plan_r2c(nfft, nchunk, nsamp_good, + nfreq_chunk_padded); + + cufft::FFT1DC2R<CUDA_C_32F> plan_c2r(nfft, nchunk, nfreq_chunk_padded, + nfreq_chunk_padded * 2); + std::thread thread_r2c = + std::thread([&]() { plan_r2c.setStream(*executestream); }); + std::thread thread_c2r = + std::thread([&]() { plan_r2c.setStream(*executestream); }); // Compute chunks std::vector<Chunk> chunks(nchunk); @@ -957,18 +957,16 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, chunks, h_spin_frequencies, nfreq_chunk, nfreq_chunk_padded, nfft, dt); // Copy segmented spin frequencies to the GPU - d_spin_frequencies.resize(h_spin_frequencies.size() * sizeof(float)); - htodstream->memcpyHtoDAsync(d_spin_frequencies, h_spin_frequencies.data(), - d_spin_frequencies.size()); + if (d_spin_frequencies) + d_spin_frequencies.reset(); + + d_spin_frequencies = std::make_unique<cu::DeviceMemory>( + h_spin_frequencies.size() * sizeof(float)); + assert(d_spin_frequencies); + htodstream->memcpyHtoDAsync(*d_spin_frequencies, h_spin_frequencies.data(), + d_spin_frequencies->size()); } mPrepSpinf.end(); - - // Initialize FDDKernel - FDDKernel kernel; - mDelayTable.start(); - kernel.copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0, - *htodstream); - mDelayTable.end(); #ifdef DEDISP_BENCHMARK init_timer->Pause(); #endif @@ -992,8 +990,8 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, job.ichan_start = job_id == 0 ? 0 : channel_jobs[job_id - 1].ichan_end; job.nchan_current = std::min(nchan_batch_max, nchan - job.ichan_start); job.ichan_end = job.ichan_start + job.nchan_current; - job.h_in_ptr = h_data_t_nu_[job_id % nchan_buffers]; - job.d_in_ptr = d_data_t_nu_[job_id % nchan_buffers]; + job.h_in_ptr = *(h_data_t_nu_.at(job_id % nchan_buffers)); + job.d_in_ptr = *(d_data_t_nu_.at(job_id % nchan_buffers)); if (job.nchan_current == 0) { channel_jobs.pop_back(); } @@ -1020,8 +1018,8 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, job.idm_start = job_id == 0 ? 0 : dm_jobs[job_id - 1].idm_end; job.ndm_current = std::min(ndm_batch_max, ndm - job.idm_start); job.idm_end = job.idm_start + job.ndm_current; - job.d_data_f_dm_ptr = d_data_f_dm_[job_id % ndm_buffers]; - job.d_data_t_dm_ptr = d_data_t_dm_[job_id % ndm_buffers]; + job.d_data_f_dm_ptr = *(d_data_f_dm_.at(job_id % ndm_buffers)); + job.d_data_t_dm_ptr = *(d_data_t_dm_.at(job_id % ndm_buffers)); if (job.ndm_current == 0) { dm_jobs.pop_back(); } @@ -1029,6 +1027,7 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, #ifdef DEDISP_DEBUG std::cout << fdd_dedispersion_str << std::endl; #endif + htodstream->record(eStartGPU); mExeGPU.start(); @@ -1061,42 +1060,52 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, dst_stride, // width bytes nsamp); // height htodstream->record(channel_job.inputStart); - htodstream->memcpyHtoDAsync(channel_job.d_in_ptr, // dst + + cu::DeviceMemory copy_dest_dev( + reinterpret_cast<CUdeviceptr>(channel_job.d_in_ptr)); + + htodstream->memcpyHtoDAsync(copy_dest_dev, // dst channel_job.h_in_ptr, // src nsamp * dst_stride); // size htodstream->record(channel_job.inputEnd); } - executestream->waitEvent(channel_job.inputEnd); + executestream->wait(channel_job.inputEnd); // Transpose and upack the data executestream->record(channel_job.preprocessingStart); - transpose_unpack((dedisp_word *)channel_job.d_in_ptr, // d_in - nchan_words_gulp, // input width - nsamp, // input height - nchan_words_gulp, // in_stride - nsamp_padded, // out_stride - d_data_t_nu, // d_out - in_nbits, 32, // in_nbits, out_nbits - 1.0 / nchan, // scale - *executestream); // stream + + cu::DeviceMemory unpack_dest( + reinterpret_cast<CUdeviceptr>(channel_job.d_in_ptr)); + + fdd_kernel_unpack.run(unpack_dest, // d_in + nchan_words_gulp, // input width + nsamp, // input height + nchan_words_gulp, // in_stride + nsamp_padded, // out_stride + d_data_t_nu, // d_out + in_nbits, 32, // in_nbits, out_nbits + 1.0 / nchan, // scale + *executestream); // stream // Apply zero padding - auto dst_ptr = ((float *)d_data_t_nu.data()) + nsamp; + cu::DeviceMemory dst_ptr(reinterpret_cast<CUdeviceptr>( + static_cast<float *const>(d_data_t_nu) + nsamp)); unsigned int nsamp_padding = nsamp_padded - nsamp; - cu::checkError(cudaMemset2DAsync(dst_ptr, // devPtr - nsamp_padded * sizeof(float), // pitch - 0, // value - nsamp_padding * sizeof(float), // width - nchan_batch_max, // height - *executestream)); + + executestream->memset2DAsync( + dst_ptr, static_cast<unsigned char>(0), nsamp_padded * sizeof(float), + nsamp_padding * sizeof(float), nchan_batch_max); // FFT data (real to complex) along time axis for (unsigned int ichan = 0; ichan < channel_job.nchan_current; ichan++) { - auto *idata = - (cufftReal *)d_data_t_nu.data() + (1ULL * ichan * nsamp_padded); - auto *odata = (cufftComplex *)d_data_f_nu.data() + - (1ULL * ichan * nsamp_padded / 2); - cufftExecR2C(plan_r2c, idata, odata); + cu::DeviceMemory idata(reinterpret_cast<CUdeviceptr>( + static_cast<cufftReal *>(d_data_t_nu) + + (1ULL * ichan * nsamp_padded))); + cu::DeviceMemory odata(reinterpret_cast<CUdeviceptr>( + static_cast<cufftComplex *>(d_data_f_nu) + + (1ULL * ichan * nsamp_padded / 2))); + + plan_r2c.execute(idata, odata, CUFFT_FORWARD); } executestream->record(channel_job.preprocessingEnd); @@ -1105,10 +1114,10 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, // Wait for all previous output copies to finish dtohstream->synchronize(); - for (cu::DeviceMemory &d_data_out : d_data_f_dm_) { + for (std::unique_ptr<cu::DeviceMemory> &d_data_out : d_data_f_dm_) { // Use executestream to make sure dedispersion // starts only after initializing the output buffer - d_data_out.zero(*executestream); + executestream->zero(*d_data_out, d_data_out->size()); } } @@ -1135,20 +1144,24 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, // Dedispersion in frequency domain executestream->record(dm_job.dedispersionStart); - kernel.launch(dm_job.ndm_current, // ndm - nfreq, // nfreq - channel_job.nchan_current, // nchan - dt, // dt - d_spin_frequencies, // d_spin_frequencies - d_dm_list, // d_dm_list - d_data_f_nu, // d_in - dm_job.d_data_f_dm_ptr, // d_out - nsamp_padded / 2, // in stride - nsamp_padded / 2, // out stride - dm_job.idm_start, // idm_start - dm_job.idm_end, // idm_end - channel_job.ichan_start, // ichan_start - *executestream); // stream + fdd_kernel_dedisperse.run( + dm_job.ndm_current, // ndm + nfreq, // nfreq + channel_job.nchan_current, // nchan + dt, // dt + *d_spin_frequencies, // d_spin_frequencies + *d_dm_list, // d_dm_list + d_data_f_nu, // d_in + cu::DeviceMemory( + reinterpret_cast<CUdeviceptr>(dm_job.d_data_f_dm_ptr)), // d_out + nsamp_padded / 2, // in stride + nsamp_padded / 2, // out stride + dm_job.idm_start, // idm_start + dm_job.idm_end, // idm_end + channel_job.ichan_start, // ichan_start + h_delay_table.data(), h_delay_table.size() * sizeof(dedisp_float), + *htodstream, + *executestream); // stream executestream->record(dm_job.dedispersionEnd); } // end for dm_job_id_inner @@ -1166,9 +1179,11 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, dst_stride, // width bytes nsamp); // height htodstream->record(channel_job_next.inputStart); - htodstream->memcpyHtoDAsync(channel_job_next.d_in_ptr, // dst - channel_job_next.h_in_ptr, // src - nsamp * dst_stride); // size + htodstream->memcpyHtoDAsync( + cu::DeviceMemory(reinterpret_cast<CUdeviceptr>( + channel_job_next.d_in_ptr)), // dst + channel_job_next.h_in_ptr, // src + nsamp * dst_stride); // size htodstream->record(channel_job_next.inputEnd); } @@ -1210,41 +1225,42 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, // Get pointer to DM output data on host and on device dedisp_size dm_stride = nsamp_padded * out_bytes_per_sample; dedisp_size dm_offset = dm_job.idm_start * dm_stride; + auto *h_data_t_dm_ptr = - (void *)(((size_t)h_data_t_dm.data()) + dm_offset); + (void *)((static_cast<uintptr_t *>(h_data_t_dm)) + dm_offset); auto *d_data_f_dm_ptr = (float *)dm_job.d_data_f_dm_ptr; auto *d_data_t_dm_ptr = (float *)dm_job.d_data_t_dm_ptr; // Fourier transform results back to time domain executestream->record(dm_job.postprocessingStart); for (unsigned int idm = 0; idm < dm_job.ndm_current; idm++) { - auto *idata = - (cufftComplex *)d_data_f_dm_ptr + (1ULL * idm * nsamp_padded / 2); - auto *odata = - (cufftReal *)d_data_t_dm_ptr + (1ULL * idm * nsamp_padded); - - cufftResult ret; - if ((ret = cufftExecC2R(plan_c2r, idata, odata)) != CUFFT_SUCCESS) { - std::stringstream ss; - ss << "Error while executing C2R FFT plan, error code: " << ret; - throw std::runtime_error(ss.str()); - } + cu::DeviceMemory idata(reinterpret_cast<CUdeviceptr>( + d_data_f_dm_ptr + (1ULL * idm * nsamp_padded / 2))); + cu::DeviceMemory odata(reinterpret_cast<CUdeviceptr>( + static_cast<cufftReal *>(d_data_t_dm_ptr) + + (1ULL * idm * nsamp_padded))); + + plan_c2r.execute(idata, odata, CUFFT_INVERSE); } // FFT scaling - kernel.scale(dm_job.ndm_current, // height - nsamp_padded, // width - nsamp_padded, // stride - 1.0f / nfft, // scale - d_data_t_dm_ptr, // d_data - *executestream); // stream + fdd_kernel_scale.scale(dm_job.ndm_current, // height + nsamp_padded, // width + nsamp_padded, // stride + 1.0f / nfft, // scale + cu::DeviceMemory(reinterpret_cast<CUdeviceptr>( + d_data_t_dm_ptr)), // d_data + *executestream); // stream executestream->record(dm_job.postprocessingEnd); // Copy output - dtohstream->waitEvent(dm_job.postprocessingEnd); + dtohstream->wait(dm_job.postprocessingEnd); dtohstream->record(dm_job.outputStart); + + cu::DeviceMemory dev_ptr( + reinterpret_cast<CUdeviceptr>(static_cast<void *>(d_data_t_dm_ptr))); dtohstream->memcpyDtoHAsync(h_data_t_dm_ptr, // dst - d_data_t_dm_ptr, // src + dev_ptr, // src dm_job.ndm_current * dm_stride); // size dtohstream->record(dm_job.outputEnd); } // end for dm_job_id_inner @@ -1252,7 +1268,7 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, // Wait for final memory transfer dtohstream->record(eEndGPU); - mExeGPU.end(eEndGPU); + mExeGPU.end(); dtohstream->synchronize(); // Copy output @@ -1263,8 +1279,9 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, #ifdef DEDISP_BENCHMARK output_timer->Start(); #endif - copy_chunk_output((float *)h_data_t_dm.data(), (float *)out, ndm, nsamp, - nsamp_computed, nsamp_padded, nsamp_good, chunks); + copy_chunk_output(static_cast<float *>(h_data_t_dm), + reinterpret_cast<float *>(out), ndm, nsamp, nsamp_computed, + nsamp_padded, nsamp_good, chunks); #ifdef DEDISP_BENCHMARK output_timer->Pause(); #endif @@ -1305,10 +1322,6 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in, << std::endl; std::cout << std::endl; #endif - - // Free FFT plans - cufftDestroy(plan_c2r); - cufftDestroy(plan_r2c); } // Private helper function @@ -1322,10 +1335,15 @@ void FDDGPUPlan::generate_spin_frequency_table(dedisp_size nfreq, h_spin_frequencies[ifreq] = ifreq * (1.0 / (nsamp * dt)); } - d_spin_frequencies.resize(nfreq * sizeof(dedisp_float)); + if (d_spin_frequencies) + d_spin_frequencies.reset(); + + d_spin_frequencies = + std::make_unique<cu::DeviceMemory>(nfreq * sizeof(dedisp_float)); + assert(d_spin_frequencies); - htodstream->memcpyHtoDAsync(d_spin_frequencies, h_spin_frequencies.data(), - d_spin_frequencies.size()); + htodstream->memcpyHtoDAsync(*d_spin_frequencies, h_spin_frequencies.data(), + d_spin_frequencies->size()); } } // end namespace dedisp \ No newline at end of file diff --git a/src/fdd/FDDGPUPlan.hpp b/src/fdd/FDDGPUPlan.hpp index d19a295d3da4d60abb1c478e527518d6648f205e..d2745776820809c3bd7e174dbee9572bd1723b92 100644 --- a/src/fdd/FDDGPUPlan.hpp +++ b/src/fdd/FDDGPUPlan.hpp @@ -1,9 +1,12 @@ -#ifndef H_FDD_GPU_PLAN_INCLUDE_GUARD -#define H_FDD_GPU_PLAN_INCLUDE_GUARD +#ifndef DEDISP_FDD_GPU_PLAN_HPP_ +#define DEDISP_FDD_GPU_PLAN_HPP_ #include "FDDCPUPlan.hpp" #include "GPUPlan.hpp" +#include "dedisperse/FDDKernel.hpp" +#include "unpack/UnpackKernel.hpp" + namespace dedisp { class FDDGPUPlan : public GPUPlan { @@ -37,15 +40,19 @@ private: // Host arrays std::vector<dedisp_float> h_spin_frequencies; // size = nfreq - std::vector<cu::HostMemory> h_data_t_nu_; - std::vector<cu::HostMemory> h_data_t_dm_; + std::vector<std::shared_ptr<cu::HostMemory>> h_data_t_nu_; + std::vector<std::shared_ptr<cu::HostMemory>> h_data_t_dm_; // Device arrays - cu::DeviceMemory d_spin_frequencies; // type = dedisp_float - std::vector<cu::DeviceMemory> d_data_t_nu_; - std::vector<cu::DeviceMemory> d_data_x_dm_; + std::unique_ptr<cu::DeviceMemory> d_spin_frequencies; // type = dedisp_float + std::vector<std::shared_ptr<cu::DeviceMemory>> d_data_t_nu_; + std::vector<std::shared_ptr<cu::DeviceMemory>> d_data_x_dm_; + + FDDKernelDedisperse fdd_kernel_dedisperse; + FDDKernelScale fdd_kernel_scale; + FDDKernelUnpack fdd_kernel_unpack; }; } // end namespace dedisp -#endif // H_FDD_GPU_PLAN_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_FDD_GPU_PLAN_HPP_ \ No newline at end of file diff --git a/src/fdd/chunk.h b/src/fdd/chunk.h index 7c7e398f91aac4722e37e3decf40d794d40e4e5a..932a02581bae91b54d21af68c65475cd810343ad 100644 --- a/src/fdd/chunk.h +++ b/src/fdd/chunk.h @@ -1,5 +1,5 @@ -#ifndef H_CHUNK_INCLUDE_GUARD -#define H_CHUNK_INCLUDE_GUARD +#ifndef DEDISP_FDD_CHUNK_H_ +#define DEDISP_FDD_CHUNK_H_ #include <cstdio> #include <vector> @@ -38,4 +38,4 @@ void generate_spin_frequency_table_chunks(std::vector<Chunk> &chunks, } // end namespace dedisp -#endif // H_CHUNK_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_FDD_CHUNK_H_ \ No newline at end of file diff --git a/src/fdd/cinterface.cpp b/src/fdd/cinterface.cpp index 75d3bc2b64003f4941b4577df14b6f9619b5c812..cd30ec131ba679a007a3ad51f8974d76dc95077d 100644 --- a/src/fdd/cinterface.cpp +++ b/src/fdd/cinterface.cpp @@ -3,9 +3,10 @@ #include "fdd/FDDGPUPlan.hpp" #include <cmath> -#include <iostream> #include <memory> +#include <cudawrappers/cu.hpp> + #ifdef __cplusplus extern "C" { #endif @@ -45,10 +46,6 @@ dedisp_error dedisp_create_plan(dedisp_plan *plan, dedisp_size nchans, *plan = nullptr; - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - *plan = new dedisp_plan_struct(); if (!plan) { throw_error(DEDISP_MEM_ALLOC_FAILED); @@ -70,10 +67,6 @@ dedisp_error dedisp_set_gulp_size(dedisp_plan plan, dedisp_size gulp_size) { throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - return DEDISP_NOT_IMPLEMENTED_ERROR; } @@ -82,10 +75,6 @@ dedisp_size dedisp_get_gulp_size(dedisp_plan plan) { throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - throw_error(DEDISP_NOT_IMPLEMENTED_ERROR); return 0; @@ -97,10 +86,6 @@ dedisp_error dedisp_set_dm_list(dedisp_plan plan, const dedisp_float *dm_list, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->set_dm_list(dm_list, count); } catch (...) { @@ -117,10 +102,6 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, dedisp_float dm_start, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->generate_dm_list(dm_start, dm_end, ti, tol); } catch (...) { @@ -180,10 +161,6 @@ dedisp_error dedisp_set_killmask(dedisp_plan plan, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->set_killmask(killmask); } catch (...) { @@ -289,10 +266,6 @@ dedisp_error dedisp_execute_guru(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_FLAG_COMBINATION); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - return DEDISP_NOT_IMPLEMENTED_ERROR; } @@ -309,10 +282,6 @@ dedisp_error dedisp_execute_adv(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_FLAG_COMBINATION); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - return DEDISP_NOT_IMPLEMENTED_ERROR; } @@ -325,10 +294,6 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->execute(nsamps, in, in_nbits, out, out_nbits, flags); } catch (...) { @@ -340,7 +305,7 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps, dedisp_error dedisp_sync(void) { try { - cu::checkError(cudaDeviceSynchronize()); + cu::Context::synchronize(); } catch (...) { throw_error(DEDISP_PRIOR_GPU_ERROR); } diff --git a/src/fdd/dedisperse/FDDKernel.cu b/src/fdd/dedisperse/FDDKernel.cu index 1ff643d770fcee64dece4591e2613986405ddc52..e5a973c1d404bd1beb4bda96a76438d6b068cf26 100644 --- a/src/fdd/dedisperse/FDDKernel.cu +++ b/src/fdd/dedisperse/FDDKernel.cu @@ -1,81 +1,87 @@ #include "FDDKernel.hpp" -#include "common/cuda/CU.h" -#include "fdd_kernel.cuh" -#include <cuda.h> -/* - * Helper functions - */ -void FDDKernel::copy_delay_table(const void *src, size_t count, size_t offset, - cudaStream_t stream) { - cu::checkError(cudaMemcpyToSymbolAsync(c_delay_table, src, count, offset, - cudaMemcpyDeviceToDevice, stream)); -} +#include <iostream> +#include <memory> +#include <sstream> +#include <stdexcept> +#include <vector> -unsigned long div_round_up(unsigned long a, unsigned long b) { - return (a - 1) / b + 1; -} +#include "GPUKernel.hpp" +#include "fdd_dedisperse_kernel.cu" +#include <cudawrappers/cu.hpp> /* * dedisperse routine */ -void FDDKernel::launch(dedisp_size ndm, dedisp_size nfreq, dedisp_size nchan, - float dt, const dedisp_float *d_spin_frequencies, - const dedisp_float *d_dm_list, const dedisp_float2 *d_in, - const dedisp_float2 *d_out, dedisp_size in_stride, - dedisp_size out_stride, unsigned int idm_start, - unsigned int idm_end, unsigned int ichan_start, - cudaStream_t stream) { +void FDDKernelDedisperse::run( + unsigned long ndm, unsigned long nfreq, unsigned long nchan, float dt, + const cu::DeviceMemory &d_spin_frequencies, + const cu::DeviceMemory &d_dm_list, const cu::DeviceMemory &d_in, + const cu::DeviceMemory &d_out, unsigned long in_stride, + unsigned long out_stride, unsigned int idm_start, unsigned int idm_end, + unsigned int ichan_start, const void *delay_table, + const size_t delay_table_count, cu::Stream &htodstream, + cu::Stream &executestream) { // Define thread decomposition unsigned grid_x = std::max((int)((ndm + NDM_BATCH_GRID) / NDM_BATCH_GRID), 1); unsigned grid_y = NFREQ_BATCH_GRID; dim3 grid(grid_x, grid_y); dim3 block(NFREQ_BATCH_BLOCK); -/* Execute the kernel - * The second kernel argument can be set to true - * in order to enable an experimental optimization feature, - * where extrapolation is used in the computation of the phasors. - * Boudary conditions should be further explored to determine - * functional correctness at all times. - * Leaving this feature in because it might be beneficial - * depending on the system configurations. - */ -#define CALL_KERNEL(NCHAN) \ - dedisperse_kernel<NCHAN, false><<<grid, block, 0, stream>>>( \ - nfreq, dt, (float *)d_spin_frequencies, (float *)d_dm_list, in_stride, \ - out_stride, (const float2 *)d_in, (float2 *)d_out, idm_start, idm_end, \ - ichan_start); - - switch (nchan) { - case 16: - CALL_KERNEL(16); - break; - case 32: - CALL_KERNEL(32); - break; - case 64: - CALL_KERNEL(64); - break; - case 128: - CALL_KERNEL(128); - break; - case 256: - CALL_KERNEL(256); - break; + /* Execute the kernel + * The second kernel argument can be set to true + * in order to enable an experimental optimization feature, + * where extrapolation is used in the computation of the phasors. + * Boudary conditions should be further explored to determine + * functional correctness at all times. + * Leaving this feature in because it might be beneficial + * depending on the system configurations. + */ + + const std::vector<const void *> parameters = { + &nfreq, &dt, &d_spin_frequencies, &d_dm_list, &in_stride, &out_stride, + &d_in, &d_out, &idm_start, &idm_end, &ichan_start}; + + // Execute the kernel + std::ostringstream oss; + oss << "-DDEF_NCHAN=" << nchan; + + CompiledKernelInfo kernel; + bool did_recompile = false; + + std::tie(kernel, did_recompile) = + compile({"-DDEF_EXTRAPOLATE=false", oss.str()}); + assertCompiled(kernel); + + if (did_recompile) { + htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_delay_table"), + delay_table, delay_table_count); } + + executestream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, + block.x, block.y, block.z, 0, parameters); } /* * dedisperse routine */ -void FDDKernel::scale(dedisp_size height, dedisp_size width, dedisp_size stride, - dedisp_float scale, dedisp_float *d_data, - cudaStream_t stream) { +void FDDKernelScale::scale(dedisp_size height, dedisp_size width, + dedisp_size stride, dedisp_float scale, + const cu::DeviceMemory &d_data, cu::Stream &stream) { + + CompiledKernelInfo kernel; + bool did_recompile; + + std::tie(kernel, did_recompile) = compile(); + assertCompiled(kernel); + // Define thread decomposition dim3 grid(height); dim3 block(128); // Execute the kernel - scale_output_kernel<<<grid, block, 0, stream>>>(width, stride, scale, d_data); + std::vector<const void *> parameters = {&width, &stride, &scale, &d_data}; + + stream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, block.x, + block.y, block.z, 0, parameters); } diff --git a/src/fdd/dedisperse/FDDKernel.hpp b/src/fdd/dedisperse/FDDKernel.hpp index a96988f69b35f3bae343b31978f92afee2eb2372..ad7e431eaea64e804679754d582aa954046cc89d 100644 --- a/src/fdd/dedisperse/FDDKernel.hpp +++ b/src/fdd/dedisperse/FDDKernel.hpp @@ -1,25 +1,54 @@ -#ifndef FDD_KERNEL_H_INCLUDE_GUARD -#define FDD_KERNEL_H_INCLUDE_GUARD +#ifndef FDD_KERNEL_H_ +#define FDD_KERNEL_H_ -#include <cuda.h> +#include <cudawrappers/cu.hpp> +#include <cudawrappers/nvtx.hpp> #include "common/dedisp_types.h" +#include <GPUKernel.hpp> -class FDDKernel { +extern const char _binary_src_fdd_dedisperse_fdd_dedisperse_kernel_cu_start, + _binary_src_fdd_dedisperse_fdd_dedisperse_kernel_cu_end; + +extern const char _binary_src_fdd_dedisperse_fdd_scale_kernel_cu_start, + _binary_src_fdd_dedisperse_fdd_scale_kernel_cu_end; + +class FDDKernelDedisperse : public GPUKernel { public: - void copy_delay_table(const void *src, size_t count, size_t offset, - cudaStream_t stream); + FDDKernelDedisperse() + : GPUKernel( + "fdd_dedisperse_kernel.cu", "dedisperse_kernel_runtime", + std::string( + reinterpret_cast<const char *>( + &_binary_src_fdd_dedisperse_fdd_dedisperse_kernel_cu_start), + reinterpret_cast<const char *>( + &_binary_src_fdd_dedisperse_fdd_dedisperse_kernel_cu_end))) { + } - void launch(dedisp_size ndm, dedisp_size nfreq, dedisp_size nchan, float dt, - const dedisp_float *d_spin_frequencies, - const dedisp_float *d_dm_list, const dedisp_float2 *d_in, - const dedisp_float2 *d_out, dedisp_size in_stride, - dedisp_size out_stride, unsigned int idm_start, - unsigned int idm_end, unsigned int ichan_start, - cudaStream_t stream); + void run(dedisp_size ndm, dedisp_size nfreq, dedisp_size nchan, float dt, + const cu::DeviceMemory &d_spin_frequencies, + const cu::DeviceMemory &d_dm_list, const cu::DeviceMemory &d_in, + const cu::DeviceMemory &d_out, dedisp_size in_stride, + dedisp_size out_stride, unsigned int idm_start, unsigned int idm_end, + unsigned int ichan_start, const void *delay_table, + const size_t delay_table_count, cu::Stream &htodstream, + cu::Stream &executestream); +}; + +class FDDKernelScale : public GPUKernel { +public: + FDDKernelScale() + : GPUKernel( + "fdd_kernel_scale.cu", "scale_output_kernel", + std::string( + reinterpret_cast<const char *>( + &_binary_src_fdd_dedisperse_fdd_scale_kernel_cu_start), + reinterpret_cast<const char *>( + &_binary_src_fdd_dedisperse_fdd_scale_kernel_cu_end))) {} void scale(dedisp_size height, dedisp_size width, dedisp_size stride, - dedisp_float scale, dedisp_float *d_data, cudaStream_t stream); + dedisp_float scale, const cu::DeviceMemory &d_data, + cu::Stream &stream); }; -#endif // FDD_KERNEL_H_INCLUDE_GUARD \ No newline at end of file +#endif // FDD_KERNEL_H_ \ No newline at end of file diff --git a/src/fdd/dedisperse/fdd_kernel.cuh b/src/fdd/dedisperse/fdd_dedisperse_kernel.cu similarity index 84% rename from src/fdd/dedisperse/fdd_kernel.cuh rename to src/fdd/dedisperse/fdd_dedisperse_kernel.cu index 9aaac69a656000667a506ad4fe9535d8339479d9..d3cc62771cc3b44a75042552f219b538291905e8 100644 --- a/src/fdd/dedisperse/fdd_kernel.cuh +++ b/src/fdd/dedisperse/fdd_dedisperse_kernel.cu @@ -1,8 +1,12 @@ +#ifndef FDD_DEDISPERSE_KERNEL_H_ +#define FDD_DEDISPERSE_KERNEL_H_ + +#define M_PI 3.14159265358979323846 + // Constant reference for input data // This value is set according to the constant memory size // for all NVIDIA GPUs to date, which is 64 KB and sizeof(dedisp_float) = 4 #define DEDISP_MAX_NCHANS 16384 -__constant__ dedisp_float c_delay_table[DEDISP_MAX_NCHANS]; // The number of DMs computed by a single thread block #define NDM_BATCH_GRID 8 @@ -19,6 +23,9 @@ __constant__ dedisp_float c_delay_table[DEDISP_MAX_NCHANS]; // Option to enable/disable caching input samples in shared memory #define USE_SHARED_MEMORY 0 +// Option to enable/disable interpolation of the phasors +#define EXTRAPOLATE_PHASORS false + /* * Helper functions */ @@ -33,16 +40,19 @@ inline __device__ float2 cmul(float2 a, float2 b) { return {a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x}; } +__constant__ float c_delay_table[DEDISP_MAX_NCHANS]; + /* * dedisperse kernel * FDD computes dedispersion as phase rotations in the Fourier domain */ template <unsigned int NCHAN, bool extrapolate> -__global__ void -dedisperse_kernel(size_t nfreq, float dt, const float *d_spin_frequencies, - const float *d_dm_list, size_t in_stride, size_t out_stride, - const float2 *d_in, float2 *d_out, unsigned int idm_start, - unsigned int idm_end, unsigned int ichan_start) { +__device__ inline void +dedisperse_kernel_impl(size_t nfreq, float dt, const float *d_spin_frequencies, + const float *d_dm_list, size_t in_stride, + size_t out_stride, const float2 *d_in, float2 *d_out, + unsigned int idm_start, unsigned int idm_end, + unsigned int ichan_start) { // The DM that the current block processes unsigned int idm_current = blockIdx.x; // The DM offset is the number of DMs processed by all thread blocks @@ -201,12 +211,24 @@ dedisperse_kernel(size_t nfreq, float dt, const float *d_spin_frequencies, } // end for ifreq_current loop } // end dedisperse_kernel -/* - * scale kernel - */ -__global__ void scale_output_kernel(size_t n, size_t stride, float scale, - float *d_data) { - for (unsigned int i = threadIdx.x; i < n; i += blockDim.x) { - d_data[blockIdx.x * stride + i] *= scale; - } -} \ No newline at end of file +extern "C" { +#ifndef DEF_NCHAN +#define DEF_NCHAN 16 +#endif + +#ifndef DEF_EXTRAPOLATE +#define DEF_EXTRAPOLATE false +#endif + +__global__ void dedisperse_kernel_runtime( + size_t nfreq, float dt, const float *d_spin_frequencies, + const float *d_dm_list, size_t in_stride, size_t out_stride, + const float2 *d_in, float2 *d_out, unsigned int idm_start, + unsigned int idm_end, unsigned int ichan_start) { + dedisperse_kernel_impl<DEF_NCHAN, DEF_EXTRAPOLATE>( + nfreq, dt, d_spin_frequencies, d_dm_list, in_stride, out_stride, d_in, + d_out, idm_start, idm_end, ichan_start); +} +} + +#endif // FDD_DEDISPERSE_KERNEL_H_ \ No newline at end of file diff --git a/src/fdd/dedisperse/fdd_scale_kernel.cu b/src/fdd/dedisperse/fdd_scale_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..364bee6a542231e071e65e9c4982503a9456a5b9 --- /dev/null +++ b/src/fdd/dedisperse/fdd_scale_kernel.cu @@ -0,0 +1,11 @@ +#ifndef FDD_SCALE_KERNEL_H_ +#define FDD_SCALE_KERNEL_H_ + +extern "C" __global__ void scale_output_kernel(size_t n, size_t stride, + float scale, float *d_data) { + for (unsigned int i = threadIdx.x; i < n; i += blockDim.x) { + d_data[blockIdx.x * stride + i] *= scale; + } +} + +#endif // FDD_SCALE_KERNEL_H_ \ No newline at end of file diff --git a/src/fdd/helper.h b/src/fdd/helper.h index 07d55162f5bea8b09adf8569cac3ec9d1f7629e7..b00674e119b57fd89acb35463223b0eb16df84d4 100644 --- a/src/fdd/helper.h +++ b/src/fdd/helper.h @@ -1,5 +1,5 @@ -#ifndef FDD_HELPER_H_INCLUDE_GUARD -#define FDD_HELPER_H_INCLUDE_GUARD +#ifndef DEDISP_FDD_HELPER_H +#define DEDISP_FDD_HELPER_H #include "common/helper.h" @@ -40,4 +40,4 @@ void copy_data(size_t height, size_t width, size_t in_stride, size_t out_stride, } // end namespace dedisp -#endif // FDD_HELPER_H_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_FDD_HELPER_H \ No newline at end of file diff --git a/src/fdd/unpack/UnpackKernel.cu b/src/fdd/unpack/UnpackKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..942ca8cee6b3905dd2d6b9cf6374294a6cedfdad --- /dev/null +++ b/src/fdd/unpack/UnpackKernel.cu @@ -0,0 +1,91 @@ +#include "UnpackKernel.hpp" + +#include <algorithm> +#include <iostream> +#include <memory> +#include <stdexcept> +#include <string> +#include <vector> + +#include <cudawrappers/cu.hpp> +#include <cudawrappers/nvrtc.hpp> + +#include "common/dedisp_types.h" + +#include "kernel_constants.cuh" + +template <typename U> inline U round_up_pow2(const U &a) { + U r = a - 1; + for (unsigned long i = 1; i <= sizeof(U) * 8 / 2; i <<= 1) + r |= r >> i; + return r + 1; +} + +template <typename U> inline U round_down_pow2(const U &a) { + return round_up_pow2(a + 1) / 2; +} + +void FDDKernelUnpack::run(const cu::DeviceMemory &d_in, size_t width, + size_t height, size_t in_stride, size_t out_stride, + cu::DeviceMemory &d_out, unsigned long in_nbits, + unsigned long out_nbits, float scale, + cu::Stream &stream) { + CompiledKernelInfo kernel; + bool did_recompile; + + std::tie(kernel, did_recompile) = compile(); + assertCompiled(kernel); + + // Specify thread decomposition (uses up-rounded divisions) + const dim3 tot_block_count((width - 1) / TILE_DIM + 1, + (height - 1) / TILE_DIM + 1); + const size_t max_grid_dim = round_down_pow2(32768); + + // Partition the grid into chunks that the GPU can accept at once + for (size_t block_y_offset = 0; block_y_offset < tot_block_count.y; + block_y_offset += max_grid_dim) { + dim3 block_count; + + // Handle the possibly incomplete final grid + block_count.y = std::min(max_grid_dim, tot_block_count.y - block_y_offset); + + for (size_t block_x_offset = 0; block_x_offset < tot_block_count.x; + block_x_offset += max_grid_dim) { + // Handle the possibly incomplete final grid + block_count.x = + std::min(max_grid_dim, tot_block_count.x - block_x_offset); + + // Compute the chunked parameters + size_t x_offset = block_x_offset * TILE_DIM; + size_t y_offset = block_y_offset * TILE_DIM; + size_t in_offset = x_offset + y_offset * in_stride; + size_t out_offset = y_offset + x_offset * out_stride; + size_t w = std::min(max_grid_dim * TILE_DIM, width - x_offset); + size_t h = std::min(max_grid_dim * TILE_DIM, height - y_offset); + + dim3 block(TILE_DIM, BLOCK_ROWS); + + // Specify grid decomposition + dim3 grid(round_up_pow2(block_count.x), round_up_pow2(block_count.y)); + + // Run the CUDA kernel + const void *d_in_offset = + static_cast<const dedisp_word *>(d_in) + in_offset; + const void *d_out_offset = static_cast<dedisp_word *>(d_out) + out_offset; + + std::vector<const void *> parameters = {&d_in_offset, + &w, + &h, + &in_stride, + &out_stride, + &d_out_offset, + &(block_count.x), + &(block_count.y), + &in_nbits, + &scale}; + + stream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, block.x, + block.y, block.z, 0, parameters); + } // end for block_x_offset + } // end for block_y_offset +} diff --git a/src/fdd/unpack/UnpackKernel.hpp b/src/fdd/unpack/UnpackKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ec64a49b73cab85e629e7423553fe1810ea0d48b --- /dev/null +++ b/src/fdd/unpack/UnpackKernel.hpp @@ -0,0 +1,27 @@ +#ifndef DEDISP_FDD_UNPACK_UNPACK_KERNEL_HPP_ +#define DEDISP_FDD_UNPACK_UNPACK_KERNEL_HPP_ + +#include "cudawrappers/cu.hpp" +#include <GPUKernel.hpp> + +extern const char _binary_src_fdd_unpack_transpose_unpack_kernel_cu_start, + _binary_src_fdd_unpack_transpose_unpack_kernel_cu_end; + +class FDDKernelUnpack : public GPUKernel { +public: + FDDKernelUnpack() + : GPUKernel( + "transpose_unpack_kernel.cu", "transpose_unpack_kernel_uint", + std::string( + reinterpret_cast<const char *>( + &_binary_src_fdd_unpack_transpose_unpack_kernel_cu_start), + reinterpret_cast<const char *>( + &_binary_src_fdd_unpack_transpose_unpack_kernel_cu_end))) {} + + void run(const cu::DeviceMemory &d_in, size_t width, size_t height, + size_t in_stride, size_t out_stride, cu::DeviceMemory &d_out, + unsigned long in_nbits, unsigned long out_nbits, float scale, + cu::Stream &stream); +}; + +#endif // DEDISP_FDD_UNPACK_UNPACK_KERNEL_HPP_ \ No newline at end of file diff --git a/src/fdd/unpack/kernel_constants.cuh b/src/fdd/unpack/kernel_constants.cuh new file mode 100644 index 0000000000000000000000000000000000000000..8ad63a47b9338dca773dea3d20ea5f6de9ff9939 --- /dev/null +++ b/src/fdd/unpack/kernel_constants.cuh @@ -0,0 +1,8 @@ +#ifndef DEDISP_FDD_UNPACK_KERNEL_CONSTANTS_CUH_ +#define DEDISP_FDD_UNPACK_KERNEL_CONSTANTS_CUH_ + +#define TILE_DIM 32 +#define BLOCK_ROWS 8 +#define EXPANSION 4 + +#endif // DEDISP_FDD_UNPACK_KERNEL_CONSTANTS_CUH_ \ No newline at end of file diff --git a/src/fdd/unpack/unpack_kernel.cuh b/src/fdd/unpack/transpose_unpack_kernel.cu similarity index 79% rename from src/fdd/unpack/unpack_kernel.cuh rename to src/fdd/unpack/transpose_unpack_kernel.cu index e6ae13208bc0733f45fe3d44818c1c136f3671e9..c1542d7853f9872bfd63f598a984c95584179753 100644 --- a/src/fdd/unpack/unpack_kernel.cuh +++ b/src/fdd/unpack/transpose_unpack_kernel.cu @@ -1,9 +1,9 @@ -#define TILE_DIM 32 -#define BLOCK_ROWS 8 -#define EXPANSION 4 +#ifndef FDD_TRANSPOSE_UNPACK_KERNEL_H_ +#define FDD_TRANSPOSE_UNPACK_KERNEL_H_ +#include "kernel_constants.cuh" template <typename WordType> -__global__ void +__device__ inline void transpose_unpack_kernel(const WordType *in, size_t width, size_t height, size_t in_stride, size_t out_stride, WordType *out, size_t block_count_x, size_t block_count_y, @@ -73,4 +73,15 @@ transpose_unpack_kernel(const WordType *in, size_t width, size_t height, } } } -} \ No newline at end of file +} + +extern "C" __global__ void transpose_unpack_kernel_uint( + const unsigned int *in, size_t width, size_t height, size_t in_stride, + size_t out_stride, unsigned int *out, size_t block_count_x, + size_t block_count_y, size_t in_nbits, float scale) { + transpose_unpack_kernel<unsigned int>(in, width, height, in_stride, + out_stride, out, block_count_x, + block_count_y, in_nbits, scale); +} + +#endif // FDD_TRANSPOSE_UNPACK_KERNEL_H_ \ No newline at end of file diff --git a/src/fdd/unpack/unpack.cu b/src/fdd/unpack/unpack.cu deleted file mode 100644 index 3e6f0bb9b8b913c1fc1274b1ea66d658186eb3e8..0000000000000000000000000000000000000000 --- a/src/fdd/unpack/unpack.cu +++ /dev/null @@ -1,60 +0,0 @@ -#include <algorithm> - -#include "dedisp_types.h" - -#include "unpack_kernel.cuh" - -template <typename U> inline U round_up_pow2(const U &a) { - U r = a - 1; - for (unsigned long i = 1; i <= sizeof(U) * 8 / 2; i <<= 1) - r |= r >> i; - return r + 1; -} - -template <typename U> inline U round_down_pow2(const U &a) { - return round_up_pow2(a + 1) / 2; -} - -void transpose_unpack(const dedisp_word *d_in, size_t width, size_t height, - size_t in_stride, size_t out_stride, dedisp_word *d_out, - dedisp_size in_nbits, dedisp_size out_nbits, float scale, - cudaStream_t stream) { - // Specify thread decomposition (uses up-rounded divisions) - dim3 tot_block_count((width - 1) / TILE_DIM + 1, (height - 1) / TILE_DIM + 1); - - size_t max_grid_dim = round_down_pow2(32768); - - // Partition the grid into chunks that the GPU can accept at once - for (size_t block_y_offset = 0; block_y_offset < tot_block_count.y; - block_y_offset += max_grid_dim) { - dim3 block_count; - - // Handle the possibly incomplete final grid - block_count.y = std::min(max_grid_dim, tot_block_count.y - block_y_offset); - - for (size_t block_x_offset = 0; block_x_offset < tot_block_count.x; - block_x_offset += max_grid_dim) { - // Handle the possibly incomplete final grid - block_count.x = - std::min(max_grid_dim, tot_block_count.x - block_x_offset); - - // Compute the chunked parameters - size_t x_offset = block_x_offset * TILE_DIM; - size_t y_offset = block_y_offset * TILE_DIM; - size_t in_offset = x_offset + y_offset * in_stride; - size_t out_offset = y_offset + x_offset * out_stride; - size_t w = std::min(max_grid_dim * TILE_DIM, width - x_offset); - size_t h = std::min(max_grid_dim * TILE_DIM, height - y_offset); - - dim3 block(TILE_DIM, BLOCK_ROWS); - - // Specify grid decomposition - dim3 grid(round_up_pow2(block_count.x), round_up_pow2(block_count.y)); - - // Run the CUDA kernel - transpose_unpack_kernel<dedisp_word><<<grid, block, 0, stream>>>( - d_in + in_offset, w, h, in_stride, out_stride, d_out + out_offset, - block_count.x, block_count.y, in_nbits, scale); - } // end for block_x_offset - } // end for block_y_offset -} \ No newline at end of file diff --git a/src/fdd/unpack/unpack.h b/src/fdd/unpack/unpack.h deleted file mode 100644 index a5b327d27fa78a4c26ff0146aa2d5ba593c713b2..0000000000000000000000000000000000000000 --- a/src/fdd/unpack/unpack.h +++ /dev/null @@ -1,8 +0,0 @@ -#include "dedisp_types.h" - -#include "cuda_runtime.h" - -void transpose_unpack(const dedisp_word *d_in, size_t width, size_t height, - size_t in_stride, size_t out_stride, dedisp_word *d_out, - dedisp_size in_nbits, dedisp_size out_nbits, float scale, - cudaStream_t stream); diff --git a/src/tdd/CMakeLists.txt b/src/tdd/CMakeLists.txt index bd92e339abe2f415b5a449ae362f8a30798c0aed..5de8f2fcf19acaaec4e96f740eee448c3ee2fbc8 100644 --- a/src/tdd/CMakeLists.txt +++ b/src/tdd/CMakeLists.txt @@ -3,17 +3,27 @@ add_library( cinterface.cpp TDDPlan.cpp dedisperse/TDDKernel.cu - transpose/transpose.cu + transpose/TransposeKernel.cu $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:plan> $<TARGET_OBJECTS:external>) -target_include_directories(tdd PRIVATE ${CMAKE_SOURCE_DIR}/src) +target_include_directories( + tdd + PRIVATE "${CMAKE_SOURCE_DIR}/src" + PRIVATE "${CMAKE_SOURCE_DIR}/src/tdd") + +include(GNUInstallDirs) +list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_FULL_LIBDIR}) + +target_compile_options(tdd PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>) -target_link_libraries(tdd CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver - OpenMP::OpenMP_CXX) +target_embed_source(tdd transpose/tdd_transpose_kernel.cu) +target_embed_source(tdd dedisperse/tdd_dedisperse_kernel.cu) -target_compile_options(tdd PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>) +target_link_libraries( + tdd PUBLIC OpenMP::OpenMP_CXX cudawrappers::cu cudawrappers::nvtx + cudawrappers::nvrtc Threads::Threads) set_target_properties( tdd @@ -21,6 +31,21 @@ set_target_properties( SOVERSION ${DEDISP_VERSION_MAJOR} PUBLIC_HEADER TDDPlan.hpp) +if(${DEDISP_BACKEND_HIP}) + get_target_property(sources tdd SOURCES) + set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP) +endif() + +if(${DEDISP_BACKEND_HIP}) + set_source_files_properties( + dedisperse/FDDKernel.cu transpose/TransposeKernel.cu PROPERTIES LANGUAGE + HIP) +endif() + +if(DEDISP_BENCHMARK_WITH_PMT) + target_link_libraries(tdd PUBLIC pmt) +endif() + install( TARGETS tdd LIBRARY DESTINATION lib diff --git a/src/tdd/TDDPlan.cpp b/src/tdd/TDDPlan.cpp index e14eb7cb52f7a1366819a89c5718d78dac57cef8..fc5c8003e80f0e74ce65abf6ad368eb58eca95d3 100644 --- a/src/tdd/TDDPlan.cpp +++ b/src/tdd/TDDPlan.cpp @@ -9,26 +9,32 @@ * - Fusing unpack and transpose kernels in to one kernel, * thus requiring only a single pass over the data for the same operations. */ +#include <cassert> #include <iostream> #include <mutex> #include <thread> #include "TDDPlan.hpp" -#include "common/cuda/CU.h" #include "common/dedisp_strings.h" #include "common/helper.h" +#include <cudawrappers/nvtx.hpp> +#include "dedisp_types.h" #include "dedisperse/TDDKernel.hpp" -#include "transpose/transpose.h" +#include "tdd/transpose/TransposeKernel.hpp" #include "dedisp_error.hpp" -#if defined(DEDISP_BENCHMARK) +#ifdef DEDISP_BENCHMARK #include "external/Stopwatch.h" #include <fstream> #endif +#ifdef HAVE_PMT +#include <pmt.h> +#endif + namespace dedisp { // Constructor @@ -54,9 +60,11 @@ unsigned long div_round_up(unsigned long a, unsigned long b) { dedisp_size TDDPlan::compute_gulp_size() { return 65536; } dedisp_size TDDPlan::compute_max_nchans() { - size_t const_mem_bytes = m_device->get_total_const_memory(); - size_t bytes_per_chan = sizeof(dedisp_float) + sizeof(dedisp_bool); - size_t max_nr_channels = const_mem_bytes / bytes_per_chan; + const size_t const_mem_bytes = static_cast<size_t>( + m_device->getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); + const size_t bytes_per_chan = sizeof(dedisp_float) + sizeof(dedisp_bool); + const size_t max_nr_channels = const_mem_bytes / bytes_per_chan; + return max_nr_channels; }; @@ -135,22 +143,22 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in, std::unique_ptr<Stopwatch> total_timer(Stopwatch::create()); std::unique_ptr<Stopwatch> input_timer(Stopwatch::create()); std::unique_ptr<Stopwatch> output_timer(Stopwatch::create()); + +#if defined(HAVE_PMT) +#if defined(__HIP__) + auto pmt_sensor = pmt::Create("rocm", m_device->getOrdinal()); +#else + auto pmt_sensor = pmt::Create("nvidia", m_device->getOrdinal()); +#endif +#endif + total_timer->Start(); init_timer->Start(); #endif // Annotate the initialization - cu::Marker initMarker("initialization", cu::Marker::red); + nvtx::Marker initMarker("initialization", nvtx::Marker::red); initMarker.start(); - // Copy the lookup tables to constant memory on the device - cu::Marker constantMarker("copy_constant_memory", cu::Marker::yellow); - constantMarker.start(); - m_kernel.copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0, - *htodstream); - m_kernel.copy_killmask(d_killmask, m_nchans * sizeof(dedisp_bool), 0, - *htodstream); - constantMarker.end(); - // Compute the problem decomposition dedisp_size nsamps_computed = nsamps - m_max_delay; @@ -165,7 +173,7 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in, // Note: If desired, this could be rounded up, e.g., to a power of 2 dedisp_size in_buf_stride_words = nchan_words; dedisp_size in_count_gulp_max = nsamps_gulp_max * in_buf_stride_words; - dedisp_size samps_per_thread = m_kernel.get_nsamps_per_thread(); + dedisp_size samps_per_thread = tdd_kernel_dedisp.get_nsamps_per_thread(); dedisp_size nsamps_padded_gulp_max = div_round_up(nsamps_computed_gulp_max, samps_per_thread) * @@ -190,20 +198,27 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in, cu::DeviceMemory d_transposed(in_count_padded_gulp_max * sizeof(dedisp_word)); cu::DeviceMemory d_out(out_count_gulp_max * sizeof(dedisp_word)); cu::HostMemory h_out(out_count_gulp_max * sizeof(dedisp_word)); - std::vector<cu::HostMemory> h_in_(2); - std::vector<cu::DeviceMemory> d_in_(2); - for (unsigned int i = 0; i < 2; i++) { - h_in_[i].resize(in_count_gulp_max * sizeof(dedisp_word)); - d_in_[i].resize(in_count_gulp_max * sizeof(dedisp_word)); - } + std::array<std::shared_ptr<cu::HostMemory>, 2> h_in_; + std::array<std::shared_ptr<cu::DeviceMemory>, 2> d_in_; + + static_assert(h_in_.size() >= 2 && d_in_.size() >= 2, + "size of h_in_ and d_in_ must both be >= 2"); + + for (auto &elem : h_in_) + elem = std::make_unique<cu::HostMemory>(in_count_gulp_max * + sizeof(dedisp_word)); + + for (auto &elem : d_in_) + elem = std::make_unique<cu::DeviceMemory>(in_count_gulp_max * + sizeof(dedisp_word)); struct JobData { dedisp_size gulp_samp_idx; dedisp_size nsamps_computed_gulp; dedisp_size nsamps_gulp; dedisp_size nsamps_padded_gulp; - void *h_in_ptr; - void *d_in_ptr; + std::shared_ptr<cu::HostMemory> h_in_ptr; + std::shared_ptr<cu::DeviceMemory> d_in_ptr; std::mutex output_lock; cu::Event inputStart, inputEnd; cu::Event preprocessingStart, preprocessingEnd; @@ -256,9 +271,14 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in, initMarker.end(); // Annotate the gulp loop - cu::ScopedMarker gulpMarker("gulp_loop", cu::Marker::black); + nvtx::Marker gulpMarker("gulp_loop", nvtx::Marker::black); #ifdef DEDISP_BENCHMARK +#ifdef HAVE_PMT + pmt::State pmt_start, pmt_end; + pmt_start = pmt_sensor->Read(); +#endif + // Measure the total time of the gulp loop cu::Event gulpStart, gulpEnd; htodstream->record(gulpStart); @@ -279,46 +299,48 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in, auto &job = jobs[job_id]; // Copy the input data for the current job - memcpy2D(job.h_in_ptr, // dst + memcpy2D(*job.h_in_ptr, // dst in_buf_stride_words * BYTES_PER_WORD, // dst stride in + job.gulp_samp_idx * in_stride, // src in_stride, // src stride nchan_words * BYTES_PER_WORD, // width bytes job.nsamps_gulp); // height htodstream->record(job.inputStart); - htodstream->memcpyHtoDAsync(job.d_in_ptr, // dst - job.h_in_ptr, // src + htodstream->memcpyHtoDAsync(*job.d_in_ptr, // dst + *job.h_in_ptr, // src nchan_words * job.nsamps_gulp * BYTES_PER_WORD); // size htodstream->record(job.inputEnd); // Transpose and unpack the words in the input - executestream->waitEvent(job.inputEnd); + executestream->wait(job.inputEnd); executestream->record(job.preprocessingStart); - tdd::transpose((dedisp_word *)job.d_in_ptr, nchan_words, job.nsamps_gulp, - in_buf_stride_words, job.nsamps_padded_gulp, - (dedisp_word *)d_transposed, *executestream); + tdd_kernel_transpose.run(*(job.d_in_ptr), nchan_words, job.nsamps_gulp, + in_buf_stride_words, job.nsamps_padded_gulp, + d_transposed, *executestream); executestream->record(job.preprocessingEnd); - // Perform direct dedispersion without scrunching executestream->record(job.dedispersionStart); - m_kernel.launch(d_transposed, // d_in - job.nsamps_padded_gulp, // in_stride - job.nsamps_computed_gulp, // nsamps - in_nbits, // in_nbits, - m_nchans, // nchans - 1, // chan_stride - d_dm_list, // d_dm_list - dm_count, // dm_count - 1, // dm_stride - d_out, // d_out - out_stride_gulp_samples, // out_stride - out_nbits, // out_nbits - *executestream); + tdd_kernel_dedisp.run( + d_transposed, // d_in + job.nsamps_padded_gulp, // in_stride + job.nsamps_computed_gulp, // nsamps + in_nbits, // in_nbits, + m_nchans, // nchans + 1, // chan_stride + *d_dm_list, + dm_count, // dm_count + 1, // dm_stride + d_out, // d_out + out_stride_gulp_samples, // out_stride + out_nbits, // out_nbits + h_delay_table.data(), h_delay_table.size() * sizeof(dedisp_float), + h_killmask.data(), h_killmask.size() * sizeof(dedisp_bool), *htodstream, + *executestream); executestream->record(job.dedispersionEnd); // Copy output back to host memory - dtohstream->waitEvent(job.dedispersionEnd); + dtohstream->wait(job.dedispersionEnd); dtohstream->record(job.outputStart); dtohstream->memcpyDtoHAsync(h_out, // dst d_out, // src @@ -337,6 +359,10 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in, gulpEnd.synchronize(); total_timer->Pause(); +#ifdef HAVE_PMT + pmt_end = pmt_sensor->Read(); +#endif + // Accumulate dedispersion and memcopy time for all jobs for (auto &job : jobs) { input_timer->Add(job.inputEnd.elapsedTime(job.inputStart)); @@ -369,6 +395,12 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in, << std::endl; std::cout << total_time_str << total_timer->ToString() << " sec." << std::endl; +#ifdef HAVE_PMT + std::cout << pmt_joules_str << pmt::PMT::joules(pmt_start, pmt_end) << " J" + << std::endl; + std::cout << pmt_watts_str << pmt::PMT::watts(pmt_start, pmt_end) << " W" + << std::endl; +#endif std::cout << std::endl; // Compute number of operations performed @@ -404,6 +436,8 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in, << std::endl; perf_file.close(); #endif + + gulpMarker.end(); } } // end namespace dedisp \ No newline at end of file diff --git a/src/tdd/TDDPlan.hpp b/src/tdd/TDDPlan.hpp index c9ec2ebee7c3677ad653971480f1a25af39c3cb8..5408223f672def0ac3ec88349c01b581e376504d 100644 --- a/src/tdd/TDDPlan.hpp +++ b/src/tdd/TDDPlan.hpp @@ -5,6 +5,8 @@ #include "GPUPlan.hpp" #include "dedisperse/TDDKernel.hpp" +#include "transpose/TransposeKernel.hpp" + namespace dedisp { class TDDPlan : public GPUPlan { @@ -36,13 +38,11 @@ public: private: // Size parameters dedisp_size m_gulp_size; - - // DedispKernel - DedispKernel m_kernel; - dedisp_size compute_gulp_size(); - dedisp_size compute_max_nchans(); + + TransposeKernelTDD tdd_kernel_transpose; + DedispKernel tdd_kernel_dedisp; }; } // end namespace dedisp \ No newline at end of file diff --git a/src/tdd/cinterface.cpp b/src/tdd/cinterface.cpp index effd49c577a12e6dd0138357411ba1f4fcce1ae9..d4fa2b123b2bff99dc3ecfbc20d7b519fc9aa909 100644 --- a/src/tdd/cinterface.cpp +++ b/src/tdd/cinterface.cpp @@ -3,7 +3,6 @@ #include "tdd/TDDPlan.hpp" #include <cmath> -#include <iostream> #include <memory> #ifdef __cplusplus @@ -43,12 +42,6 @@ dedisp_error dedisp_create_plan(dedisp_plan *plan, dedisp_size nchans, std::cout << "dedisp_create_plan()" << std::endl; #endif - *plan = nullptr; - - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - *plan = new dedisp_plan_struct(); if (!plan) { throw_error(DEDISP_MEM_ALLOC_FAILED); @@ -69,10 +62,6 @@ dedisp_error dedisp_set_gulp_size(dedisp_plan plan, dedisp_size gulp_size) { throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { static_cast<dedisp::TDDPlan *>(plan->plan_.get())->set_gulp_size(gulp_size); } catch (...) { @@ -87,10 +76,6 @@ dedisp_size dedisp_get_gulp_size(dedisp_plan plan) { throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { return static_cast<dedisp::TDDPlan *>(plan->plan_.get())->get_gulp_size(); } catch (...) { @@ -104,10 +89,6 @@ dedisp_error dedisp_set_dm_list(dedisp_plan plan, const dedisp_float *dm_list, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->set_dm_list(dm_list, count); } catch (...) { @@ -124,10 +105,6 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, dedisp_float dm_start, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->generate_dm_list(dm_start, dm_end, ti, tol); } catch (...) { @@ -187,10 +164,6 @@ dedisp_error dedisp_set_killmask(dedisp_plan plan, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->set_killmask(killmask); } catch (...) { @@ -296,10 +269,6 @@ dedisp_error dedisp_execute_guru(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_FLAG_COMBINATION); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { static_cast<dedisp::TDDPlan *>(plan->plan_.get()) ->execute_guru(nsamps, in, in_nbits, in_stride, out, out_nbits, @@ -324,10 +293,6 @@ dedisp_error dedisp_execute_adv(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_FLAG_COMBINATION); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { static_cast<dedisp::TDDPlan *>(plan->plan_.get()) ->execute_adv(nsamps, in, in_nbits, in_stride, out, out_nbits, @@ -348,10 +313,6 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps, throw_error(DEDISP_INVALID_PLAN); } - if (cudaGetLastError() != cudaSuccess) { - throw_error(DEDISP_PRIOR_GPU_ERROR); - } - try { plan->plan_->execute(nsamps, in, in_nbits, out, out_nbits, flags); } catch (...) { @@ -363,7 +324,7 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps, dedisp_error dedisp_sync(void) { try { - cu::checkError(cudaDeviceSynchronize()); + cu::Context::synchronize(); } catch (...) { throw_error(DEDISP_PRIOR_GPU_ERROR); } diff --git a/src/tdd/dedisperse/TDDKernel.cu b/src/tdd/dedisperse/TDDKernel.cu index 3e483f27949d9eedebbd71d20a15a7a93253ba07..0d2fd9582f82bdf75d2a016115cedc0b1c997899 100644 --- a/src/tdd/dedisperse/TDDKernel.cu +++ b/src/tdd/dedisperse/TDDKernel.cu @@ -2,30 +2,16 @@ * Time Domain Dedispersion (TDD) * is an optimized version of the original dedisp implementation. */ +#include "GPUKernel.hpp" #include "TDDKernel.hpp" -#include "common/cuda/CU.h" -#include "common/dedisp_error.hpp" -#include "dedisperse_kernel.cuh" +#include "dedisp_types.h" +#include "tdd_dedisperse_kernel.cu" + +#include <cudawrappers/cu.hpp> -// Kernel tuning parameters #define DEDISP_BLOCK_SIZE 256 #define DEDISP_BLOCK_SAMPS 8 -/* - * Helper functions - */ -void DedispKernel::copy_delay_table(const void *src, size_t count, - size_t offset, cudaStream_t stream) { - cu::checkError(cudaMemcpyToSymbolAsync(c_delay_table, src, count, offset, - cudaMemcpyDeviceToDevice, stream)); -} - -void DedispKernel::copy_killmask(const void *src, size_t count, size_t offset, - cudaStream_t stream) { - cu::checkError(cudaMemcpyToSymbolAsync(c_killmask, src, count, offset, - cudaMemcpyDeviceToDevice, stream)); -} - unsigned int DedispKernel::get_nsamps_per_thread() { return DEDISP_SAMPS_PER_THREAD; } @@ -33,25 +19,23 @@ unsigned int DedispKernel::get_nsamps_per_thread() { /* * dedisperse routine */ -void DedispKernel::launch(const dedisp_word *d_in, dedisp_size in_stride, - dedisp_size nsamps, dedisp_size in_nbits, - dedisp_size nchans, dedisp_size chan_stride, - const dedisp_float *d_dm_list, dedisp_size dm_count, - dedisp_size dm_stride, dedisp_byte *d_out, - dedisp_size out_stride, dedisp_size out_nbits, - cudaStream_t stream) { +void DedispKernel::run(const cu::DeviceMemory &d_in, dedisp_size in_stride, + dedisp_size nsamps, dedisp_size in_nbits, + dedisp_size nchans, dedisp_size chan_stride, + const cu::DeviceMemory &d_dm_list, dedisp_size dm_count, + dedisp_size dm_stride, const cu::DeviceMemory &d_out, + dedisp_size out_stride, dedisp_size out_nbits, + const void *delay_table, dedisp_size delay_table_size, + const void *killmask, dedisp_size killmask_size, + cu::Stream &htodstream, cu::Stream &executestream) { + enum { BITS_PER_BYTE = 8, - BYTES_PER_WORD = sizeof(dedisp_word) / sizeof(dedisp_byte), - BLOCK_DIM_X = DEDISP_BLOCK_SAMPS, - BLOCK_DIM_Y = DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS, - MAX_CUDA_GRID_SIZE_X = 65535, - MAX_CUDA_GRID_SIZE_Y = 65535 }; // Define thread decomposition // Note: Block dimensions x and y represent time samples and DMs respectively - dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + dim3 block(KERNEL_BLOCK_DIM_X, KERNEL_BLOCK_DIM_Y); // Note: Grid dimension x represents time samples. Dimension y represents DMs // Divide and round up @@ -68,35 +52,36 @@ void DedispKernel::launch(const dedisp_word *d_in, dedisp_size in_stride, // Divide and round up dedisp_size nsamps_reduced = (nsamps - 1) / DEDISP_SAMPS_PER_THREAD + 1; + const std::vector<const void *> parameters = { + &d_in, &nsamps, &nsamps_reduced, &nsamp_blocks, &in_stride, + &dm_count, &dm_stride, &ndm_blocks, &nchans, &chan_stride, + &d_out, &out_nbits, &out_stride, &d_dm_list}; + // Execute the kernel -#define DEDISP_CALL_KERNEL(NBITS) \ - dedisperse_kernel<NBITS, DEDISP_SAMPS_PER_THREAD, BLOCK_DIM_X, BLOCK_DIM_Y> \ - <<<grid, block, 0, stream>>>(d_in, nsamps, nsamps_reduced, nsamp_blocks, \ - in_stride, dm_count, dm_stride, ndm_blocks, \ - nchans, chan_stride, d_out, out_nbits, \ - out_stride, d_dm_list) - // Note: Here we dispatch dynamically on nbits for supported values - switch (in_nbits) { - case 1: - DEDISP_CALL_KERNEL(1); - break; - case 2: - DEDISP_CALL_KERNEL(2); - break; - case 4: - DEDISP_CALL_KERNEL(4); - break; - case 8: - DEDISP_CALL_KERNEL(8); - break; - case 16: - DEDISP_CALL_KERNEL(16); - break; - case 32: - DEDISP_CALL_KERNEL(32); - break; - default: /* should never be reached */ - break; + std::ostringstream ss; + ss << "-DPARAM_IN_NBITS=" << in_nbits; + + CompiledKernelInfo kernel; + bool did_recompile; + std::tie(kernel, did_recompile) = compile({ss.str()}); + assertCompiled(kernel); + + if (did_recompile) { + constant_memcpy_marker.start(); + if (delay_table) { + htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_delay_table"), + delay_table, delay_table_size); + htodstream.synchronize(); + } + + if (killmask) { + htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_killmask"), + killmask, killmask_size); + htodstream.synchronize(); + } + constant_memcpy_marker.end(); } -#undef DEDISP_CALL_KERNEL + + executestream.launchKernel(*kernel.function, grid.x, grid.y, grid.z, block.x, + block.y, block.z, 0, parameters); } \ No newline at end of file diff --git a/src/tdd/dedisperse/TDDKernel.hpp b/src/tdd/dedisperse/TDDKernel.hpp index 5f64e5d7f7ef4988ecf4df4a6b2fa01cf766792c..dd47d31f50ae2f51d6b332f636e8bd5f54309a58 100644 --- a/src/tdd/dedisperse/TDDKernel.hpp +++ b/src/tdd/dedisperse/TDDKernel.hpp @@ -1,29 +1,44 @@ -#ifndef DEDISP_KERNEL_H_INCLUDE_GUARD -#define DEDISP_KERNEL_H_INCLUDE_GUARD +#ifndef DEDISP_TDD_DEDISPERSE_TDD_KERNEL_HPP_ +#define DEDISP_TDD_DEDISPERSE_TDD_KERNEL_HPP_ -#include <cuda.h> +#include <cudawrappers/cu.hpp> +#include <cudawrappers/nvtx.hpp> + +#include <GPUKernel.hpp> #include "common/dedisp_types.h" -class DedispKernel { -public: - void copy_delay_table(const void *src, size_t count, size_t offset, - cudaStream_t stream); +extern unsigned char _binary_src_tdd_dedisperse_tdd_dedisperse_kernel_cu_start, + _binary_src_tdd_dedisperse_tdd_dedisperse_kernel_cu_end; - void copy_killmask(const void *src, size_t count, size_t offset, - cudaStream_t stream); +class DedispKernel : public GPUKernel { +public: + DedispKernel() + : GPUKernel( + "tdd_dedisperse_kernel.cu", "dedisperse_kernel", + std::string( + reinterpret_cast<const char *>( + &_binary_src_tdd_dedisperse_tdd_dedisperse_kernel_cu_start), + reinterpret_cast<const char *>( + &_binary_src_tdd_dedisperse_tdd_dedisperse_kernel_cu_end))) { + } unsigned int get_nsamps_per_thread(); - void launch(const dedisp_word *d_in, dedisp_size in_stride, - dedisp_size nsamps, dedisp_size in_nbits, dedisp_size nchans, - dedisp_size chan_stride, const dedisp_float *d_dm_list, - dedisp_size dm_count, dedisp_size dm_stride, dedisp_byte *d_out, - dedisp_size out_stride, dedisp_size out_nbits, - cudaStream_t stream = 0); + void run(const cu::DeviceMemory &d_in, dedisp_size in_stride, + dedisp_size nsamps, dedisp_size in_nbits, dedisp_size nchans, + dedisp_size chan_stride, const cu::DeviceMemory &d_dm_list, + dedisp_size dm_count, dedisp_size dm_stride, + const cu::DeviceMemory &d_out, dedisp_size out_stride, + dedisp_size out_nbits, const void *delay_table, + dedisp_size delay_table_size, const void *killmask, + dedisp_size killmask_size, cu::Stream &htodstream, + cu::Stream &executestream); private: dedisp_word *m_d_in = nullptr; + nvtx::Marker constant_memcpy_marker{"copy_constant_memory", + nvtx::Marker::yellow}; }; -#endif // DEDISP_KERNEL_H_INCLUDE_GUARD \ No newline at end of file +#endif // DEDISP_TDD_DEDISPERSE_TDD_KERNEL_HPP_ \ No newline at end of file diff --git a/src/tdd/dedisperse/dedisperse_kernel.cuh b/src/tdd/dedisperse/tdd_dedisperse_kernel.cu similarity index 59% rename from src/tdd/dedisperse/dedisperse_kernel.cuh rename to src/tdd/dedisperse/tdd_dedisperse_kernel.cu index a6d2cb44b02b4a85e5ab02aff5b0d3b2c74b53ed..cc8075ef91515cd46ccb1da0ae8f8179cf3ad952 100644 --- a/src/tdd/dedisperse/dedisperse_kernel.cuh +++ b/src/tdd/dedisperse/tdd_dedisperse_kernel.cu @@ -1,3 +1,6 @@ +#pragma once + +#include "common/dedisp_types.h" /* * Time Domain Dedispersion (TDD) * is an optimized version of the original dedisp implementation. @@ -14,6 +17,14 @@ __constant__ dedisp_bool c_killmask[DEDISP_MAX_NCHANS]; // Kernel tuning parameters #define DEDISP_SAMPS_PER_THREAD 2 +#define DEDISP_BLOCK_SIZE 256 +#define DEDISP_BLOCK_SAMPS 8 +// #define BITS_PER_BYTE 8 +#define BYTES_PER_WORD (sizeof(dedisp_word) / sizeof(dedisp_byte)) +#define KERNEL_BLOCK_DIM_X DEDISP_BLOCK_SAMPS +#define KERNEL_BLOCK_DIM_Y (DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS) +#define MAX_CUDA_GRID_SIZE_X 65535 +#define MAX_CUDA_GRID_SIZE_Y 65535 /* * Helper functions @@ -87,14 +98,12 @@ set_out_val(dedisp_byte *d_out, dedisp_size out_nbits, dedisp_size idx, // E.g., Words bracketed: (t0c0,t0c1,t0c2,t0c3), (t1c0,t1c1,t1c2,t1c3),... // Note: out_stride should be in units of samples template <int IN_NBITS, int SAMPS_PER_THREAD, int BLOCK_DIM_X, int BLOCK_DIM_Y> -__global__ void dedisperse_kernel(const dedisp_word *d_in, dedisp_size nsamps, - dedisp_size nsamps_reduced, - dedisp_size nsamp_blocks, dedisp_size stride, - dedisp_size dm_count, dedisp_size dm_stride, - dedisp_size ndm_blocks, dedisp_size nchans, - dedisp_size chan_stride, dedisp_byte *d_out, - dedisp_size out_nbits, dedisp_size out_stride, - const dedisp_float *d_dm_list) { +__device__ inline void dedisperse_kernel_impl( + const dedisp_word *d_in, dedisp_size nsamps, dedisp_size nsamps_reduced, + dedisp_size nsamp_blocks, dedisp_size stride, dedisp_size dm_count, + dedisp_size dm_stride, dedisp_size ndm_blocks, dedisp_size nchans, + dedisp_size chan_stride, dedisp_byte *d_out, dedisp_size out_nbits, + dedisp_size out_stride, const dedisp_float *d_dm_list) { // Compute compile-time constants enum { BITS_PER_BYTE = 8, @@ -167,4 +176,105 @@ __global__ void dedisperse_kernel(const dedisp_word *d_in, dedisp_size nsamps, } // End of sample loop } // End of DM loop -} \ No newline at end of file +} + +template <int IN_NBITS, int SAMPS_PER_THREAD, int BLOCK_DIM_X, int BLOCK_DIM_Y> +__global__ inline void dedisperse_kernel_impl_2( + const dedisp_word *d_in, dedisp_size nsamps, dedisp_size nsamps_reduced, + dedisp_size nsamp_blocks, dedisp_size stride, dedisp_size dm_count, + dedisp_size dm_stride, dedisp_size ndm_blocks, dedisp_size nchans, + dedisp_size chan_stride, dedisp_byte *d_out, dedisp_size out_nbits, + dedisp_size out_stride, const dedisp_float *d_dm_list) { + // Compute compile-time constants + enum { + BITS_PER_BYTE = 8, + CHANS_PER_WORD = sizeof(dedisp_word) * BITS_PER_BYTE / IN_NBITS + }; + + // Compute the thread decomposition + dedisp_size samp_block = blockIdx.x; + dedisp_size dm_block = blockIdx.y % ndm_blocks; + + dedisp_size samp_idx = samp_block * BLOCK_DIM_X + threadIdx.x; + dedisp_size dm_idx = dm_block * BLOCK_DIM_Y + threadIdx.y; + dedisp_size nsamp_threads = nsamp_blocks * BLOCK_DIM_X; + + dedisp_size ndm_threads = ndm_blocks * BLOCK_DIM_Y; + + // Iterate over grids of DMs + for (; dm_idx < dm_count; dm_idx += ndm_threads) { + // Look up the dispersion measure + dedisp_float dm = d_dm_list[dm_idx * dm_stride]; + + // Loop over samples + for (; samp_idx < nsamps_reduced; samp_idx += nsamp_threads) { + typedef typename SumType<IN_NBITS>::type sum_type; + sum_type sum[SAMPS_PER_THREAD]; + +#pragma unroll + for (dedisp_size s = 0; s < SAMPS_PER_THREAD; ++s) { + sum[s] = 0; + } + + // Loop over channel words + for (dedisp_size chan_word = 0; chan_word < nchans; + chan_word += CHANS_PER_WORD) { + // Pre-compute the memory offset + dedisp_size offset = + samp_idx * SAMPS_PER_THREAD + chan_word / CHANS_PER_WORD * stride; + + // Loop over channel subwords + for (dedisp_size chan_sub = 0; chan_sub < CHANS_PER_WORD; ++chan_sub) { + dedisp_size chan_idx = (chan_word + chan_sub) * chan_stride; + + // Look up the fractional delay + dedisp_float frac_delay = c_delay_table[chan_idx]; + // Compute the integer delay + dedisp_size delay = __float2uint_rn(dm * frac_delay); + +// Loop over samples per thread +// Note: Unrolled to ensure the sum[] array is stored in regs +#pragma unroll + for (dedisp_size s = 0; s < SAMPS_PER_THREAD; ++s) { + // Grab the word containing the sample + dedisp_word sample = d_in[offset + s + delay]; + + // Extract the desired subword and accumulate + sum[s] += c_killmask[chan_idx] * + extract_subword<IN_NBITS>(sample, chan_sub); + } + } + } + + // Write sums to global mem + dedisp_size out_idx = (samp_idx * SAMPS_PER_THREAD + dm_idx * out_stride); +#pragma unroll + for (dedisp_size s = 0; s < SAMPS_PER_THREAD; ++s) { + if (samp_idx * SAMPS_PER_THREAD + s < nsamps) + set_out_val<IN_NBITS>(d_out, out_nbits, out_idx + s, sum[s], nchans); + } + + } // End of sample loop + + } // End of DM loop +} + +extern "C" { +#ifndef PARAM_IN_NBITS +#define PARAM_IN_NBITS 16 +#endif + +__global__ void dedisperse_kernel(const dedisp_word *d_in, dedisp_size nsamps, + dedisp_size nsamps_reduced, + dedisp_size nsamp_blocks, dedisp_size stride, + dedisp_size dm_count, dedisp_size dm_stride, + dedisp_size ndm_blocks, dedisp_size nchans, + dedisp_size chan_stride, dedisp_byte *d_out, + dedisp_size out_nbits, dedisp_size out_stride, + const dedisp_float *d_dm_list) { + dedisperse_kernel_impl<PARAM_IN_NBITS, DEDISP_SAMPS_PER_THREAD, + KERNEL_BLOCK_DIM_X, KERNEL_BLOCK_DIM_Y>( + d_in, nsamps, nsamps_reduced, nsamp_blocks, stride, dm_count, dm_stride, + ndm_blocks, nchans, chan_stride, d_out, out_nbits, out_stride, d_dm_list); +} +} diff --git a/src/tdd/transpose/transpose.cu b/src/tdd/transpose/TransposeKernel.cu similarity index 61% rename from src/tdd/transpose/transpose.cu rename to src/tdd/transpose/TransposeKernel.cu index 3a1aebcc274f27048a5eedf03c5135038f505438..ca1528353aee426265d708998b9d4346158671af 100644 --- a/src/tdd/transpose/transpose.cu +++ b/src/tdd/transpose/TransposeKernel.cu @@ -4,11 +4,11 @@ * This kernel is a fused version of the original transpose and unpack kernels. * Thus requiring only a single pass over the data for the same operations. */ -#include "dedisp_types.h" -#include "transpose_kernel.cuh" -#include <algorithm> -namespace tdd { +#include "TransposeKernel.hpp" +#include "tdd_transpose_kernel.cu" +#include <algorithm> +#include <memory> template <typename U> inline U round_up_pow2(const U &a) { U r = a - 1; @@ -21,9 +21,15 @@ template <typename U> inline U round_down_pow2(const U &a) { return round_up_pow2(a + 1) / 2; } -void transpose(const dedisp_word *d_in, size_t width, size_t height, - size_t in_stride, size_t out_stride, dedisp_word *d_out, - cudaStream_t stream) { +void TransposeKernelTDD::run(cu::DeviceMemory &d_in, size_t width, + size_t height, size_t in_stride, size_t out_stride, + cu::DeviceMemory &d_out, cu::Stream &stream) { + + CompiledKernelInfo kernel; + bool did_recompile; + std::tie(kernel, did_recompile) = compile(); + assertCompiled(kernel); + // Specify thread decomposition (uses up-rounded divisions) dim3 tot_block_count((width - 1) / TILE_DIM + 1, (height - 1) / TILE_DIM + 1); @@ -56,12 +62,20 @@ void transpose(const dedisp_word *d_in, size_t width, size_t height, // Specify grid decomposition dim3 grid(round_up_pow2(block_count.x), round_up_pow2(block_count.y)); - // Run the CUDA kernel - transpose_kernel<dedisp_word><<<grid, block, 0, stream>>>( - d_in + in_offset, w, h, in_stride, out_stride, d_out + out_offset, - block_count.x, block_count.y); + const void *d_in_offset = static_cast<dedisp_word *>(d_in) + in_offset; + void *d_out_offset = static_cast<dedisp_word *>(d_out) + out_offset; + + const std::vector<const void *> parameters = {&d_in_offset, + &w, + &h, + &in_stride, + &out_stride, + &d_out_offset, + &block_count.x, + &block_count.y}; + + stream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, block.x, + block.y, block.z, 0, parameters); } // end for block_x_offset } // end for block_y_offset } - -} // end namespace tdd \ No newline at end of file diff --git a/src/tdd/transpose/TransposeKernel.hpp b/src/tdd/transpose/TransposeKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a7dc2c79e501b2e1ed541222fcfccf1703724e81 --- /dev/null +++ b/src/tdd/transpose/TransposeKernel.hpp @@ -0,0 +1,28 @@ +#ifndef DEDISP_TDD_TRANSPOSE_TRANSPOSE_KERNEL_HPP_ +#define DEDISP_TDD_TRANSPOSE_TRANSPOSE_KERNEL_HPP_ + +#include <cudawrappers/cu.hpp> + +#include "common/dedisp_types.h" +#include <GPUKernel.hpp> + +extern const char _binary_src_tdd_transpose_tdd_transpose_kernel_cu_start, + _binary_src_tdd_transpose_tdd_transpose_kernel_cu_end; + +class TransposeKernelTDD : public GPUKernel { +public: + TransposeKernelTDD() + : GPUKernel( + "tdd_transpose_kernel.cu", "transpose_kernel_dedisp", + std::string( + reinterpret_cast<const char *>( + &_binary_src_tdd_transpose_tdd_transpose_kernel_cu_start), + reinterpret_cast<const char *>( + &_binary_src_tdd_transpose_tdd_transpose_kernel_cu_end))) {} + + void run(cu::DeviceMemory &d_in, size_t width, size_t height, + size_t in_stride, size_t out_stride, cu::DeviceMemory &d_out, + cu::Stream &stream); +}; + +#endif // DEDISP_TDD_TRANSPOSE_TRANSPOSE_KERNEL_HPP_ \ No newline at end of file diff --git a/src/tdd/transpose/transpose_kernel.cuh b/src/tdd/transpose/tdd_transpose_kernel.cu similarity index 59% rename from src/tdd/transpose/transpose_kernel.cuh rename to src/tdd/transpose/tdd_transpose_kernel.cu index d05504601af6bd86e1523a94adcf656c2aa1f728..74bc1451cfd5934c82718b712fde39f557f03ade 100644 --- a/src/tdd/transpose/transpose_kernel.cuh +++ b/src/tdd/transpose/tdd_transpose_kernel.cu @@ -5,13 +5,11 @@ #define TILE_DIM 32 #define BLOCK_ROWS 8 -namespace tdd { - template <typename WordType> -__global__ void transpose_kernel(const WordType *in, size_t width, - size_t height, size_t in_stride, - size_t out_stride, WordType *out, - size_t block_count_x, size_t block_count_y) { +__device__ void +transpose_kernel_dedisp_impl(const WordType *in, size_t width, size_t height, + size_t in_stride, size_t out_stride, WordType *out, + size_t block_count_x, size_t block_count_y) { __shared__ WordType tile[TILE_DIM][TILE_DIM]; // Cull excess blocks (there may be many if we round up to a power of 2) @@ -48,4 +46,18 @@ __global__ void transpose_kernel(const WordType *in, size_t width, } } +extern "C" { +#ifndef PARAM_DATA_TYPE +#define PARAM_DATA_TYPE unsigned int +#endif + +__global__ void transpose_kernel_dedisp(const unsigned int *in, size_t width, + size_t height, size_t in_stride, + size_t out_stride, unsigned int *out, + size_t block_count_x, + size_t block_count_y) { + transpose_kernel_dedisp_impl<PARAM_DATA_TYPE>(in, width, height, in_stride, + out_stride, out, block_count_x, + block_count_y); +} } // end namespace tdd \ No newline at end of file diff --git a/src/tdd/transpose/transpose.h b/src/tdd/transpose/transpose.h deleted file mode 100644 index cc3c83cbde75c779305332d626f89baec6165958..0000000000000000000000000000000000000000 --- a/src/tdd/transpose/transpose.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Time Domain Dedispersion (TDD) - * is an optimized version of the original dedisp implementation. - */ -#include "cuda_runtime.h" -#include "dedisp_types.h" - -namespace tdd { - -void transpose(const dedisp_word *d_in, size_t width, size_t height, - size_t in_stride, size_t out_stride, dedisp_word *d_out, - cudaStream_t stream); - -}; // end namespace tdd \ No newline at end of file