diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51a824a40ce57428d1f4c2dac9d85b149221a914..67b01daf83ae132b0b22b3bfb309658ec37537cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
 
 project(dedisp)
 
@@ -9,17 +9,60 @@ set(DEDISP_VERSION_PATCH 1)
 set(DEDISP_VERSION
     ${DEDISP_VERSION_MAJOR}.${DEDISP_VERSION_MINOR}.${DEDISP_VERSION_PATCH})
 
+set(CMAKE_CXX_STANDARD 14)
+
 # dependencies
 find_package(PkgConfig REQUIRED)
 
-# CUDA
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  set(CMAKE_CUDA_ARCHITECTURES 80)
+if(NOT DEFINED DEDISP_BACKEND)
+  set(DEDISP_BACKEND "CUDA")
+endif()
+set(DEDISP_BACKEND
+    ${DEDISP_BACKEND}
+    CACHE STRING "GPU backend API to use")
+set_property(CACHE DEDISP_BACKEND PROPERTY STRINGS "CUDA" "HIP")
+
+if(${DEDISP_BACKEND} STREQUAL "CUDA")
+  set(DEDISP_BACKEND_CUDA True)
+elseif(${DEDISP_BACKEND} STREQUAL "HIP")
+  set(DEDISP_BACKEND_HIP True)
+else()
+  message(FATAL_ERROR "Invalid value for DEDISP_BACKEND: ${DEDISP_BACKEND}")
 endif()
 
-find_package(CUDAToolkit REQUIRED)
+find_package(Threads REQUIRED)
+find_package(OpenMP REQUIRED)
 
-enable_language(CUDA)
+set(CUDAWRAPPERS_BACKEND ${DEDISP_BACKEND})
+if(${DEDISP_BACKEND_HIP})
+  # Workaround missing omp.h for compilation of sources with language property
+  # HIP by adding the OpenMP include directory of the host compiler to the
+  # OpenMP target.
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    get_filename_component(OpenMP_LIB_DIR ${OpenMP_gomp_LIBRARY} DIRECTORY)
+    message(STATUS "OpenMP library directory: ${OpenMP_LIB_DIR}")
+    find_path(
+      OpenMP_INCLUDE_DIR omp.h
+      PATHS ${OpenMP_LIB_DIR} "${OpenMP_LIB_DIR}/.."
+      PATH_SUFFIXES
+        "include"
+        "lib/gcc/x86_64-pc-linux-gnu/${CMAKE_CXX_COMPILER_VERSION}/include")
+    target_include_directories(OpenMP::OpenMP_CXX SYSTEM
+                               INTERFACE ${OpenMP_INCLUDE_DIR})
+  endif()
+
+  enable_language(HIP)
+  set(LINK_gpu_runtime hip::host)
+elseif(${DEDISP_BACKEND_CUDA})
+  enable_language(CUDA)
+  set(CUDA_MIN_VERSION 10.0)
+  find_package(CUDAToolkit REQUIRED)
+  set(LINK_gpu_runtime CUDA::cudart)
+
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 80)
+  endif()
+endif()
 
 # set Release as default build type
 if(NOT CMAKE_BUILD_TYPE)
@@ -33,23 +76,32 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 option(DEDISP_BUILD_BENCHMARKS "Build benchmarks" ON)
 
 if(${DEDISP_BUILD_BENCHMARKS})
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEDISP_BENCHMARK")
+  add_compile_options("-DDEDISP_BENCHMARK")
 endif()
 
 option(DEDISP_ENABLE_DEBUG "Enable extra verbose output" OFF)
 
 if(${DEDISP_ENABLE_DEBUG})
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEDISP_DEBUG")
+  add_compile_options("-DDEDISP_BENCHMARK")
 endif()
 
-option(DEDISP_BUILD_TESTS "Build tests" OFF)
+option(DEDISP_BUILD_TESTING "Build tests" OFF)
+option(DEDISP_BENCHMARK_WITH_PMT
+       "Enable Power Measurement Toolkit support in the benchmark suite" OFF)
 
-if(${DEDISP_BUILD_TESTS})
+if(${DEDISP_BUILD_TESTING})
   enable_testing()
 endif()
 
-# OpenMP configuration
-find_package(OpenMP REQUIRED)
+# CUDA Wrappers
+include(GNUInstallDirs)
+include(FetchContent)
+
+FetchContent_Declare(
+  cudawrappers
+  GIT_REPOSITORY https://github.com/nlesc-recruit/cudawrappers
+  GIT_TAG main)
+FetchContent_MakeAvailable(cudawrappers)
 
 # library
 add_subdirectory(src)
diff --git a/README.md b/README.md
index ecbbf6d911dd94113a0bac0df92abb75df4d0f16..167574266b6321a41848aa7d0f8c0e0a624f1a7e 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Installation Instructions:
   3.  Optionally further configure cmake through interactive build settings
       * `$ccmake .`
       * e.g. set `DEDISP_BUILD_BENCHMARKS` to `ON` to enable build of performance benchmarks [default: ON]
-      * e.g. set `DEDISP_BUILD_TESTS` to `ON` to enable build of tests [default: ON]
+      * e.g. set `DEDISP_BUILD_TESTING` to `ON` to enable build of tests [default: ON]
       * e.g. set `DEDISP_DEBUG` to `ON` to enable build with more verbose output (for debugging purposes) [default: OFF]
   4.  make and install
       * `$ make install`
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index 864be15fa2357df1c7c21fe7dacfd9b6ced43318..bcb67c32afe32aa2cfb90e8a83a6f3946a7f8e7c 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -2,7 +2,7 @@
 set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
 
 # add binaries
-if(${DEDISP_BUILD_TESTS})
+if(${DEDISP_BUILD_TESTING})
   add_subdirectory(test)
 endif()
 if(${DEDISP_BUILD_BENCHMARKS})
diff --git a/bin/benchmark/CMakeLists.txt b/bin/benchmark/CMakeLists.txt
index 5e4ca31951d80354b967e559c0e52c8331bc1757..c5cc05652d4e86496f91e8951c41069610a60a49 100644
--- a/bin/benchmark/CMakeLists.txt
+++ b/bin/benchmark/CMakeLists.txt
@@ -1,3 +1,5 @@
-add_subdirectory(dedisp)
+if(${DEDISP_BACKEND_CUDA})
+  add_subdirectory(dedisp)
+endif()
 add_subdirectory(tdd)
 add_subdirectory(fdd)
diff --git a/bin/benchmark/common/bench.hpp b/bin/benchmark/common/bench.hpp
index c021237acebf7357e50bf3827a37350abae34728..451c24ababa0d748c3303c8c3689c7cb655f4a70 100644
--- a/bin/benchmark/common/bench.hpp
+++ b/bin/benchmark/common/bench.hpp
@@ -425,6 +425,14 @@ template <typename PlanType> int run(BenchParameters &benchParameter) {
     // Print timings
     if (benchParameter.verbose) {
       tbench->Pause();
+
+      dedisp_float out_mean, out_sigma;
+      calc_stats_float(output, nsamps_computed * dm_count, &out_mean,
+                       &out_sigma);
+
+      printf("Output RMS                               : %f\n", out_mean);
+      printf("Output StdDev                            : %f\n", out_sigma);
+
       printf("\n");
       printf("------------- BENCHMARK TIMES (accumulated for %d iteration(s)) "
              "-------------\n",
diff --git a/bin/benchmark/dedisp/CMakeLists.txt b/bin/benchmark/dedisp/CMakeLists.txt
index 0fbb6999ff57c284172cc4738ae9e6c7196d7987..c0435751a618409bcb0b291c76f51697d8663406 100644
--- a/bin/benchmark/dedisp/CMakeLists.txt
+++ b/bin/benchmark/dedisp/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_executable(benchdedisp benchdedisp.cpp)
 
-target_include_directories(benchdedisp PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(benchdedisp PRIVATE "${CMAKE_SOURCE_DIR}/src")
 
 target_link_libraries(benchdedisp dedisp)
 
diff --git a/bin/benchmark/fdd/CMakeLists.txt b/bin/benchmark/fdd/CMakeLists.txt
index 60418d97f82581670efcd622ca23c9449167a3f0..b170820c1d1a9aa2a5e195e6b9aba1feac30b01b 100644
--- a/bin/benchmark/fdd/CMakeLists.txt
+++ b/bin/benchmark/fdd/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_executable(benchfdd benchfdd.cpp)
 
-target_include_directories(benchfdd PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(benchfdd PRIVATE "${CMAKE_SOURCE_DIR}/src")
 
 target_link_libraries(benchfdd fdd)
 
diff --git a/bin/benchmark/tdd/CMakeLists.txt b/bin/benchmark/tdd/CMakeLists.txt
index f6db2e7588cf749271021e8f10d3419d2a51ab27..76b05b84dd8c50549fd1c53c3ab784a2aa68ed00 100644
--- a/bin/benchmark/tdd/CMakeLists.txt
+++ b/bin/benchmark/tdd/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_executable(benchtdd benchtdd.cpp)
 
-target_include_directories(benchtdd PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(benchtdd PRIVATE "${CMAKE_SOURCE_DIR}/src")
 
 target_link_libraries(benchtdd tdd)
 
diff --git a/bin/test/CMakeLists.txt b/bin/test/CMakeLists.txt
index 5dd541b18dad16cb9e4e6dc46be6d195c0dbca1e..c4611915d9c2ecfcd77c903242447e5428e6c538 100644
--- a/bin/test/CMakeLists.txt
+++ b/bin/test/CMakeLists.txt
@@ -1,5 +1,29 @@
-if(${DEDISP_BUILD_TESTS})
-  add_subdirectory(dedisp)
+if(${DEDISP_BUILD_TESTING})
+  if(${DEDISP_BACKEND_CUDA})
+    add_subdirectory(dedisp)
+  endif()
   add_subdirectory(tdd)
   add_subdirectory(fdd)
+
+  include(FetchContent)
+
+  FetchContent_Declare(
+    Catch2
+    GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+    GIT_TAG v3.6.0)
+
+  FetchContent_MakeAvailable(Catch2)
+  list(APPEND CMAKE_MODULE_PATH "${catch2_SOURCE_DIR}/contrib")
+  include(Catch)
+
+  add_executable(test_unit UnitTests.cpp)
+  target_include_directories(test_unit PRIVATE "${CMAKE_SOURCE_DIR}/src")
+
+  if(${DEDISP_BACKEND_CUDA})
+    target_link_libraries(test_unit dedisp fdd tdd Catch2::Catch2WithMain)
+  else()
+    target_link_libraries(test_unit fdd tdd Catch2::Catch2WithMain)
+  endif()
+
+  catch_discover_tests(test_unit)
 endif()
diff --git a/bin/test/UnitTests.cpp b/bin/test/UnitTests.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42379b720d730583f83958b075d8dcfe6902774c
--- /dev/null
+++ b/bin/test/UnitTests.cpp
@@ -0,0 +1,84 @@
+#include "fdd/FDDGPUPlan.hpp"
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch_approx.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/matchers/catch_matchers_vector.hpp>
+
+#include "common/test.hpp"
+
+#include "fdd/FDDGPUPlan.hpp"
+#include "tdd/TDDPlan.hpp"
+
+// Dedisp is not supported on HIP.
+#ifndef __HIP__
+#include "dedisp/DedispPlan.hpp"
+
+TEST_CASE("Dedisp compare expected DM output") {
+  const std::vector<float> expected_dms = {
+      0.392292, 0.395637, 0.401067, 0.397621, 0.390937, 0.446373,
+      0.401329, 0.405182, 0.391987, 0.391658, 0.391621};
+
+  unsigned int seed = 0;
+  TestInput test_input{&seed};
+  TestOutput test_output;
+
+  run<dedisp::DedispPlan>(test_input,
+                          test_output); // uses run method from test.hpp
+
+  REQUIRE(
+      test_output.out_mean ==
+      Catch::Approx(0.376468509).margin(std::numeric_limits<float>::epsilon()));
+  REQUIRE(
+      test_output.out_sigma ==
+      Catch::Approx(0.002306607).margin(std::numeric_limits<float>::epsilon()));
+  REQUIRE_THAT(test_output.calculated_dms,
+               Catch::Matchers::Approx(expected_dms)
+                   .margin(std::numeric_limits<float>::epsilon()));
+}
+#endif
+
+TEST_CASE("tdd compare expected DM output") {
+  const std::vector<float> expected_dms = {
+      0.392292, 0.395637, 0.401067, 0.397621, 0.390937, 0.446373,
+      0.401329, 0.405182, 0.391987, 0.391658, 0.391621};
+
+  unsigned int seed = 0;
+  TestInput test_input{&seed};
+  TestOutput test_output;
+
+  run<dedisp::TDDPlan>(test_input,
+                       test_output); // uses run method from test.hpp
+
+  REQUIRE(
+      test_output.out_mean ==
+      Catch::Approx(0.376468509).margin(std::numeric_limits<float>::epsilon()));
+  REQUIRE(
+      test_output.out_sigma ==
+      Catch::Approx(0.002306607).margin(std::numeric_limits<float>::epsilon()));
+  REQUIRE_THAT(test_output.calculated_dms,
+               Catch::Matchers::Approx(expected_dms)
+                   .margin(std::numeric_limits<float>::epsilon()));
+}
+
+TEST_CASE("fdd compare expected DM output") {
+  const std::vector<float> expected_dms = {
+      4.850260,  6.077153, 8.697535, 7.709451, 4.709877, 7.514279,
+      20.041370, 6.095791, 7.270431, 9.196441, 4.596337, 4.517603};
+
+  unsigned int seed = 0;
+  TestInput test_input{&seed};
+  TestOutput test_output;
+
+  run<dedisp::FDDGPUPlan>(test_input,
+                          test_output); // uses run method from test.hpp
+
+  REQUIRE(test_output.out_mean ==
+          Catch::Approx(-0.000709622)
+              .margin(std::numeric_limits<float>::epsilon()));
+  REQUIRE(test_output.out_sigma == Catch::Approx(0.747440).margin(
+                                       std::numeric_limits<float>::epsilon()));
+  REQUIRE_THAT(test_output.calculated_dms,
+               Catch::Matchers::Approx(expected_dms)
+                   .margin(std::numeric_limits<float>::epsilon()));
+}
diff --git a/bin/test/common/test.hpp b/bin/test/common/test.hpp
index 237606b4868b86b71fefc0976da178310eb921e7..9656d3dbb59b14d2eaedafcf7fecee7ac0224e68 100644
--- a/bin/test/common/test.hpp
+++ b/bin/test/common/test.hpp
@@ -11,6 +11,7 @@
 #include <time.h>
 
 #include <functional>
+#include <optional>
 
 #include <Plan.hpp>
 
@@ -18,6 +19,16 @@
 #define WRITE_INPUT_DATA 0
 #define WRITE_OUTPUT_DATA 0
 
+struct TestInput {
+  unsigned int *seed = nullptr;
+};
+
+struct TestOutput {
+  float out_mean;
+  float out_sigma;
+  std::vector<float> calculated_dms;
+};
+
 // Assume input is a 0 mean float and quantize to an unsigned 8-bit quantity
 dedisp_byte bytequant(dedisp_float f) {
   dedisp_float v = f + 127.5f;
@@ -94,7 +105,8 @@ void calc_stats_float(dedisp_float *a, dedisp_size n, dedisp_float *mean,
   return;
 }
 
-template <typename PlanType> int run() {
+template <typename PlanType>
+int run(const TestInput &test_input, TestOutput &test_output) {
   int device_idx = 0;
 
   dedisp_float sampletime_base =
@@ -153,7 +165,7 @@ template <typename PlanType> int run() {
 
   /* Random number generator setup */
   static std::random_device rd;
-  static std::mt19937 generator(rd());
+  static std::mt19937 generator((test_input.seed) ? *(test_input.seed) : rd());
   static std::normal_distribution<float> distribution(0.0, 1.0);
 
   /* First build 2-D array of floats with our signal in it */
@@ -263,22 +275,26 @@ template <typename PlanType> int run() {
          (double)(clock() - startclock) / CLOCKS_PER_SEC);
 
   // Look for significant peaks
-  dedisp_float out_mean, out_sigma;
-  calc_stats_float(output, nsamps_computed * dm_count, &out_mean, &out_sigma);
+  calc_stats_float(output, nsamps_computed * dm_count, &test_output.out_mean,
+                   &test_output.out_sigma);
 
-  printf("Output RMS                               : %f\n", out_mean);
-  printf("Output StdDev                            : %f\n", out_sigma);
+  printf("Output RMS                               : %f\n",
+         test_output.out_mean);
+  printf("Output StdDev                            : %f\n",
+         test_output.out_sigma);
 
   i = 0;
   for (nd = 0; nd < dm_count; nd++) {
     for (ns = 0; ns < nsamps_computed; ns++) {
       dedisp_size idx = nd * nsamps_computed + ns;
       dedisp_float val = output[idx];
-      if (val - out_mean > 6.0 * out_sigma) {
+      if (val - test_output.out_mean > 6.0 * test_output.out_sigma) {
         printf(
             "DM trial %u (%.3f pc/cm^3), Samp %u (%.6f s): %f (%.2f sigma)\n",
-            nd, dmlist[nd], ns, ns * dt, val, (val - out_mean) / out_sigma);
+            nd, dmlist[nd], ns, ns * dt, val,
+            (val - test_output.out_mean) / test_output.out_sigma);
         i++;
+        test_output.calculated_dms.push_back(val);
         if (i > 100)
           break;
       }
@@ -305,4 +321,4 @@ template <typename PlanType> int run() {
   free(input);
   printf("Dedispersion successful.\n");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/bin/test/dedisp/CMakeLists.txt b/bin/test/dedisp/CMakeLists.txt
index 0e5344a31fba95dbbc3bf2d933ace3bd0e4c6acc..d5491c0e08c8c58b1666220cc5e593b2bb6d99e7 100644
--- a/bin/test/dedisp/CMakeLists.txt
+++ b/bin/test/dedisp/CMakeLists.txt
@@ -1,12 +1,12 @@
 # test for dedisp C interface
 add_executable(ctestdedisp ctestdedisp.c)
-target_include_directories(ctestdedisp PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(ctestdedisp PRIVATE "${CMAKE_SOURCE_DIR}/src")
 target_link_libraries(ctestdedisp dedisp m)
 add_test(ctestdedisp ctestdedisp)
 
 # test for dedisp C++ interface
 add_executable(testdedisp testdedisp.cpp)
-target_include_directories(testdedisp PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(testdedisp PRIVATE "${CMAKE_SOURCE_DIR}/src")
 target_link_libraries(testdedisp dedisp)
 add_test(testdedisp testdedisp)
 
diff --git a/bin/test/dedisp/testdedisp.cpp b/bin/test/dedisp/testdedisp.cpp
index 6db39772bcc27d5f36326cc464361fc3972b2630..c9848de54885fad8c3bd791d2b1c2ed79f434085 100644
--- a/bin/test/dedisp/testdedisp.cpp
+++ b/bin/test/dedisp/testdedisp.cpp
@@ -2,8 +2,12 @@
 
 #include "../common/test.hpp"
 
-template <typename PlanType> int run();
+template <typename PlanType> int run(const TestInput &, TestOutput &);
 
 int main(int argc, char *argv[]) {
-  return run<dedisp::DedispPlan>(); // uses run method from test.hpp
+  TestInput test_input{};
+  TestOutput test_output{};
+
+  return run<dedisp::DedispPlan>(test_input,
+                                 test_output); // uses run method from test.hpp
 }
\ No newline at end of file
diff --git a/bin/test/fdd/CMakeLists.txt b/bin/test/fdd/CMakeLists.txt
index d8d7ba46dc1e5c27a70b1779da8a35dbce72d715..8a94488ff5c150488f8f14d5904ab42922961c8b 100644
--- a/bin/test/fdd/CMakeLists.txt
+++ b/bin/test/fdd/CMakeLists.txt
@@ -1,12 +1,12 @@
 # test for fdd C interface
 add_executable(ctestfdd ctestfdd.c)
-target_include_directories(ctestfdd PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(ctestfdd PRIVATE "${CMAKE_SOURCE_DIR}/src")
 target_link_libraries(ctestfdd fdd m)
 add_test(ctestfdd ctestfdd)
 
 # test for fdd C++ interface
 add_executable(testfdd testfdd.cpp)
-target_include_directories(testfdd PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(testfdd PRIVATE "${CMAKE_SOURCE_DIR}/src")
 target_link_libraries(testfdd fdd)
 add_test(testfdd testfdd)
 
diff --git a/bin/test/fdd/testfdd.cpp b/bin/test/fdd/testfdd.cpp
index 351da4ebb857f500e62966c966963fa8090fee72..31d64c641ae75e717deb3a5d73cbab2f4e590931 100644
--- a/bin/test/fdd/testfdd.cpp
+++ b/bin/test/fdd/testfdd.cpp
@@ -5,17 +5,20 @@
 
 #include "../common/test.hpp"
 
-template <typename PlanType> int run();
+template <typename PlanType> int run(const TestInput &, TestOutput &);
 
 int main(int argc, char *argv[]) {
+  TestInput test_input{};
+  TestOutput test_output{};
+
   // Set environment variable USE_CPU to switch to CPU implementation of FDD
   char *use_cpu_str = getenv("USE_CPU");
   bool use_cpu = !use_cpu_str ? false : atoi(use_cpu_str);
   if (use_cpu) {
     std::cout << "Test FDD on CPU" << std::endl;
-    return run<dedisp::FDDCPUPlan>();
+    return run<dedisp::FDDCPUPlan>(test_input, test_output);
   } else {
     std::cout << "Test FDD on GPU" << std::endl;
-    return run<dedisp::FDDGPUPlan>();
+    return run<dedisp::FDDGPUPlan>(test_input, test_output);
   }
 }
\ No newline at end of file
diff --git a/bin/test/tdd/CMakeLists.txt b/bin/test/tdd/CMakeLists.txt
index 7cbaebd3a3acf24b54e6a3be00275bd337ca6eec..ae1cc4ba1afccd438807af72be15617d67fe6907 100644
--- a/bin/test/tdd/CMakeLists.txt
+++ b/bin/test/tdd/CMakeLists.txt
@@ -1,12 +1,12 @@
 # test for tdd C interface
 add_executable(ctesttdd ctesttdd.c)
-target_include_directories(ctesttdd PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(ctesttdd PRIVATE "${CMAKE_SOURCE_DIR}/src")
 target_link_libraries(ctesttdd tdd m)
 add_test(ctesttdd ctesttdd)
 
 # test for tdd C++ interface
 add_executable(testtdd testtdd.cpp)
-target_include_directories(testtdd PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(testtdd PRIVATE "${CMAKE_SOURCE_DIR}/src")
 target_link_libraries(testtdd tdd)
 add_test(testtdd testtdd)
 
diff --git a/bin/test/tdd/testtdd.cpp b/bin/test/tdd/testtdd.cpp
index 8a84b7b6ee9ab2fa2a4fc188498dcaeb95b5d316..0c68f94948c6e5e7084e10bd5c4066d89065876d 100644
--- a/bin/test/tdd/testtdd.cpp
+++ b/bin/test/tdd/testtdd.cpp
@@ -2,8 +2,12 @@
 
 #include "../common/test.hpp"
 
-template <typename PlanType> int run();
+template <typename PlanType> int run(const TestInput &, TestOutput &);
 
 int main(int argc, char *argv[]) {
-  return run<dedisp::TDDPlan>(); // uses run method from test.hpp
+  TestInput test_input{};
+  TestOutput test_output{};
+
+  return run<dedisp::TDDPlan>(test_input,
+                              test_output); // uses run method from test.hpp
 }
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7acf14db1092a4bdfc0c8e8220b004f89b258bc6..2d04818c3fc5a3aee051ec864d0a0a31ed9a5dba 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,6 @@
 # helper libraries
 
-# include directory for common header files
-include_directories(common)
+include_directories(common PRIVATE ${cudawrappers_SOURCE_DIR}/include)
 
 # add subdirectories for individual libraries
 add_subdirectory(common)
@@ -14,11 +13,28 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
 
 # plan library
-add_library(plan OBJECT Plan.cpp GPUPlan.cpp)
-target_link_libraries(plan CUDA::cudart)
-
-# dedisp library
-add_subdirectory(dedisp)
+add_library(plan OBJECT Plan.cpp GPUPlan.cpp GPUKernel.cpp)
+target_link_libraries(plan PRIVATE LINK_gpu_runtime cudawrappers::cu
+                                   cudawrappers::nvrtc cudawrappers::nvtx)
+
+if(DEDISP_BENCHMARK_WITH_PMT)
+  if(${DEDISP_BACKEND_HIP})
+    set(PMT_BUILD_ROCM ON)
+  else()
+    set(PMT_BUILD_NVML ON)
+    set(PMT_BUILD_NVIDIA ON)
+  endif()
+  FetchContent_Declare(pmt GIT_REPOSITORY https://git.astron.nl/RD/pmt)
+  FetchContent_MakeAvailable(pmt)
+  add_compile_definitions("HAVE_PMT")
+  include_directories(common PRIVATE ${pmt_SOURCE_DIR}/include)
+endif()
+
+target_include_directories(common PRIVATE "${PROJECT_SOURCE_DIR}/include")
+
+if(${DEDISP_BACKEND_CUDA})
+  add_subdirectory(dedisp)
+endif()
 
 # TDD library
 add_subdirectory(tdd)
diff --git a/src/GPUKernel.cpp b/src/GPUKernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4cb6d46febda761d07e54e24a50622b0dcf39238
--- /dev/null
+++ b/src/GPUKernel.cpp
@@ -0,0 +1,58 @@
+#include "GPUKernel.hpp"
+#include <cudawrappers/cu.hpp>
+
+#include <iostream>
+#include <stdexcept>
+
+void GPUKernel::assertCompiled(const CompiledKernelInfo &kernel_info) const {
+  if (!kernel_info.function) {
+    std::ostringstream ss;
+    ss << func_name_ << " in file " << filename_
+       << " has not yet been compiled";
+    throw std::runtime_error(ss.str());
+  }
+}
+
+std::pair<const CompiledKernelInfo &, bool>
+GPUKernel::compile(const std::vector<std::string> &compile_options) {
+  // Concat the compile options into a single string to be used as a map key.
+  std::string options;
+
+  if (compile_options.size()) {
+    // Allocate some space beforehand to avoid repeated expanding allocations.
+    options.reserve(compile_options.size() * 32);
+
+    for (const auto &compile_option : compile_options) {
+      options += compile_option;
+    }
+  }
+
+  // Check if the function has already been compiled which the same options.
+  if (compiled_kernels_.find(options) != compiled_kernels_.end()) {
+    return {compiled_kernels_[options], false};
+  }
+
+  // Create a new map entry inplace.
+  compiled_kernels_[options] = CompiledKernelInfo{};
+  CompiledKernelInfo &info = compiled_kernels_[options];
+
+  try {
+    program_->compile(compile_options);
+  } catch (nvrtc::Error &error) {
+    std::cerr << program_->getLog();
+    throw;
+  }
+
+  // Create a new module from the compiled PTX if needed.
+  info.module = std::make_unique<cu::Module>(
+      static_cast<const void *>(program_->getPTX().data()));
+
+  if (!info.module) {
+    throw std::runtime_error("Unable to create kernel module");
+  }
+
+  info.function = std::make_shared<cu::Function>(
+      *info.module, std::string(func_name_).c_str());
+
+  return {info, true};
+}
\ No newline at end of file
diff --git a/src/GPUKernel.hpp b/src/GPUKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5b46afe29008b2ec996aef5c4fab8c2eb1c541f
--- /dev/null
+++ b/src/GPUKernel.hpp
@@ -0,0 +1,42 @@
+#ifndef DEDISP_GPU_RUNTIME_KERNEL_HPP_
+#define DEDISP_GPU_RUNTIME_KERNEL_HPP_
+
+#include <memory>
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+#include <unordered_map>
+#include <vector>
+
+#include <cudawrappers/cu.hpp>
+#include <cudawrappers/nvrtc.hpp>
+
+struct CompiledKernelInfo {
+  std::shared_ptr<cu::Module> module;
+  std::shared_ptr<cu::Function> function;
+};
+
+class GPUKernel {
+public:
+  GPUKernel(const std::string &filename, const std::string &func_name,
+            const std::string &file_src)
+      : filename_(filename), func_name_(func_name),
+        program_(std::make_unique<nvrtc::Program>(file_src, filename)) {}
+  void setFunctionParams(const std::string &kernel_function_name,
+                         const std::vector<const void *> &params);
+  void assertCompiled(const CompiledKernelInfo &kernel_info) const;
+
+  inline std::string getFilename() const { return filename_; }
+
+protected:
+  std::pair<const CompiledKernelInfo &, bool>
+  compile(const std::vector<std::string> &compile_options = {});
+
+private:
+  const std::string filename_;
+  const std::string func_name_;
+  const std::unique_ptr<nvrtc::Program> program_;
+
+  std::map<std::string, CompiledKernelInfo> compiled_kernels_;
+};
+#endif // DEDISP_GPU_RUNTIME_KERNEL_HPP_
\ No newline at end of file
diff --git a/src/GPUPlan.cpp b/src/GPUPlan.cpp
index 1a9f1b6b4c12a579c052c01a0b79722a77a0586a..81cafd0e41b1d5861beea0bd8d133e1ee5e173ae 100644
--- a/src/GPUPlan.cpp
+++ b/src/GPUPlan.cpp
@@ -1,11 +1,20 @@
 #include "GPUPlan.hpp"
 
+#include <cassert>
+#include <exception>
+#include <iostream>
+
+#include "cudawrappers/cu.hpp"
+
 namespace dedisp {
 
 // Public interface
 GPUPlan::GPUPlan(size_type nchans, float_type dt, float_type f0, float_type df,
                  int device_idx)
     : Plan(nchans, dt, f0, df) {
+  // Initialize CUDA
+  cu::init();
+
   // Initialize device
   set_device(device_idx);
 
@@ -15,13 +24,15 @@ GPUPlan::GPUPlan(size_type nchans, float_type dt, float_type f0, float_type df,
   executestream.reset(new cu::Stream());
 
   // Initialize delay table
-  d_delay_table.resize(nchans * sizeof(dedisp_float));
-  htodstream->memcpyHtoDAsync(d_delay_table, h_delay_table.data(),
-                              d_delay_table.size());
+  d_delay_table =
+      std::make_unique<cu::DeviceMemory>(nchans * sizeof(dedisp_float));
+  htodstream->memcpyHtoDAsync(*d_delay_table, h_delay_table.data(),
+                              d_delay_table->size());
 
   // Initialize the killmask
-  d_killmask.resize(nchans * sizeof(dedisp_bool));
-  htodstream->memcpyHtoDAsync(d_killmask, h_killmask.data(), d_killmask.size());
+  d_killmask = std::make_unique<cu::DeviceMemory>(nchans * sizeof(dedisp_bool));
+  htodstream->memcpyHtoDAsync(*d_killmask, h_killmask.data(),
+                              d_killmask->size());
 }
 
 // Destructor
@@ -29,34 +40,38 @@ GPUPlan::~GPUPlan() {}
 
 void GPUPlan::set_device(int device_idx) {
   m_device.reset(new cu::Device(device_idx));
+  m_context.reset(new cu::Context(CU_CTX_SCHED_BLOCKING_SYNC, *m_device));
 }
 
 void GPUPlan::generate_dm_list(float_type dm_start, float_type dm_end,
                                float_type ti, float_type tol) {
   Plan::generate_dm_list(dm_start, dm_end, ti, tol);
 
-  // Allocate device memory for the DM list
-  d_dm_list.resize(m_dm_count * sizeof(dedisp_float));
+  d_dm_list =
+      std::make_unique<cu::DeviceMemory>(m_dm_count * sizeof(dedisp_float));
+  assert(d_dm_list);
 
   // Copy the DM list to the device
-  htodstream->memcpyHtoDAsync(d_dm_list, h_dm_list.data(), d_dm_list.size());
+  htodstream->memcpyHtoDAsync(*d_dm_list, h_dm_list.data(), d_dm_list->size());
 }
 
 void GPUPlan::set_dm_list(const float_type *dm_list, size_type count) {
   Plan::set_dm_list(dm_list, count);
 
-  // Allocate device memory for the DM list
-  d_dm_list.resize(m_dm_count * sizeof(dedisp_float));
+  d_dm_list =
+      std::make_unique<cu::DeviceMemory>(m_dm_count * sizeof(dedisp_float));
+  assert(d_dm_list);
 
   // Copy the DM list to the device
-  htodstream->memcpyHtoDAsync(d_dm_list, h_dm_list.data(), d_dm_list.size());
+  htodstream->memcpyHtoDAsync(*d_dm_list, h_dm_list.data(), d_dm_list->size());
 }
 
 void GPUPlan::set_killmask(const bool_type *killmask) {
   Plan::set_killmask(killmask);
 
   // Copy the killmask to the device
-  htodstream->memcpyHtoDAsync(d_killmask, h_killmask.data(), d_killmask.size());
+  htodstream->memcpyHtoDAsync(*d_killmask, h_killmask.data(),
+                              d_killmask->size());
 }
 
 } // end namespace dedisp
diff --git a/src/GPUPlan.hpp b/src/GPUPlan.hpp
index 96387fdf7f008cea7c22f5bbd2ad4fd9e1d94d0a..8911364e923db3663783a3716dd504b35a693fe7 100644
--- a/src/GPUPlan.hpp
+++ b/src/GPUPlan.hpp
@@ -1,9 +1,10 @@
-#ifndef DEDISP_PLAN_H_GPU_INCLUDE_GUARD
-#define DEDISP_PLAN_H_GPU_INCLUDE_GUARD
+#ifndef DEDISP_GPU_PLAN_HPP_
+#define DEDISP_GPU_PLAN_HPP_
 
 #include "Plan.hpp"
 
-#include "common/cuda/CU.h"
+#include <cudawrappers/cu.hpp>
+#include <cudawrappers/nvrtc.hpp>
 
 namespace dedisp {
 
@@ -32,11 +33,12 @@ protected:
   // Device
   void set_device(int device_idx);
   std::unique_ptr<cu::Device> m_device;
+  std::unique_ptr<cu::Context> m_context;
 
   // Device arrays
-  cu::DeviceMemory d_dm_list;     // type = dedisp_float
-  cu::DeviceMemory d_delay_table; // type = dedisp_float
-  cu::DeviceMemory d_killmask;    // type = dedisp_bool
+  std::unique_ptr<cu::DeviceMemory> d_dm_list;     // type = dedisp_float
+  std::unique_ptr<cu::DeviceMemory> d_delay_table; // type = dedisp_float
+  std::unique_ptr<cu::DeviceMemory> d_killmask;    // type = dedisp_bool
 
   // Streams
   std::unique_ptr<cu::Stream> htodstream;
@@ -55,4 +57,4 @@ public:
 
 } // end namespace dedisp
 
-#endif // DEDISP_PLAN_GPU_H_INCLUDE_GUARD
+#endif // DEDISP_GPU_PLAN_HPP_
diff --git a/src/Plan.cpp b/src/Plan.cpp
index ddd26b9539ca2abdb45aa2f357a2a758be08129e..65a18c549691f4f00ade20eb5fc536824803534b 100644
--- a/src/Plan.cpp
+++ b/src/Plan.cpp
@@ -1,6 +1,6 @@
 #include "Plan.hpp"
-#include "common/cuda/CU.h"
 #include "common/dedisp_error.hpp"
+#include "cudawrappers/cu.hpp"
 #include <cmath>
 
 namespace dedisp {
@@ -80,7 +80,7 @@ void Plan::set_killmask(const bool_type *killmask) {
   }
 }
 
-void Plan::sync() { cu::checkError(cudaDeviceSynchronize()); }
+void Plan::sync() { cu::Context::synchronize(); }
 
 // Private helper functions
 void Plan::generate_delay_table(dedisp_float *h_delay_table, dedisp_size nchans,
diff --git a/src/Plan.hpp b/src/Plan.hpp
index 41ac61c4b19ec9ff47c7e89f856ff15f70f94a99..ec70ad5821d1081315f18e1da77e8fca4c32c8e9 100644
--- a/src/Plan.hpp
+++ b/src/Plan.hpp
@@ -1,7 +1,6 @@
-#ifndef DEDISP_PLAN_H_INCLUDE_GUARD
-#define DEDISP_PLAN_H_INCLUDE_GUARD
+#ifndef DEDISP_PLAN_HPP_
+#define DEDISP_PLAN_HPP_
 
-#include <memory>
 #include <vector>
 
 #include "common/dedisp_types.h"
@@ -96,4 +95,4 @@ protected:
 
 } // end namespace dedisp
 
-#endif // DEDISP_PLAN_H_INCLUDE_GUARD
+#endif // DEDISP_PLAN_HPP_
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index f0678b476349c03c277d95d0d322b366af04525c..f950bcfb143d8896469f7c3e7cef1bc2cbfe7fd6 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -1,11 +1,16 @@
-add_library(common OBJECT dedisp_error.cpp helper.cpp cuda/CU.cpp)
+add_library(common OBJECT dedisp_error.cpp helper.cpp)
 
-target_link_libraries(common CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver
-                      OpenMP::OpenMP_CXX)
+target_link_libraries(common PUBLIC OpenMP::OpenMP_CXX cudawrappers::cu
+                                    cudawrappers::nvrtc cudawrappers::nvtx)
 
 set_target_properties(common PROPERTIES VERSION ${DEDISP_VERSION}
                                         SOVERSION ${DEDISP_VERSION_MAJOR})
 
+if(${DEDISP_BACKEND_HIP})
+  get_target_property(sources common SOURCES)
+  set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP)
+endif()
+
 install(TARGETS common LIBRARY DESTINATION lib)
 
 # Install dedisp_types.h in common subdirectory for backwards compatibility with
diff --git a/src/common/cuda/CU.cpp b/src/common/cuda/CU.cpp
deleted file mode 100644
index d4d44cd61c6129c23013b11e4633257c0270259c..0000000000000000000000000000000000000000
--- a/src/common/cuda/CU.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * CU, a CUDA driver api C++ wrapper.
- * This code is copied from the IDG repository (https://git.astron.nl/RD/idg)
- * and changed to meet the needs for this library.
- */
-#include "CU.h"
-
-#include <cassert>
-#include <cstring>
-#include <iostream>
-#include <sstream>
-
-#define assertCudaCall(val) __assertCudaCall(val, #val, __FILE__, __LINE__)
-#define checkCudaCall(val) __checkCudaCall(val, #val, __FILE__, __LINE__)
-#define assertCuCall(val) __assertCuCall(val, #val, __FILE__, __LINE__)
-#define checkCuCall(val) __checkCuCall(val, #val, __FILE__, __LINE__)
-
-namespace cu {
-
-/*
-    Error checking
-*/
-inline void __assertCudaCall(cudaError_t result, char const *const func,
-                             const char *const file, int const line) {
-  if (result != cudaSuccess) {
-    const char *msg;
-    msg = cudaGetErrorString(result);
-    std::cerr << "CUDA Error at " << file;
-    std::cerr << ":" << line;
-    std::cerr << " in function " << func;
-    std::cerr << ": " << msg;
-    std::cerr << std::endl;
-    throw Error<cudaError_t>(result);
-  }
-}
-
-inline void __checkCudaCall(cudaError_t result, char const *const func,
-                            const char *const file, int const line) {
-  try {
-    __assertCudaCall(result, func, file, line);
-  } catch (Error<cudaError_t> &error) {
-    // pass
-  }
-}
-
-inline void __assertCuCall(CUresult result, char const *const func,
-                           const char *const file, int const line) {
-  if (result != CUDA_SUCCESS) {
-    const char *msg;
-    cuGetErrorString(result, &msg);
-    std::cerr << "CU Error at " << file;
-    std::cerr << ":" << line;
-    std::cerr << " in function " << func;
-    std::cerr << ": ";
-    if (msg)
-      std::cerr << msg;
-    std::cerr << std::endl;
-    throw Error<CUresult>(result);
-  }
-}
-
-inline void __checkCuCall(CUresult result, char const *const func,
-                          const char *const file, int const line) {
-  try {
-    __assertCuCall(result, func, file, line);
-  } catch (Error<CUresult> &error) {
-    // pass
-  }
-}
-
-void checkError() { assertCudaCall(cudaGetLastError()); }
-
-void checkError(cudaError_t error) { assertCudaCall(error); }
-void checkError(CUresult error) { assertCuCall(error); }
-
-/*
-    Device
-*/
-Device::Device(int device) {
-  m_device = device;
-  checkCudaCall(cudaSetDevice(device));
-}
-
-unsigned int Device::get_capability() {
-  cudaDeviceProp device_props;
-  cudaGetDeviceProperties(&device_props, m_device);
-  return 10 * device_props.major + device_props.minor;
-}
-
-size_t Device::get_total_const_memory() const {
-  cudaDeviceProp device_props;
-  cudaGetDeviceProperties(&device_props, m_device);
-  return device_props.totalConstMem;
-}
-
-size_t Device::get_free_memory() const {
-  size_t free;
-  size_t total;
-  cudaMemGetInfo(&free, &total);
-  return free;
-}
-
-size_t Device::get_total_memory() const {
-  size_t free;
-  size_t total;
-  cudaMemGetInfo(&free, &total);
-  return total;
-}
-
-CUcontext Device::get_current_context() {
-  CUcontext context{};
-  assertCuCall(cuCtxGetCurrent(&context));
-  return context;
-}
-
-void Device::set_context(CUcontext context) { cuCtxSetCurrent(context); }
-
-/*
-    HostMemory
-*/
-HostMemory::HostMemory(size_t size, int flags) {
-  m_capacity = size;
-  m_size = size;
-  m_flags = flags;
-  assertCudaCall(cudaHostAlloc(&m_ptr, size, m_flags));
-}
-
-HostMemory::~HostMemory() { release(); }
-
-void HostMemory::resize(size_t size) {
-  assert(size > 0);
-  m_size = size;
-  if (size > m_capacity) {
-    release();
-    assertCudaCall(cudaHostAlloc(&m_ptr, size, m_flags));
-    m_capacity = size;
-  }
-}
-
-void HostMemory::release() { assertCudaCall(cudaFreeHost(m_ptr)); }
-
-void HostMemory::zero() { memset(m_ptr, 0, m_size); }
-
-/*
-    DeviceMemory
-*/
-DeviceMemory::DeviceMemory(size_t size) {
-  m_capacity = size;
-  m_size = size;
-  if (size) {
-    assertCudaCall(cudaMalloc(&m_ptr, size));
-  }
-}
-
-DeviceMemory::~DeviceMemory() { release(); }
-
-void DeviceMemory::resize(size_t size) {
-  assert(size > 0);
-  m_size = size;
-  if (size > m_capacity) {
-    release();
-    assertCudaCall(cudaMalloc(&m_ptr, size));
-    m_capacity = size;
-  }
-}
-
-void DeviceMemory::release() {
-  if (m_capacity) {
-    assertCudaCall(cudaFree(m_ptr));
-  }
-}
-
-void DeviceMemory::zero(cudaStream_t stream) {
-  if (m_size) {
-    if (stream != NULL) {
-      assertCudaCall(cudaMemsetAsync(m_ptr, 0, m_size, stream));
-    } else {
-      assertCudaCall(cudaMemset(m_ptr, 0, m_size));
-    }
-  }
-}
-
-/*
-    Event
-*/
-Event::Event(int flags) { assertCudaCall(cudaEventCreate(&m_event, flags)); }
-
-Event::~Event() { assertCudaCall(cudaEventDestroy(m_event)); }
-
-void Event::synchronize() { assertCudaCall(cudaEventSynchronize(m_event)); }
-
-float Event::elapsedTime(Event &second) {
-  float ms;
-  assertCudaCall(cudaEventElapsedTime(&ms, second, m_event));
-  return ms;
-}
-
-Event::operator cudaEvent_t() { return m_event; }
-
-/*
-    Stream
-*/
-Stream::Stream(int flags) {
-  assertCudaCall(cudaStreamCreateWithFlags(&m_stream, flags));
-}
-
-Stream::~Stream() { assertCudaCall(cudaStreamDestroy(m_stream)); }
-
-void Stream::memcpyHtoDAsync(void *devPtr, const void *hostPtr, size_t size) {
-  assertCudaCall(
-      cudaMemcpyAsync(devPtr, hostPtr, size, cudaMemcpyHostToDevice, m_stream));
-}
-
-void Stream::memcpyDtoHAsync(void *hostPtr, void *devPtr, size_t size) {
-  assertCudaCall(
-      cudaMemcpyAsync(hostPtr, devPtr, size, cudaMemcpyDeviceToHost, m_stream));
-}
-
-void Stream::memcpyDtoDAsync(void *dstPtr, void *srcPtr, size_t size) {
-  assertCudaCall(cudaMemcpyAsync(dstPtr, srcPtr, size, cudaMemcpyDeviceToDevice,
-                                 m_stream));
-}
-
-void Stream::memcpyHtoD2DAsync(void *dstPtr, size_t dstWidth,
-                               const void *srcPtr, size_t srcWidth,
-                               size_t widthBytes, size_t height) {
-  assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth,
-                                   widthBytes, height, cudaMemcpyHostToDevice,
-                                   m_stream));
-}
-
-void Stream::memcpyDtoH2DAsync(void *dstPtr, size_t dstWidth,
-                               const void *srcPtr, size_t srcWidth,
-                               size_t widthBytes, size_t height) {
-  assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth,
-                                   widthBytes, height, cudaMemcpyDeviceToHost,
-                                   m_stream));
-}
-
-void Stream::memcpyHtoH2DAsync(void *dstPtr, size_t dstWidth,
-                               const void *srcPtr, size_t srcWidth,
-                               size_t widthBytes, size_t height) {
-  assertCudaCall(cudaMemcpy2DAsync(dstPtr, dstWidth, srcPtr, srcWidth,
-                                   widthBytes, height, cudaMemcpyHostToHost,
-                                   m_stream));
-}
-
-void Stream::synchronize() { assertCudaCall(cudaStreamSynchronize(m_stream)); }
-
-void Stream::waitEvent(Event &event) {
-  assertCudaCall(cudaStreamWaitEvent(m_stream, event, 0));
-}
-
-void Stream::record(Event &event) {
-  assertCudaCall(cudaEventRecord(event, m_stream));
-}
-
-void Stream::zero(void *ptr, size_t size) {
-  assertCudaCall(cudaMemsetAsync(ptr, 0, size, m_stream));
-}
-
-Stream::operator cudaStream_t() { return m_stream; }
-
-/*
-    Marker
-*/
-Marker::Marker(const char *message, Color color) {
-  _attributes.version = NVTX_VERSION;
-  _attributes.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  _attributes.colorType = NVTX_COLOR_ARGB;
-  _attributes.color = convert(color);
-  _attributes.messageType = NVTX_MESSAGE_TYPE_ASCII;
-  _attributes.message.ascii = message;
-}
-
-void Marker::start() { _id = nvtxRangeStartEx(&_attributes); }
-
-void Marker::end() { nvtxRangeEnd(_id); }
-
-void Marker::start(cu::Event &event) {
-  event.synchronize();
-  start();
-}
-
-void Marker::end(cu::Event &event) {
-  event.synchronize();
-  end();
-}
-
-unsigned int Marker::convert(Color color) {
-  switch (color) {
-  case red:
-    return 0xffff0000;
-  case green:
-    return 0xff00ff00;
-  case blue:
-    return 0xff0000ff;
-  case yellow:
-    return 0xffffff00;
-  case black:
-    return 0xff000000;
-  default:
-    return 0xff00ff00;
-  }
-}
-
-/*
-    ScopedMarker
-*/
-ScopedMarker::ScopedMarker(const char *message, Color color)
-    : Marker(message, color) {
-  _id = nvtxRangeStartEx(&_attributes);
-};
-
-ScopedMarker::~ScopedMarker() { nvtxRangeEnd(_id); }
-
-} // end namespace cu
diff --git a/src/common/cuda/CU.h b/src/common/cuda/CU.h
deleted file mode 100644
index 8ad2ae3265dabe10c394105d7ceff993cf9c3014..0000000000000000000000000000000000000000
--- a/src/common/cuda/CU.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * CU, a CUDA driver api C++ wrapper.
- * This code is copied from the IDG repository (https://git.astron.nl/RD/idg)
- * and changed to meet the needs for this library.
- */
-#ifndef CU_WRAPPER_H
-#define CU_WRAPPER_H
-
-#include <stdexcept>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <nvToolsExt.h>
-
-namespace cu {
-
-template <typename T> class Error : public std::exception {
-public:
-  Error(T result) : _result(result) {}
-
-  operator T() const { return _result; }
-
-private:
-  T _result;
-};
-
-void checkError();
-void checkError(cudaError_t error);
-
-class Device {
-public:
-  Device(int device);
-
-  unsigned int get_capability();
-  size_t get_total_const_memory() const;
-  size_t get_free_memory() const;
-  size_t get_total_memory() const;
-  CUcontext get_current_context();
-  void set_context(CUcontext cntx);
-
-private:
-  int m_device;
-};
-
-class Memory {
-public:
-  void *data() { return m_ptr; }
-  size_t size() { return m_size; }
-  virtual void resize(size_t size) = 0;
-  template <typename T> operator T *() { return static_cast<T *>(m_ptr); }
-
-protected:
-  size_t m_capacity = 0; // capacity, in bytes
-  size_t m_size = 0;     // size, in bytes
-  void *m_ptr = nullptr;
-};
-
-class HostMemory : public virtual Memory {
-public:
-  HostMemory(size_t size = 0, int flags = cudaHostAllocPortable);
-  virtual ~HostMemory();
-
-  void resize(size_t size) override;
-  void zero();
-
-private:
-  void release();
-  int m_flags;
-};
-
-class DeviceMemory : public virtual Memory {
-public:
-  DeviceMemory(size_t size = 0);
-  ~DeviceMemory();
-
-  void resize(size_t size);
-  void zero(cudaStream_t stream = NULL);
-
-  template <typename T> operator T() { return static_cast<T>(m_ptr); }
-
-private:
-  void release();
-};
-
-class Event {
-public:
-  Event(int flags = cudaEventDefault);
-  ~Event();
-
-  void synchronize();
-  float elapsedTime(Event &second);
-
-  operator cudaEvent_t();
-
-private:
-  cudaEvent_t m_event;
-};
-
-class Stream {
-public:
-  Stream(int flags = cudaStreamDefault);
-  ~Stream();
-
-  void memcpyHtoDAsync(void *devPtr, const void *hostPtr, size_t size);
-  void memcpyDtoHAsync(void *hostPtr, void *devPtr, size_t size);
-  void memcpyDtoDAsync(void *dstPtr, void *srcPtr, size_t size);
-  void memcpyHtoD2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr,
-                         size_t srcWidth, size_t widthBytes, size_t height);
-  void memcpyDtoH2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr,
-                         size_t srcWidth, size_t widthBytes, size_t height);
-  void memcpyHtoH2DAsync(void *dstPtr, size_t dstWidth, const void *srcPtr,
-                         size_t srcWidth, size_t widthBytes, size_t height);
-  void synchronize();
-  void waitEvent(Event &event);
-  void record(Event &event);
-  void zero(void *ptr, size_t size);
-
-  operator cudaStream_t();
-
-private:
-  cudaStream_t m_stream;
-};
-
-class Marker {
-public:
-  enum Color { red, green, blue, yellow, black };
-
-  Marker(const char *message, Marker::Color color = Color::red);
-
-  void start();
-  void end();
-  void start(cu::Event &event);
-  void end(cu::Event &event);
-
-private:
-  unsigned int convert(Color color);
-
-protected:
-  nvtxEventAttributes_t _attributes;
-  nvtxRangeId_t _id;
-};
-
-class ScopedMarker : public Marker {
-public:
-  ScopedMarker(const char *message, Marker::Color color = Color::red);
-
-  ~ScopedMarker();
-
-  void start() = delete;
-  void end() = delete;
-  void start(cu::Event &event) = delete;
-  void end(cu::Event &event) = delete;
-};
-
-} // end namespace cu
-
-#endif // end CU_WRAPPER_H
\ No newline at end of file
diff --git a/src/common/dedisp.h b/src/common/dedisp.h
index 99823ba2e373112ada7a6b993aee2e53284a5f56..9b9eee637fa14b55bc9a37e644ca1a1e8e4c0748 100644
--- a/src/common/dedisp.h
+++ b/src/common/dedisp.h
@@ -37,8 +37,8 @@
  *  \bug Asynchronous execution is not currently operational.
  */
 
-#ifndef DEDISP_H_INCLUDE_GUARD
-#define DEDISP_H_INCLUDE_GUARD
+#ifndef DEDISP_COMMON_DEDISP_H_
+#define DEDISP_COMMON_DEDISP_H_
 
 // Use C linkage to allow cross-language use of the library
 #ifdef __cplusplus
@@ -489,4 +489,4 @@ const dedisp_size *dedisp_get_dt_factors(const dedisp_plan plan);
 } // closing brace for extern "C"
 #endif
 
-#endif // DEDISP_H_INCLUDE_GUARD
+#endif // DEDISP_COMMON_DEDISP_H_
diff --git a/src/common/dedisp_error.hpp b/src/common/dedisp_error.hpp
index fdff277f2070504931a673e7417688b5d77b25be..6d6400d75aa11e4649a332f68cb11ad257cdf65e 100644
--- a/src/common/dedisp_error.hpp
+++ b/src/common/dedisp_error.hpp
@@ -1,5 +1,5 @@
-#ifndef DEDISP_ERROR_H_INCLUDE_GUARD
-#define DEDISP_ERROR_H_INCLUDE_GUARD
+#ifndef DEDISP_COMMON_DEDISP_ERROR_H_
+#define DEDISP_COMMON_DEDISP_ERROR_H_
 
 #include <sstream>
 #include <stdexcept>
@@ -79,4 +79,4 @@ inline void _throw_error(dedisp_error error, char const *const func,
 
 inline void check_error(dedisp_error error) { throw_error(error); }
 
-#endif // DEDISP_ERROR_H_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_COMMON_DEDISP_ERROR_H_
\ No newline at end of file
diff --git a/src/common/dedisp_strings.h b/src/common/dedisp_strings.h
index 0ae85f94122e4fcff575381a087a3572f30c6e33..4e6e2410aeb1269488fe5389e3112ccf14e5b98a 100644
--- a/src/common/dedisp_strings.h
+++ b/src/common/dedisp_strings.h
@@ -12,6 +12,8 @@ static const std::string output_memcpy_time_str = "Output memcpy time  : ";
 static const std::string runtime_time_str = "Runtime             : ";
 static const std::string gpuexec_time_str = "GPU execution time  : ";
 static const std::string total_time_str = "Total time          : ";
+static const std::string pmt_joules_str = "Joules              : ";
+static const std::string pmt_watts_str = "Watts               : ";
 
 static const std::string preprocessing_perf_str =
     "Preprocessing performance : ";
diff --git a/src/common/dedisp_types.h b/src/common/dedisp_types.h
index b92ab7051999db3e96ee9f82b0783ce44cdad5b2..cab0f0304f104ffaa61adfd86eafb1dc47c0fc18 100644
--- a/src/common/dedisp_types.h
+++ b/src/common/dedisp_types.h
@@ -1,5 +1,5 @@
-#ifndef DEDISP_TYPES_H_INCLUDE_GUARD
-#define DEDISP_TYPES_H_INCLUDE_GUARD
+#ifndef DEDISP_COMMON_TYPES_H_
+#define DEDISP_COMMON_TYPES_H_
 
 // Types
 // -----
@@ -48,4 +48,4 @@ typedef enum {
  * are complete.
  */
 
-#endif // DEDISP_TYPES_H_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_COMMON_TYPES_H_
\ No newline at end of file
diff --git a/src/common/helper.cpp b/src/common/helper.cpp
index fd2a9e7a7985af36c4bb0b49bc3aa0fca25c998e..7f1660ffbd483d99e7ff2c38c3ac061ffc4466af 100644
--- a/src/common/helper.cpp
+++ b/src/common/helper.cpp
@@ -1,5 +1,4 @@
 #include "helper.h"
-#include <omp.h>
 #include <sys/resource.h> // get used memory
 #include <unistd.h>       // get total memory
 
diff --git a/src/common/helper.h b/src/common/helper.h
index 8f617fc2a43bc4bbe67236d4032261007084abdf..01dfa020a46da27c9a4b9de8f7157ccfbce2a5b5 100644
--- a/src/common/helper.h
+++ b/src/common/helper.h
@@ -1,5 +1,5 @@
-#ifndef HELPER_H_INCLUDE_GUARD
-#define HELPER_H_INCLUDE_GUARD
+#ifndef DEDISP_COMMON_HELPER_H_
+#define DEDISP_COMMON_HELPER_H_
 
 #include <cstddef>
 
@@ -17,4 +17,4 @@ size_t get_free_memory();
 
 } // end namespace dedisp
 
-#endif // HELPER_H_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_COMMON_HELPER_H_
\ No newline at end of file
diff --git a/src/dedisp/CMakeLists.txt b/src/dedisp/CMakeLists.txt
index aafc1627b8f20aae0aae2d805ca553c9623e86f8..b4725d3cf28a5d6d19ca9246ea827cd15e8ef36d 100644
--- a/src/dedisp/CMakeLists.txt
+++ b/src/dedisp/CMakeLists.txt
@@ -2,17 +2,20 @@ add_library(
   dedisp SHARED
   cinterface.cpp
   DedispPlan.cpp
-  dedisperse/dedisperse.cu
-  unpack/unpack.cu
-  transpose/transpose.cu
+  dedisperse/DedispKernel.cu
+  unpack/UnpackKernel.cu
+  transpose/TransposeKernel.cu
   $<TARGET_OBJECTS:common>
   $<TARGET_OBJECTS:plan>
   $<TARGET_OBJECTS:external>)
 
-target_include_directories(dedisp PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(dedisp PRIVATE "${CMAKE_SOURCE_DIR}/src")
 
-target_link_libraries(dedisp CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver
-                      OpenMP::OpenMP_CXX)
+target_embed_source(dedisp transpose/transpose_kernel.cu)
+target_embed_source(dedisp dedisperse/dedisperse_kernel.cu)
+
+target_link_libraries(dedisp PUBLIC OpenMP::OpenMP_CXX cudawrappers::cu
+                                    cudawrappers::nvtx cudawrappers::nvrtc)
 
 target_compile_options(dedisp
                        PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>)
@@ -23,6 +26,11 @@ set_target_properties(
              SOVERSION ${DEDISP_VERSION_MAJOR}
              PUBLIC_HEADER DedispPlan.hpp)
 
+if(${DEDISP_BACKEND_HIP})
+  get_target_property(sources ${PROJECT_NAME} SOURCES)
+  set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP)
+endif()
+
 install(
   TARGETS dedisp
   LIBRARY DESTINATION lib
diff --git a/src/dedisp/DedispPlan.cpp b/src/dedisp/DedispPlan.cpp
index 128fa71f4830ef23e5afdf9cd8081f0249496705..627c5f0380519f93d927d1eb4a2363e9e125c781 100644
--- a/src/dedisp/DedispPlan.cpp
+++ b/src/dedisp/DedispPlan.cpp
@@ -6,9 +6,8 @@
 #include "common/dedisp_strings.h"
 #include "common/dedisp_types.h"
 
-#include "dedisperse/dedisperse.h"
-#include "transpose/transpose.hpp"
-#include "unpack/unpack.h"
+#include "dedisperse/dedisp_constants.cuh"
+#include "unpack/UnpackKernel.hpp"
 
 #if defined(DEDISP_BENCHMARK)
 #include "external/Stopwatch.h"
@@ -45,8 +44,6 @@ void DedispPlan::set_gulp_size(size_type gulp_size) { m_gulp_size = gulp_size; }
 void DedispPlan::execute(size_type nsamps, const byte_type *in,
                          size_type in_nbits, byte_type *out,
                          size_type out_nbits, unsigned flags) {
-  enum { BITS_PER_BYTE = 8 };
-
   // Note: The default out_stride is nsamps - m_max_delay
   dedisp_size out_bytes_per_sample =
       out_nbits / (sizeof(dedisp_byte) * BITS_PER_BYTE);
@@ -76,13 +73,6 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in,
                               byte_type *out, size_type out_nbits,
                               size_type out_stride, size_type first_dm_idx,
                               size_type dm_count, unsigned flags) {
-  cu::checkError();
-
-  enum {
-    BITS_PER_BYTE = 8,
-    BYTES_PER_WORD = sizeof(dedisp_word) / sizeof(dedisp_byte)
-  };
-
   dedisp_size out_bytes_per_sample =
       out_nbits / (sizeof(dedisp_byte) * BITS_PER_BYTE);
 
@@ -126,10 +116,6 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in,
   init_timer->Start();
 #endif
 
-  // Copy the lookup tables to constant memory on the device
-  copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0, 0);
-  copy_killmask(d_killmask, m_nchans * sizeof(dedisp_bool), 0, 0);
-
   // Compute the problem decomposition
   dedisp_size nsamps_computed = nsamps - m_max_delay;
 
@@ -219,39 +205,41 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in,
                                   nsamps_gulp);                   // height
     htodstream->synchronize();
 #ifdef DEDISP_BENCHMARK
-    cudaDeviceSynchronize();
     input_timer->Pause();
     preprocessing_timer->Start();
 #endif
     // Transpose the words in the input
-    transpose((dedisp_word *)d_in, nchan_words, nsamps_gulp,
-              in_buf_stride_words, nsamps_padded_gulp,
-              (dedisp_word *)d_transposed);
+    dedisp_kernel_transpose.run(d_in, nchan_words, nsamps_gulp,
+                                in_buf_stride_words, nsamps_padded_gulp,
+                                d_transposed, *executestream);
 #ifdef DEDISP_BENCHMARK
-    cudaDeviceSynchronize();
+    executestream->synchronize();
 #endif
-
     // Unpack the transposed data
-    unpack(d_transposed, nsamps_padded_gulp, nchan_words, d_unpacked, in_nbits,
-           unpacked_in_nbits);
+    dedisp_kernel_unpack.run(d_transposed, nsamps_padded_gulp, nchan_words,
+                             d_unpacked, in_nbits, unpacked_in_nbits,
+                             *executestream);
 
 #ifdef DEDISP_BENCHMARK
-    cudaDeviceSynchronize();
+    executestream->synchronize();
     preprocessing_timer->Pause();
     dedispersion_timer->Start();
 #endif
 
     // Perform direct dedispersion without scrunching
-    if (!dedisperse( // d_transposed,
+    if (!dedisp_kernel_dedisperse.run( // d_transposed,
             d_unpacked, nsamps_padded_gulp, nsamps_computed_gulp,
             unpacked_in_nbits, // in_nbits,
-            m_nchans, 1, d_dm_list, dm_count, 1, d_out, out_stride_gulp_samples,
-            out_nbits, 1, 0, 0, 0, 0)) {
+            m_nchans, 1, *d_dm_list, dm_count, 1, d_out,
+            out_stride_gulp_samples, out_nbits, 1, 0, 0, 0, 0,
+            h_delay_table.data(), h_delay_table.size() * sizeof(dedisp_float),
+            h_killmask.data(), h_killmask.size() * sizeof(dedisp_bool),
+            *htodstream, *executestream)) {
       throw_error(DEDISP_INTERNAL_GPU_ERROR);
     }
 
 #ifdef DEDISP_BENCHMARK
-    cudaDeviceSynchronize();
+    executestream->synchronize();
     dedispersion_timer->Pause();
 #endif
     // Copy output back to host memory
@@ -263,13 +251,12 @@ void DedispPlan::execute_guru(size_type nsamps, const byte_type *in,
 #endif
     dtohstream->memcpyDtoH2DAsync(out + gulp_samp_byte_idx,  // dst
                                   out_stride,                // dst stride
-                                  (byte_type *)d_out,        // src
+                                  d_out,                     // src
                                   out_stride_gulp_bytes,     // src stride
                                   nsamp_bytes_computed_gulp, // width bytes
                                   dm_count);                 // height
     dtohstream->synchronize();
 #ifdef DEDISP_BENCHMARK
-    cudaDeviceSynchronize();
     output_timer->Pause();
 #endif
 
diff --git a/src/dedisp/DedispPlan.hpp b/src/dedisp/DedispPlan.hpp
index 0cedfbdc277453db4d4d251ee19a6127f18a401b..9853195a312376b9a0b45044346e022f58c8f37f 100644
--- a/src/dedisp/DedispPlan.hpp
+++ b/src/dedisp/DedispPlan.hpp
@@ -1,7 +1,10 @@
-#ifndef H_DEDISP_PLAN_INCLUDE_GUARD
-#define H_DEDISP_PLAN_INCLUDE_GUARD
+#ifndef DEDISP_DEDISP_DEDISP_PLAN_HPP_
+#define DEDISP_DEDISP_DEDISP_PLAN_HPP_
 
 #include "GPUPlan.hpp"
+#include "dedisperse/DedispKernel.hpp"
+#include "transpose/TransposeKernel.hpp"
+#include "unpack/UnpackKernel.hpp"
 
 namespace dedisp {
 
@@ -48,8 +51,11 @@ public:
 
 private:
   dedisp_size m_gulp_size;
+  TransposeKernel dedisp_kernel_transpose;
+  DedisperseKernel dedisp_kernel_dedisperse;
+  UnpackKernel dedisp_kernel_unpack;
 };
 
 } // end namespace dedisp
 
-#endif // H_DEDISP_PLAN_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_DEDISP_DEDISP_PLAN_HPP_
\ No newline at end of file
diff --git a/src/dedisp/cinterface.cpp b/src/dedisp/cinterface.cpp
index a67e2473abd8cf4d51bdd0b0efad139bc815fb6c..a9ebe6adeaf186e373b84c0435554e1f8e0138a9 100644
--- a/src/dedisp/cinterface.cpp
+++ b/src/dedisp/cinterface.cpp
@@ -45,10 +45,6 @@ dedisp_error dedisp_create_plan(dedisp_plan *plan, dedisp_size nchans,
 
   *plan = nullptr;
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   *plan = new dedisp_plan_struct();
   if (!plan) {
     throw_error(DEDISP_MEM_ALLOC_FAILED);
@@ -70,10 +66,6 @@ dedisp_error dedisp_set_gulp_size(dedisp_plan plan, dedisp_size gulp_size) {
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     static_cast<dedisp::DedispPlan *>(plan->plan_.get())
         ->set_gulp_size(gulp_size);
@@ -89,10 +81,6 @@ dedisp_size dedisp_get_gulp_size(dedisp_plan plan) {
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     return static_cast<dedisp::DedispPlan *>(plan->plan_.get())
         ->get_gulp_size();
@@ -107,10 +95,6 @@ dedisp_error dedisp_set_dm_list(dedisp_plan plan, const dedisp_float *dm_list,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->set_dm_list(dm_list, count);
   } catch (...) {
@@ -127,10 +111,6 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, dedisp_float dm_start,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->generate_dm_list(dm_start, dm_end, ti, tol);
   } catch (...) {
@@ -190,10 +170,6 @@ dedisp_error dedisp_set_killmask(dedisp_plan plan,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->set_killmask(killmask);
   } catch (...) {
@@ -295,10 +271,6 @@ dedisp_error dedisp_execute_guru(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     static_cast<dedisp::DedispPlan *>(plan->plan_.get())
         ->execute_guru(nsamps, in, in_nbits, in_stride, out, out_nbits,
@@ -319,10 +291,6 @@ dedisp_error dedisp_execute_adv(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     static_cast<dedisp::DedispPlan *>(plan->plan_.get())
         ->execute_adv(nsamps, in, in_nbits, in_stride, out, out_nbits,
@@ -343,10 +311,6 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->execute(nsamps, in, in_nbits, out, out_nbits, flags);
   } catch (...) {
@@ -358,7 +322,7 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps,
 
 dedisp_error dedisp_sync(void) {
   try {
-    cu::checkError(cudaDeviceSynchronize());
+    cu::Context::synchronize();
   } catch (...) {
     throw_error(DEDISP_PRIOR_GPU_ERROR);
   }
diff --git a/src/dedisp/dedisperse/DedispKernel.cu b/src/dedisp/dedisperse/DedispKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..19a2bfafbb1730846c972965233e67cf7d75f1dd
--- /dev/null
+++ b/src/dedisp/dedisperse/DedispKernel.cu
@@ -0,0 +1,97 @@
+#include "DedispKernel.hpp"
+#include "GPUKernel.hpp"
+#include "dedisperse_kernel.cu"
+
+#include <cudawrappers/cu.hpp>
+#include <iostream>
+#include <memory>
+
+/*
+ * Helper functions
+ */
+
+/*
+ * dedisperse routine
+ */
+bool DedisperseKernel::run(
+    const dedisp_word *d_in, dedisp_size in_stride, dedisp_size nsamps,
+    dedisp_size in_nbits, dedisp_size nchans, dedisp_size chan_stride,
+    const dedisp_float *d_dm_list, dedisp_size dm_count, dedisp_size dm_stride,
+    dedisp_byte *d_out, dedisp_size out_stride, dedisp_size out_nbits,
+    dedisp_size batch_size, dedisp_size batch_in_stride,
+    dedisp_size batch_dm_stride, dedisp_size batch_chan_stride,
+    dedisp_size batch_out_stride, const void *delay_table,
+    const size_t delay_table_size, const void *killmask,
+    const size_t killmask_size, cu::Stream &htodstream,
+    cu::Stream &executestream) {
+  // Define thread decomposition
+  // Note: Block dimensions x and y represent time samples and DMs respectively
+  dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+  // Note: Grid dimension x represents time samples. Dimension y represents
+  //         DMs and batch jobs flattened together.
+
+  // Divide and round up
+  dedisp_size nsamp_blocks =
+      (nsamps - 1) / ((dedisp_size)DEDISP_SAMPS_PER_THREAD * block.x) + 1;
+  dedisp_size ndm_blocks = (dm_count - 1) / (dedisp_size)block.y + 1;
+
+  // Constrain the grid size to the maximum allowed
+  // TODO: Consider cropping the batch size dimension instead and looping over
+  // it inside the kernel
+  ndm_blocks = min((unsigned int)ndm_blocks,
+                   (unsigned int)(MAX_CUDA_GRID_SIZE_Y / batch_size));
+
+  // Note: We combine the DM and batch dimensions into one
+  dim3 grid(nsamp_blocks, ndm_blocks * batch_size);
+
+  // Divide and round up
+  dedisp_size nsamps_reduced = (nsamps - 1) / DEDISP_SAMPS_PER_THREAD + 1;
+
+  const std::vector<const void *> parameters = {&d_in,
+                                                &nsamps,
+                                                &nsamps_reduced,
+                                                &nsamp_blocks,
+                                                &in_stride,
+                                                &dm_count,
+                                                &dm_stride,
+                                                &ndm_blocks,
+                                                &nchans,
+                                                &chan_stride,
+                                                &d_out,
+                                                &out_nbits,
+                                                &out_stride,
+                                                &d_dm_list,
+                                                &batch_in_stride,
+                                                &batch_dm_stride,
+                                                &batch_chan_stride,
+                                                &batch_out_stride};
+
+  std::ostringstream ss;
+  ss << "-DPARAM_IN_NBITS=" << in_nbits;
+
+  CompiledKernelInfo kernel;
+  bool did_recompile;
+
+  std::tie(kernel, did_recompile) = compile({ss.str()});
+  assertCompiled(kernel);
+
+  // Copy delay table and killmask
+  if (did_recompile) {
+    if (delay_table) {
+      htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_delay_table"),
+                                 delay_table, delay_table_size);
+      htodstream.synchronize();
+    }
+
+    if (killmask) {
+      htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_killmask"),
+                                 killmask, killmask_size);
+      htodstream.synchronize();
+    }
+  }
+
+  executestream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z,
+                             block.x, block.y, block.z, 0, parameters);
+
+  return true;
+}
diff --git a/src/dedisp/dedisperse/DedispKernel.hpp b/src/dedisp/dedisperse/DedispKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..adb248b2a6e00bfb0caee9496a0298d808c9190e
--- /dev/null
+++ b/src/dedisp/dedisperse/DedispKernel.hpp
@@ -0,0 +1,47 @@
+#ifndef DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_
+#define DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_
+
+#include <string>
+
+#include <cudawrappers/cu.hpp>
+
+#include "common/dedisp_types.h"
+#include <GPUKernel.hpp>
+
+extern const char _binary_src_dedisp_dedisperse_dedisperse_kernel_cu_start,
+    _binary_src_dedisp_dedisperse_dedisperse_kernel_cu_end;
+
+class DedisperseKernel : public GPUKernel {
+public:
+  DedisperseKernel()
+      : GPUKernel(
+            "dedisperse_kernel.cu", "dedisperse_kernel",
+            std::string(
+                reinterpret_cast<const char *>(
+                    &_binary_src_dedisp_dedisperse_dedisperse_kernel_cu_start),
+                reinterpret_cast<const char *>(
+                    &_binary_src_dedisp_dedisperse_dedisperse_kernel_cu_end))) {
+  }
+
+  bool run(const dedisp_word *d_in, dedisp_size in_stride, dedisp_size nsamps,
+           dedisp_size in_nbits, dedisp_size nchans, dedisp_size chan_stride,
+           const dedisp_float *d_dm_list, dedisp_size dm_count,
+           dedisp_size dm_stride, dedisp_byte *d_out, dedisp_size out_stride,
+           dedisp_size out_nbits, dedisp_size batch_size,
+           dedisp_size batch_in_stride, dedisp_size batch_dm_stride,
+           dedisp_size batch_chan_stride, dedisp_size batch_out_stride,
+           const void *delay_table, const size_t delay_table_size,
+           const void *killmask, const size_t killmask_size,
+           cu::Stream &h2dstream, cu::Stream &stream);
+
+private:
+  const void *delay_table_src_host_ = nullptr;
+  size_t delay_table_count_ = 0;
+  size_t delay_table_offset_ = 0;
+
+  const void *killmask_src_host_ = nullptr;
+  size_t killmask_count_ = 0;
+  size_t killmask_offset_ = 0;
+};
+
+#endif // DEDISP_DEDISP_DEDISPERSE_DEDISP_KERNEL_HPP_
\ No newline at end of file
diff --git a/src/dedisp/dedisperse/dedisp_constants.cuh b/src/dedisp/dedisperse/dedisp_constants.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8f928843820537e6645bcf64f35045e0a81808e0
--- /dev/null
+++ b/src/dedisp/dedisperse/dedisp_constants.cuh
@@ -0,0 +1,23 @@
+#ifndef DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_
+#define DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_
+
+#define DEDISP_DEFAULT_GULP_SIZE 65536u // 131072
+
+// TODO: Make sure this doesn't limit GPU constant memory
+//         available to users.
+#define DEDISP_MAX_NCHANS 8192u
+
+// Kernel tuning parameters
+#define DEDISP_BLOCK_SIZE 256u
+#define DEDISP_BLOCK_SAMPS 8u
+#define DEDISP_SAMPS_PER_THREAD 2u // 4 is better for Fermi?
+
+#define BITS_PER_BYTE 8
+#define BYTES_PER_WORD (sizeof(dedisp_word) / sizeof(unsigned char))
+#define MAX_CUDA_GRID_SIZE_X 65535u
+#define MAX_CUDA_GRID_SIZE_Y 65535u
+
+constexpr unsigned int BLOCK_DIM_X = DEDISP_BLOCK_SAMPS;
+constexpr unsigned int BLOCK_DIM_Y = (DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS);
+
+#endif // DEDISP_DEDISP_DEDISPERSE_CONSTANTS_CUH_
\ No newline at end of file
diff --git a/src/dedisp/dedisperse/dedisperse.cu b/src/dedisp/dedisperse/dedisperse.cu
deleted file mode 100644
index 09da8df6186a4aa18425649472be12debf362f8a..0000000000000000000000000000000000000000
--- a/src/dedisp/dedisperse/dedisperse.cu
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "dedisperse.h"
-#include "dedisperse_kernel.cuh"
-
-// Kernel tuning parameters
-#define DEDISP_BLOCK_SIZE 256
-#define DEDISP_BLOCK_SAMPS 8
-#define DEDISP_SAMPS_PER_THREAD 2 // 4 is better for Fermi?
-
-/*
- * Helper functions
- */
-void copy_delay_table(const void *src, size_t count, size_t offset,
-                      cudaStream_t stream) {
-  cudaMemcpyToSymbolAsync(c_delay_table, src, count, offset,
-                          cudaMemcpyDeviceToDevice, stream);
-  cudaDeviceSynchronize();
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) {
-    throw_error(DEDISP_MEM_COPY_FAILED);
-  }
-}
-
-void copy_killmask(const void *src, size_t count, size_t offset,
-                   cudaStream_t stream) {
-  cudaMemcpyToSymbolAsync(c_killmask, src, count, offset,
-                          cudaMemcpyDeviceToDevice, stream);
-  cudaDeviceSynchronize();
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) {
-    throw_error(DEDISP_MEM_COPY_FAILED);
-  }
-}
-
-/*
- * dedisperse routine
- */
-bool dedisperse(const dedisp_word *d_in, dedisp_size in_stride,
-                dedisp_size nsamps, dedisp_size in_nbits, dedisp_size nchans,
-                dedisp_size chan_stride, const dedisp_float *d_dm_list,
-                dedisp_size dm_count, dedisp_size dm_stride, dedisp_byte *d_out,
-                dedisp_size out_stride, dedisp_size out_nbits,
-                dedisp_size batch_size, dedisp_size batch_in_stride,
-                dedisp_size batch_dm_stride, dedisp_size batch_chan_stride,
-                dedisp_size batch_out_stride) {
-  enum {
-    BITS_PER_BYTE = 8,
-    BYTES_PER_WORD = sizeof(dedisp_word) / sizeof(dedisp_byte),
-    BLOCK_DIM_X = DEDISP_BLOCK_SAMPS,
-    BLOCK_DIM_Y = DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS,
-    MAX_CUDA_GRID_SIZE_X = 65535,
-    MAX_CUDA_GRID_SIZE_Y = 65535,
-  };
-
-  // Define thread decomposition
-  // Note: Block dimensions x and y represent time samples and DMs respectively
-  dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-  // Note: Grid dimension x represents time samples. Dimension y represents
-  //         DMs and batch jobs flattened together.
-
-  // Divide and round up
-  dedisp_size nsamp_blocks =
-      (nsamps - 1) / ((dedisp_size)DEDISP_SAMPS_PER_THREAD * block.x) + 1;
-  dedisp_size ndm_blocks = (dm_count - 1) / (dedisp_size)block.y + 1;
-
-  // Constrain the grid size to the maximum allowed
-  // TODO: Consider cropping the batch size dimension instead and looping over
-  // it
-  //         inside the kernel
-  ndm_blocks = min((unsigned int)ndm_blocks,
-                   (unsigned int)(MAX_CUDA_GRID_SIZE_Y / batch_size));
-
-  // Note: We combine the DM and batch dimensions into one
-  dim3 grid(nsamp_blocks, ndm_blocks * batch_size);
-
-  // Divide and round up
-  dedisp_size nsamps_reduced = (nsamps - 1) / DEDISP_SAMPS_PER_THREAD + 1;
-
-  cudaStream_t stream = 0;
-
-  // Execute the kernel
-#define DEDISP_CALL_KERNEL(NBITS)                                              \
-  dedisperse_kernel<NBITS, DEDISP_SAMPS_PER_THREAD, BLOCK_DIM_X, BLOCK_DIM_Y>  \
-      <<<grid, block, 0, stream>>>(                                            \
-          d_in, nsamps, nsamps_reduced, nsamp_blocks, in_stride, dm_count,     \
-          dm_stride, ndm_blocks, nchans, chan_stride, d_out, out_nbits,        \
-          out_stride, d_dm_list, batch_in_stride, batch_dm_stride,             \
-          batch_chan_stride, batch_out_stride)
-  // Note: Here we dispatch dynamically on nbits for supported values
-  switch (in_nbits) {
-  case 1:
-    DEDISP_CALL_KERNEL(1);
-    break;
-  case 2:
-    DEDISP_CALL_KERNEL(2);
-    break;
-  case 4:
-    DEDISP_CALL_KERNEL(4);
-    break;
-  case 8:
-    DEDISP_CALL_KERNEL(8);
-    break;
-  case 16:
-    DEDISP_CALL_KERNEL(16);
-    break;
-  case 32:
-    DEDISP_CALL_KERNEL(32);
-    break;
-  default: /* should never be reached */
-    break;
-  }
-#undef DEDISP_CALL_KERNEL
-
-  // Check for kernel errors
-#ifdef DEDISP_DEBUG
-  cudaDeviceSynchronize();
-  cudaError_t cuda_error = cudaGetLastError();
-  if (cuda_error != cudaSuccess) {
-    return false;
-  }
-#endif // DEDISP_DEBUG
-
-  return true;
-}
diff --git a/src/dedisp/dedisperse/dedisperse.h b/src/dedisp/dedisperse/dedisperse.h
deleted file mode 100644
index 30e2ddd574260cdd8ce5237bec933a1d67edbf7f..0000000000000000000000000000000000000000
--- a/src/dedisp/dedisperse/dedisperse.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "common/dedisp_error.hpp"
-#include "common/dedisp_types.h"
-
-#define DEDISP_DEFAULT_GULP_SIZE 65536 // 131072
-
-// TODO: Make sure this doesn't limit GPU constant memory
-//         available to users.
-#define DEDISP_MAX_NCHANS 8192
-
-// Kernel tuning parameters
-#define DEDISP_SAMPS_PER_THREAD 2 // 4 is better for Fermi?
-
-void copy_delay_table(const void *src, size_t count, size_t offset,
-                      cudaStream_t stream);
-
-void copy_killmask(const void *src, size_t count, size_t offset,
-                   cudaStream_t stream);
-
-bool dedisperse(const dedisp_word *d_in, dedisp_size in_stride,
-                dedisp_size nsamps, dedisp_size in_nbits, dedisp_size nchans,
-                dedisp_size chan_stride, const dedisp_float *d_dm_list,
-                dedisp_size dm_count, dedisp_size dm_stride, dedisp_byte *d_out,
-                dedisp_size out_stride, dedisp_size out_nbits,
-                dedisp_size batch_size, dedisp_size batch_in_stride,
-                dedisp_size batch_dm_stride, dedisp_size batch_chan_stride,
-                dedisp_size batch_out_stride);
\ No newline at end of file
diff --git a/src/dedisp/dedisperse/dedisperse_kernel.cuh b/src/dedisp/dedisperse/dedisperse_kernel.cu
similarity index 85%
rename from src/dedisp/dedisperse/dedisperse_kernel.cuh
rename to src/dedisp/dedisperse/dedisperse_kernel.cu
index 1dfbd13af0ffbee10e7163522f2db28ab0b5df2e..cf7256bd117827e39e597b862f78babf66d42b80 100644
--- a/src/dedisp/dedisperse/dedisperse_kernel.cuh
+++ b/src/dedisp/dedisperse/dedisperse_kernel.cu
@@ -1,3 +1,6 @@
+#include "common/dedisp_types.h"
+#include "dedisp_constants.cuh"
+
 // Constant reference for input data
 __constant__ dedisp_float c_delay_table[DEDISP_MAX_NCHANS];
 __constant__ dedisp_bool c_killmask[DEDISP_MAX_NCHANS];
@@ -25,7 +28,6 @@ inline __host__ __device__ T extract_subword(T value, int idx) {
 
 template <int IN_NBITS, typename T, typename SumType>
 inline __host__ __device__ T scale_output(SumType sum, dedisp_size nchans) {
-  enum { BITS_PER_BYTE = 8 };
   // This emulates dedisperse_all, but is specific to 8-bit output
   float in_range = max_value<IN_NBITS>::value;
   // Note: We use floats when out_nbits == 32, and scale to a range of [0:1]
@@ -59,7 +61,7 @@ inline __host__ __device__ void set_out_val(dedisp_byte *d_out, dedisp_size idx,
 //       E.g., Words bracketed: (t0c0,t0c1,t0c2,t0c3), (t1c0,t1c1,t1c2,t1c3),...
 // Note: out_stride should be in units of samples
 template <int IN_NBITS, int SAMPS_PER_THREAD, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void dedisperse_kernel(
+__device__ inline void dedisperse_kernel_impl(
     const dedisp_word *d_in, dedisp_size nsamps, dedisp_size nsamps_reduced,
     dedisp_size nsamp_blocks, dedisp_size stride, dedisp_size dm_count,
     dedisp_size dm_stride, dedisp_size ndm_blocks, dedisp_size nchans,
@@ -68,10 +70,7 @@ __global__ void dedisperse_kernel(
     dedisp_size batch_in_stride, dedisp_size batch_dm_stride,
     dedisp_size batch_chan_stride, dedisp_size batch_out_stride) {
   // Compute compile-time constants
-  enum {
-    BITS_PER_BYTE = 8,
-    CHANS_PER_WORD = sizeof(dedisp_word) * BITS_PER_BYTE / IN_NBITS
-  };
+  enum { CHANS_PER_WORD = sizeof(dedisp_word) * BITS_PER_BYTE / IN_NBITS };
 
   // Compute the thread decomposition
   dedisp_size samp_block = blockIdx.x;
@@ -170,4 +169,25 @@ __global__ void dedisperse_kernel(
     } // End of sample loop
 
   } // End of DM loop
+}
+
+extern "C" {
+#ifndef PARAM_IN_NBITS
+#define PARAM_IN_NBITS 16
+#endif
+
+__global__ void dedisperse_kernel(
+    const dedisp_word *d_in, dedisp_size nsamps, dedisp_size nsamps_reduced,
+    dedisp_size nsamp_blocks, dedisp_size stride, dedisp_size dm_count,
+    dedisp_size dm_stride, dedisp_size ndm_blocks, dedisp_size nchans,
+    dedisp_size chan_stride, dedisp_byte *d_out, dedisp_size out_nbits,
+    dedisp_size out_stride, const dedisp_float *d_dm_list,
+    dedisp_size batch_in_stride, dedisp_size batch_dm_stride,
+    dedisp_size batch_chan_stride, dedisp_size batch_out_stride) {
+  dedisperse_kernel_impl<PARAM_IN_NBITS, DEDISP_SAMPS_PER_THREAD, BLOCK_DIM_X,
+                         BLOCK_DIM_Y>(
+      d_in, nsamps, nsamps_reduced, nsamp_blocks, stride, dm_count, dm_stride,
+      ndm_blocks, nchans, chan_stride, d_out, out_nbits, out_stride, d_dm_list,
+      batch_in_stride, batch_dm_stride, batch_chan_stride, batch_out_stride);
+}
 }
\ No newline at end of file
diff --git a/src/dedisp/transpose/TransposeKernel.cu b/src/dedisp/transpose/TransposeKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..964a8e87c2cee73a8f6079cf4d6ed1e6db00850d
--- /dev/null
+++ b/src/dedisp/transpose/TransposeKernel.cu
@@ -0,0 +1,146 @@
+/*
+ * This file contains a CUDA implementation of the array transpose operation.
+ *
+ * Parts of this file are based on the transpose implementation in the
+ * NVIDIA CUDA SDK.
+ */
+
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+
+#include <cudawrappers/cu.hpp>
+
+#include "GPUKernel.hpp"
+#include "TransposeKernel.hpp"
+#include "transpose_kernel.cu"
+
+namespace cuda_specs {
+enum { MAX_GRID_DIMENSION = 65535 };
+}
+
+template <typename U> inline U min(const U &a, const U &b) {
+  return a < b ? a : b;
+}
+
+template <typename U> inline U round_up_pow2(const U &a) {
+  U r = a - 1;
+  for (unsigned long i = 1; i <= sizeof(U) * 8 / 2; i <<= 1)
+    r |= r >> i;
+  return r + 1;
+}
+
+template <typename U> inline U round_down_pow2(const U &a) {
+  return round_up_pow2(a + 1) / 2;
+}
+
+inline unsigned int log2(unsigned int a) {
+  unsigned int r;
+  unsigned int shift;
+  r = (a > 0xFFFF) << 4;
+  a >>= r;
+  shift = (a > 0xFF) << 3;
+  a >>= shift;
+  r |= shift;
+  shift = (a > 0xF) << 2;
+  a >>= shift;
+  r |= shift;
+  shift = (a > 0x3) << 1;
+  a >>= shift;
+  r |= shift;
+  r |= (a >> 1);
+  return r;
+}
+
+inline unsigned long log2(unsigned long a) {
+  unsigned long r;
+  unsigned long shift;
+  r = (a > 0xFFFFFFFF) << 5;
+  a >>= r;
+  shift = (a > 0xFFFF) << 4;
+  a >>= shift;
+  r |= shift;
+  shift = (a > 0xFF) << 3;
+  a >>= shift;
+  r |= shift;
+  shift = (a > 0xF) << 2;
+  a >>= shift;
+  r |= shift;
+  shift = (a > 0x3) << 1;
+  a >>= shift;
+  r |= shift;
+  r |= (a >> 1);
+  return r;
+}
+
+void TransposeKernel::run(cu::DeviceMemory &in, size_t width, size_t height,
+                          size_t in_stride, size_t out_stride,
+                          cu::DeviceMemory &out, cu::Stream &stream) {
+  // Specify thread decomposition (uses up-rounded divisions)
+  dim3 tot_block_count((width - 1) / TILE_DIM + 1, (height - 1) / TILE_DIM + 1);
+
+  size_t max_grid_dim = round_down_pow2((size_t)cuda_specs::MAX_GRID_DIMENSION);
+
+  // Partition the grid into chunks that the GPU can accept at once
+  for (size_t block_y_offset = 0; block_y_offset < tot_block_count.y;
+       block_y_offset += max_grid_dim) {
+    dim3 block_count;
+
+    // Handle the possibly incomplete final grid
+    block_count.y = min(max_grid_dim, tot_block_count.y - block_y_offset);
+
+    for (size_t block_x_offset = 0; block_x_offset < tot_block_count.x;
+         block_x_offset += max_grid_dim) {
+      // Handle the possibly incomplete final grid
+      block_count.x = min(max_grid_dim, tot_block_count.x - block_x_offset);
+
+      // Compute the chunked parameters
+      size_t x_offset = block_x_offset * TILE_DIM;
+      size_t y_offset = block_y_offset * TILE_DIM;
+      size_t in_offset = x_offset + y_offset * in_stride;
+      size_t out_offset = y_offset + x_offset * out_stride;
+      size_t w = min(max_grid_dim * TILE_DIM, width - x_offset);
+      size_t h = min(max_grid_dim * TILE_DIM, height - y_offset);
+
+      dim3 block(TILE_DIM, BLOCK_ROWS);
+
+      // TODO:
+      // Unfortunately there are cases where rounding to a power of 2
+      // becomes detrimental to performance. Could work out a heuristic.
+      bool round_grid_to_pow2 = true;
+
+      unsigned int *in_ptr = reinterpret_cast<unsigned int *>(
+          static_cast<dedisp_word *>(in) + (in_offset));
+      unsigned int *out_ptr = reinterpret_cast<unsigned int *>(
+          static_cast<dedisp_word *>(out) + (out_offset));
+
+      // Dispatch on grid-rounding
+      const dim3 grid = round_grid_to_pow2 ? dim3(round_up_pow2(block_count.x),
+                                                  round_up_pow2(block_count.y))
+                                           : dim3(block_count.x, block_count.y);
+
+      const auto grid_y_log2 = log2(grid.y);
+      const std::vector<const void *> parameters = {&in_ptr,
+                                                    &w,
+                                                    &h,
+                                                    &in_stride,
+                                                    &out_stride,
+                                                    &out_ptr,
+                                                    &block_count.x,
+                                                    &block_count.y,
+                                                    &grid_y_log2};
+      std::ostringstream ss;
+      ss << "-DPARAM_GRID_IS_POW2=" << round_grid_to_pow2;
+
+      CompiledKernelInfo kernel;
+      bool did_recompile;
+
+      std::tie(kernel, did_recompile) = compile({ss.str()});
+      assertCompiled(kernel);
+
+      // Run the CUDA kernel
+      stream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, block.x,
+                          block.y, block.z, 0, parameters);
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/dedisp/transpose/TransposeKernel.hpp b/src/dedisp/transpose/TransposeKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3733508cf606f19a041b20684ea9f6274148d06b
--- /dev/null
+++ b/src/dedisp/transpose/TransposeKernel.hpp
@@ -0,0 +1,27 @@
+#ifndef DEDISP_DEDISP_DEDISPERSE_TRANSPOSE_KERNEL_H_
+#define DEDISP_DEDISP_DEDISPERSE_TRANSPOSE_KERNEL_H_
+
+#include <cudawrappers/cu.hpp>
+
+#include "common/dedisp_types.h"
+#include <GPUKernel.hpp>
+
+extern const char _binary_src_dedisp_transpose_transpose_kernel_cu_start,
+    _binary_src_dedisp_transpose_transpose_kernel_cu_end;
+
+class TransposeKernel : public GPUKernel {
+public:
+  TransposeKernel()
+      : GPUKernel(
+            "transpose_kernel.cu", "transpose_kernel_grid",
+            std::string(
+                reinterpret_cast<const char *>(
+                    &_binary_src_dedisp_transpose_transpose_kernel_cu_start),
+                reinterpret_cast<const char *>(
+                    &_binary_src_dedisp_transpose_transpose_kernel_cu_end))) {}
+
+  void run(cu::DeviceMemory &in, size_t width, size_t height, size_t in_stride,
+           size_t out_stride, cu::DeviceMemory &out, cu::Stream &stream);
+};
+
+#endif // DEDISP_DEDISP_DEDISPERSE_TRANSPOSE_KERNEL_H_
\ No newline at end of file
diff --git a/src/dedisp/transpose/transpose.cu b/src/dedisp/transpose/transpose.cu
deleted file mode 100644
index ee37762610a1d09aa96e250476e5bd4433832417..0000000000000000000000000000000000000000
--- a/src/dedisp/transpose/transpose.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * This header files contains the implementation of the Transpose<T> class
- */
-#include "transpose.cuh"
-
-/*
- * The generic Transpose<T>::transpose method
- */
-template <typename T>
-void transpose(const T *in, size_t width, size_t height, size_t in_stride,
-               size_t out_stride, T *out, cudaStream_t stream = 0) {
-  Transpose<T> transpose;
-  transpose.transpose(in, width, height, in_stride, out_stride, out, stream);
-}
-
-/*
- * Explicit template instantations
- */
-template void transpose<unsigned int>(const unsigned int *, size_t, size_t,
-                                      size_t, size_t, unsigned int *,
-                                      cudaStream_t);
\ No newline at end of file
diff --git a/src/dedisp/transpose/transpose.cuh b/src/dedisp/transpose/transpose.cuh
deleted file mode 100644
index 15a2a797b3051431dc703918d629ee718660b9ef..0000000000000000000000000000000000000000
--- a/src/dedisp/transpose/transpose.cuh
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * This file contains a CUDA implementation of the array transpose operation.
- *
- * Parts of this file are based on the transpose implementation in the
- * NVIDIA CUDA SDK.
- */
-
-#pragma once
-
-#include <stdexcept>
-
-#include "transpose_kernel.cuh"
-
-namespace cuda_specs {
-enum { MAX_GRID_DIMENSION = 65535 };
-}
-
-template <typename T> struct Transpose {
-  Transpose() {}
-
-  void transpose(const T *in, size_t width, size_t height, size_t in_stride,
-                 size_t out_stride, T *out, cudaStream_t stream = 0);
-  void transpose(const T *in, size_t width, size_t height, T *out,
-                 cudaStream_t stream = 0) {
-    transpose(in, width, height, width, height, out, stream);
-  }
-
-private:
-  // TODO: These should probably be imported from somewhere else
-  template <typename U> inline U min(const U &a, const U &b) {
-    return a < b ? a : b;
-  }
-  template <typename U> inline U round_up_pow2(const U &a) {
-    U r = a - 1;
-    for (unsigned long i = 1; i <= sizeof(U) * 8 / 2; i <<= 1)
-      r |= r >> i;
-    return r + 1;
-  }
-  template <typename U> inline U round_down_pow2(const U &a) {
-    return round_up_pow2(a + 1) / 2;
-  }
-  inline unsigned int log2(unsigned int a) {
-    unsigned int r;
-    unsigned int shift;
-    r = (a > 0xFFFF) << 4;
-    a >>= r;
-    shift = (a > 0xFF) << 3;
-    a >>= shift;
-    r |= shift;
-    shift = (a > 0xF) << 2;
-    a >>= shift;
-    r |= shift;
-    shift = (a > 0x3) << 1;
-    a >>= shift;
-    r |= shift;
-    r |= (a >> 1);
-    return r;
-  }
-  inline unsigned long log2(unsigned long a) {
-    unsigned long r;
-    unsigned long shift;
-    r = (a > 0xFFFFFFFF) << 5;
-    a >>= r;
-    shift = (a > 0xFFFF) << 4;
-    a >>= shift;
-    r |= shift;
-    shift = (a > 0xFF) << 3;
-    a >>= shift;
-    r |= shift;
-    shift = (a > 0xF) << 2;
-    a >>= shift;
-    r |= shift;
-    shift = (a > 0x3) << 1;
-    a >>= shift;
-    r |= shift;
-    r |= (a >> 1);
-    return r;
-  }
-};
-
-template <typename T>
-void Transpose<T>::transpose(const T *in, size_t width, size_t height,
-                             size_t in_stride, size_t out_stride, T *out,
-                             cudaStream_t stream) {
-  // Parameter checks
-  if (0 == width || 0 == height) {
-    return;
-  }
-  if (0 == in) {
-    throw std::runtime_error("Transpose: in is NULL");
-  }
-  if (0 == out) {
-    throw std::runtime_error("Transpose: out is NULL");
-  }
-  if (width > in_stride) {
-    throw std::runtime_error("Transpose: width exceeds in_stride");
-  }
-  if (height > out_stride) {
-    throw std::runtime_error("Transpose: height exceeds out_stride");
-  }
-
-  // Specify thread decomposition (uses up-rounded divisions)
-  dim3 tot_block_count((width - 1) / TILE_DIM + 1, (height - 1) / TILE_DIM + 1);
-
-  size_t max_grid_dim = round_down_pow2((size_t)cuda_specs::MAX_GRID_DIMENSION);
-
-  // Partition the grid into chunks that the GPU can accept at once
-  for (size_t block_y_offset = 0; block_y_offset < tot_block_count.y;
-       block_y_offset += max_grid_dim) {
-    dim3 block_count;
-
-    // Handle the possibly incomplete final grid
-    block_count.y = min(max_grid_dim, tot_block_count.y - block_y_offset);
-
-    for (size_t block_x_offset = 0; block_x_offset < tot_block_count.x;
-         block_x_offset += max_grid_dim) {
-      // Handle the possibly incomplete final grid
-      block_count.x = min(max_grid_dim, tot_block_count.x - block_x_offset);
-
-      // Compute the chunked parameters
-      size_t x_offset = block_x_offset * TILE_DIM;
-      size_t y_offset = block_y_offset * TILE_DIM;
-      size_t in_offset = x_offset + y_offset * in_stride;
-      size_t out_offset = y_offset + x_offset * out_stride;
-      size_t w = min(max_grid_dim * TILE_DIM, width - x_offset);
-      size_t h = min(max_grid_dim * TILE_DIM, height - y_offset);
-
-      dim3 block(TILE_DIM, BLOCK_ROWS);
-
-      // TODO:
-      // Unfortunately there are cases where rounding to a power of 2
-      // becomes detrimental to performance. Could work out a heuristic.
-      bool round_grid_to_pow2 = true;
-
-      // Dispatch on grid-rounding
-      if (round_grid_to_pow2) {
-        dim3 grid(round_up_pow2(block_count.x), round_up_pow2(block_count.y));
-
-        // Run the CUDA kernel
-        transpose_kernel<true, T><<<grid, block, 0, stream>>>(
-            in + in_offset, w, h, in_stride, out_stride, out + out_offset,
-            block_count.x, block_count.y, log2(grid.y));
-      } else {
-        dim3 grid(block_count.x, block_count.y);
-
-        // Run the CUDA kernel
-        transpose_kernel<false, T><<<grid, block, 0, stream>>>(
-            in + in_offset, w, h, in_stride, out_stride, out + out_offset,
-            block_count.x, block_count.y, log2(grid.y));
-      }
-
-#ifndef NDEBUG
-      cudaStreamSynchronize(stream);
-      cudaError_t error = cudaGetLastError();
-      if (error != cudaSuccess) {
-        throw std::runtime_error(
-            std::string("Transpose: CUDA error in kernel: ") +
-            cudaGetErrorString(error));
-      }
-#endif
-    }
-  }
-}
\ No newline at end of file
diff --git a/src/dedisp/transpose/transpose.hpp b/src/dedisp/transpose/transpose.hpp
deleted file mode 100644
index 862ba667adf3c4687dbf6ef2500636525fde18ff..0000000000000000000000000000000000000000
--- a/src/dedisp/transpose/transpose.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * This file exposes the generic Transpose<T>::transpose method
- *
- * Explicit template instantiations for this method are defined in transpose.cu
- */
-
-template <typename T>
-void transpose(const T *in, size_t width, size_t height, size_t in_stride,
-               size_t out_stride, T *out, cudaStream_t stream = 0);
\ No newline at end of file
diff --git a/src/dedisp/transpose/transpose_kernel.cuh b/src/dedisp/transpose/transpose_kernel.cu
similarity index 66%
rename from src/dedisp/transpose/transpose_kernel.cuh
rename to src/dedisp/transpose/transpose_kernel.cu
index 01f401ddaf762f868c70bd1bb76559f1f4c98b6c..615f346f93d1245f38de2670e48b03794c68f539 100644
--- a/src/dedisp/transpose/transpose_kernel.cuh
+++ b/src/dedisp/transpose/transpose_kernel.cu
@@ -2,20 +2,17 @@
  * This file contains the implementation of transpose_kernel
  */
 
-#ifndef TRANSPOSE_KERNEL_H_INCLUDE_GUARD
-#define TRANSPOSE_KERNEL_H_INCLUDE_GUARD
-
 #define TILE_DIM 32
 #define BLOCK_ROWS 8
 
 typedef unsigned int gpu_size_t;
 
 template <bool GRID_IS_POW2, typename T>
-__global__ void
-transpose_kernel(const T *in, gpu_size_t width, gpu_size_t height,
-                 gpu_size_t in_stride, gpu_size_t out_stride, T *out,
-                 gpu_size_t block_count_x, gpu_size_t block_count_y,
-                 gpu_size_t log2_gridDim_y) {
+__device__ inline void
+transpose_kernel_impl(const T *in, gpu_size_t width, gpu_size_t height,
+                      gpu_size_t in_stride, gpu_size_t out_stride, T *out,
+                      gpu_size_t block_count_x, gpu_size_t block_count_y,
+                      gpu_size_t log2_gridDim_y) {
   __shared__ T tile[TILE_DIM][TILE_DIM];
 
   gpu_size_t blockIdx_x, blockIdx_y;
@@ -73,4 +70,19 @@ transpose_kernel(const T *in, gpu_size_t width, gpu_size_t height,
   }
 }
 
-#endif // TRANSPOSE_KERNEL_H_INCLUDE_GUARD
\ No newline at end of file
+extern "C" {
+#ifndef PARAM_GRID_IS_POW2
+#define PARAM_GRID_IS_POW2 false
+#endif
+
+__global__ void transpose_kernel_grid(const unsigned int *in, size_t width,
+                                      size_t height, size_t in_stride,
+                                      size_t out_stride, unsigned int *out,
+                                      size_t block_count_x,
+                                      size_t block_count_y,
+                                      unsigned int log2_gridDim_y) {
+  transpose_kernel_impl<PARAM_GRID_IS_POW2>(in, width, height, in_stride,
+                                            out_stride, out, block_count_x,
+                                            block_count_y, log2_gridDim_y);
+}
+}
\ No newline at end of file
diff --git a/src/dedisp/unpack/unpack.cu b/src/dedisp/unpack/UnpackKernel.cu
similarity index 87%
rename from src/dedisp/unpack/unpack.cu
rename to src/dedisp/unpack/UnpackKernel.cu
index 8353df7cfce298e954353706b7bd918abcbdec89..4f55b35679640c119dc578ae2a5ec48bc81ad502 100644
--- a/src/dedisp/unpack/unpack.cu
+++ b/src/dedisp/unpack/UnpackKernel.cu
@@ -1,3 +1,5 @@
+#include "UnpackKernel.hpp"
+
 #include <thrust/device_ptr.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -52,9 +54,11 @@ struct unpack_functor : public thrust::unary_function<unsigned int, WordType> {
     return result;
   }
 };
-void unpack(const dedisp_word *d_transposed, dedisp_size nsamps,
-            dedisp_size nchan_words, dedisp_word *d_unpacked,
-            dedisp_size in_nbits, dedisp_size out_nbits) {
+
+void UnpackKernel::run(const dedisp_word *d_transposed, dedisp_size nsamps,
+                       dedisp_size nchan_words, dedisp_word *d_unpacked,
+                       dedisp_size in_nbits, dedisp_size out_nbits,
+                       cu::Stream &) {
   thrust::device_ptr<dedisp_word> d_unpacked_begin(d_unpacked);
 
   dedisp_size expansion = out_nbits / in_nbits;
diff --git a/src/dedisp/unpack/UnpackKernel.hpp b/src/dedisp/unpack/UnpackKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..12ab25aabd93b5d45fe82c97a7f0e4defd491dda
--- /dev/null
+++ b/src/dedisp/unpack/UnpackKernel.hpp
@@ -0,0 +1,17 @@
+#ifndef DEDISP_DEDISP_DEDISPERSE_UNPACK_KERNEL_HPP_
+#define DEDISP_DEDISP_DEDISPERSE_UNPACK_KERNEL_HPP_
+
+#include <cudawrappers/cu.hpp>
+#include <string>
+
+#include "common/dedisp_types.h"
+#include <GPUKernel.hpp>
+
+class UnpackKernel {
+public:
+  void run(const dedisp_word *d_transposed, dedisp_size nsamps,
+           dedisp_size nchan_words, dedisp_word *d_unpacked,
+           dedisp_size in_nbits, dedisp_size out_nbits, cu::Stream &);
+};
+
+#endif // DEDISP_DEDISP_DEDISPERSE_UNPACK_KERNEL_HPP_
\ No newline at end of file
diff --git a/src/dedisp/unpack/unpack.h b/src/dedisp/unpack/unpack.h
deleted file mode 100644
index 88abc1399b58c8e4515777efeee7807bb588270b..0000000000000000000000000000000000000000
--- a/src/dedisp/unpack/unpack.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "common/dedisp_types.h"
-
-void unpack(const dedisp_word *d_transposed, dedisp_size nsamps,
-            dedisp_size nchan_words, dedisp_word *d_unpacked,
-            dedisp_size in_nbits, dedisp_size out_nbits);
diff --git a/src/fdd/CMakeLists.txt b/src/fdd/CMakeLists.txt
index cf3f11b546ee49d68fc17c3c2243a7c89a8eaa61..fef95210fa4f5c7de947d689a47133ba9dbe8962 100644
--- a/src/fdd/CMakeLists.txt
+++ b/src/fdd/CMakeLists.txt
@@ -4,7 +4,7 @@ add_library(
   FDDGPUPlan.cpp
   FDDCPUPlan.cpp
   dedisperse/FDDKernel.cu
-  unpack/unpack.cu
+  unpack/UnpackKernel.cu
   chunk.cpp
   $<TARGET_OBJECTS:common>
   $<TARGET_OBJECTS:plan>
@@ -14,19 +14,27 @@ pkg_search_module(FFTW REQUIRED fftw3f IMPORTED_TARGET)
 
 target_include_directories(
   fdd
-  PRIVATE ${CMAKE_SOURCE_DIR}/src
-  PRIVATE ${CMAKE_SOURCE_DIR}/src/fdd)
+  PRIVATE "${CMAKE_SOURCE_DIR}/src"
+  PRIVATE "${CMAKE_SOURCE_DIR}/src/fdd")
+
+include(GNUInstallDirs)
+list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_FULL_LIBDIR})
+
+target_compile_options(fdd PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>)
+
+target_embed_source(fdd unpack/transpose_unpack_kernel.cu)
+target_embed_source(fdd dedisperse/fdd_dedisperse_kernel.cu)
+target_embed_source(fdd dedisperse/fdd_scale_kernel.cu)
 
 target_link_libraries(
   fdd
-  CUDA::cudart
-  CUDA::cufft
-  CUDA::nvToolsExt
-  CUDA::cuda_driver
-  OpenMP::OpenMP_CXX
-  PkgConfig::FFTW)
-
-target_compile_options(fdd PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>)
+  PUBLIC cudawrappers::cufft
+         cudawrappers::cu
+         cudawrappers::nvtx
+         cudawrappers::nvrtc
+         OpenMP::OpenMP_CXX
+         PkgConfig::FFTW
+         Threads::Threads)
 
 set_target_properties(
   fdd
@@ -35,6 +43,15 @@ set_target_properties(
              VERSION ${DEDISP_VERSION}
              SOVERSION ${DEDISP_VERSION_MAJOR})
 
+if(${DEDISP_BACKEND_HIP})
+  set_source_files_properties(dedisperse/FDDKernel.cu unpack/UnpackKernel.cu
+                              scale/ScaleKernel.cu PROPERTIES LANGUAGE HIP)
+endif()
+
+if(DEDISP_BENCHMARK_WITH_PMT)
+  target_link_libraries(fdd PUBLIC pmt)
+endif()
+
 install(
   TARGETS fdd
   LIBRARY DESTINATION lib
diff --git a/src/fdd/FDDCPUPlan.cpp b/src/fdd/FDDCPUPlan.cpp
index dc6a949120b8cacae7164bf55544fc7c034ddfa0..206f4034f7a9fce099a5f6fd960fdf87c0681c6a 100644
--- a/src/fdd/FDDCPUPlan.cpp
+++ b/src/fdd/FDDCPUPlan.cpp
@@ -9,7 +9,6 @@
 #include <thread>
 
 #include <fftw3.h>
-#include <omp.h>
 
 #include "common/dedisp_strings.h"
 #ifdef DEDISP_BENCHMARK
diff --git a/src/fdd/FDDCPUPlan.hpp b/src/fdd/FDDCPUPlan.hpp
index 7d5fe05979f23379814b00b208ec4a3c32436e65..e1332fc6ffbe8d7c8a5343d50aa32d143f2107f2 100644
--- a/src/fdd/FDDCPUPlan.hpp
+++ b/src/fdd/FDDCPUPlan.hpp
@@ -1,10 +1,10 @@
-#ifndef H_FDD_CPU_PLAN_INCLUDE_GUARD
-#define H_FDD_CPU_PLAN_INCLUDE_GUARD
-
-#include <vector>
+#ifndef DEDISP_FDD_CPU_PLAN_HPP_
+#define DEDISP_FDD_CPU_PLAN_HPP_
 
 #include "Plan.hpp"
 
+#include <stddef.h>
+
 namespace dedisp {
 
 class FDDCPUPlan : public Plan {
@@ -55,4 +55,4 @@ private:
 
 } // end namespace dedisp
 
-#endif // H_FDD_CPU_PLAN_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_FDD_CPU_PLAN_HPP_
\ No newline at end of file
diff --git a/src/fdd/FDDGPUPlan.cpp b/src/fdd/FDDGPUPlan.cpp
index c38e261f8e935c76bb93e09ab40d3276e06df658..c39b4c136345af1bdd378a4205b64dc87df20c27 100644
--- a/src/fdd/FDDGPUPlan.cpp
+++ b/src/fdd/FDDGPUPlan.cpp
@@ -1,30 +1,37 @@
 #include "FDDGPUPlan.hpp"
 
+#include <array>
 #include <cmath>
 #include <complex>
 #include <cstring>
 #include <iomanip>
 #include <iostream>
+#include <memory>
 #include <mutex>
 #include <thread>
 
 #include <assert.h>
-#include <cufft.h>
-#include <omp.h>
 
+#include "GPUPlan.hpp"
 #include "common/dedisp_strings.h"
-#include "dedisperse/FDDKernel.hpp"
-#include "unpack/unpack.h"
+#include "cudawrappers/cu.hpp"
+#include "dedisp_types.h"
+
 #ifdef DEDISP_BENCHMARK
 #include "external/Stopwatch.h"
 #endif
 
+#ifdef HAVE_PMT
+#include <pmt.h>
+#endif
+
+#include <cudawrappers/cufft.hpp>
+#include <cudawrappers/nvtx.hpp>
+
 #include "chunk.h"
 #include "common/helper.h"
 #include "helper.h"
 
-#include <cuda_runtime.h>
-
 namespace dedisp {
 
 // Constructor
@@ -126,12 +133,12 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
 
   // Events, markers, timers
   cu::Event eStartGPU, eEndGPU;
-  cu::Marker mAllocMem("Allocate host and device memory", cu::Marker::black);
-  cu::Marker mCopyMem("Copy CUDA mem to CPU mem", cu::Marker::black);
-  cu::Marker mPrepFFT("cufft Plan Many", cu::Marker::yellow);
-  cu::Marker mPrepSpinf("spin Frequency generation", cu::Marker::blue);
-  cu::Marker mDelayTable("Delay table copy", cu::Marker::black);
-  cu::Marker mExeGPU("Dedisp fdd execution on GPU", cu::Marker::green);
+  nvtx::Marker mAllocMem("Allocate host and device memory",
+                         nvtx::Marker::black);
+  nvtx::Marker mCopyMem("Copy CUDA mem to CPU mem", nvtx::Marker::black);
+  nvtx::Marker mPrepFFT("cufft Plan Many", nvtx::Marker::yellow);
+  nvtx::Marker mPrepSpinf("spin Frequency generation", nvtx::Marker::blue);
+  nvtx::Marker mExeGPU("Dedisp fdd execution on GPU", nvtx::Marker::green);
 #ifdef DEDISP_BENCHMARK
   std::unique_ptr<Stopwatch> init_timer(Stopwatch::create());
   std::unique_ptr<Stopwatch> preprocessing_timer(Stopwatch::create());
@@ -141,6 +148,15 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
   std::unique_ptr<Stopwatch> output_timer(Stopwatch::create());
   std::unique_ptr<Stopwatch> gpuexec_timer(Stopwatch::create());
   std::unique_ptr<Stopwatch> total_timer(Stopwatch::create());
+
+#if defined(HAVE_PMT)
+#if defined(__HIP__)
+  auto pmt_sensor = pmt::Create("rocm", m_device->getOrdinal());
+#else
+  auto pmt_sensor = pmt::Create("nvidia", m_device->getOrdinal());
+#endif
+#endif
+
   total_timer->Start();
   init_timer->Start();
 #endif
@@ -150,42 +166,30 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
   std::cout << fft_plan_str << std::endl;
 #endif
 
-  CUcontext context = m_device->get_current_context();
-
   mPrepFFT.start();
-  cufftHandle plan_r2c, plan_c2r;
   int n[] = {(int)nsamp_fft};
-  int rnembed[] = {(int)nsamp_padded};     // width in real elements
-  int cnembed[] = {(int)nsamp_padded / 2}; // width in complex elements
+
+  std::unique_ptr<cufft::FFT1DR2C<CUDA_R_32F>> plan_r2c;
+  std::unique_ptr<cufft::FFT1DC2R<CUDA_C_32F>> plan_c2r;
+
+  const long long rnembed = nsamp_padded;     // width in real elements
+  const long long cnembed = nsamp_padded / 2; // width in complex elements
+
   std::thread thread_r2c = std::thread([&]() {
-    m_device->set_context(context);
-
-    cufftResult result =
-        cufftPlanMany(&plan_r2c,              // plan
-                      1, n,                   // rank, n
-                      rnembed, 1, rnembed[0], // inembed, istride, idist
-                      cnembed, 1, cnembed[0], // onembed, ostride, odist
-                      CUFFT_R2C,              // type
-                      nchan_fft_batch);       // batch
-    if (result != CUFFT_SUCCESS) {
-      throw std::runtime_error("Error creating real to complex FFT plan.");
-    }
-    cufftSetStream(plan_r2c, *executestream);
+    m_context->setCurrent();
+
+    plan_r2c = std::make_unique<cufft::FFT1DR2C<CUDA_R_32F>>(
+        n[0], nchan_fft_batch, rnembed, cnembed);
+
+    plan_r2c->setStream(*executestream);
   });
   std::thread thread_c2r = std::thread([&]() {
-    m_device->set_context(context);
-
-    cufftResult result =
-        cufftPlanMany(&plan_c2r,              // plan
-                      1, n,                   // rank, n
-                      cnembed, 1, cnembed[0], // inembed, istride, idist
-                      rnembed, 1, rnembed[0], // onembed, ostride, odist
-                      CUFFT_C2R,              // type
-                      ndm_fft_batch);         // batch
-    if (result != CUFFT_SUCCESS) {
-      throw std::runtime_error("Error creating complex to real FFT plan.");
-    }
-    cufftSetStream(plan_c2r, *executestream);
+    m_context->setCurrent();
+
+    plan_c2r = std::make_unique<cufft::FFT1DC2R<CUDA_C_32F>>(
+        n[0], ndm_fft_batch, cnembed, rnembed);
+
+    plan_c2r->setStream(*executestream);
   });
 
   // Wait for cuFFT plans to be created
@@ -195,6 +199,7 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
   if (thread_c2r.joinable()) {
     thread_c2r.join();
   }
+
   mPrepFFT.end();
 
   // Generate spin frequency table
@@ -205,8 +210,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
   mPrepSpinf.end();
 
   // Determine the amount of memory to use
-  size_t d_memory_total = m_device->get_total_memory(); // in Bytes
-  size_t d_memory_free = m_device->get_free_memory();   // in Bytes
+  size_t d_memory_total = m_context->getTotalMemory(); // in Bytes
+  size_t d_memory_free = m_context->getFreeMemory();   // in Bytes
   size_t sizeof_data_t_nu =
       1ULL * nsamp * nchan_words_gulp * sizeof(dedisp_word);
   size_t sizeof_data_x_nu =
@@ -219,11 +224,11 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
 
   // Subtract the memory usage of any pre-existing device buffers
   size_t d_memory_in_use = 0;
-  for (cu::DeviceMemory &d_memory : d_data_t_nu_) {
-    d_memory_in_use += d_memory.size();
+  for (auto &d_memory : d_data_t_nu_) {
+    d_memory_in_use += d_memory->size();
   }
-  for (cu::DeviceMemory &d_memory : d_data_x_dm_) {
-    d_memory_in_use += d_memory.size();
+  for (auto &d_memory : d_data_x_dm_) {
+    d_memory_in_use += d_memory->size();
   }
   d_memory_free += d_memory_in_use;
 
@@ -295,6 +300,7 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
   std::cout << memory_alloc_str << std::endl;
 #endif
   mAllocMem.start();
+
   /*
       The buffers are used as follows:
       1) copy into page-locked buffer: in -> memcpyHtoH -> h_data_t_nu
@@ -317,22 +323,27 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
   */
   h_data_t_nu_.resize(nchan_buffers);
   h_data_t_dm_.resize(ndm_buffers);
+
   d_data_t_nu_.resize(nchan_buffers);
   d_data_x_dm_.resize(ndm_buffers);
+
   cu::DeviceMemory d_data_x_nu(sizeof_data_x_nu);
+
   for (unsigned int i = 0; i < nchan_buffers; i++) {
-    h_data_t_nu_[i].resize(sizeof_data_t_nu);
-    d_data_t_nu_[i].resize(sizeof_data_t_nu);
+    h_data_t_nu_[i] = std::make_shared<cu::HostMemory>(sizeof_data_t_nu);
+    d_data_t_nu_[i] = std::make_shared<cu::DeviceMemory>(sizeof_data_t_nu);
   }
+
   for (unsigned int i = 0; i < ndm_buffers; i++) {
-    h_data_t_dm_[i].resize(sizeof_data_x_dm);
-    d_data_x_dm_[i].resize(sizeof_data_x_dm);
+    h_data_t_dm_[i] = std::make_shared<cu::HostMemory>(sizeof_data_x_dm);
+    d_data_x_dm_[i] = std::make_shared<cu::DeviceMemory>(sizeof_data_x_dm);
   }
+
   mAllocMem.end();
 
 #ifdef DEDISP_DEBUG
-  size_t d_memory_free_after_malloc = m_device->get_free_memory(); // in Bytes
-  size_t h_memory_free_after_malloc = get_free_memory();           // in Mbytes
+  size_t d_memory_free_after_malloc = m_context->getFreeMemory(); // in Bytes
+  size_t h_memory_free_after_malloc = get_free_memory();          // in Mbytes
   std::cout << "Device memory free after memory allocations = "
             << d_memory_free_after_malloc / std::pow(1024, 3) << " Gb"
             << std::endl;
@@ -340,13 +351,6 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
             << h_memory_free_after_malloc / std::pow(1024, 1) << " Gb"
             << std::endl;
 #endif
-
-  // Initialize FDDKernel
-  FDDKernel kernel;
-  mDelayTable.start();
-  kernel.copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0,
-                          *htodstream);
-  mDelayTable.end();
 #ifdef DEDISP_BENCHMARK
   init_timer->Pause();
 #endif
@@ -355,8 +359,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
     unsigned int ichan_start;
     unsigned int ichan_end;
     unsigned int nchan_current;
-    void *h_in_ptr;
-    void *d_in_ptr;
+    std::shared_ptr<cu::HostMemory> h_in_ptr;
+    std::shared_ptr<cu::DeviceMemory> d_in_ptr;
     cu::Event inputStart, inputEnd;
     cu::Event preprocessingStart, preprocessingEnd;
     cu::Event outputStart, outputEnd;
@@ -370,8 +374,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
     job.ichan_start = job_id == 0 ? 0 : channel_jobs[job_id - 1].ichan_end;
     job.nchan_current = std::min(nchan_batch_max, nchan - job.ichan_start);
     job.ichan_end = job.ichan_start + job.nchan_current;
-    job.h_in_ptr = h_data_t_nu_[job_id % nchan_buffers];
-    job.d_in_ptr = d_data_t_nu_[job_id % nchan_buffers];
+    job.h_in_ptr = h_data_t_nu_.at(job_id % nchan_buffers);
+    job.d_in_ptr = d_data_t_nu_.at(job_id % nchan_buffers);
     if (job.nchan_current == 0) {
       channel_jobs.pop_back();
     }
@@ -383,8 +387,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
     unsigned int ndm_current;
     std::mutex cpu_lock;
     std::mutex gpu_lock;
-    cu::HostMemory *h_data_t_dm;
-    cu::DeviceMemory *d_data_x_dm;
+    std::shared_ptr<cu::HostMemory> h_data_t_dm;
+    std::shared_ptr<cu::DeviceMemory> d_data_x_dm;
     cu::Event inputStart, inputEnd;
     cu::Event dedispersionStart, dedispersionEnd;
     cu::Event postprocessingStart, postprocessingEnd;
@@ -399,8 +403,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
     job.idm_start = job_id == 0 ? 0 : dm_jobs[job_id - 1].idm_end;
     job.ndm_current = std::min(ndm_batch_max, ndm - job.idm_start);
     job.idm_end = job.idm_start + job.ndm_current;
-    job.h_data_t_dm = &h_data_t_dm_[job_id % ndm_buffers];
-    job.d_data_x_dm = &d_data_x_dm_[job_id % ndm_buffers];
+    job.h_data_t_dm = h_data_t_dm_.at(job_id % ndm_buffers);
+    job.d_data_x_dm = d_data_x_dm_.at(job_id % ndm_buffers);
     if (job.ndm_current == 0) {
       dm_jobs.pop_back();
     }
@@ -429,7 +433,7 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
 #endif
       // Copy partial output from pinned memory to output buffer
       dedisp_size src_stride = 1ULL * nsamp_padded * out_bytes_per_sample;
-      auto *h_src = dm_job.h_data_t_dm->data();
+      const void *h_src = static_cast<const void *>(*dm_job.h_data_t_dm);
       dedisp_size dst_stride = 1ULL * nsamp_computed * out_bytes_per_sample;
       dedisp_size dst_offset = 1ULL * dm_job.idm_start * dst_stride;
       auto *h_dst = (void *)(((size_t)out) + dst_offset);
@@ -451,6 +455,12 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
 #ifdef DEDISP_DEBUG
   std::cout << fdd_dedispersion_str << std::endl;
 #endif
+
+#ifdef HAVE_PMT
+  pmt::State pmt_start, pmt_end;
+  pmt_start = pmt_sensor->Read();
+#endif
+
   htodstream->record(eStartGPU);
   mExeGPU.start();
 
@@ -476,54 +486,53 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
       if (channel_job_id == 0) {
         dedisp_size gulp_chan_byte_idx =
             (channel_job.ichan_start / chans_per_word) * sizeof(dedisp_word);
-        memcpy2D(channel_job.h_in_ptr,    // dst
+        memcpy2D(*(channel_job.h_in_ptr), // dst
                  dst_stride,              // dst width
                  in + gulp_chan_byte_idx, // src
                  src_stride,              // src width
                  dst_stride,              // width bytes
                  nsamp);                  // height
         htodstream->record(channel_job.inputStart);
-        htodstream->memcpyHtoDAsync(channel_job.d_in_ptr, // dst
-                                    channel_job.h_in_ptr, // src
-                                    nsamp * dst_stride);  // size
+        htodstream->memcpyHtoDAsync((*channel_job.d_in_ptr), // dst
+                                    (*channel_job.h_in_ptr), // src
+                                    nsamp * dst_stride);     // size
         htodstream->record(channel_job.inputEnd);
       }
-      executestream->waitEvent(channel_job.inputEnd);
+      executestream->wait(channel_job.inputEnd);
 
       // Transpose and upack the data
       executestream->record(channel_job.preprocessingStart);
-      transpose_unpack((dedisp_word *)channel_job.d_in_ptr, // d_in
-                       nchan_words_gulp,                    // input width
-                       nsamp,                               // input height
-                       nchan_words_gulp,                    // in_stride
-                       nsamp_padded,                        // out_stride
-                       d_data_x_nu,                         // d_out
-                       in_nbits, 32,    // in_nbits, out_nbits
-                       1.0 / nchan,     // scale
-                       *executestream); // stream
+      fdd_kernel_unpack.run(*(channel_job.d_in_ptr), // d_in
+                            nchan_words_gulp,        // input width
+                            nsamp,                   // input height
+                            nchan_words_gulp,        // in_stride
+                            nsamp_padded,            // out_stride
+                            d_data_x_nu,             // d_out
+                            in_nbits, 32,            // in_nbits, out_nbits
+                            1.0 / nchan,             // scale
+                            *executestream);         // stream
 
       // Apply zero padding
-      auto dst_ptr = ((float *)d_data_x_nu.data()) + nsamp;
       unsigned int nsamp_padding = nsamp_padded - nsamp;
-      cu::checkError(cudaMemset2DAsync(dst_ptr,                       // devPtr
-                                       nsamp_padded * sizeof(float),  // pitch
-                                       0,                             // value
-                                       nsamp_padding * sizeof(float), // width
-                                       nchan_batch_max,               // height
-                                       *executestream));
+
+      float *dst_ptr = (static_cast<float *>(d_data_x_nu)) + nsamp;
+      cu::DeviceMemory dev_ptr(
+          reinterpret_cast<CUdeviceptr>(static_cast<void *>(dst_ptr)));
+
+      executestream->memset2DAsync(dev_ptr,                       // device ptr
+                                   static_cast<unsigned char>(0), // value
+                                   nsamp_padded * sizeof(float),  // pitch
+                                   nsamp_padding * sizeof(float), // width
+                                   nchan_batch_max);              // height
 
       // FFT data (real to complex) along time axis
       for (unsigned int i = 0; i < nchan_batch_max / nchan_fft_batch; i++) {
-        cufftReal *idata = (cufftReal *)d_data_x_nu.data() +
-                           i * nsamp_padded * nchan_fft_batch;
-        cufftComplex *odata = (cufftComplex *)idata;
-
-        cufftResult ret;
-        if ((ret = cufftExecR2C(plan_r2c, idata, odata)) != CUFFT_SUCCESS) {
-          std::stringstream ss;
-          ss << "Error while executing R2C FFT plan, error code: " << ret;
-          throw std::runtime_error(ss.str());
-        }
+        cu::DeviceMemory idata(reinterpret_cast<CUdeviceptr>(
+            static_cast<cufftReal *>(d_data_x_nu) +
+            (i * nsamp_padded * nchan_fft_batch)));
+        cu::DeviceMemory odata(idata);
+
+        plan_r2c->execute(idata, odata, CUFFT_FORWARD);
       }
       executestream->record(channel_job.preprocessingEnd);
 
@@ -550,7 +559,8 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
             dm_job_previous.outputEnd.synchronize();
           }
 
-          dm_job.d_data_x_dm->zero(*executestream);
+          executestream->zero(*(dm_job.d_data_x_dm),
+                              dm_job.d_data_x_dm->size());
         }
 
         // Wait for temporary output from previous job to be copied
@@ -561,21 +571,25 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
 
         // Dedispersion in frequency domain
         executestream->record(dm_job.dedispersionStart);
-        auto d_out = (dedisp_float2 *)dm_job.d_data_x_dm->data();
-        kernel.launch(dm_job.ndm_current,        // ndm
-                      nfreq,                     // nfreq
-                      channel_job.nchan_current, // nchan
-                      dt,                        // dt
-                      d_spin_frequencies,        // d_spin_frequencies
-                      d_dm_list,                 // d_dm_list
-                      d_data_x_nu,               // d_in
-                      d_out,                     // d_out
-                      nsamp_padded / 2,          // in stride
-                      nsamp_padded / 2,          // out stride
-                      dm_job.idm_start,          // idm_start
-                      dm_job.idm_end,            // idm_end
-                      channel_job.ichan_start,   // ichan_start
-                      *executestream);           // stream
+
+        fdd_kernel_dedisperse.run(
+            dm_job.ndm_current,                          // ndm
+            nfreq,                                       // nfreq
+            channel_job.nchan_current,                   // nchan
+            dt,                                          // dt
+            *d_spin_frequencies,                         // d_spin_frequencies
+            *d_dm_list,                                  // d_dm_list
+            d_data_x_nu,                                 // d_in
+            *dm_job.d_data_x_dm,                         // d_out
+            nsamp_padded / 2,                            // in stride
+            nsamp_padded / 2,                            // out stride
+            dm_job.idm_start,                            // idm_start
+            dm_job.idm_end,                              // idm_end
+            channel_job.ichan_start,                     // ichan_start
+            h_delay_table.data(),                        // d_delay_table
+            h_delay_table.size() * sizeof(dedisp_float), // d_delay_table size
+            *htodstream,
+            *executestream); // stream
         executestream->record(dm_job.dedispersionEnd);
       } // end for dm_job_id_inner
 
@@ -586,16 +600,16 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
         dedisp_size gulp_chan_byte_idx =
             (channel_job_next.ichan_start / chans_per_word) *
             sizeof(dedisp_word);
-        memcpy2D(channel_job_next.h_in_ptr, // dst
-                 dst_stride,                // dst width
-                 in + gulp_chan_byte_idx,   // src
-                 src_stride,                // src width
-                 dst_stride,                // width bytes
-                 nsamp);                    // height
+        memcpy2D(*(channel_job_next.h_in_ptr), // dst
+                 dst_stride,                   // dst width
+                 in + gulp_chan_byte_idx,      // src
+                 src_stride,                   // src width
+                 dst_stride,                   // width bytes
+                 nsamp);                       // height
         htodstream->record(channel_job_next.inputStart);
-        htodstream->memcpyHtoDAsync(channel_job_next.d_in_ptr, // dst
-                                    channel_job_next.h_in_ptr, // src
-                                    nsamp * dst_stride);       // size
+        htodstream->memcpyHtoDAsync(*channel_job_next.d_in_ptr, // dst
+                                    *channel_job_next.h_in_ptr, // src
+                                    nsamp * dst_stride);        // size
         htodstream->record(channel_job_next.inputEnd);
       }
 
@@ -644,43 +658,39 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
 #endif
       // Get pointer to DM output data on host and on device
       dedisp_size dm_stride = 1ULL * nsamp_padded * out_bytes_per_sample;
-      auto *h_out = dm_job.h_data_t_dm->data();
-      auto *d_out = (float *)dm_job.d_data_x_dm->data();
+      void *const h_out = (*dm_job.h_data_t_dm);
+      auto d_out = dm_job.d_data_x_dm;
 
       // Fourier transform results back to time domain
       executestream->record(dm_job.postprocessingStart);
       for (unsigned int i = 0; i < ndm_batch_max / ndm_fft_batch; i++) {
-        cufftReal *odata =
-            (cufftReal *)d_out + i * nsamp_padded * ndm_fft_batch;
-        cufftComplex *idata = (cufftComplex *)odata;
-
-        cufftResult ret;
-        if ((ret = cufftExecC2R(plan_c2r, idata, odata)) != CUFFT_SUCCESS) {
-          std::stringstream ss;
-          ss << "Error while executing C2R FFT plan, error code: " << ret;
-          throw std::runtime_error(ss.str());
-        }
+        cu::DeviceMemory odata(
+            reinterpret_cast<CUdeviceptr>(static_cast<cufftReal *>(*d_out) +
+                                          (i * nsamp_padded * ndm_fft_batch)));
+        cu::DeviceMemory idata = odata;
+
+        plan_c2r->execute(idata, odata, CUFFT_INVERSE);
       }
 
       // FFT scaling
-      kernel.scale(dm_job.ndm_current, // height
-                   nsamp_padded,       // width
-                   nsamp_padded,       // stride
-                   1.0f / nsamp_fft,   // scale
-                   d_out,              // d_data
-                   *executestream);    // stream
+      fdd_kernel_scale.scale(dm_job.ndm_current, // height
+                             nsamp_padded,       // width
+                             nsamp_padded,       // stride
+                             1.0f / nsamp_fft,   // scale
+                             *d_out,             // d_data
+                             *executestream);    // stream
       executestream->record(dm_job.postprocessingEnd);
 
       // Copy output
       // Output is picked up by (already running) host side thread
       // and is there copied from CPU pinned to paged memory
       dm_job.gpu_lock.lock();
-      dtohstream->waitEvent(dm_job.postprocessingEnd);
+      dtohstream->wait(dm_job.postprocessingEnd);
       dtohstream->record(dm_job.outputStart);
       dedisp_size size = 1ULL * dm_job.ndm_current * dm_stride;
-      dtohstream->memcpyDtoHAsync(h_out, // dst
-                                  d_out, // src
-                                  size); // size
+      dtohstream->memcpyDtoHAsync(h_out,  // dst
+                                  *d_out, // src
+                                  size);  // size
       dtohstream->record(dm_job.outputEnd);
       dm_job.cpu_lock.unlock();
     } // end for dm_job_id_inner
@@ -692,12 +702,17 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
     output_thread.join();
   }
   dtohstream->record(eEndGPU);
-  mExeGPU.end(eEndGPU);
+  dtohstream->synchronize();
+  mExeGPU.end();
 #ifdef DEDISP_BENCHMARK
   total_timer->Pause();
 
   gpuexec_timer->Add(eEndGPU.elapsedTime(eStartGPU));
 
+#ifdef HAVE_PMT
+  pmt_end = pmt_sensor->Read();
+#endif
+
   // Accumulate postprocessing time for all dm jobs
   for (auto &job : dm_jobs) {
     postprocessing_timer->Add(
@@ -732,12 +747,14 @@ void FDDGPUPlan::execute_gpu(size_type nsamps, const byte_type *in,
             << std::endl;
   std::cout << total_time_str << total_timer->ToString() << " sec."
             << std::endl;
+#ifdef HAVE_PMT
+  std::cout << pmt_joules_str << pmt::PMT::joules(pmt_start, pmt_end) << " J"
+            << std::endl;
+  std::cout << pmt_watts_str << pmt::PMT::watts(pmt_start, pmt_end) << " W"
+            << std::endl;
+#endif
   std::cout << std::endl;
 #endif
-
-  // Free FFT plans
-  cufftDestroy(plan_c2r);
-  cufftDestroy(plan_r2c);
 }
 
 /*    Refer to execute_gpu() above for additional comments on common constructs
@@ -851,12 +868,13 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
 
   // Events, markers, timers
   cu::Event eStartGPU, eEndGPU;
-  cu::Marker mAllocMem("Allocate host and device memory", cu::Marker::black);
-  cu::Marker mCopyMem("Copy CUDA mem to CPU mem", cu::Marker::black);
-  cu::Marker mPrepFFT("cufft Plan Many", cu::Marker::yellow);
-  cu::Marker mPrepSpinf("spin Frequency generation", cu::Marker::blue);
-  cu::Marker mDelayTable("Delay table copy", cu::Marker::black);
-  cu::Marker mExeGPU("Dedisp fdd execution on GPU", cu::Marker::green);
+  nvtx::Marker mAllocMem("Allocate host and device memory",
+                         nvtx::Marker::black);
+  nvtx::Marker mCopyMem("Copy CUDA mem to CPU mem", nvtx::Marker::black);
+  nvtx::Marker mPrepFFT("cufft Plan Many", nvtx::Marker::yellow);
+  nvtx::Marker mPrepSpinf("spin Frequency generation", nvtx::Marker::blue);
+  nvtx::Marker mDelayTable("Delay table copy", nvtx::Marker::black);
+  nvtx::Marker mExeGPU("Dedisp fdd execution on GPU", nvtx::Marker::green);
 #ifdef DEDISP_BENCHMARK
   std::unique_ptr<Stopwatch> init_timer(Stopwatch::create());
   std::unique_ptr<Stopwatch> input_timer(Stopwatch::create());
@@ -881,17 +899,21 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
   cu::HostMemory h_data_t_dm(ndm * nsamp_padded * sizeof(float));
   cu::DeviceMemory d_data_t_nu(nchan_batch_max * nsamp_padded * sizeof(float));
   cu::DeviceMemory d_data_f_nu(nchan_batch_max * nsamp_padded * sizeof(float));
-  std::vector<cu::HostMemory> h_data_t_nu_(nchan_buffers);
-  std::vector<cu::DeviceMemory> d_data_t_nu_(nchan_buffers);
-  std::vector<cu::DeviceMemory> d_data_f_dm_(ndm_buffers);
-  std::vector<cu::DeviceMemory> d_data_t_dm_(ndm_buffers);
+  std::vector<std::unique_ptr<cu::HostMemory>> h_data_t_nu_(nchan_buffers);
+  std::vector<std::unique_ptr<cu::DeviceMemory>> d_data_t_nu_(nchan_buffers);
+  std::vector<std::unique_ptr<cu::DeviceMemory>> d_data_f_dm_(ndm_buffers);
+  std::vector<std::unique_ptr<cu::DeviceMemory>> d_data_t_dm_(ndm_buffers);
   for (unsigned int i = 0; i < nchan_buffers; i++) {
-    h_data_t_nu_[i].resize(nsamp * nchan_words_gulp * sizeof(dedisp_word));
-    d_data_t_nu_[i].resize(nsamp * nchan_words_gulp * sizeof(dedisp_word));
+    h_data_t_nu_[i] = std::make_unique<cu::HostMemory>(
+        nsamp * nchan_words_gulp * sizeof(dedisp_word));
+    d_data_t_nu_[i] = std::make_unique<cu::DeviceMemory>(
+        nsamp * nchan_words_gulp * sizeof(dedisp_word));
   }
   for (unsigned int i = 0; i < ndm_buffers; i++) {
-    d_data_f_dm_[i].resize(ndm_batch_max * nsamp_padded * sizeof(float));
-    d_data_t_dm_[i].resize(ndm_batch_max * nsamp_padded * sizeof(float));
+    d_data_f_dm_[i] = std::make_unique<cu::DeviceMemory>(
+        ndm_batch_max * nsamp_padded * sizeof(float));
+    d_data_t_dm_[i] = std::make_unique<cu::DeviceMemory>(
+        ndm_batch_max * nsamp_padded * sizeof(float));
   }
   mAllocMem.end();
 
@@ -900,38 +922,16 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
   std::cout << fft_plan_str << std::endl;
 #endif
   mPrepFFT.start();
-  cufftHandle plan_r2c, plan_c2r;
-  int n[] = {(int)nfft};
-  std::thread thread_r2c = std::thread([&]() {
-    int inembed[] = {(int)nsamp_good};
-    int onembed[] = {(int)nfreq_chunk_padded};
-    cufftResult result =
-        cufftPlanMany(&plan_r2c,              // plan
-                      1, n,                   // rank, n
-                      inembed, 1, inembed[0], // inembed, istride, idist
-                      onembed, 1, onembed[0], // onembed, ostride, odist
-                      CUFFT_R2C,              // type
-                      nchunk);                // batch
-    if (result != CUFFT_SUCCESS) {
-      throw std::runtime_error("Error creating real to complex FFT plan.");
-    }
-    cufftSetStream(plan_r2c, *executestream);
-  });
-  std::thread thread_c2r = std::thread([&]() {
-    int inembed[] = {(int)nfreq_chunk_padded};
-    int onembed[] = {(int)nfreq_chunk_padded * 2};
-    cufftResult result =
-        cufftPlanMany(&plan_c2r,              // plan
-                      1, n,                   // rank, n
-                      inembed, 1, inembed[0], // inembed, istride, idist
-                      onembed, 1, onembed[0], // onembed, ostride, odist
-                      CUFFT_C2R,              // type
-                      nchunk);                // batch
-    if (result != CUFFT_SUCCESS) {
-      throw std::runtime_error("Error creating complex to real FFT plan.");
-    }
-    cufftSetStream(plan_c2r, *executestream);
-  });
+
+  cufft::FFT1DR2C<CUDA_R_32F> plan_r2c(nfft, nchunk, nsamp_good,
+                                       nfreq_chunk_padded);
+
+  cufft::FFT1DC2R<CUDA_C_32F> plan_c2r(nfft, nchunk, nfreq_chunk_padded,
+                                       nfreq_chunk_padded * 2);
+  std::thread thread_r2c =
+      std::thread([&]() { plan_r2c.setStream(*executestream); });
+  std::thread thread_c2r =
+      std::thread([&]() { plan_r2c.setStream(*executestream); });
 
   // Compute chunks
   std::vector<Chunk> chunks(nchunk);
@@ -957,18 +957,16 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
         chunks, h_spin_frequencies, nfreq_chunk, nfreq_chunk_padded, nfft, dt);
 
     // Copy segmented spin frequencies to the GPU
-    d_spin_frequencies.resize(h_spin_frequencies.size() * sizeof(float));
-    htodstream->memcpyHtoDAsync(d_spin_frequencies, h_spin_frequencies.data(),
-                                d_spin_frequencies.size());
+    if (d_spin_frequencies)
+      d_spin_frequencies.reset();
+
+    d_spin_frequencies = std::make_unique<cu::DeviceMemory>(
+        h_spin_frequencies.size() * sizeof(float));
+    assert(d_spin_frequencies);
+    htodstream->memcpyHtoDAsync(*d_spin_frequencies, h_spin_frequencies.data(),
+                                d_spin_frequencies->size());
   }
   mPrepSpinf.end();
-
-  // Initialize FDDKernel
-  FDDKernel kernel;
-  mDelayTable.start();
-  kernel.copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0,
-                          *htodstream);
-  mDelayTable.end();
 #ifdef DEDISP_BENCHMARK
   init_timer->Pause();
 #endif
@@ -992,8 +990,8 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
     job.ichan_start = job_id == 0 ? 0 : channel_jobs[job_id - 1].ichan_end;
     job.nchan_current = std::min(nchan_batch_max, nchan - job.ichan_start);
     job.ichan_end = job.ichan_start + job.nchan_current;
-    job.h_in_ptr = h_data_t_nu_[job_id % nchan_buffers];
-    job.d_in_ptr = d_data_t_nu_[job_id % nchan_buffers];
+    job.h_in_ptr = *(h_data_t_nu_.at(job_id % nchan_buffers));
+    job.d_in_ptr = *(d_data_t_nu_.at(job_id % nchan_buffers));
     if (job.nchan_current == 0) {
       channel_jobs.pop_back();
     }
@@ -1020,8 +1018,8 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
     job.idm_start = job_id == 0 ? 0 : dm_jobs[job_id - 1].idm_end;
     job.ndm_current = std::min(ndm_batch_max, ndm - job.idm_start);
     job.idm_end = job.idm_start + job.ndm_current;
-    job.d_data_f_dm_ptr = d_data_f_dm_[job_id % ndm_buffers];
-    job.d_data_t_dm_ptr = d_data_t_dm_[job_id % ndm_buffers];
+    job.d_data_f_dm_ptr = *(d_data_f_dm_.at(job_id % ndm_buffers));
+    job.d_data_t_dm_ptr = *(d_data_t_dm_.at(job_id % ndm_buffers));
     if (job.ndm_current == 0) {
       dm_jobs.pop_back();
     }
@@ -1029,6 +1027,7 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
 #ifdef DEDISP_DEBUG
   std::cout << fdd_dedispersion_str << std::endl;
 #endif
+
   htodstream->record(eStartGPU);
   mExeGPU.start();
 
@@ -1061,42 +1060,52 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
                  dst_stride,              // width bytes
                  nsamp);                  // height
         htodstream->record(channel_job.inputStart);
-        htodstream->memcpyHtoDAsync(channel_job.d_in_ptr, // dst
+
+        cu::DeviceMemory copy_dest_dev(
+            reinterpret_cast<CUdeviceptr>(channel_job.d_in_ptr));
+
+        htodstream->memcpyHtoDAsync(copy_dest_dev,        // dst
                                     channel_job.h_in_ptr, // src
                                     nsamp * dst_stride);  // size
         htodstream->record(channel_job.inputEnd);
       }
-      executestream->waitEvent(channel_job.inputEnd);
+      executestream->wait(channel_job.inputEnd);
 
       // Transpose and upack the data
       executestream->record(channel_job.preprocessingStart);
-      transpose_unpack((dedisp_word *)channel_job.d_in_ptr, // d_in
-                       nchan_words_gulp,                    // input width
-                       nsamp,                               // input height
-                       nchan_words_gulp,                    // in_stride
-                       nsamp_padded,                        // out_stride
-                       d_data_t_nu,                         // d_out
-                       in_nbits, 32,    // in_nbits, out_nbits
-                       1.0 / nchan,     // scale
-                       *executestream); // stream
+
+      cu::DeviceMemory unpack_dest(
+          reinterpret_cast<CUdeviceptr>(channel_job.d_in_ptr));
+
+      fdd_kernel_unpack.run(unpack_dest,      // d_in
+                            nchan_words_gulp, // input width
+                            nsamp,            // input height
+                            nchan_words_gulp, // in_stride
+                            nsamp_padded,     // out_stride
+                            d_data_t_nu,      // d_out
+                            in_nbits, 32,     // in_nbits, out_nbits
+                            1.0 / nchan,      // scale
+                            *executestream);  // stream
 
       // Apply zero padding
-      auto dst_ptr = ((float *)d_data_t_nu.data()) + nsamp;
+      cu::DeviceMemory dst_ptr(reinterpret_cast<CUdeviceptr>(
+          static_cast<float *const>(d_data_t_nu) + nsamp));
       unsigned int nsamp_padding = nsamp_padded - nsamp;
-      cu::checkError(cudaMemset2DAsync(dst_ptr,                       // devPtr
-                                       nsamp_padded * sizeof(float),  // pitch
-                                       0,                             // value
-                                       nsamp_padding * sizeof(float), // width
-                                       nchan_batch_max,               // height
-                                       *executestream));
+
+      executestream->memset2DAsync(
+          dst_ptr, static_cast<unsigned char>(0), nsamp_padded * sizeof(float),
+          nsamp_padding * sizeof(float), nchan_batch_max);
 
       // FFT data (real to complex) along time axis
       for (unsigned int ichan = 0; ichan < channel_job.nchan_current; ichan++) {
-        auto *idata =
-            (cufftReal *)d_data_t_nu.data() + (1ULL * ichan * nsamp_padded);
-        auto *odata = (cufftComplex *)d_data_f_nu.data() +
-                      (1ULL * ichan * nsamp_padded / 2);
-        cufftExecR2C(plan_r2c, idata, odata);
+        cu::DeviceMemory idata(reinterpret_cast<CUdeviceptr>(
+            static_cast<cufftReal *>(d_data_t_nu) +
+            (1ULL * ichan * nsamp_padded)));
+        cu::DeviceMemory odata(reinterpret_cast<CUdeviceptr>(
+            static_cast<cufftComplex *>(d_data_f_nu) +
+            (1ULL * ichan * nsamp_padded / 2)));
+
+        plan_r2c.execute(idata, odata, CUFFT_FORWARD);
       }
       executestream->record(channel_job.preprocessingEnd);
 
@@ -1105,10 +1114,10 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
         // Wait for all previous output copies to finish
         dtohstream->synchronize();
 
-        for (cu::DeviceMemory &d_data_out : d_data_f_dm_) {
+        for (std::unique_ptr<cu::DeviceMemory> &d_data_out : d_data_f_dm_) {
           // Use executestream to make sure dedispersion
           // starts only after initializing the output buffer
-          d_data_out.zero(*executestream);
+          executestream->zero(*d_data_out, d_data_out->size());
         }
       }
 
@@ -1135,20 +1144,24 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
 
         // Dedispersion in frequency domain
         executestream->record(dm_job.dedispersionStart);
-        kernel.launch(dm_job.ndm_current,        // ndm
-                      nfreq,                     // nfreq
-                      channel_job.nchan_current, // nchan
-                      dt,                        // dt
-                      d_spin_frequencies,        // d_spin_frequencies
-                      d_dm_list,                 // d_dm_list
-                      d_data_f_nu,               // d_in
-                      dm_job.d_data_f_dm_ptr,    // d_out
-                      nsamp_padded / 2,          // in stride
-                      nsamp_padded / 2,          // out stride
-                      dm_job.idm_start,          // idm_start
-                      dm_job.idm_end,            // idm_end
-                      channel_job.ichan_start,   // ichan_start
-                      *executestream);           // stream
+        fdd_kernel_dedisperse.run(
+            dm_job.ndm_current,        // ndm
+            nfreq,                     // nfreq
+            channel_job.nchan_current, // nchan
+            dt,                        // dt
+            *d_spin_frequencies,       // d_spin_frequencies
+            *d_dm_list,                // d_dm_list
+            d_data_f_nu,               // d_in
+            cu::DeviceMemory(
+                reinterpret_cast<CUdeviceptr>(dm_job.d_data_f_dm_ptr)), // d_out
+            nsamp_padded / 2,        // in stride
+            nsamp_padded / 2,        // out stride
+            dm_job.idm_start,        // idm_start
+            dm_job.idm_end,          // idm_end
+            channel_job.ichan_start, // ichan_start
+            h_delay_table.data(), h_delay_table.size() * sizeof(dedisp_float),
+            *htodstream,
+            *executestream); // stream
         executestream->record(dm_job.dedispersionEnd);
       } // end for dm_job_id_inner
 
@@ -1166,9 +1179,11 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
                  dst_stride,                // width bytes
                  nsamp);                    // height
         htodstream->record(channel_job_next.inputStart);
-        htodstream->memcpyHtoDAsync(channel_job_next.d_in_ptr, // dst
-                                    channel_job_next.h_in_ptr, // src
-                                    nsamp * dst_stride);       // size
+        htodstream->memcpyHtoDAsync(
+            cu::DeviceMemory(reinterpret_cast<CUdeviceptr>(
+                channel_job_next.d_in_ptr)), // dst
+            channel_job_next.h_in_ptr,       // src
+            nsamp * dst_stride);             // size
         htodstream->record(channel_job_next.inputEnd);
       }
 
@@ -1210,41 +1225,42 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
       // Get pointer to DM output data on host and on device
       dedisp_size dm_stride = nsamp_padded * out_bytes_per_sample;
       dedisp_size dm_offset = dm_job.idm_start * dm_stride;
+
       auto *h_data_t_dm_ptr =
-          (void *)(((size_t)h_data_t_dm.data()) + dm_offset);
+          (void *)((static_cast<uintptr_t *>(h_data_t_dm)) + dm_offset);
       auto *d_data_f_dm_ptr = (float *)dm_job.d_data_f_dm_ptr;
       auto *d_data_t_dm_ptr = (float *)dm_job.d_data_t_dm_ptr;
 
       // Fourier transform results back to time domain
       executestream->record(dm_job.postprocessingStart);
       for (unsigned int idm = 0; idm < dm_job.ndm_current; idm++) {
-        auto *idata =
-            (cufftComplex *)d_data_f_dm_ptr + (1ULL * idm * nsamp_padded / 2);
-        auto *odata =
-            (cufftReal *)d_data_t_dm_ptr + (1ULL * idm * nsamp_padded);
-
-        cufftResult ret;
-        if ((ret = cufftExecC2R(plan_c2r, idata, odata)) != CUFFT_SUCCESS) {
-          std::stringstream ss;
-          ss << "Error while executing C2R FFT plan, error code: " << ret;
-          throw std::runtime_error(ss.str());
-        }
+        cu::DeviceMemory idata(reinterpret_cast<CUdeviceptr>(
+            d_data_f_dm_ptr + (1ULL * idm * nsamp_padded / 2)));
+        cu::DeviceMemory odata(reinterpret_cast<CUdeviceptr>(
+            static_cast<cufftReal *>(d_data_t_dm_ptr) +
+            (1ULL * idm * nsamp_padded)));
+
+        plan_c2r.execute(idata, odata, CUFFT_INVERSE);
       }
 
       // FFT scaling
-      kernel.scale(dm_job.ndm_current, // height
-                   nsamp_padded,       // width
-                   nsamp_padded,       // stride
-                   1.0f / nfft,        // scale
-                   d_data_t_dm_ptr,    // d_data
-                   *executestream);    // stream
+      fdd_kernel_scale.scale(dm_job.ndm_current, // height
+                             nsamp_padded,       // width
+                             nsamp_padded,       // stride
+                             1.0f / nfft,        // scale
+                             cu::DeviceMemory(reinterpret_cast<CUdeviceptr>(
+                                 d_data_t_dm_ptr)), // d_data
+                             *executestream);       // stream
       executestream->record(dm_job.postprocessingEnd);
 
       // Copy output
-      dtohstream->waitEvent(dm_job.postprocessingEnd);
+      dtohstream->wait(dm_job.postprocessingEnd);
       dtohstream->record(dm_job.outputStart);
+
+      cu::DeviceMemory dev_ptr(
+          reinterpret_cast<CUdeviceptr>(static_cast<void *>(d_data_t_dm_ptr)));
       dtohstream->memcpyDtoHAsync(h_data_t_dm_ptr,                 // dst
-                                  d_data_t_dm_ptr,                 // src
+                                  dev_ptr,                         // src
                                   dm_job.ndm_current * dm_stride); // size
       dtohstream->record(dm_job.outputEnd);
     } // end for dm_job_id_inner
@@ -1252,7 +1268,7 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
 
   // Wait for final memory transfer
   dtohstream->record(eEndGPU);
-  mExeGPU.end(eEndGPU);
+  mExeGPU.end();
   dtohstream->synchronize();
 
   // Copy output
@@ -1263,8 +1279,9 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
 #ifdef DEDISP_BENCHMARK
   output_timer->Start();
 #endif
-  copy_chunk_output((float *)h_data_t_dm.data(), (float *)out, ndm, nsamp,
-                    nsamp_computed, nsamp_padded, nsamp_good, chunks);
+  copy_chunk_output(static_cast<float *>(h_data_t_dm),
+                    reinterpret_cast<float *>(out), ndm, nsamp, nsamp_computed,
+                    nsamp_padded, nsamp_good, chunks);
 #ifdef DEDISP_BENCHMARK
   output_timer->Pause();
 #endif
@@ -1305,10 +1322,6 @@ void FDDGPUPlan::execute_gpu_segmented(size_type nsamps, const byte_type *in,
             << std::endl;
   std::cout << std::endl;
 #endif
-
-  // Free FFT plans
-  cufftDestroy(plan_c2r);
-  cufftDestroy(plan_r2c);
 }
 
 // Private helper function
@@ -1322,10 +1335,15 @@ void FDDGPUPlan::generate_spin_frequency_table(dedisp_size nfreq,
     h_spin_frequencies[ifreq] = ifreq * (1.0 / (nsamp * dt));
   }
 
-  d_spin_frequencies.resize(nfreq * sizeof(dedisp_float));
+  if (d_spin_frequencies)
+    d_spin_frequencies.reset();
+
+  d_spin_frequencies =
+      std::make_unique<cu::DeviceMemory>(nfreq * sizeof(dedisp_float));
+  assert(d_spin_frequencies);
 
-  htodstream->memcpyHtoDAsync(d_spin_frequencies, h_spin_frequencies.data(),
-                              d_spin_frequencies.size());
+  htodstream->memcpyHtoDAsync(*d_spin_frequencies, h_spin_frequencies.data(),
+                              d_spin_frequencies->size());
 }
 
 } // end namespace dedisp
\ No newline at end of file
diff --git a/src/fdd/FDDGPUPlan.hpp b/src/fdd/FDDGPUPlan.hpp
index d19a295d3da4d60abb1c478e527518d6648f205e..d2745776820809c3bd7e174dbee9572bd1723b92 100644
--- a/src/fdd/FDDGPUPlan.hpp
+++ b/src/fdd/FDDGPUPlan.hpp
@@ -1,9 +1,12 @@
-#ifndef H_FDD_GPU_PLAN_INCLUDE_GUARD
-#define H_FDD_GPU_PLAN_INCLUDE_GUARD
+#ifndef DEDISP_FDD_GPU_PLAN_HPP_
+#define DEDISP_FDD_GPU_PLAN_HPP_
 
 #include "FDDCPUPlan.hpp"
 #include "GPUPlan.hpp"
 
+#include "dedisperse/FDDKernel.hpp"
+#include "unpack/UnpackKernel.hpp"
+
 namespace dedisp {
 
 class FDDGPUPlan : public GPUPlan {
@@ -37,15 +40,19 @@ private:
 
   // Host arrays
   std::vector<dedisp_float> h_spin_frequencies; // size = nfreq
-  std::vector<cu::HostMemory> h_data_t_nu_;
-  std::vector<cu::HostMemory> h_data_t_dm_;
+  std::vector<std::shared_ptr<cu::HostMemory>> h_data_t_nu_;
+  std::vector<std::shared_ptr<cu::HostMemory>> h_data_t_dm_;
 
   // Device arrays
-  cu::DeviceMemory d_spin_frequencies; // type = dedisp_float
-  std::vector<cu::DeviceMemory> d_data_t_nu_;
-  std::vector<cu::DeviceMemory> d_data_x_dm_;
+  std::unique_ptr<cu::DeviceMemory> d_spin_frequencies; // type = dedisp_float
+  std::vector<std::shared_ptr<cu::DeviceMemory>> d_data_t_nu_;
+  std::vector<std::shared_ptr<cu::DeviceMemory>> d_data_x_dm_;
+
+  FDDKernelDedisperse fdd_kernel_dedisperse;
+  FDDKernelScale fdd_kernel_scale;
+  FDDKernelUnpack fdd_kernel_unpack;
 };
 
 } // end namespace dedisp
 
-#endif // H_FDD_GPU_PLAN_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_FDD_GPU_PLAN_HPP_
\ No newline at end of file
diff --git a/src/fdd/chunk.h b/src/fdd/chunk.h
index 7c7e398f91aac4722e37e3decf40d794d40e4e5a..932a02581bae91b54d21af68c65475cd810343ad 100644
--- a/src/fdd/chunk.h
+++ b/src/fdd/chunk.h
@@ -1,5 +1,5 @@
-#ifndef H_CHUNK_INCLUDE_GUARD
-#define H_CHUNK_INCLUDE_GUARD
+#ifndef DEDISP_FDD_CHUNK_H_
+#define DEDISP_FDD_CHUNK_H_
 
 #include <cstdio>
 #include <vector>
@@ -38,4 +38,4 @@ void generate_spin_frequency_table_chunks(std::vector<Chunk> &chunks,
 
 } // end namespace dedisp
 
-#endif // H_CHUNK_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_FDD_CHUNK_H_
\ No newline at end of file
diff --git a/src/fdd/cinterface.cpp b/src/fdd/cinterface.cpp
index 75d3bc2b64003f4941b4577df14b6f9619b5c812..cd30ec131ba679a007a3ad51f8974d76dc95077d 100644
--- a/src/fdd/cinterface.cpp
+++ b/src/fdd/cinterface.cpp
@@ -3,9 +3,10 @@
 #include "fdd/FDDGPUPlan.hpp"
 
 #include <cmath>
-#include <iostream>
 #include <memory>
 
+#include <cudawrappers/cu.hpp>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -45,10 +46,6 @@ dedisp_error dedisp_create_plan(dedisp_plan *plan, dedisp_size nchans,
 
   *plan = nullptr;
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   *plan = new dedisp_plan_struct();
   if (!plan) {
     throw_error(DEDISP_MEM_ALLOC_FAILED);
@@ -70,10 +67,6 @@ dedisp_error dedisp_set_gulp_size(dedisp_plan plan, dedisp_size gulp_size) {
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   return DEDISP_NOT_IMPLEMENTED_ERROR;
 }
 
@@ -82,10 +75,6 @@ dedisp_size dedisp_get_gulp_size(dedisp_plan plan) {
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   throw_error(DEDISP_NOT_IMPLEMENTED_ERROR);
 
   return 0;
@@ -97,10 +86,6 @@ dedisp_error dedisp_set_dm_list(dedisp_plan plan, const dedisp_float *dm_list,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->set_dm_list(dm_list, count);
   } catch (...) {
@@ -117,10 +102,6 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, dedisp_float dm_start,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->generate_dm_list(dm_start, dm_end, ti, tol);
   } catch (...) {
@@ -180,10 +161,6 @@ dedisp_error dedisp_set_killmask(dedisp_plan plan,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->set_killmask(killmask);
   } catch (...) {
@@ -289,10 +266,6 @@ dedisp_error dedisp_execute_guru(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_FLAG_COMBINATION);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   return DEDISP_NOT_IMPLEMENTED_ERROR;
 }
 
@@ -309,10 +282,6 @@ dedisp_error dedisp_execute_adv(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_FLAG_COMBINATION);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   return DEDISP_NOT_IMPLEMENTED_ERROR;
 }
 
@@ -325,10 +294,6 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->execute(nsamps, in, in_nbits, out, out_nbits, flags);
   } catch (...) {
@@ -340,7 +305,7 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps,
 
 dedisp_error dedisp_sync(void) {
   try {
-    cu::checkError(cudaDeviceSynchronize());
+    cu::Context::synchronize();
   } catch (...) {
     throw_error(DEDISP_PRIOR_GPU_ERROR);
   }
diff --git a/src/fdd/dedisperse/FDDKernel.cu b/src/fdd/dedisperse/FDDKernel.cu
index 1ff643d770fcee64dece4591e2613986405ddc52..e5a973c1d404bd1beb4bda96a76438d6b068cf26 100644
--- a/src/fdd/dedisperse/FDDKernel.cu
+++ b/src/fdd/dedisperse/FDDKernel.cu
@@ -1,81 +1,87 @@
 #include "FDDKernel.hpp"
-#include "common/cuda/CU.h"
-#include "fdd_kernel.cuh"
-#include <cuda.h>
 
-/*
- * Helper functions
- */
-void FDDKernel::copy_delay_table(const void *src, size_t count, size_t offset,
-                                 cudaStream_t stream) {
-  cu::checkError(cudaMemcpyToSymbolAsync(c_delay_table, src, count, offset,
-                                         cudaMemcpyDeviceToDevice, stream));
-}
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
 
-unsigned long div_round_up(unsigned long a, unsigned long b) {
-  return (a - 1) / b + 1;
-}
+#include "GPUKernel.hpp"
+#include "fdd_dedisperse_kernel.cu"
+#include <cudawrappers/cu.hpp>
 
 /*
  * dedisperse routine
  */
-void FDDKernel::launch(dedisp_size ndm, dedisp_size nfreq, dedisp_size nchan,
-                       float dt, const dedisp_float *d_spin_frequencies,
-                       const dedisp_float *d_dm_list, const dedisp_float2 *d_in,
-                       const dedisp_float2 *d_out, dedisp_size in_stride,
-                       dedisp_size out_stride, unsigned int idm_start,
-                       unsigned int idm_end, unsigned int ichan_start,
-                       cudaStream_t stream) {
+void FDDKernelDedisperse::run(
+    unsigned long ndm, unsigned long nfreq, unsigned long nchan, float dt,
+    const cu::DeviceMemory &d_spin_frequencies,
+    const cu::DeviceMemory &d_dm_list, const cu::DeviceMemory &d_in,
+    const cu::DeviceMemory &d_out, unsigned long in_stride,
+    unsigned long out_stride, unsigned int idm_start, unsigned int idm_end,
+    unsigned int ichan_start, const void *delay_table,
+    const size_t delay_table_count, cu::Stream &htodstream,
+    cu::Stream &executestream) {
   // Define thread decomposition
   unsigned grid_x = std::max((int)((ndm + NDM_BATCH_GRID) / NDM_BATCH_GRID), 1);
   unsigned grid_y = NFREQ_BATCH_GRID;
   dim3 grid(grid_x, grid_y);
   dim3 block(NFREQ_BATCH_BLOCK);
 
-/* Execute the kernel
- *  The second kernel argument can be set to true
- *  in order to enable an experimental optimization feature,
- *  where extrapolation is used in the computation of the phasors.
- *  Boudary conditions should be further explored to determine
- *  functional correctness at all times.
- *  Leaving this feature in because it might be beneficial
- *  depending on the system configurations.
- */
-#define CALL_KERNEL(NCHAN)                                                     \
-  dedisperse_kernel<NCHAN, false><<<grid, block, 0, stream>>>(                 \
-      nfreq, dt, (float *)d_spin_frequencies, (float *)d_dm_list, in_stride,   \
-      out_stride, (const float2 *)d_in, (float2 *)d_out, idm_start, idm_end,   \
-      ichan_start);
-
-  switch (nchan) {
-  case 16:
-    CALL_KERNEL(16);
-    break;
-  case 32:
-    CALL_KERNEL(32);
-    break;
-  case 64:
-    CALL_KERNEL(64);
-    break;
-  case 128:
-    CALL_KERNEL(128);
-    break;
-  case 256:
-    CALL_KERNEL(256);
-    break;
+  /* Execute the kernel
+   *  The second kernel argument can be set to true
+   *  in order to enable an experimental optimization feature,
+   *  where extrapolation is used in the computation of the phasors.
+   *  Boudary conditions should be further explored to determine
+   *  functional correctness at all times.
+   *  Leaving this feature in because it might be beneficial
+   *  depending on the system configurations.
+   */
+
+  const std::vector<const void *> parameters = {
+      &nfreq, &dt,    &d_spin_frequencies, &d_dm_list, &in_stride,  &out_stride,
+      &d_in,  &d_out, &idm_start,          &idm_end,   &ichan_start};
+
+  // Execute the kernel
+  std::ostringstream oss;
+  oss << "-DDEF_NCHAN=" << nchan;
+
+  CompiledKernelInfo kernel;
+  bool did_recompile = false;
+
+  std::tie(kernel, did_recompile) =
+      compile({"-DDEF_EXTRAPOLATE=false", oss.str()});
+  assertCompiled(kernel);
+
+  if (did_recompile) {
+    htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_delay_table"),
+                               delay_table, delay_table_count);
   }
+
+  executestream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z,
+                             block.x, block.y, block.z, 0, parameters);
 }
 
 /*
  * dedisperse routine
  */
-void FDDKernel::scale(dedisp_size height, dedisp_size width, dedisp_size stride,
-                      dedisp_float scale, dedisp_float *d_data,
-                      cudaStream_t stream) {
+void FDDKernelScale::scale(dedisp_size height, dedisp_size width,
+                           dedisp_size stride, dedisp_float scale,
+                           const cu::DeviceMemory &d_data, cu::Stream &stream) {
+
+  CompiledKernelInfo kernel;
+  bool did_recompile;
+
+  std::tie(kernel, did_recompile) = compile();
+  assertCompiled(kernel);
+
   // Define thread decomposition
   dim3 grid(height);
   dim3 block(128);
 
   // Execute the kernel
-  scale_output_kernel<<<grid, block, 0, stream>>>(width, stride, scale, d_data);
+  std::vector<const void *> parameters = {&width, &stride, &scale, &d_data};
+
+  stream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, block.x,
+                      block.y, block.z, 0, parameters);
 }
diff --git a/src/fdd/dedisperse/FDDKernel.hpp b/src/fdd/dedisperse/FDDKernel.hpp
index a96988f69b35f3bae343b31978f92afee2eb2372..ad7e431eaea64e804679754d582aa954046cc89d 100644
--- a/src/fdd/dedisperse/FDDKernel.hpp
+++ b/src/fdd/dedisperse/FDDKernel.hpp
@@ -1,25 +1,54 @@
-#ifndef FDD_KERNEL_H_INCLUDE_GUARD
-#define FDD_KERNEL_H_INCLUDE_GUARD
+#ifndef FDD_KERNEL_H_
+#define FDD_KERNEL_H_
 
-#include <cuda.h>
+#include <cudawrappers/cu.hpp>
+#include <cudawrappers/nvtx.hpp>
 
 #include "common/dedisp_types.h"
+#include <GPUKernel.hpp>
 
-class FDDKernel {
+extern const char _binary_src_fdd_dedisperse_fdd_dedisperse_kernel_cu_start,
+    _binary_src_fdd_dedisperse_fdd_dedisperse_kernel_cu_end;
+
+extern const char _binary_src_fdd_dedisperse_fdd_scale_kernel_cu_start,
+    _binary_src_fdd_dedisperse_fdd_scale_kernel_cu_end;
+
+class FDDKernelDedisperse : public GPUKernel {
 public:
-  void copy_delay_table(const void *src, size_t count, size_t offset,
-                        cudaStream_t stream);
+  FDDKernelDedisperse()
+      : GPUKernel(
+            "fdd_dedisperse_kernel.cu", "dedisperse_kernel_runtime",
+            std::string(
+                reinterpret_cast<const char *>(
+                    &_binary_src_fdd_dedisperse_fdd_dedisperse_kernel_cu_start),
+                reinterpret_cast<const char *>(
+                    &_binary_src_fdd_dedisperse_fdd_dedisperse_kernel_cu_end))) {
+  }
 
-  void launch(dedisp_size ndm, dedisp_size nfreq, dedisp_size nchan, float dt,
-              const dedisp_float *d_spin_frequencies,
-              const dedisp_float *d_dm_list, const dedisp_float2 *d_in,
-              const dedisp_float2 *d_out, dedisp_size in_stride,
-              dedisp_size out_stride, unsigned int idm_start,
-              unsigned int idm_end, unsigned int ichan_start,
-              cudaStream_t stream);
+  void run(dedisp_size ndm, dedisp_size nfreq, dedisp_size nchan, float dt,
+           const cu::DeviceMemory &d_spin_frequencies,
+           const cu::DeviceMemory &d_dm_list, const cu::DeviceMemory &d_in,
+           const cu::DeviceMemory &d_out, dedisp_size in_stride,
+           dedisp_size out_stride, unsigned int idm_start, unsigned int idm_end,
+           unsigned int ichan_start, const void *delay_table,
+           const size_t delay_table_count, cu::Stream &htodstream,
+           cu::Stream &executestream);
+};
+
+class FDDKernelScale : public GPUKernel {
+public:
+  FDDKernelScale()
+      : GPUKernel(
+            "fdd_kernel_scale.cu", "scale_output_kernel",
+            std::string(
+                reinterpret_cast<const char *>(
+                    &_binary_src_fdd_dedisperse_fdd_scale_kernel_cu_start),
+                reinterpret_cast<const char *>(
+                    &_binary_src_fdd_dedisperse_fdd_scale_kernel_cu_end))) {}
 
   void scale(dedisp_size height, dedisp_size width, dedisp_size stride,
-             dedisp_float scale, dedisp_float *d_data, cudaStream_t stream);
+             dedisp_float scale, const cu::DeviceMemory &d_data,
+             cu::Stream &stream);
 };
 
-#endif // FDD_KERNEL_H_INCLUDE_GUARD
\ No newline at end of file
+#endif // FDD_KERNEL_H_
\ No newline at end of file
diff --git a/src/fdd/dedisperse/fdd_kernel.cuh b/src/fdd/dedisperse/fdd_dedisperse_kernel.cu
similarity index 84%
rename from src/fdd/dedisperse/fdd_kernel.cuh
rename to src/fdd/dedisperse/fdd_dedisperse_kernel.cu
index 9aaac69a656000667a506ad4fe9535d8339479d9..d3cc62771cc3b44a75042552f219b538291905e8 100644
--- a/src/fdd/dedisperse/fdd_kernel.cuh
+++ b/src/fdd/dedisperse/fdd_dedisperse_kernel.cu
@@ -1,8 +1,12 @@
+#ifndef FDD_DEDISPERSE_KERNEL_H_
+#define FDD_DEDISPERSE_KERNEL_H_
+
+#define M_PI 3.14159265358979323846
+
 // Constant reference for input data
 // This value is set according to the constant memory size
 // for all NVIDIA GPUs to date, which is 64 KB and sizeof(dedisp_float) = 4
 #define DEDISP_MAX_NCHANS 16384
-__constant__ dedisp_float c_delay_table[DEDISP_MAX_NCHANS];
 
 // The number of DMs computed by a single thread block
 #define NDM_BATCH_GRID 8
@@ -19,6 +23,9 @@ __constant__ dedisp_float c_delay_table[DEDISP_MAX_NCHANS];
 // Option to enable/disable caching input samples in shared memory
 #define USE_SHARED_MEMORY 0
 
+// Option to enable/disable interpolation of the phasors
+#define EXTRAPOLATE_PHASORS false
+
 /*
  * Helper functions
  */
@@ -33,16 +40,19 @@ inline __device__ float2 cmul(float2 a, float2 b) {
   return {a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
 }
 
+__constant__ float c_delay_table[DEDISP_MAX_NCHANS];
+
 /*
  * dedisperse kernel
  * FDD computes dedispersion as phase rotations in the Fourier domain
  */
 template <unsigned int NCHAN, bool extrapolate>
-__global__ void
-dedisperse_kernel(size_t nfreq, float dt, const float *d_spin_frequencies,
-                  const float *d_dm_list, size_t in_stride, size_t out_stride,
-                  const float2 *d_in, float2 *d_out, unsigned int idm_start,
-                  unsigned int idm_end, unsigned int ichan_start) {
+__device__ inline void
+dedisperse_kernel_impl(size_t nfreq, float dt, const float *d_spin_frequencies,
+                       const float *d_dm_list, size_t in_stride,
+                       size_t out_stride, const float2 *d_in, float2 *d_out,
+                       unsigned int idm_start, unsigned int idm_end,
+                       unsigned int ichan_start) {
   // The DM that the current block processes
   unsigned int idm_current = blockIdx.x;
   // The DM offset is the number of DMs processed by all thread blocks
@@ -201,12 +211,24 @@ dedisperse_kernel(size_t nfreq, float dt, const float *d_spin_frequencies,
   } // end for ifreq_current loop
 } // end dedisperse_kernel
 
-/*
- * scale kernel
- */
-__global__ void scale_output_kernel(size_t n, size_t stride, float scale,
-                                    float *d_data) {
-  for (unsigned int i = threadIdx.x; i < n; i += blockDim.x) {
-    d_data[blockIdx.x * stride + i] *= scale;
-  }
-}
\ No newline at end of file
+extern "C" {
+#ifndef DEF_NCHAN
+#define DEF_NCHAN 16
+#endif
+
+#ifndef DEF_EXTRAPOLATE
+#define DEF_EXTRAPOLATE false
+#endif
+
+__global__ void dedisperse_kernel_runtime(
+    size_t nfreq, float dt, const float *d_spin_frequencies,
+    const float *d_dm_list, size_t in_stride, size_t out_stride,
+    const float2 *d_in, float2 *d_out, unsigned int idm_start,
+    unsigned int idm_end, unsigned int ichan_start) {
+  dedisperse_kernel_impl<DEF_NCHAN, DEF_EXTRAPOLATE>(
+      nfreq, dt, d_spin_frequencies, d_dm_list, in_stride, out_stride, d_in,
+      d_out, idm_start, idm_end, ichan_start);
+}
+}
+
+#endif // FDD_DEDISPERSE_KERNEL_H_
\ No newline at end of file
diff --git a/src/fdd/dedisperse/fdd_scale_kernel.cu b/src/fdd/dedisperse/fdd_scale_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..364bee6a542231e071e65e9c4982503a9456a5b9
--- /dev/null
+++ b/src/fdd/dedisperse/fdd_scale_kernel.cu
@@ -0,0 +1,11 @@
+#ifndef FDD_SCALE_KERNEL_H_
+#define FDD_SCALE_KERNEL_H_
+
+extern "C" __global__ void scale_output_kernel(size_t n, size_t stride,
+                                               float scale, float *d_data) {
+  for (unsigned int i = threadIdx.x; i < n; i += blockDim.x) {
+    d_data[blockIdx.x * stride + i] *= scale;
+  }
+}
+
+#endif // FDD_SCALE_KERNEL_H_
\ No newline at end of file
diff --git a/src/fdd/helper.h b/src/fdd/helper.h
index 07d55162f5bea8b09adf8569cac3ec9d1f7629e7..b00674e119b57fd89acb35463223b0eb16df84d4 100644
--- a/src/fdd/helper.h
+++ b/src/fdd/helper.h
@@ -1,5 +1,5 @@
-#ifndef FDD_HELPER_H_INCLUDE_GUARD
-#define FDD_HELPER_H_INCLUDE_GUARD
+#ifndef DEDISP_FDD_HELPER_H
+#define DEDISP_FDD_HELPER_H
 
 #include "common/helper.h"
 
@@ -40,4 +40,4 @@ void copy_data(size_t height, size_t width, size_t in_stride, size_t out_stride,
 
 } // end namespace dedisp
 
-#endif // FDD_HELPER_H_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_FDD_HELPER_H
\ No newline at end of file
diff --git a/src/fdd/unpack/UnpackKernel.cu b/src/fdd/unpack/UnpackKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..942ca8cee6b3905dd2d6b9cf6374294a6cedfdad
--- /dev/null
+++ b/src/fdd/unpack/UnpackKernel.cu
@@ -0,0 +1,91 @@
+#include "UnpackKernel.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <cudawrappers/cu.hpp>
+#include <cudawrappers/nvrtc.hpp>
+
+#include "common/dedisp_types.h"
+
+#include "kernel_constants.cuh"
+
+template <typename U> inline U round_up_pow2(const U &a) {
+  U r = a - 1;
+  for (unsigned long i = 1; i <= sizeof(U) * 8 / 2; i <<= 1)
+    r |= r >> i;
+  return r + 1;
+}
+
+template <typename U> inline U round_down_pow2(const U &a) {
+  return round_up_pow2(a + 1) / 2;
+}
+
+void FDDKernelUnpack::run(const cu::DeviceMemory &d_in, size_t width,
+                          size_t height, size_t in_stride, size_t out_stride,
+                          cu::DeviceMemory &d_out, unsigned long in_nbits,
+                          unsigned long out_nbits, float scale,
+                          cu::Stream &stream) {
+  CompiledKernelInfo kernel;
+  bool did_recompile;
+
+  std::tie(kernel, did_recompile) = compile();
+  assertCompiled(kernel);
+
+  // Specify thread decomposition (uses up-rounded divisions)
+  const dim3 tot_block_count((width - 1) / TILE_DIM + 1,
+                             (height - 1) / TILE_DIM + 1);
+  const size_t max_grid_dim = round_down_pow2(32768);
+
+  // Partition the grid into chunks that the GPU can accept at once
+  for (size_t block_y_offset = 0; block_y_offset < tot_block_count.y;
+       block_y_offset += max_grid_dim) {
+    dim3 block_count;
+
+    // Handle the possibly incomplete final grid
+    block_count.y = std::min(max_grid_dim, tot_block_count.y - block_y_offset);
+
+    for (size_t block_x_offset = 0; block_x_offset < tot_block_count.x;
+         block_x_offset += max_grid_dim) {
+      // Handle the possibly incomplete final grid
+      block_count.x =
+          std::min(max_grid_dim, tot_block_count.x - block_x_offset);
+
+      // Compute the chunked parameters
+      size_t x_offset = block_x_offset * TILE_DIM;
+      size_t y_offset = block_y_offset * TILE_DIM;
+      size_t in_offset = x_offset + y_offset * in_stride;
+      size_t out_offset = y_offset + x_offset * out_stride;
+      size_t w = std::min(max_grid_dim * TILE_DIM, width - x_offset);
+      size_t h = std::min(max_grid_dim * TILE_DIM, height - y_offset);
+
+      dim3 block(TILE_DIM, BLOCK_ROWS);
+
+      // Specify grid decomposition
+      dim3 grid(round_up_pow2(block_count.x), round_up_pow2(block_count.y));
+
+      // Run the CUDA kernel
+      const void *d_in_offset =
+          static_cast<const dedisp_word *>(d_in) + in_offset;
+      const void *d_out_offset = static_cast<dedisp_word *>(d_out) + out_offset;
+
+      std::vector<const void *> parameters = {&d_in_offset,
+                                              &w,
+                                              &h,
+                                              &in_stride,
+                                              &out_stride,
+                                              &d_out_offset,
+                                              &(block_count.x),
+                                              &(block_count.y),
+                                              &in_nbits,
+                                              &scale};
+
+      stream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, block.x,
+                          block.y, block.z, 0, parameters);
+    } // end for block_x_offset
+  } // end for block_y_offset
+}
diff --git a/src/fdd/unpack/UnpackKernel.hpp b/src/fdd/unpack/UnpackKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec64a49b73cab85e629e7423553fe1810ea0d48b
--- /dev/null
+++ b/src/fdd/unpack/UnpackKernel.hpp
@@ -0,0 +1,27 @@
+#ifndef DEDISP_FDD_UNPACK_UNPACK_KERNEL_HPP_
+#define DEDISP_FDD_UNPACK_UNPACK_KERNEL_HPP_
+
+#include "cudawrappers/cu.hpp"
+#include <GPUKernel.hpp>
+
+extern const char _binary_src_fdd_unpack_transpose_unpack_kernel_cu_start,
+    _binary_src_fdd_unpack_transpose_unpack_kernel_cu_end;
+
+class FDDKernelUnpack : public GPUKernel {
+public:
+  FDDKernelUnpack()
+      : GPUKernel(
+            "transpose_unpack_kernel.cu", "transpose_unpack_kernel_uint",
+            std::string(
+                reinterpret_cast<const char *>(
+                    &_binary_src_fdd_unpack_transpose_unpack_kernel_cu_start),
+                reinterpret_cast<const char *>(
+                    &_binary_src_fdd_unpack_transpose_unpack_kernel_cu_end))) {}
+
+  void run(const cu::DeviceMemory &d_in, size_t width, size_t height,
+           size_t in_stride, size_t out_stride, cu::DeviceMemory &d_out,
+           unsigned long in_nbits, unsigned long out_nbits, float scale,
+           cu::Stream &stream);
+};
+
+#endif // DEDISP_FDD_UNPACK_UNPACK_KERNEL_HPP_
\ No newline at end of file
diff --git a/src/fdd/unpack/kernel_constants.cuh b/src/fdd/unpack/kernel_constants.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8ad63a47b9338dca773dea3d20ea5f6de9ff9939
--- /dev/null
+++ b/src/fdd/unpack/kernel_constants.cuh
@@ -0,0 +1,8 @@
+#ifndef DEDISP_FDD_UNPACK_KERNEL_CONSTANTS_CUH_
+#define DEDISP_FDD_UNPACK_KERNEL_CONSTANTS_CUH_
+
+#define TILE_DIM 32
+#define BLOCK_ROWS 8
+#define EXPANSION 4
+
+#endif // DEDISP_FDD_UNPACK_KERNEL_CONSTANTS_CUH_
\ No newline at end of file
diff --git a/src/fdd/unpack/unpack_kernel.cuh b/src/fdd/unpack/transpose_unpack_kernel.cu
similarity index 79%
rename from src/fdd/unpack/unpack_kernel.cuh
rename to src/fdd/unpack/transpose_unpack_kernel.cu
index e6ae13208bc0733f45fe3d44818c1c136f3671e9..c1542d7853f9872bfd63f598a984c95584179753 100644
--- a/src/fdd/unpack/unpack_kernel.cuh
+++ b/src/fdd/unpack/transpose_unpack_kernel.cu
@@ -1,9 +1,9 @@
-#define TILE_DIM 32
-#define BLOCK_ROWS 8
-#define EXPANSION 4
+#ifndef FDD_TRANSPOSE_UNPACK_KERNEL_H_
+#define FDD_TRANSPOSE_UNPACK_KERNEL_H_
+#include "kernel_constants.cuh"
 
 template <typename WordType>
-__global__ void
+__device__ inline void
 transpose_unpack_kernel(const WordType *in, size_t width, size_t height,
                         size_t in_stride, size_t out_stride, WordType *out,
                         size_t block_count_x, size_t block_count_y,
@@ -73,4 +73,15 @@ transpose_unpack_kernel(const WordType *in, size_t width, size_t height,
       }
     }
   }
-}
\ No newline at end of file
+}
+
+extern "C" __global__ void transpose_unpack_kernel_uint(
+    const unsigned int *in, size_t width, size_t height, size_t in_stride,
+    size_t out_stride, unsigned int *out, size_t block_count_x,
+    size_t block_count_y, size_t in_nbits, float scale) {
+  transpose_unpack_kernel<unsigned int>(in, width, height, in_stride,
+                                        out_stride, out, block_count_x,
+                                        block_count_y, in_nbits, scale);
+}
+
+#endif // FDD_TRANSPOSE_UNPACK_KERNEL_H_
\ No newline at end of file
diff --git a/src/fdd/unpack/unpack.cu b/src/fdd/unpack/unpack.cu
deleted file mode 100644
index 3e6f0bb9b8b913c1fc1274b1ea66d658186eb3e8..0000000000000000000000000000000000000000
--- a/src/fdd/unpack/unpack.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <algorithm>
-
-#include "dedisp_types.h"
-
-#include "unpack_kernel.cuh"
-
-template <typename U> inline U round_up_pow2(const U &a) {
-  U r = a - 1;
-  for (unsigned long i = 1; i <= sizeof(U) * 8 / 2; i <<= 1)
-    r |= r >> i;
-  return r + 1;
-}
-
-template <typename U> inline U round_down_pow2(const U &a) {
-  return round_up_pow2(a + 1) / 2;
-}
-
-void transpose_unpack(const dedisp_word *d_in, size_t width, size_t height,
-                      size_t in_stride, size_t out_stride, dedisp_word *d_out,
-                      dedisp_size in_nbits, dedisp_size out_nbits, float scale,
-                      cudaStream_t stream) {
-  // Specify thread decomposition (uses up-rounded divisions)
-  dim3 tot_block_count((width - 1) / TILE_DIM + 1, (height - 1) / TILE_DIM + 1);
-
-  size_t max_grid_dim = round_down_pow2(32768);
-
-  // Partition the grid into chunks that the GPU can accept at once
-  for (size_t block_y_offset = 0; block_y_offset < tot_block_count.y;
-       block_y_offset += max_grid_dim) {
-    dim3 block_count;
-
-    // Handle the possibly incomplete final grid
-    block_count.y = std::min(max_grid_dim, tot_block_count.y - block_y_offset);
-
-    for (size_t block_x_offset = 0; block_x_offset < tot_block_count.x;
-         block_x_offset += max_grid_dim) {
-      // Handle the possibly incomplete final grid
-      block_count.x =
-          std::min(max_grid_dim, tot_block_count.x - block_x_offset);
-
-      // Compute the chunked parameters
-      size_t x_offset = block_x_offset * TILE_DIM;
-      size_t y_offset = block_y_offset * TILE_DIM;
-      size_t in_offset = x_offset + y_offset * in_stride;
-      size_t out_offset = y_offset + x_offset * out_stride;
-      size_t w = std::min(max_grid_dim * TILE_DIM, width - x_offset);
-      size_t h = std::min(max_grid_dim * TILE_DIM, height - y_offset);
-
-      dim3 block(TILE_DIM, BLOCK_ROWS);
-
-      // Specify grid decomposition
-      dim3 grid(round_up_pow2(block_count.x), round_up_pow2(block_count.y));
-
-      // Run the CUDA kernel
-      transpose_unpack_kernel<dedisp_word><<<grid, block, 0, stream>>>(
-          d_in + in_offset, w, h, in_stride, out_stride, d_out + out_offset,
-          block_count.x, block_count.y, in_nbits, scale);
-    } // end for block_x_offset
-  } // end for block_y_offset
-}
\ No newline at end of file
diff --git a/src/fdd/unpack/unpack.h b/src/fdd/unpack/unpack.h
deleted file mode 100644
index a5b327d27fa78a4c26ff0146aa2d5ba593c713b2..0000000000000000000000000000000000000000
--- a/src/fdd/unpack/unpack.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "dedisp_types.h"
-
-#include "cuda_runtime.h"
-
-void transpose_unpack(const dedisp_word *d_in, size_t width, size_t height,
-                      size_t in_stride, size_t out_stride, dedisp_word *d_out,
-                      dedisp_size in_nbits, dedisp_size out_nbits, float scale,
-                      cudaStream_t stream);
diff --git a/src/tdd/CMakeLists.txt b/src/tdd/CMakeLists.txt
index bd92e339abe2f415b5a449ae362f8a30798c0aed..5de8f2fcf19acaaec4e96f740eee448c3ee2fbc8 100644
--- a/src/tdd/CMakeLists.txt
+++ b/src/tdd/CMakeLists.txt
@@ -3,17 +3,27 @@ add_library(
   cinterface.cpp
   TDDPlan.cpp
   dedisperse/TDDKernel.cu
-  transpose/transpose.cu
+  transpose/TransposeKernel.cu
   $<TARGET_OBJECTS:common>
   $<TARGET_OBJECTS:plan>
   $<TARGET_OBJECTS:external>)
 
-target_include_directories(tdd PRIVATE ${CMAKE_SOURCE_DIR}/src)
+target_include_directories(
+  tdd
+  PRIVATE "${CMAKE_SOURCE_DIR}/src"
+  PRIVATE "${CMAKE_SOURCE_DIR}/src/tdd")
+
+include(GNUInstallDirs)
+list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_FULL_LIBDIR})
+
+target_compile_options(tdd PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>)
 
-target_link_libraries(tdd CUDA::cudart CUDA::nvToolsExt CUDA::cuda_driver
-                      OpenMP::OpenMP_CXX)
+target_embed_source(tdd transpose/tdd_transpose_kernel.cu)
+target_embed_source(tdd dedisperse/tdd_dedisperse_kernel.cu)
 
-target_compile_options(tdd PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>)
+target_link_libraries(
+  tdd PUBLIC OpenMP::OpenMP_CXX cudawrappers::cu cudawrappers::nvtx
+             cudawrappers::nvrtc Threads::Threads)
 
 set_target_properties(
   tdd
@@ -21,6 +31,21 @@ set_target_properties(
              SOVERSION ${DEDISP_VERSION_MAJOR}
              PUBLIC_HEADER TDDPlan.hpp)
 
+if(${DEDISP_BACKEND_HIP})
+  get_target_property(sources tdd SOURCES)
+  set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP)
+endif()
+
+if(${DEDISP_BACKEND_HIP})
+  set_source_files_properties(
+    dedisperse/FDDKernel.cu transpose/TransposeKernel.cu PROPERTIES LANGUAGE
+                                                                    HIP)
+endif()
+
+if(DEDISP_BENCHMARK_WITH_PMT)
+  target_link_libraries(tdd PUBLIC pmt)
+endif()
+
 install(
   TARGETS tdd
   LIBRARY DESTINATION lib
diff --git a/src/tdd/TDDPlan.cpp b/src/tdd/TDDPlan.cpp
index e14eb7cb52f7a1366819a89c5718d78dac57cef8..fc5c8003e80f0e74ce65abf6ad368eb58eca95d3 100644
--- a/src/tdd/TDDPlan.cpp
+++ b/src/tdd/TDDPlan.cpp
@@ -9,26 +9,32 @@
  * - Fusing unpack and transpose kernels in to one kernel,
  *   thus requiring only a single pass over the data for the same operations.
  */
+#include <cassert>
 #include <iostream>
 #include <mutex>
 #include <thread>
 
 #include "TDDPlan.hpp"
 
-#include "common/cuda/CU.h"
 #include "common/dedisp_strings.h"
 #include "common/helper.h"
+#include <cudawrappers/nvtx.hpp>
 
+#include "dedisp_types.h"
 #include "dedisperse/TDDKernel.hpp"
-#include "transpose/transpose.h"
+#include "tdd/transpose/TransposeKernel.hpp"
 
 #include "dedisp_error.hpp"
 
-#if defined(DEDISP_BENCHMARK)
+#ifdef DEDISP_BENCHMARK
 #include "external/Stopwatch.h"
 #include <fstream>
 #endif
 
+#ifdef HAVE_PMT
+#include <pmt.h>
+#endif
+
 namespace dedisp {
 
 // Constructor
@@ -54,9 +60,11 @@ unsigned long div_round_up(unsigned long a, unsigned long b) {
 dedisp_size TDDPlan::compute_gulp_size() { return 65536; }
 
 dedisp_size TDDPlan::compute_max_nchans() {
-  size_t const_mem_bytes = m_device->get_total_const_memory();
-  size_t bytes_per_chan = sizeof(dedisp_float) + sizeof(dedisp_bool);
-  size_t max_nr_channels = const_mem_bytes / bytes_per_chan;
+  const size_t const_mem_bytes = static_cast<size_t>(
+      m_device->getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY));
+  const size_t bytes_per_chan = sizeof(dedisp_float) + sizeof(dedisp_bool);
+  const size_t max_nr_channels = const_mem_bytes / bytes_per_chan;
+
   return max_nr_channels;
 };
 
@@ -135,22 +143,22 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in,
   std::unique_ptr<Stopwatch> total_timer(Stopwatch::create());
   std::unique_ptr<Stopwatch> input_timer(Stopwatch::create());
   std::unique_ptr<Stopwatch> output_timer(Stopwatch::create());
+
+#if defined(HAVE_PMT)
+#if defined(__HIP__)
+  auto pmt_sensor = pmt::Create("rocm", m_device->getOrdinal());
+#else
+  auto pmt_sensor = pmt::Create("nvidia", m_device->getOrdinal());
+#endif
+#endif
+
   total_timer->Start();
   init_timer->Start();
 #endif
   // Annotate the initialization
-  cu::Marker initMarker("initialization", cu::Marker::red);
+  nvtx::Marker initMarker("initialization", nvtx::Marker::red);
   initMarker.start();
 
-  // Copy the lookup tables to constant memory on the device
-  cu::Marker constantMarker("copy_constant_memory", cu::Marker::yellow);
-  constantMarker.start();
-  m_kernel.copy_delay_table(d_delay_table, m_nchans * sizeof(dedisp_float), 0,
-                            *htodstream);
-  m_kernel.copy_killmask(d_killmask, m_nchans * sizeof(dedisp_bool), 0,
-                         *htodstream);
-  constantMarker.end();
-
   // Compute the problem decomposition
   dedisp_size nsamps_computed = nsamps - m_max_delay;
 
@@ -165,7 +173,7 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in,
   // Note: If desired, this could be rounded up, e.g., to a power of 2
   dedisp_size in_buf_stride_words = nchan_words;
   dedisp_size in_count_gulp_max = nsamps_gulp_max * in_buf_stride_words;
-  dedisp_size samps_per_thread = m_kernel.get_nsamps_per_thread();
+  dedisp_size samps_per_thread = tdd_kernel_dedisp.get_nsamps_per_thread();
 
   dedisp_size nsamps_padded_gulp_max =
       div_round_up(nsamps_computed_gulp_max, samps_per_thread) *
@@ -190,20 +198,27 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in,
   cu::DeviceMemory d_transposed(in_count_padded_gulp_max * sizeof(dedisp_word));
   cu::DeviceMemory d_out(out_count_gulp_max * sizeof(dedisp_word));
   cu::HostMemory h_out(out_count_gulp_max * sizeof(dedisp_word));
-  std::vector<cu::HostMemory> h_in_(2);
-  std::vector<cu::DeviceMemory> d_in_(2);
-  for (unsigned int i = 0; i < 2; i++) {
-    h_in_[i].resize(in_count_gulp_max * sizeof(dedisp_word));
-    d_in_[i].resize(in_count_gulp_max * sizeof(dedisp_word));
-  }
+  std::array<std::shared_ptr<cu::HostMemory>, 2> h_in_;
+  std::array<std::shared_ptr<cu::DeviceMemory>, 2> d_in_;
+
+  static_assert(h_in_.size() >= 2 && d_in_.size() >= 2,
+                "size of h_in_ and d_in_ must both be >= 2");
+
+  for (auto &elem : h_in_)
+    elem = std::make_unique<cu::HostMemory>(in_count_gulp_max *
+                                            sizeof(dedisp_word));
+
+  for (auto &elem : d_in_)
+    elem = std::make_unique<cu::DeviceMemory>(in_count_gulp_max *
+                                              sizeof(dedisp_word));
 
   struct JobData {
     dedisp_size gulp_samp_idx;
     dedisp_size nsamps_computed_gulp;
     dedisp_size nsamps_gulp;
     dedisp_size nsamps_padded_gulp;
-    void *h_in_ptr;
-    void *d_in_ptr;
+    std::shared_ptr<cu::HostMemory> h_in_ptr;
+    std::shared_ptr<cu::DeviceMemory> d_in_ptr;
     std::mutex output_lock;
     cu::Event inputStart, inputEnd;
     cu::Event preprocessingStart, preprocessingEnd;
@@ -256,9 +271,14 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in,
   initMarker.end();
 
   // Annotate the gulp loop
-  cu::ScopedMarker gulpMarker("gulp_loop", cu::Marker::black);
+  nvtx::Marker gulpMarker("gulp_loop", nvtx::Marker::black);
 
 #ifdef DEDISP_BENCHMARK
+#ifdef HAVE_PMT
+  pmt::State pmt_start, pmt_end;
+  pmt_start = pmt_sensor->Read();
+#endif
+
   // Measure the total time of the gulp loop
   cu::Event gulpStart, gulpEnd;
   htodstream->record(gulpStart);
@@ -279,46 +299,48 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in,
     auto &job = jobs[job_id];
 
     // Copy the input data for the current job
-    memcpy2D(job.h_in_ptr,                         // dst
+    memcpy2D(*job.h_in_ptr,                        // dst
              in_buf_stride_words * BYTES_PER_WORD, // dst stride
              in + job.gulp_samp_idx * in_stride,   // src
              in_stride,                            // src stride
              nchan_words * BYTES_PER_WORD,         // width bytes
              job.nsamps_gulp);                     // height
     htodstream->record(job.inputStart);
-    htodstream->memcpyHtoDAsync(job.d_in_ptr, // dst
-                                job.h_in_ptr, // src
+    htodstream->memcpyHtoDAsync(*job.d_in_ptr, // dst
+                                *job.h_in_ptr, // src
                                 nchan_words * job.nsamps_gulp *
                                     BYTES_PER_WORD); // size
     htodstream->record(job.inputEnd);
 
     // Transpose and unpack the words in the input
-    executestream->waitEvent(job.inputEnd);
+    executestream->wait(job.inputEnd);
     executestream->record(job.preprocessingStart);
-    tdd::transpose((dedisp_word *)job.d_in_ptr, nchan_words, job.nsamps_gulp,
-                   in_buf_stride_words, job.nsamps_padded_gulp,
-                   (dedisp_word *)d_transposed, *executestream);
+    tdd_kernel_transpose.run(*(job.d_in_ptr), nchan_words, job.nsamps_gulp,
+                             in_buf_stride_words, job.nsamps_padded_gulp,
+                             d_transposed, *executestream);
     executestream->record(job.preprocessingEnd);
-
     // Perform direct dedispersion without scrunching
     executestream->record(job.dedispersionStart);
-    m_kernel.launch(d_transposed,             // d_in
-                    job.nsamps_padded_gulp,   // in_stride
-                    job.nsamps_computed_gulp, // nsamps
-                    in_nbits,                 // in_nbits,
-                    m_nchans,                 // nchans
-                    1,                        // chan_stride
-                    d_dm_list,                // d_dm_list
-                    dm_count,                 // dm_count
-                    1,                        // dm_stride
-                    d_out,                    // d_out
-                    out_stride_gulp_samples,  // out_stride
-                    out_nbits,                // out_nbits
-                    *executestream);
+    tdd_kernel_dedisp.run(
+        d_transposed,             // d_in
+        job.nsamps_padded_gulp,   // in_stride
+        job.nsamps_computed_gulp, // nsamps
+        in_nbits,                 // in_nbits,
+        m_nchans,                 // nchans
+        1,                        // chan_stride
+        *d_dm_list,
+        dm_count,                // dm_count
+        1,                       // dm_stride
+        d_out,                   // d_out
+        out_stride_gulp_samples, // out_stride
+        out_nbits,               // out_nbits
+        h_delay_table.data(), h_delay_table.size() * sizeof(dedisp_float),
+        h_killmask.data(), h_killmask.size() * sizeof(dedisp_bool), *htodstream,
+        *executestream);
     executestream->record(job.dedispersionEnd);
 
     // Copy output back to host memory
-    dtohstream->waitEvent(job.dedispersionEnd);
+    dtohstream->wait(job.dedispersionEnd);
     dtohstream->record(job.outputStart);
     dtohstream->memcpyDtoHAsync(h_out,               // dst
                                 d_out,               // src
@@ -337,6 +359,10 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in,
   gulpEnd.synchronize();
   total_timer->Pause();
 
+#ifdef HAVE_PMT
+  pmt_end = pmt_sensor->Read();
+#endif
+
   // Accumulate dedispersion and memcopy time for all jobs
   for (auto &job : jobs) {
     input_timer->Add(job.inputEnd.elapsedTime(job.inputStart));
@@ -369,6 +395,12 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in,
             << std::endl;
   std::cout << total_time_str << total_timer->ToString() << " sec."
             << std::endl;
+#ifdef HAVE_PMT
+  std::cout << pmt_joules_str << pmt::PMT::joules(pmt_start, pmt_end) << " J"
+            << std::endl;
+  std::cout << pmt_watts_str << pmt::PMT::watts(pmt_start, pmt_end) << " W"
+            << std::endl;
+#endif
   std::cout << std::endl;
 
   // Compute number of operations performed
@@ -404,6 +436,8 @@ void TDDPlan::execute_guru(size_type nsamps, const byte_type *in,
             << std::endl;
   perf_file.close();
 #endif
+
+  gulpMarker.end();
 }
 
 } // end namespace dedisp
\ No newline at end of file
diff --git a/src/tdd/TDDPlan.hpp b/src/tdd/TDDPlan.hpp
index c9ec2ebee7c3677ad653971480f1a25af39c3cb8..5408223f672def0ac3ec88349c01b581e376504d 100644
--- a/src/tdd/TDDPlan.hpp
+++ b/src/tdd/TDDPlan.hpp
@@ -5,6 +5,8 @@
 #include "GPUPlan.hpp"
 #include "dedisperse/TDDKernel.hpp"
 
+#include "transpose/TransposeKernel.hpp"
+
 namespace dedisp {
 
 class TDDPlan : public GPUPlan {
@@ -36,13 +38,11 @@ public:
 private:
   // Size parameters
   dedisp_size m_gulp_size;
-
-  // DedispKernel
-  DedispKernel m_kernel;
-
   dedisp_size compute_gulp_size();
-
   dedisp_size compute_max_nchans();
+
+  TransposeKernelTDD tdd_kernel_transpose;
+  DedispKernel tdd_kernel_dedisp;
 };
 
 } // end namespace dedisp
\ No newline at end of file
diff --git a/src/tdd/cinterface.cpp b/src/tdd/cinterface.cpp
index effd49c577a12e6dd0138357411ba1f4fcce1ae9..d4fa2b123b2bff99dc3ecfbc20d7b519fc9aa909 100644
--- a/src/tdd/cinterface.cpp
+++ b/src/tdd/cinterface.cpp
@@ -3,7 +3,6 @@
 #include "tdd/TDDPlan.hpp"
 
 #include <cmath>
-#include <iostream>
 #include <memory>
 
 #ifdef __cplusplus
@@ -43,12 +42,6 @@ dedisp_error dedisp_create_plan(dedisp_plan *plan, dedisp_size nchans,
   std::cout << "dedisp_create_plan()" << std::endl;
 #endif
 
-  *plan = nullptr;
-
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   *plan = new dedisp_plan_struct();
   if (!plan) {
     throw_error(DEDISP_MEM_ALLOC_FAILED);
@@ -69,10 +62,6 @@ dedisp_error dedisp_set_gulp_size(dedisp_plan plan, dedisp_size gulp_size) {
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     static_cast<dedisp::TDDPlan *>(plan->plan_.get())->set_gulp_size(gulp_size);
   } catch (...) {
@@ -87,10 +76,6 @@ dedisp_size dedisp_get_gulp_size(dedisp_plan plan) {
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     return static_cast<dedisp::TDDPlan *>(plan->plan_.get())->get_gulp_size();
   } catch (...) {
@@ -104,10 +89,6 @@ dedisp_error dedisp_set_dm_list(dedisp_plan plan, const dedisp_float *dm_list,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->set_dm_list(dm_list, count);
   } catch (...) {
@@ -124,10 +105,6 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, dedisp_float dm_start,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->generate_dm_list(dm_start, dm_end, ti, tol);
   } catch (...) {
@@ -187,10 +164,6 @@ dedisp_error dedisp_set_killmask(dedisp_plan plan,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->set_killmask(killmask);
   } catch (...) {
@@ -296,10 +269,6 @@ dedisp_error dedisp_execute_guru(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_FLAG_COMBINATION);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     static_cast<dedisp::TDDPlan *>(plan->plan_.get())
         ->execute_guru(nsamps, in, in_nbits, in_stride, out, out_nbits,
@@ -324,10 +293,6 @@ dedisp_error dedisp_execute_adv(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_FLAG_COMBINATION);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     static_cast<dedisp::TDDPlan *>(plan->plan_.get())
         ->execute_adv(nsamps, in, in_nbits, in_stride, out, out_nbits,
@@ -348,10 +313,6 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps,
     throw_error(DEDISP_INVALID_PLAN);
   }
 
-  if (cudaGetLastError() != cudaSuccess) {
-    throw_error(DEDISP_PRIOR_GPU_ERROR);
-  }
-
   try {
     plan->plan_->execute(nsamps, in, in_nbits, out, out_nbits, flags);
   } catch (...) {
@@ -363,7 +324,7 @@ dedisp_error dedisp_execute(const dedisp_plan plan, dedisp_size nsamps,
 
 dedisp_error dedisp_sync(void) {
   try {
-    cu::checkError(cudaDeviceSynchronize());
+    cu::Context::synchronize();
   } catch (...) {
     throw_error(DEDISP_PRIOR_GPU_ERROR);
   }
diff --git a/src/tdd/dedisperse/TDDKernel.cu b/src/tdd/dedisperse/TDDKernel.cu
index 3e483f27949d9eedebbd71d20a15a7a93253ba07..0d2fd9582f82bdf75d2a016115cedc0b1c997899 100644
--- a/src/tdd/dedisperse/TDDKernel.cu
+++ b/src/tdd/dedisperse/TDDKernel.cu
@@ -2,30 +2,16 @@
  * Time Domain Dedispersion (TDD)
  * is an optimized version of the original dedisp implementation.
  */
+#include "GPUKernel.hpp"
 #include "TDDKernel.hpp"
-#include "common/cuda/CU.h"
-#include "common/dedisp_error.hpp"
-#include "dedisperse_kernel.cuh"
+#include "dedisp_types.h"
+#include "tdd_dedisperse_kernel.cu"
+
+#include <cudawrappers/cu.hpp>
 
-// Kernel tuning parameters
 #define DEDISP_BLOCK_SIZE 256
 #define DEDISP_BLOCK_SAMPS 8
 
-/*
- * Helper functions
- */
-void DedispKernel::copy_delay_table(const void *src, size_t count,
-                                    size_t offset, cudaStream_t stream) {
-  cu::checkError(cudaMemcpyToSymbolAsync(c_delay_table, src, count, offset,
-                                         cudaMemcpyDeviceToDevice, stream));
-}
-
-void DedispKernel::copy_killmask(const void *src, size_t count, size_t offset,
-                                 cudaStream_t stream) {
-  cu::checkError(cudaMemcpyToSymbolAsync(c_killmask, src, count, offset,
-                                         cudaMemcpyDeviceToDevice, stream));
-}
-
 unsigned int DedispKernel::get_nsamps_per_thread() {
   return DEDISP_SAMPS_PER_THREAD;
 }
@@ -33,25 +19,23 @@ unsigned int DedispKernel::get_nsamps_per_thread() {
 /*
  * dedisperse routine
  */
-void DedispKernel::launch(const dedisp_word *d_in, dedisp_size in_stride,
-                          dedisp_size nsamps, dedisp_size in_nbits,
-                          dedisp_size nchans, dedisp_size chan_stride,
-                          const dedisp_float *d_dm_list, dedisp_size dm_count,
-                          dedisp_size dm_stride, dedisp_byte *d_out,
-                          dedisp_size out_stride, dedisp_size out_nbits,
-                          cudaStream_t stream) {
+void DedispKernel::run(const cu::DeviceMemory &d_in, dedisp_size in_stride,
+                       dedisp_size nsamps, dedisp_size in_nbits,
+                       dedisp_size nchans, dedisp_size chan_stride,
+                       const cu::DeviceMemory &d_dm_list, dedisp_size dm_count,
+                       dedisp_size dm_stride, const cu::DeviceMemory &d_out,
+                       dedisp_size out_stride, dedisp_size out_nbits,
+                       const void *delay_table, dedisp_size delay_table_size,
+                       const void *killmask, dedisp_size killmask_size,
+                       cu::Stream &htodstream, cu::Stream &executestream) {
+
   enum {
     BITS_PER_BYTE = 8,
-    BYTES_PER_WORD = sizeof(dedisp_word) / sizeof(dedisp_byte),
-    BLOCK_DIM_X = DEDISP_BLOCK_SAMPS,
-    BLOCK_DIM_Y = DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS,
-    MAX_CUDA_GRID_SIZE_X = 65535,
-    MAX_CUDA_GRID_SIZE_Y = 65535
   };
 
   // Define thread decomposition
   // Note: Block dimensions x and y represent time samples and DMs respectively
-  dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+  dim3 block(KERNEL_BLOCK_DIM_X, KERNEL_BLOCK_DIM_Y);
   // Note: Grid dimension x represents time samples. Dimension y represents DMs
 
   // Divide and round up
@@ -68,35 +52,36 @@ void DedispKernel::launch(const dedisp_word *d_in, dedisp_size in_stride,
   // Divide and round up
   dedisp_size nsamps_reduced = (nsamps - 1) / DEDISP_SAMPS_PER_THREAD + 1;
 
+  const std::vector<const void *> parameters = {
+      &d_in,     &nsamps,    &nsamps_reduced, &nsamp_blocks, &in_stride,
+      &dm_count, &dm_stride, &ndm_blocks,     &nchans,       &chan_stride,
+      &d_out,    &out_nbits, &out_stride,     &d_dm_list};
+
   // Execute the kernel
-#define DEDISP_CALL_KERNEL(NBITS)                                              \
-  dedisperse_kernel<NBITS, DEDISP_SAMPS_PER_THREAD, BLOCK_DIM_X, BLOCK_DIM_Y>  \
-      <<<grid, block, 0, stream>>>(d_in, nsamps, nsamps_reduced, nsamp_blocks, \
-                                   in_stride, dm_count, dm_stride, ndm_blocks, \
-                                   nchans, chan_stride, d_out, out_nbits,      \
-                                   out_stride, d_dm_list)
-  // Note: Here we dispatch dynamically on nbits for supported values
-  switch (in_nbits) {
-  case 1:
-    DEDISP_CALL_KERNEL(1);
-    break;
-  case 2:
-    DEDISP_CALL_KERNEL(2);
-    break;
-  case 4:
-    DEDISP_CALL_KERNEL(4);
-    break;
-  case 8:
-    DEDISP_CALL_KERNEL(8);
-    break;
-  case 16:
-    DEDISP_CALL_KERNEL(16);
-    break;
-  case 32:
-    DEDISP_CALL_KERNEL(32);
-    break;
-  default: /* should never be reached */
-    break;
+  std::ostringstream ss;
+  ss << "-DPARAM_IN_NBITS=" << in_nbits;
+
+  CompiledKernelInfo kernel;
+  bool did_recompile;
+  std::tie(kernel, did_recompile) = compile({ss.str()});
+  assertCompiled(kernel);
+
+  if (did_recompile) {
+    constant_memcpy_marker.start();
+    if (delay_table) {
+      htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_delay_table"),
+                                 delay_table, delay_table_size);
+      htodstream.synchronize();
+    }
+
+    if (killmask) {
+      htodstream.memcpyHtoDAsync(kernel.module->getGlobal("c_killmask"),
+                                 killmask, killmask_size);
+      htodstream.synchronize();
+    }
+    constant_memcpy_marker.end();
   }
-#undef DEDISP_CALL_KERNEL
+
+  executestream.launchKernel(*kernel.function, grid.x, grid.y, grid.z, block.x,
+                             block.y, block.z, 0, parameters);
 }
\ No newline at end of file
diff --git a/src/tdd/dedisperse/TDDKernel.hpp b/src/tdd/dedisperse/TDDKernel.hpp
index 5f64e5d7f7ef4988ecf4df4a6b2fa01cf766792c..dd47d31f50ae2f51d6b332f636e8bd5f54309a58 100644
--- a/src/tdd/dedisperse/TDDKernel.hpp
+++ b/src/tdd/dedisperse/TDDKernel.hpp
@@ -1,29 +1,44 @@
-#ifndef DEDISP_KERNEL_H_INCLUDE_GUARD
-#define DEDISP_KERNEL_H_INCLUDE_GUARD
+#ifndef DEDISP_TDD_DEDISPERSE_TDD_KERNEL_HPP_
+#define DEDISP_TDD_DEDISPERSE_TDD_KERNEL_HPP_
 
-#include <cuda.h>
+#include <cudawrappers/cu.hpp>
+#include <cudawrappers/nvtx.hpp>
+
+#include <GPUKernel.hpp>
 
 #include "common/dedisp_types.h"
 
-class DedispKernel {
-public:
-  void copy_delay_table(const void *src, size_t count, size_t offset,
-                        cudaStream_t stream);
+extern unsigned char _binary_src_tdd_dedisperse_tdd_dedisperse_kernel_cu_start,
+    _binary_src_tdd_dedisperse_tdd_dedisperse_kernel_cu_end;
 
-  void copy_killmask(const void *src, size_t count, size_t offset,
-                     cudaStream_t stream);
+class DedispKernel : public GPUKernel {
+public:
+  DedispKernel()
+      : GPUKernel(
+            "tdd_dedisperse_kernel.cu", "dedisperse_kernel",
+            std::string(
+                reinterpret_cast<const char *>(
+                    &_binary_src_tdd_dedisperse_tdd_dedisperse_kernel_cu_start),
+                reinterpret_cast<const char *>(
+                    &_binary_src_tdd_dedisperse_tdd_dedisperse_kernel_cu_end))) {
+  }
 
   unsigned int get_nsamps_per_thread();
 
-  void launch(const dedisp_word *d_in, dedisp_size in_stride,
-              dedisp_size nsamps, dedisp_size in_nbits, dedisp_size nchans,
-              dedisp_size chan_stride, const dedisp_float *d_dm_list,
-              dedisp_size dm_count, dedisp_size dm_stride, dedisp_byte *d_out,
-              dedisp_size out_stride, dedisp_size out_nbits,
-              cudaStream_t stream = 0);
+  void run(const cu::DeviceMemory &d_in, dedisp_size in_stride,
+           dedisp_size nsamps, dedisp_size in_nbits, dedisp_size nchans,
+           dedisp_size chan_stride, const cu::DeviceMemory &d_dm_list,
+           dedisp_size dm_count, dedisp_size dm_stride,
+           const cu::DeviceMemory &d_out, dedisp_size out_stride,
+           dedisp_size out_nbits, const void *delay_table,
+           dedisp_size delay_table_size, const void *killmask,
+           dedisp_size killmask_size, cu::Stream &htodstream,
+           cu::Stream &executestream);
 
 private:
   dedisp_word *m_d_in = nullptr;
+  nvtx::Marker constant_memcpy_marker{"copy_constant_memory",
+                                      nvtx::Marker::yellow};
 };
 
-#endif // DEDISP_KERNEL_H_INCLUDE_GUARD
\ No newline at end of file
+#endif // DEDISP_TDD_DEDISPERSE_TDD_KERNEL_HPP_
\ No newline at end of file
diff --git a/src/tdd/dedisperse/dedisperse_kernel.cuh b/src/tdd/dedisperse/tdd_dedisperse_kernel.cu
similarity index 59%
rename from src/tdd/dedisperse/dedisperse_kernel.cuh
rename to src/tdd/dedisperse/tdd_dedisperse_kernel.cu
index a6d2cb44b02b4a85e5ab02aff5b0d3b2c74b53ed..cc8075ef91515cd46ccb1da0ae8f8179cf3ad952 100644
--- a/src/tdd/dedisperse/dedisperse_kernel.cuh
+++ b/src/tdd/dedisperse/tdd_dedisperse_kernel.cu
@@ -1,3 +1,6 @@
+#pragma once
+
+#include "common/dedisp_types.h"
 /*
  * Time Domain Dedispersion (TDD)
  * is an optimized version of the original dedisp implementation.
@@ -14,6 +17,14 @@ __constant__ dedisp_bool c_killmask[DEDISP_MAX_NCHANS];
 
 // Kernel tuning parameters
 #define DEDISP_SAMPS_PER_THREAD 2
+#define DEDISP_BLOCK_SIZE 256
+#define DEDISP_BLOCK_SAMPS 8
+// #define BITS_PER_BYTE 8
+#define BYTES_PER_WORD (sizeof(dedisp_word) / sizeof(dedisp_byte))
+#define KERNEL_BLOCK_DIM_X DEDISP_BLOCK_SAMPS
+#define KERNEL_BLOCK_DIM_Y (DEDISP_BLOCK_SIZE / DEDISP_BLOCK_SAMPS)
+#define MAX_CUDA_GRID_SIZE_X 65535
+#define MAX_CUDA_GRID_SIZE_Y 65535
 
 /*
  * Helper functions
@@ -87,14 +98,12 @@ set_out_val(dedisp_byte *d_out, dedisp_size out_nbits, dedisp_size idx,
 //       E.g., Words bracketed: (t0c0,t0c1,t0c2,t0c3), (t1c0,t1c1,t1c2,t1c3),...
 // Note: out_stride should be in units of samples
 template <int IN_NBITS, int SAMPS_PER_THREAD, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void dedisperse_kernel(const dedisp_word *d_in, dedisp_size nsamps,
-                                  dedisp_size nsamps_reduced,
-                                  dedisp_size nsamp_blocks, dedisp_size stride,
-                                  dedisp_size dm_count, dedisp_size dm_stride,
-                                  dedisp_size ndm_blocks, dedisp_size nchans,
-                                  dedisp_size chan_stride, dedisp_byte *d_out,
-                                  dedisp_size out_nbits, dedisp_size out_stride,
-                                  const dedisp_float *d_dm_list) {
+__device__ inline void dedisperse_kernel_impl(
+    const dedisp_word *d_in, dedisp_size nsamps, dedisp_size nsamps_reduced,
+    dedisp_size nsamp_blocks, dedisp_size stride, dedisp_size dm_count,
+    dedisp_size dm_stride, dedisp_size ndm_blocks, dedisp_size nchans,
+    dedisp_size chan_stride, dedisp_byte *d_out, dedisp_size out_nbits,
+    dedisp_size out_stride, const dedisp_float *d_dm_list) {
   // Compute compile-time constants
   enum {
     BITS_PER_BYTE = 8,
@@ -167,4 +176,105 @@ __global__ void dedisperse_kernel(const dedisp_word *d_in, dedisp_size nsamps,
     } // End of sample loop
 
   } // End of DM loop
-}
\ No newline at end of file
+}
+
+template <int IN_NBITS, int SAMPS_PER_THREAD, int BLOCK_DIM_X, int BLOCK_DIM_Y>
+__global__ inline void dedisperse_kernel_impl_2(
+    const dedisp_word *d_in, dedisp_size nsamps, dedisp_size nsamps_reduced,
+    dedisp_size nsamp_blocks, dedisp_size stride, dedisp_size dm_count,
+    dedisp_size dm_stride, dedisp_size ndm_blocks, dedisp_size nchans,
+    dedisp_size chan_stride, dedisp_byte *d_out, dedisp_size out_nbits,
+    dedisp_size out_stride, const dedisp_float *d_dm_list) {
+  // Compute compile-time constants
+  enum {
+    BITS_PER_BYTE = 8,
+    CHANS_PER_WORD = sizeof(dedisp_word) * BITS_PER_BYTE / IN_NBITS
+  };
+
+  // Compute the thread decomposition
+  dedisp_size samp_block = blockIdx.x;
+  dedisp_size dm_block = blockIdx.y % ndm_blocks;
+
+  dedisp_size samp_idx = samp_block * BLOCK_DIM_X + threadIdx.x;
+  dedisp_size dm_idx = dm_block * BLOCK_DIM_Y + threadIdx.y;
+  dedisp_size nsamp_threads = nsamp_blocks * BLOCK_DIM_X;
+
+  dedisp_size ndm_threads = ndm_blocks * BLOCK_DIM_Y;
+
+  // Iterate over grids of DMs
+  for (; dm_idx < dm_count; dm_idx += ndm_threads) {
+    // Look up the dispersion measure
+    dedisp_float dm = d_dm_list[dm_idx * dm_stride];
+
+    // Loop over samples
+    for (; samp_idx < nsamps_reduced; samp_idx += nsamp_threads) {
+      typedef typename SumType<IN_NBITS>::type sum_type;
+      sum_type sum[SAMPS_PER_THREAD];
+
+#pragma unroll
+      for (dedisp_size s = 0; s < SAMPS_PER_THREAD; ++s) {
+        sum[s] = 0;
+      }
+
+      // Loop over channel words
+      for (dedisp_size chan_word = 0; chan_word < nchans;
+           chan_word += CHANS_PER_WORD) {
+        // Pre-compute the memory offset
+        dedisp_size offset =
+            samp_idx * SAMPS_PER_THREAD + chan_word / CHANS_PER_WORD * stride;
+
+        // Loop over channel subwords
+        for (dedisp_size chan_sub = 0; chan_sub < CHANS_PER_WORD; ++chan_sub) {
+          dedisp_size chan_idx = (chan_word + chan_sub) * chan_stride;
+
+          // Look up the fractional delay
+          dedisp_float frac_delay = c_delay_table[chan_idx];
+          // Compute the integer delay
+          dedisp_size delay = __float2uint_rn(dm * frac_delay);
+
+// Loop over samples per thread
+// Note: Unrolled to ensure the sum[] array is stored in regs
+#pragma unroll
+          for (dedisp_size s = 0; s < SAMPS_PER_THREAD; ++s) {
+            // Grab the word containing the sample
+            dedisp_word sample = d_in[offset + s + delay];
+
+            // Extract the desired subword and accumulate
+            sum[s] += c_killmask[chan_idx] *
+                      extract_subword<IN_NBITS>(sample, chan_sub);
+          }
+        }
+      }
+
+      // Write sums to global mem
+      dedisp_size out_idx = (samp_idx * SAMPS_PER_THREAD + dm_idx * out_stride);
+#pragma unroll
+      for (dedisp_size s = 0; s < SAMPS_PER_THREAD; ++s) {
+        if (samp_idx * SAMPS_PER_THREAD + s < nsamps)
+          set_out_val<IN_NBITS>(d_out, out_nbits, out_idx + s, sum[s], nchans);
+      }
+
+    } // End of sample loop
+
+  } // End of DM loop
+}
+
+extern "C" {
+#ifndef PARAM_IN_NBITS
+#define PARAM_IN_NBITS 16
+#endif
+
+__global__ void dedisperse_kernel(const dedisp_word *d_in, dedisp_size nsamps,
+                                  dedisp_size nsamps_reduced,
+                                  dedisp_size nsamp_blocks, dedisp_size stride,
+                                  dedisp_size dm_count, dedisp_size dm_stride,
+                                  dedisp_size ndm_blocks, dedisp_size nchans,
+                                  dedisp_size chan_stride, dedisp_byte *d_out,
+                                  dedisp_size out_nbits, dedisp_size out_stride,
+                                  const dedisp_float *d_dm_list) {
+  dedisperse_kernel_impl<PARAM_IN_NBITS, DEDISP_SAMPS_PER_THREAD,
+                         KERNEL_BLOCK_DIM_X, KERNEL_BLOCK_DIM_Y>(
+      d_in, nsamps, nsamps_reduced, nsamp_blocks, stride, dm_count, dm_stride,
+      ndm_blocks, nchans, chan_stride, d_out, out_nbits, out_stride, d_dm_list);
+}
+}
diff --git a/src/tdd/transpose/transpose.cu b/src/tdd/transpose/TransposeKernel.cu
similarity index 61%
rename from src/tdd/transpose/transpose.cu
rename to src/tdd/transpose/TransposeKernel.cu
index 3a1aebcc274f27048a5eedf03c5135038f505438..ca1528353aee426265d708998b9d4346158671af 100644
--- a/src/tdd/transpose/transpose.cu
+++ b/src/tdd/transpose/TransposeKernel.cu
@@ -4,11 +4,11 @@
  * This kernel is a fused version of the original transpose and unpack kernels.
  * Thus requiring only a single pass over the data for the same operations.
  */
-#include "dedisp_types.h"
-#include "transpose_kernel.cuh"
-#include <algorithm>
 
-namespace tdd {
+#include "TransposeKernel.hpp"
+#include "tdd_transpose_kernel.cu"
+#include <algorithm>
+#include <memory>
 
 template <typename U> inline U round_up_pow2(const U &a) {
   U r = a - 1;
@@ -21,9 +21,15 @@ template <typename U> inline U round_down_pow2(const U &a) {
   return round_up_pow2(a + 1) / 2;
 }
 
-void transpose(const dedisp_word *d_in, size_t width, size_t height,
-               size_t in_stride, size_t out_stride, dedisp_word *d_out,
-               cudaStream_t stream) {
+void TransposeKernelTDD::run(cu::DeviceMemory &d_in, size_t width,
+                             size_t height, size_t in_stride, size_t out_stride,
+                             cu::DeviceMemory &d_out, cu::Stream &stream) {
+
+  CompiledKernelInfo kernel;
+  bool did_recompile;
+  std::tie(kernel, did_recompile) = compile();
+  assertCompiled(kernel);
+
   // Specify thread decomposition (uses up-rounded divisions)
   dim3 tot_block_count((width - 1) / TILE_DIM + 1, (height - 1) / TILE_DIM + 1);
 
@@ -56,12 +62,20 @@ void transpose(const dedisp_word *d_in, size_t width, size_t height,
       // Specify grid decomposition
       dim3 grid(round_up_pow2(block_count.x), round_up_pow2(block_count.y));
 
-      // Run the CUDA kernel
-      transpose_kernel<dedisp_word><<<grid, block, 0, stream>>>(
-          d_in + in_offset, w, h, in_stride, out_stride, d_out + out_offset,
-          block_count.x, block_count.y);
+      const void *d_in_offset = static_cast<dedisp_word *>(d_in) + in_offset;
+      void *d_out_offset = static_cast<dedisp_word *>(d_out) + out_offset;
+
+      const std::vector<const void *> parameters = {&d_in_offset,
+                                                    &w,
+                                                    &h,
+                                                    &in_stride,
+                                                    &out_stride,
+                                                    &d_out_offset,
+                                                    &block_count.x,
+                                                    &block_count.y};
+
+      stream.launchKernel(*(kernel.function), grid.x, grid.y, grid.z, block.x,
+                          block.y, block.z, 0, parameters);
     } // end for block_x_offset
   } // end for block_y_offset
 }
-
-} // end namespace tdd
\ No newline at end of file
diff --git a/src/tdd/transpose/TransposeKernel.hpp b/src/tdd/transpose/TransposeKernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7dc2c79e501b2e1ed541222fcfccf1703724e81
--- /dev/null
+++ b/src/tdd/transpose/TransposeKernel.hpp
@@ -0,0 +1,28 @@
+#ifndef DEDISP_TDD_TRANSPOSE_TRANSPOSE_KERNEL_HPP_
+#define DEDISP_TDD_TRANSPOSE_TRANSPOSE_KERNEL_HPP_
+
+#include <cudawrappers/cu.hpp>
+
+#include "common/dedisp_types.h"
+#include <GPUKernel.hpp>
+
+extern const char _binary_src_tdd_transpose_tdd_transpose_kernel_cu_start,
+    _binary_src_tdd_transpose_tdd_transpose_kernel_cu_end;
+
+class TransposeKernelTDD : public GPUKernel {
+public:
+  TransposeKernelTDD()
+      : GPUKernel(
+            "tdd_transpose_kernel.cu", "transpose_kernel_dedisp",
+            std::string(
+                reinterpret_cast<const char *>(
+                    &_binary_src_tdd_transpose_tdd_transpose_kernel_cu_start),
+                reinterpret_cast<const char *>(
+                    &_binary_src_tdd_transpose_tdd_transpose_kernel_cu_end))) {}
+
+  void run(cu::DeviceMemory &d_in, size_t width, size_t height,
+           size_t in_stride, size_t out_stride, cu::DeviceMemory &d_out,
+           cu::Stream &stream);
+};
+
+#endif // DEDISP_TDD_TRANSPOSE_TRANSPOSE_KERNEL_HPP_
\ No newline at end of file
diff --git a/src/tdd/transpose/transpose_kernel.cuh b/src/tdd/transpose/tdd_transpose_kernel.cu
similarity index 59%
rename from src/tdd/transpose/transpose_kernel.cuh
rename to src/tdd/transpose/tdd_transpose_kernel.cu
index d05504601af6bd86e1523a94adcf656c2aa1f728..74bc1451cfd5934c82718b712fde39f557f03ade 100644
--- a/src/tdd/transpose/transpose_kernel.cuh
+++ b/src/tdd/transpose/tdd_transpose_kernel.cu
@@ -5,13 +5,11 @@
 #define TILE_DIM 32
 #define BLOCK_ROWS 8
 
-namespace tdd {
-
 template <typename WordType>
-__global__ void transpose_kernel(const WordType *in, size_t width,
-                                 size_t height, size_t in_stride,
-                                 size_t out_stride, WordType *out,
-                                 size_t block_count_x, size_t block_count_y) {
+__device__ void
+transpose_kernel_dedisp_impl(const WordType *in, size_t width, size_t height,
+                             size_t in_stride, size_t out_stride, WordType *out,
+                             size_t block_count_x, size_t block_count_y) {
   __shared__ WordType tile[TILE_DIM][TILE_DIM];
 
   // Cull excess blocks (there may be many if we round up to a power of 2)
@@ -48,4 +46,18 @@ __global__ void transpose_kernel(const WordType *in, size_t width,
   }
 }
 
+extern "C" {
+#ifndef PARAM_DATA_TYPE
+#define PARAM_DATA_TYPE unsigned int
+#endif
+
+__global__ void transpose_kernel_dedisp(const unsigned int *in, size_t width,
+                                        size_t height, size_t in_stride,
+                                        size_t out_stride, unsigned int *out,
+                                        size_t block_count_x,
+                                        size_t block_count_y) {
+  transpose_kernel_dedisp_impl<PARAM_DATA_TYPE>(in, width, height, in_stride,
+                                                out_stride, out, block_count_x,
+                                                block_count_y);
+}
 } // end namespace tdd
\ No newline at end of file
diff --git a/src/tdd/transpose/transpose.h b/src/tdd/transpose/transpose.h
deleted file mode 100644
index cc3c83cbde75c779305332d626f89baec6165958..0000000000000000000000000000000000000000
--- a/src/tdd/transpose/transpose.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Time Domain Dedispersion (TDD)
- * is an optimized version of the original dedisp implementation.
- */
-#include "cuda_runtime.h"
-#include "dedisp_types.h"
-
-namespace tdd {
-
-void transpose(const dedisp_word *d_in, size_t width, size_t height,
-               size_t in_stride, size_t out_stride, dedisp_word *d_out,
-               cudaStream_t stream);
-
-}; // end namespace tdd
\ No newline at end of file