diff --git a/bin/test/CMakeLists.txt b/bin/test/CMakeLists.txt
index 4f69e95a5a8f3fbdb228ba6a1b4ba2bb54003c13..a4d948d67477d5e1b543af3083f72b2997b3d474 100644
--- a/bin/test/CMakeLists.txt
+++ b/bin/test/CMakeLists.txt
@@ -41,6 +41,21 @@ target_include_directories(testtdd PRIVATE
 target_link_libraries(testtdd tdd)
 
 
+################################################################################
+# test for tdd kernels
+################################################################################
+add_executable(testtranspose
+    testtranspose.cpp
+)
+
+target_include_directories(testtranspose PRIVATE
+    ${CMAKE_SOURCE_DIR}/src
+    ${CMAKE_SOURCE_DIR}/src/common
+)
+
+target_link_libraries(testtranspose dedisp tdd)
+
+
 ################################################################################
 # test for fdd library
 ################################################################################
diff --git a/bin/test/testtranspose.cpp b/bin/test/testtranspose.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f03d14749444516834a123c2e3c92590426ff4f2
--- /dev/null
+++ b/bin/test/testtranspose.cpp
@@ -0,0 +1,205 @@
+#include <cstdlib>
+#include <iostream>
+#include <random>
+#include <functional>
+#include <vector>
+
+#include "cuda/CU.h"
+#include "dedisp_types.h"
+#include "dedisp/transpose/transpose.hpp"
+#include "tdd/transpose/transpose.h"
+
+#define BITS_PER_BYTE 8
+#define DEDISP_SAMPS_PER_THREAD 2
+
+unsigned long div_round_up(unsigned long a, unsigned long b) {
+    return (a-1) / b + 1;
+}
+
+// Assume input is a 0 mean float and quantize to an unsigned 8-bit quantity
+dedisp_byte bytequant(dedisp_float f)
+{
+  dedisp_float v = f + 127.5f;
+  dedisp_byte r;
+  if (v > 255.0) {
+    r = (dedisp_byte) 255;
+  } else if (v < 0.0f) {
+    r = (dedisp_byte) 0;
+  } else {
+    r = (dedisp_byte) roundf(v);
+  }
+  return r;
+}
+
+// Compute mean and standard deviation of a float array
+void calc_stats_float(dedisp_float *a, dedisp_size n, dedisp_float *mean, dedisp_float *sigma)
+{
+  // Use doubles to prevent rounding error
+  double sum = 0.0, sum2 = 0.0;
+  double mtmp = 0.0, vartmp;
+  double v;
+  dedisp_size i;
+
+  // Use corrected 2-pass algorithm from Numerical Recipes
+  sum = 0.0;
+  for (i = 0; i < n; i++) {
+    sum += a[i];
+  }
+  mtmp = sum/n;
+
+  sum = 0.0;
+  sum2 = 0.0;
+  for (i = 0; i < n; i++) {
+    v = a[i];
+    sum2 += (v-mtmp) * (v-mtmp);
+    sum += v-mtmp;
+  }
+  vartmp = (sum2 - (sum*sum) /n) / (n-1);
+  *mean = mtmp;
+  *sigma = sqrt(vartmp);
+
+  return;
+}
+
+
+int run_test(dedisp_size in_nbits)
+{
+    // Observaton parameters
+    dedisp_size nchans           = 1024;
+    dedisp_float sampletime_base = 250.0E-6; // Base is 250 microsecond time samples
+    dedisp_float downsamp        = 1.0;
+    dedisp_float Tobs            = 30.0;    // Observation duration in seconds
+    dedisp_float dt              = downsamp*sampletime_base;     // s (0.25 ms sampling)
+    dedisp_size nsamps           = Tobs / dt;
+
+    // Data parameters
+    dedisp_size chans_per_word = sizeof(dedisp_word)*BITS_PER_BYTE / in_nbits;
+    dedisp_size nchan_words    = nchans / chans_per_word;
+    dedisp_float datarms       = 25.0;
+
+    // Dedispersion parameters
+    dedisp_float last_dm           = 102.6;
+    dedisp_float last_delay        = 0.92;
+    dedisp_size max_delay          = last_dm * last_delay + 0.5;
+    dedisp_size nsamps_gulp        = 65536;
+    dedisp_size nsamps_computed    = nsamps - max_delay;
+    dedisp_size nsamps_padded_gulp = div_round_up(
+        nsamps_computed, DEDISP_SAMPS_PER_THREAD)
+        * DEDISP_SAMPS_PER_THREAD + max_delay;
+
+    // Print parameters
+    std::cout << " >>> Parameters" << std::endl;
+    std::cout << "in_nbits:           " << in_nbits << std::endl;
+    std::cout << "nsamps:             " << nsamps << std::endl;
+    std::cout << "nsamps_computed:    " << nsamps_computed << std::endl;
+    std::cout << "nsamps_gulp:        " << nsamps_gulp << std::endl;
+    std::cout << "nsamps_padded_gulp: " << nsamps_padded_gulp << std::endl;
+    std::cout << "nchans:             " << nchans << std::endl;
+    std::cout << "nchan_words:        " << nchan_words << std::endl;
+
+    // CUDA
+    cu::Stream stream;
+
+    // Initialize random number generator
+    auto random = std::bind(std::normal_distribution<float>(0, 1),
+                            std::mt19937(0));
+
+    // Create raw data
+    std::cout << " >>> Initializing input" << std::endl;
+    std::vector<dedisp_float> rawdata(nsamps_gulp * nchans);
+    for (unsigned int i = 0; i < rawdata.size(); i++) {
+        rawdata[i] = datarms * random();
+    }
+
+    // Create input data by quantizing the raw data
+    dedisp_float raw_mean, raw_sigma;
+    calc_stats_float(rawdata.data(), nsamps_gulp * nchans, &raw_mean, &raw_sigma);
+
+    std::vector<dedisp_byte> input(nsamps_gulp * nchans * (in_nbits / BITS_PER_BYTE));
+
+    for (unsigned int ns = 0; ns < nsamps_gulp; ns++) {
+        for (unsigned int nc = 0; nc < nchans; nc++) {
+            input[ns*nchans+nc] = bytequant(rawdata[ns*nchans+nc]);
+      }
+    }
+
+    // Copy the input to the GPU
+    size_t sizeof_input = input.size() * sizeof(dedisp_byte);
+    cu::DeviceMemory d_input(sizeof_input);
+    stream.memcpyHtoDAsync(d_input, input.data(), sizeof_input);
+
+    // Output
+    std::vector<dedisp_word> output_ref(nsamps_padded_gulp * nchan_words);
+    std::vector<dedisp_word> output_tst(output_ref.size());
+    size_t sizeof_output = nsamps_padded_gulp * nchan_words * sizeof(dedisp_word);
+    cu::DeviceMemory d_output(sizeof_output);
+
+    // Dedisp reference
+    std::cout << " >>> Run dedisp transpose" << std::endl;
+    d_output.zero(stream);
+    transpose((dedisp_word *) d_input,
+              nchan_words, nsamps_gulp,
+              nchan_words, nsamps_padded_gulp,
+              (dedisp_word *) d_output,
+              stream);
+    stream.memcpyDtoHAsync(output_ref.data(), d_output, d_output.size());
+    stream.synchronize();
+
+    // TDD fused kernel
+    std::cout << " >>> Run tdd transpose" << std::endl;
+    d_output.zero(stream);
+    tdd::transpose(
+        (dedisp_word *) d_input,
+        nchan_words, nsamps_gulp,
+        nchan_words, nsamps_padded_gulp,
+        (dedisp_word *) d_output,
+        stream);
+    stream.memcpyDtoHAsync(output_tst.data(), d_output, d_output.size());
+
+    // Wait for execution to finish
+    stream.synchronize();
+
+    // Compare results
+    std::cout << " >>> Comparing results" << std::endl;
+    unsigned long nnz_ref = 0;
+    unsigned long nnz_tst = 0;
+    unsigned long errors = 0;
+
+    for (unsigned long i = 0; i < (nsamps_padded_gulp * nchan_words); i++) {
+        dedisp_word ref = output_ref[i];
+        dedisp_word tst = output_tst[i];
+        if (ref != tst) {
+            if (errors < 10) {
+                std::cerr << "i = " << i << ": " << ref << " != " << tst << std::endl;
+            }
+            errors++;
+        }
+        nnz_ref += ref != 0;
+        nnz_tst += tst != 0;
+    }
+
+    // Report results
+    unsigned long nnz_expected = (nchans * nsamps_gulp) / sizeof(dedisp_word);
+    bool pass = errors == 0 &&             // no errors
+                nnz_ref == nnz_expected && // correct number of nonzeros
+                nnz_ref == nnz_tst;        // same number of nonzeros
+    if (pass) {
+        std::cout << " -> TEST PASSED!" << std::endl << std::endl;
+        return EXIT_SUCCESS;
+    } else {
+        std::cerr << "number of errors: " << errors << std::endl;
+        std::cerr << "number of nonzeros in ref: " << nnz_ref << std::endl;
+        std::cerr << "number of nonzeros in tst: " << nnz_tst << std::endl;
+        std::cerr << " -> TEST FAILED!" << std::endl << std::endl;
+        return EXIT_FAILURE;
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    int retval = 0;
+    retval |= run_test(8);
+    retval |= run_test(16);
+    retval |= run_test(32);
+    return retval;
+}
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index eaf4c687dcea7cde6c6b4d53b008af6ff51ba3f5..1a533f25d4cd66899d7ccc398d05a7f798a482c0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,6 +10,11 @@ include_directories(${CUDA_TOOLKIT_INCLUDE})
 add_subdirectory(common)
 add_subdirectory(external)
 
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+# set the RPATH for the subsequent libraries
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
 
 ################################################################################
 # plan library
@@ -20,7 +25,6 @@ add_library(plan OBJECT Plan.cpp GPUPlan.cpp)
 ################################################################################
 # dedisp c-library
 ################################################################################
-
 add_subdirectory(clib)
 
 
diff --git a/src/Plan.cpp b/src/Plan.cpp
index 824e7663945fae254e3e198dd5698663cfcdbc74..3cec4d17c421e19a7c694671b61d332029b2748b 100644
--- a/src/Plan.cpp
+++ b/src/Plan.cpp
@@ -67,6 +67,8 @@ void Plan::generate_dm_list(
         double dm = ((b2*prev + std::sqrt(-a2*b2*prev2 + (a2+b2)*k)) / (a2+b2));
         dm_table.push_back(dm);
     }
+
+    h_dt_factors.resize(dm_table.size(), 1);
 }
 
 void Plan::set_dm_list(
diff --git a/src/Plan.hpp b/src/Plan.hpp
index 9ce2c6589dffef5788bb572d298031ed2c905e0a..f354178983dd13f0d950665ab75e00c540b015d5 100644
--- a/src/Plan.hpp
+++ b/src/Plan.hpp
@@ -70,6 +70,7 @@ public:
     const float_type* get_dm_list()  const { return h_dm_list.data(); }
     const float_type* get_delay_table() const { return h_delay_table.data(); }
     const bool_type*  get_killmask() const { return h_killmask.data(); }
+    const dedisp_size* get_dt_factors() const { return h_dt_factors.data(); }
 
     void sync();
 
@@ -111,6 +112,7 @@ protected:
     std::vector<dedisp_float> h_dm_list;      // size = dm_count
     std::vector<dedisp_float> h_delay_table;  // size = nchans
     std::vector<dedisp_bool>  h_killmask;     // size = nchans
+    std::vector<dedisp_size>  h_dt_factors;   // size = dm_count
 };
 
 } // end namespace dedisp
diff --git a/src/clib/CMakeLists.txt b/src/clib/CMakeLists.txt
index 0fe192b6dbd8a55bbfcc4ba2cb6d78d0ddf34c5a..7da4bafbbf14575e0c6c9e12435ef44138ba3fb9 100644
--- a/src/clib/CMakeLists.txt
+++ b/src/clib/CMakeLists.txt
@@ -6,4 +6,17 @@ add_library(
 target_include_directories(
    clib
    PRIVATE ${CMAKE_SOURCE_DIR}/src
-)
\ No newline at end of file
+   PRIVATE ${CUDA_TOOLKIT_INCLUDE}
+)
+
+set_target_properties(
+    clib
+    PROPERTIES PUBLIC_HEADER dedisp.h
+)
+
+install(
+    TARGETS
+        clib
+    PUBLIC_HEADER
+        DESTINATION include
+)
diff --git a/src/clib/dedisp.cpp b/src/clib/dedisp.cpp
index 049858a93244101d86da400da371001547cfcb3a..612cdb60511e6dcd5b70f2f8ac2094ba186d0d19 100644
--- a/src/clib/dedisp.cpp
+++ b/src/clib/dedisp.cpp
@@ -9,6 +9,7 @@
 #include <memory> //for shared pointers
 #include <iostream>
 #include <typeinfo>
+#include <cmath>
 
 // Implementation specific CPP Plan interfaces
 #include <dedisp/DedispPlan.hpp>
@@ -266,8 +267,34 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan,
 	return DEDISP_NO_ERROR;
 }
 
-/* Currently not implemented in C-interface:
-dedisp_float * dedisp_generate_dm_list_guru (dedisp_float dm_start, dedisp_float dm_end,
+void generate_dm_list(
+    std::vector<dedisp_float>& dm_table,
+    dedisp_float dm_start, dedisp_float dm_end,
+    double dt, double ti, double f0, double df,
+    dedisp_size nchans, double tol)
+{
+    // Note: This algorithm originates from Lina Levin
+    // Note: Computation done in double precision to match MB's code
+
+    dt *= 1e6;
+    double f    = (f0 + ((nchans/2) - 0.5) * df) * 1e-3;
+    double tol2 = tol*tol;
+    double a    = 8.3 * df / (f*f*f);
+    double a2   = a*a;
+    double b2   = a2 * (double)(nchans*nchans / 16.0);
+    double c    = (dt*dt + ti*ti) * (tol2 - 1.0);
+
+    dm_table.push_back(dm_start);
+    while( dm_table.back() < dm_end ) {
+        double prev     = dm_table.back();
+        double prev2    = prev*prev;
+        double k        = c + tol2*a2*prev2;
+        double dm = ((b2*prev + std::sqrt(-a2*b2*prev2 + (a2+b2)*k)) / (a2+b2));
+        dm_table.push_back(dm);
+    }
+}
+
+dedisp_float * dedisp_generate_dm_list_guru(dedisp_float dm_start, dedisp_float dm_end,
             double dt, double ti, double f0, double df,
             dedisp_size nchans, double tol, dedisp_size * dm_count)
 {
@@ -279,7 +306,6 @@ dedisp_float * dedisp_generate_dm_list_guru (dedisp_float dm_start, dedisp_float
   *dm_count = dm_table.size();
   return &dm_table[0];
 }
-*/
 
 dedisp_error dedisp_set_device(int device_idx) {
 	//keep global copy of device_idx for usage in dedisp_create_plan()
@@ -590,32 +616,24 @@ const char* dedisp_get_error_string(dedisp_error error)
 	}
 }
 
-/* Currently not implemented in C-interface:
 dedisp_error dedisp_enable_adaptive_dt(dedisp_plan  plan,
                                        dedisp_float pulse_width,
                                        dedisp_float tol)
 {
-	if( !plan ) { throw_error(DEDISP_INVALID_PLAN); }
-	plan->scrunching_enabled = true;
-	plan->pulse_width = pulse_width;
-	plan->scrunch_tol = tol;
-	return update_scrunch_list(plan);
+	return DEDISP_NOT_IMPLEMENTED_ERROR;
 }
+
 dedisp_error dedisp_disable_adaptive_dt(dedisp_plan plan) {
-	if( !plan ) { throw_error(DEDISP_INVALID_PLAN); }
-	plan->scrunching_enabled = false;
-	return update_scrunch_list(plan);
+	return DEDISP_NO_ERROR;
 }
+
 dedisp_bool dedisp_using_adaptive_dt(const dedisp_plan plan) {
-	if( !plan ) { throw_getter_error(DEDISP_INVALID_PLAN,false); }
-	return plan->scrunching_enabled;
+	return false;
 }
+
 const dedisp_size* dedisp_get_dt_factors(const dedisp_plan plan) {
-	if( !plan ) { throw_getter_error(DEDISP_INVALID_PLAN,0); }
-	if( 0 == plan->dm_count ) { throw_getter_error(DEDISP_NO_DM_LIST_SET,0); }
-	return &plan->scrunch_list[0];
+	return plan->ptr->get_dt_factors();
 }
-*/
 
 #ifdef __cplusplus
 }
diff --git a/src/clib/dedisp.h b/src/clib/dedisp.h
index 83d074a74e06f87fec859cb4df0d1bfafb0e8514..71c244d163d4ecab42627017428b65eb7671957f 100644
--- a/src/clib/dedisp.h
+++ b/src/clib/dedisp.h
@@ -90,6 +90,7 @@ typedef enum {
 	DEDISP_DEVICE_ALREADY_SET,
 	DEDISP_PRIOR_GPU_ERROR,
 	DEDISP_INTERNAL_GPU_ERROR,
+    DEDISP_NOT_IMPLEMENTED_ERROR,
 	DEDISP_UNKNOWN_ERROR
 } dedisp_error;
 /*! \enum dedisp_error
@@ -257,13 +258,10 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan  plan,
                                      dedisp_float pulse_width,
                                      dedisp_float tol);
 
-/* Currently not implemented in C-interface:
 dedisp_float * dedisp_generate_dm_list_guru (dedisp_float dm_start, dedisp_float dm_end,
                                      double dt, double ti, double f0, double df,
                                      dedisp_size nchans, double tol, dedisp_size * dm_count);
 
-*/
-
 // Getters
 // -------
 /*! \p dedisp_get_max_delay gets the maximum delay (in samples) applied during
@@ -512,24 +510,21 @@ dedisp_error dedisp_sync(void);
  *  \param tol The smearing tolerance at which the time resolution is reduced.
  *           A typical value is 1.15, meaning a tolerance of 15%.
  */
-/*
 dedisp_error dedisp_enable_adaptive_dt(dedisp_plan  plan,
                                        dedisp_float pulse_width,
                                        dedisp_float tol);
-*/
+
 /*! \p dedisp_disable_adaptive_dt disables adaptive time resolution for \p plan.
  *  \param plan The plan for which to disable adaptive time resolution.
  */
-/*
 dedisp_error dedisp_disable_adaptive_dt(dedisp_plan plan);
-*/
+
 /*! \p dedisp_using_adaptive_dt returns whether \p plan has adaptive time
  *       resolution enabled.
  *  \param plan The plan for which to query adaptive time resolution.
  */
-/*
 dedisp_bool  dedisp_using_adaptive_dt(const dedisp_plan plan);
-*/
+
 /*! \p dedisp_get_dt_factors returns an array of length
  *       \p dedisp_get_dm_count(\p plan) containing the integer factors by which
  *       the time resolution (1/dt) is decreased for each DM. Note that the
@@ -541,9 +536,7 @@ dedisp_bool  dedisp_using_adaptive_dt(const dedisp_plan plan);
  *          \p dedisp_enable_adaptive_dt has not been called; in such cases the
  *          returned list will consist entirely of ones.
  */
-/*
 const dedisp_size* dedisp_get_dt_factors(const dedisp_plan plan);
-*/
 
 #ifdef __cplusplus
 } // closing brace for extern "C"
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 09a41fe55c429f75b6e5c64d0605287d6a84408b..8e6a927c87322684e24bf683d4b2a38c0cfc4afc 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -5,4 +5,25 @@ add_library(common SHARED
 
 target_link_libraries(common OpenMP::OpenMP_CXX)
 
+set_target_properties(
+    common PROPERTIES
+    VERSION ${DEDISP_VERSION}
+    SOVERSION ${DEDISP_VERSION_MAJOR}
+)
+
+install(
+    TARGETS
+        common
+    LIBRARY
+        DESTINATION lib
+)
+
+# Install dedisp_types.h in common subdirectory
+# for backwards compatibility with Heimdall
+install(
+    FILES
+        dedisp_types.h
+    DESTINATION include/common
+)
+
 add_subdirectory(cuda)
diff --git a/src/dedisp/CMakeLists.txt b/src/dedisp/CMakeLists.txt
index 276f085f8bc8dc20b5a2dee22181c72176ae1081..4f69d408c10c658690953f455105bada9b8b0d28 100644
--- a/src/dedisp/CMakeLists.txt
+++ b/src/dedisp/CMakeLists.txt
@@ -26,15 +26,11 @@ target_include_directories(
    PUBLIC ${CUDA_TOOLKIT_INCLUDE}
 )
 
-set_target_properties(
-    dedisp
-    PROPERTIES PUBLIC_HEADER DedispPlan.hpp
-)
-
 set_target_properties(
     dedisp PROPERTIES
     VERSION ${DEDISP_VERSION}
     SOVERSION ${DEDISP_VERSION_MAJOR}
+    PUBLIC_HEADER DedispPlan.hpp
 )
 
 install(
@@ -44,4 +40,4 @@ install(
         DESTINATION lib
     PUBLIC_HEADER
         DESTINATION include
-)
+)
\ No newline at end of file
diff --git a/src/tdd/CMakeLists.txt b/src/tdd/CMakeLists.txt
index fd70d7ce77607fbc47df779cf12bfdecb244cdb5..5ff50566c5eb4e0e4a7ea1bd09c3b75adff19d2f 100644
--- a/src/tdd/CMakeLists.txt
+++ b/src/tdd/CMakeLists.txt
@@ -2,7 +2,7 @@ cuda_add_library(
     tdd SHARED
     TDDPlan.cpp
     dedisperse/TDDKernel.cu
-    unpack/unpack.cu
+    transpose/transpose.cu
     $<TARGET_OBJECTS:plan>
     $<TARGET_OBJECTS:external>
 )
@@ -19,15 +19,11 @@ target_include_directories(
     PUBLIC ${CUDA_TOOLKIT_INCLUDE}
 )
 
-set_target_properties(
-    tdd
-    PROPERTIES PUBLIC_HEADER TDDPlan.hpp
-)
-
 set_target_properties(
     tdd PROPERTIES
     VERSION ${DEDISP_VERSION}
     SOVERSION ${DEDISP_VERSION_MAJOR}
+    PUBLIC_HEADER TDDPlan.hpp
 )
 
 install(
diff --git a/src/tdd/TDDPlan.cpp b/src/tdd/TDDPlan.cpp
index f764890fc3fffaf9a2ecd7945d2bbb5b55fed905..8bd3bcc57a33063e4658cb6ad5b370e0c1d5f8d8 100644
--- a/src/tdd/TDDPlan.cpp
+++ b/src/tdd/TDDPlan.cpp
@@ -27,7 +27,7 @@
 #include "common/helper.h"
 
 #include "dedisperse/TDDKernel.hpp"
-#include "unpack/unpack.h"
+#include "transpose/transpose.h"
 
 #include "dedisp_error.hpp"
 
@@ -244,15 +244,6 @@ void TDDPlan::execute_guru(
     dedisp_size in_count_padded_gulp_max =
         nsamps_padded_gulp_max * in_buf_stride_words;
 
-    // TODO: Make this a parameter?
-    dedisp_size min_in_nbits = 0;
-    dedisp_size unpacked_in_nbits = std::max((int)in_nbits, (int)min_in_nbits);
-    dedisp_size unpacked_chans_per_word =
-        sizeof(dedisp_word)*BITS_PER_BYTE / unpacked_in_nbits;
-    dedisp_size unpacked_nchan_words = m_nchans / unpacked_chans_per_word;
-    dedisp_size unpacked_buf_stride_words = unpacked_nchan_words;
-    dedisp_size unpacked_count_padded_gulp_max =
-        nsamps_padded_gulp_max * unpacked_buf_stride_words;
 
     dedisp_size out_stride_gulp_samples  = nsamps_computed_gulp_max;
     dedisp_size out_stride_gulp_bytes    =
@@ -267,7 +258,6 @@ void TDDPlan::execute_guru(
     std::cout << memory_alloc_str << std::endl;
 #endif
     cu::DeviceMemory d_transposed(in_count_padded_gulp_max * sizeof(dedisp_word));
-    cu::DeviceMemory d_unpacked(unpacked_count_padded_gulp_max * sizeof(dedisp_word));
     cu::DeviceMemory d_out(out_count_gulp_max * sizeof(dedisp_word));
     cu::HostMemory h_out(out_count_gulp_max * sizeof(dedisp_word));
     std::vector<cu::HostMemory> h_in_(2);
@@ -381,22 +371,21 @@ void TDDPlan::execute_guru(
         // Transpose and unpack the words in the input
         executestream->waitEvent(job.inputEnd);
         executestream->record(job.preprocessingStart);
-        transpose_unpack(
+        tdd::transpose(
             (dedisp_word *) job.d_in_ptr,
             nchan_words, job.nsamps_gulp,
             in_buf_stride_words, job.nsamps_padded_gulp,
-            (dedisp_word *) d_unpacked,
-            in_nbits, unpacked_in_nbits,
+            (dedisp_word *) d_transposed,
             *executestream);
         executestream->record(job.preprocessingEnd);
 
         // Perform direct dedispersion without scrunching
         executestream->record(job.dedispersionStart);
         m_kernel.launch(
-            d_unpacked,               // d_in
+            d_transposed,             // d_in
             job.nsamps_padded_gulp,   // in_stride
             job.nsamps_computed_gulp, // nsamps
-            unpacked_in_nbits,        // in_nbits,
+            in_nbits,                 // in_nbits,
             m_nchans,                 // nchans
             1,                        // chan_stride
             d_dm_list,                // d_dm_list
diff --git a/src/tdd/unpack/unpack.cu b/src/tdd/transpose/transpose.cu
similarity index 71%
rename from src/tdd/unpack/unpack.cu
rename to src/tdd/transpose/transpose.cu
index 2c3dbdcb5aa781bfb3f927166812435604152853..ba24558e22a79db8d982ab8a5b0bb3d5f68b8276 100644
--- a/src/tdd/unpack/unpack.cu
+++ b/src/tdd/transpose/transpose.cu
@@ -8,7 +8,9 @@
 */
 #include <algorithm>
 #include "dedisp_types.h"
-#include "unpack_kernel.cuh"
+#include "transpose_kernel.cuh"
+
+namespace tdd {
 
 template<typename U>
 inline U round_up_pow2(const U& a) {
@@ -22,12 +24,11 @@ inline U round_down_pow2(const U& a) {
     return round_up_pow2(a+1)/2;
 }
 
-void transpose_unpack(
+void transpose(
     const dedisp_word* d_in,
     size_t width, size_t height,
     size_t in_stride, size_t out_stride,
     dedisp_word* d_out,
-    dedisp_size in_nbits, dedisp_size out_nbits,
     cudaStream_t stream)
 {
     // Specify thread decomposition (uses up-rounded divisions)
@@ -41,7 +42,6 @@ void transpose_unpack(
                 block_y_offset < tot_block_count.y;
                 block_y_offset += max_grid_dim)
     {
-
         dim3 block_count;
 
         // Handle the possibly incomplete final grid
@@ -69,26 +69,15 @@ void transpose_unpack(
                       round_up_pow2(block_count.y));
 
             // Run the CUDA kernel
-            #define CALL_KERNEL(OUT_NBITS) \
-            transpose_unpack_kernel<dedisp_word, OUT_NBITS><<<grid, block, 0, stream>>> \
-                (d_in + in_offset,      \
-                 w, h,                  \
-                 in_stride, out_stride, \
-                 d_out + out_offset,    \
-                 block_count.x,         \
-                 block_count.y,         \
-                 in_nbits);             \
-
-            // Dispatch dynamically on out_nbits for supported values
-            switch (out_nbits)
-            {
-                case 1:  CALL_KERNEL(1); break;
-                case 2:  CALL_KERNEL(2); break;
-                case 4:  CALL_KERNEL(4); break;
-                case 8:  CALL_KERNEL(8); break;
-                case 16: CALL_KERNEL(16); break;
-                default: /* should never be reached */ break;
-            }
+            transpose_kernel<dedisp_word><<<grid, block, 0, stream>>>
+                (d_in + in_offset,
+                 w, h,
+                 in_stride, out_stride,
+                 d_out + out_offset,
+                 block_count.x,
+                 block_count.y);
         } // end for block_x_offset
     } // end for block_y_offset
-}
\ No newline at end of file
+}
+
+} // end namespace tdd
\ No newline at end of file
diff --git a/src/tdd/unpack/unpack.h b/src/tdd/transpose/transpose.h
similarity index 85%
rename from src/tdd/unpack/unpack.h
rename to src/tdd/transpose/transpose.h
index 30b8abb4095d22ebcceaed53c464582e91043ea0..cd099058466fdcf7fa134d32c79b85a61b6c82a2 100644
--- a/src/tdd/unpack/unpack.h
+++ b/src/tdd/transpose/transpose.h
@@ -7,10 +7,13 @@
 #include "dedisp_types.h"
 #include "cuda_runtime.h"
 
-void transpose_unpack(
+namespace tdd {
+
+void transpose(
     const dedisp_word* d_in,
     size_t width, size_t height,
     size_t in_stride, size_t out_stride,
     dedisp_word* d_out,
-    dedisp_size in_nbits, dedisp_size out_nbits,
     cudaStream_t stream);
+
+}; // end namespace tdd
\ No newline at end of file
diff --git a/src/tdd/transpose/transpose_kernel.cuh b/src/tdd/transpose/transpose_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a5cfda9fb2bd370e465b51a61ea40985f4bc22d4
--- /dev/null
+++ b/src/tdd/transpose/transpose_kernel.cuh
@@ -0,0 +1,60 @@
+/*
+* Copyright (C) 2021 ASTRON (Netherlands Institute for Radio Astronomy)
+* SPDX-License-Identifier: GPL-3.0-or-later
+* Time Domain Dedispersion (TDD)
+* is an optimized version of the original dedisp implementation.
+*/
+#define TILE_DIM     32
+#define BLOCK_ROWS   8
+
+namespace tdd {
+
+template<typename WordType>
+__global__
+void transpose_kernel(
+    const WordType* in,
+    size_t width,
+    size_t height,
+    size_t in_stride,
+    size_t out_stride,
+    WordType* out,
+    size_t block_count_x,
+    size_t block_count_y)
+{
+    __shared__ WordType tile[TILE_DIM][TILE_DIM];
+
+    // Cull excess blocks (there may be many if we round up to a power of 2)
+    if( blockIdx.x >= block_count_x ||
+        blockIdx.y >= block_count_y ) {
+        return;
+    }
+
+    size_t index_in_x = blockIdx.x * TILE_DIM + threadIdx.x;
+    size_t index_in_y = blockIdx.y * TILE_DIM + threadIdx.y;
+    size_t index_in = index_in_x + (index_in_y)*in_stride;
+
+#pragma unroll
+    for( size_t i=0; i<TILE_DIM; i+=BLOCK_ROWS ) {
+        // TODO: Is it possible to cull some excess threads early?
+        if( index_in_x < width && index_in_y+i < height )
+                tile[threadIdx.y+i][threadIdx.x] = in[index_in+i*in_stride];
+    }
+
+    __syncthreads();
+
+    size_t index_out_x = blockIdx.y * TILE_DIM + threadIdx.x;
+    // Avoid excess threads
+    if( index_out_x >= height ) return;
+    size_t index_out_y = blockIdx.x * TILE_DIM + threadIdx.y;
+    size_t index_out = index_out_x + (index_out_y)*out_stride;
+
+#pragma unroll
+    for( size_t i=0; i<TILE_DIM; i+=BLOCK_ROWS ) {
+        // Avoid excess threads
+        if( index_out_y+i < width ) {
+            out[index_out+i*out_stride] = tile[threadIdx.x][threadIdx.y+i];
+        }
+    }
+}
+
+} // end namespace tdd
\ No newline at end of file
diff --git a/src/tdd/unpack/unpack_kernel.cuh b/src/tdd/unpack/unpack_kernel.cuh
deleted file mode 100644
index 3729f234d9b6256a5e30ef0c30d6fdc2653ec8bf..0000000000000000000000000000000000000000
--- a/src/tdd/unpack/unpack_kernel.cuh
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
-* Copyright (C) 2021 ASTRON (Netherlands Institute for Radio Astronomy)
-* SPDX-License-Identifier: GPL-3.0-or-later
-* Time Domain Dedispersion (TDD)
-* is an optimized version of the original dedisp implementation.
-*/
-#define TILE_DIM     32
-#define BLOCK_ROWS   8
-
-template<typename WordType, unsigned int OUT_NBITS>
-__global__
-void transpose_unpack_kernel(
-    const WordType* in,
-    size_t width,
-    size_t height,
-    size_t in_stride,
-    size_t out_stride,
-    WordType* out,
-    size_t block_count_x,
-    size_t block_count_y,
-    size_t in_nbits)
-{
-    // The input data has dimensions height * width,
-    // with width corresponding to a number of channel words,
-    // and height corresponding to a number of observing channels.
-    // The output will have expansion times as many channel words,
-    // thus this kernel transforms:
-    // data[height][width] -> unpacked[width][height*expansion]
-    dedisp_size expansion = OUT_NBITS / in_nbits;
-    // the output stride therefore scales linearly with the expansion.
-    out_stride *= expansion;
-
-    // Shared memory buffer for the transposed input data
-    __shared__ WordType tile[TILE_DIM][TILE_DIM];
-
-    // Cull excess blocks
-    if (blockIdx.x >= block_count_x ||
-        blockIdx.y >= block_count_y)
-    {
-        return;
-    }
-
-    // Compute index in input matrix data[height][width]
-    size_t index_in_x = blockIdx.x * TILE_DIM + threadIdx.x;
-    size_t index_in_y = blockIdx.y * TILE_DIM + threadIdx.y;
-    size_t index_in   = index_in_x + (index_in_y)*in_stride;
-
-    // Transpose a tile of input into shared memory:
-    // data[nwords_tile][nsamps_tile] -> transposed[nsamps_tile][nwords_tile]
-    #pragma unroll
-    for (size_t i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-    {
-        if (index_in_x < width && index_in_y+i < height)
-        {
-            tile[threadIdx.x][threadIdx.y+i] = in[index_in+i*in_stride];
-        }
-    }
-
-    __syncthreads();
-
-    // Helper variables for data unpacking
-    int out_chans_per_word = sizeof(WordType)*8 / OUT_NBITS;
-    int in_chans_per_word = sizeof(WordType)*8 / in_nbits;
-    int norm = ((1l<<OUT_NBITS)-1) / ((1l<<in_nbits)-1);
-    WordType in_mask  = (1<<in_nbits)-1;
-    WordType out_mask = (1<<OUT_NBITS)-1;
-
-    // Unpack the inner dimension of the transposed tile:
-    // transposed[nsamps_tile][nwords_tile] -> unpacked[nsamps_tile][nwords_tile*expansion]
-    // Process one row (one samp) per iteration.
-    #pragma unroll
-    for (size_t i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-    {
-        // Map threads [0:tile_dim] to [0:tile_dim*expansion],
-        // to compute [nwords_tile*expansion] channel words.
-        unsigned int tile_width_out = TILE_DIM * expansion;
-        for (size_t j = 0; j < tile_width_out; j += blockDim.x)
-        {
-            // Compute index in output matrix unpacked[width][height*expansion]
-            size_t index_out_x = blockIdx.y * tile_width_out + (threadIdx.x + j);
-            size_t index_out_y = blockIdx.x * TILE_DIM + threadIdx.y;
-            size_t index_out   = index_out_x + (index_out_y)*out_stride;
-
-            // Avoid excess threads
-            if (index_out_x < out_stride && index_out_y+i < width)
-            {
-                // Construct an output channel word
-                WordType result = 0;
-                for (int k = 0; k < sizeof(WordType)*8; k += OUT_NBITS)
-                {
-                    int out_cw = threadIdx.x + j;
-                    int c = out_cw * out_chans_per_word + k/OUT_NBITS;
-                    int in_cw = c / in_chans_per_word;
-                    int in_k  = c % in_chans_per_word * in_nbits;
-                    WordType word = tile[threadIdx.y+i][in_cw];
-
-                    WordType val = (word >> in_k) & in_mask;
-                    result |= ((val * norm) & out_mask) << k;
-                }
-
-                // Write the result to device memory
-                out[index_out+i*out_stride] = result;
-            }
-        }
-    }
-}
\ No newline at end of file