diff --git a/bin/test/testtranspose.cpp b/bin/test/testtranspose.cpp
index c0cc686252ab56f933213dd59208d843909c3f58..f03d14749444516834a123c2e3c92590426ff4f2 100644
--- a/bin/test/testtranspose.cpp
+++ b/bin/test/testtranspose.cpp
@@ -7,7 +7,7 @@
 #include "cuda/CU.h"
 #include "dedisp_types.h"
 #include "dedisp/transpose/transpose.hpp"
-#include "tdd/unpack/unpack.h"
+#include "tdd/transpose/transpose.h"
 
 #define BITS_PER_BYTE 8
 #define DEDISP_SAMPS_PER_THREAD 2
@@ -73,7 +73,6 @@ int run_test(dedisp_size in_nbits)
     dedisp_size nsamps           = Tobs / dt;
 
     // Data parameters
-    dedisp_size out_nbits      = in_nbits; // no unpack
     dedisp_size chans_per_word = sizeof(dedisp_word)*BITS_PER_BYTE / in_nbits;
     dedisp_size nchan_words    = nchans / chans_per_word;
     dedisp_float datarms       = 25.0;
@@ -91,7 +90,6 @@ int run_test(dedisp_size in_nbits)
     // Print parameters
     std::cout << " >>> Parameters" << std::endl;
     std::cout << "in_nbits:           " << in_nbits << std::endl;
-    std::cout << "out_nbits:          " << out_nbits << std::endl;
     std::cout << "nsamps:             " << nsamps << std::endl;
     std::cout << "nsamps_computed:    " << nsamps_computed << std::endl;
     std::cout << "nsamps_gulp:        " << nsamps_gulp << std::endl;
@@ -150,12 +148,11 @@ int run_test(dedisp_size in_nbits)
     // TDD fused kernel
     std::cout << " >>> Run tdd transpose" << std::endl;
     d_output.zero(stream);
-    transpose_unpack(
+    tdd::transpose(
         (dedisp_word *) d_input,
         nchan_words, nsamps_gulp,
         nchan_words, nsamps_padded_gulp,
         (dedisp_word *) d_output,
-        in_nbits, out_nbits,
         stream);
     stream.memcpyDtoHAsync(output_tst.data(), d_output, d_output.size());
 
diff --git a/src/tdd/CMakeLists.txt b/src/tdd/CMakeLists.txt
index b09dbfde8168a37e247c69eb152dea3beac4c0ee..5ff50566c5eb4e0e4a7ea1bd09c3b75adff19d2f 100644
--- a/src/tdd/CMakeLists.txt
+++ b/src/tdd/CMakeLists.txt
@@ -2,10 +2,7 @@ cuda_add_library(
     tdd SHARED
     TDDPlan.cpp
     dedisperse/TDDKernel.cu
-    # FIXME
-    unpack/unpack.cu
-    ../dedisp/unpack/unpack.cu
-    ../dedisp/transpose/transpose.cu
+    transpose/transpose.cu
     $<TARGET_OBJECTS:plan>
     $<TARGET_OBJECTS:external>
 )
diff --git a/src/tdd/TDDPlan.cpp b/src/tdd/TDDPlan.cpp
index 11bb02d3eed721cc98a3b9bd160c965770afec03..8bd3bcc57a33063e4658cb6ad5b370e0c1d5f8d8 100644
--- a/src/tdd/TDDPlan.cpp
+++ b/src/tdd/TDDPlan.cpp
@@ -27,13 +27,7 @@
 #include "common/helper.h"
 
 #include "dedisperse/TDDKernel.hpp"
-// FIXME: see below for an explanation
-#if 0
-#include "unpack/unpack.h"
-#else
-#include "../dedisp/transpose/transpose.hpp"
-#include "../dedisp/unpack/unpack.h"
-#endif
+#include "transpose/transpose.h"
 
 #include "dedisp_error.hpp"
 
@@ -250,15 +244,6 @@ void TDDPlan::execute_guru(
     dedisp_size in_count_padded_gulp_max =
         nsamps_padded_gulp_max * in_buf_stride_words;
 
-    // TODO: Make this a parameter?
-    dedisp_size min_in_nbits = 0;
-    dedisp_size unpacked_in_nbits = std::max((int)in_nbits, (int)min_in_nbits);
-    dedisp_size unpacked_chans_per_word =
-        sizeof(dedisp_word)*BITS_PER_BYTE / unpacked_in_nbits;
-    dedisp_size unpacked_nchan_words = m_nchans / unpacked_chans_per_word;
-    dedisp_size unpacked_buf_stride_words = unpacked_nchan_words;
-    dedisp_size unpacked_count_padded_gulp_max =
-        nsamps_padded_gulp_max * unpacked_buf_stride_words;
 
     dedisp_size out_stride_gulp_samples  = nsamps_computed_gulp_max;
     dedisp_size out_stride_gulp_bytes    =
@@ -273,7 +258,6 @@ void TDDPlan::execute_guru(
     std::cout << memory_alloc_str << std::endl;
 #endif
     cu::DeviceMemory d_transposed(in_count_padded_gulp_max * sizeof(dedisp_word));
-    cu::DeviceMemory d_unpacked(unpacked_count_padded_gulp_max * sizeof(dedisp_word));
     cu::DeviceMemory d_out(out_count_gulp_max * sizeof(dedisp_word));
     cu::HostMemory h_out(out_count_gulp_max * sizeof(dedisp_word));
     std::vector<cu::HostMemory> h_in_(2);
@@ -387,37 +371,21 @@ void TDDPlan::execute_guru(
         // Transpose and unpack the words in the input
         executestream->waitEvent(job.inputEnd);
         executestream->record(job.preprocessingStart);
-        #if 0
-        // The unpack part of the transpose_unpack function seems broken,
-        // in some cases (number of bits?) it does not produce the same output
-        // as the reference dedisp transpose + unpack kernels. TODO: debug.
-        transpose_unpack(
+        tdd::transpose(
             (dedisp_word *) job.d_in_ptr,
             nchan_words, job.nsamps_gulp,
             in_buf_stride_words, job.nsamps_padded_gulp,
-            (dedisp_word *) d_unpacked,
-            in_nbits, unpacked_in_nbits,
+            (dedisp_word *) d_transposed,
             *executestream);
-        #else
-        // Use the original dedisp transpose + unpack kernels.
-        transpose((dedisp_word *) job.d_in_ptr,
-                  nchan_words, job.nsamps_gulp,
-                  in_buf_stride_words, job.nsamps_padded_gulp,
-                  (dedisp_word *) d_transposed);
-
-        unpack(d_transposed, job.nsamps_padded_gulp, nchan_words,
-               d_unpacked,
-               in_nbits, unpacked_in_nbits);
-        #endif
         executestream->record(job.preprocessingEnd);
 
         // Perform direct dedispersion without scrunching
         executestream->record(job.dedispersionStart);
         m_kernel.launch(
-            d_unpacked,               // d_in
+            d_transposed,             // d_in
             job.nsamps_padded_gulp,   // in_stride
             job.nsamps_computed_gulp, // nsamps
-            unpacked_in_nbits,        // in_nbits,
+            in_nbits,                 // in_nbits,
             m_nchans,                 // nchans
             1,                        // chan_stride
             d_dm_list,                // d_dm_list
diff --git a/src/tdd/unpack/unpack.cu b/src/tdd/transpose/transpose.cu
similarity index 70%
rename from src/tdd/unpack/unpack.cu
rename to src/tdd/transpose/transpose.cu
index 901659d32662ed3cd4991f161db4269d28ad5ca6..ba24558e22a79db8d982ab8a5b0bb3d5f68b8276 100644
--- a/src/tdd/unpack/unpack.cu
+++ b/src/tdd/transpose/transpose.cu
@@ -8,7 +8,9 @@
 */
 #include <algorithm>
 #include "dedisp_types.h"
-#include "unpack_kernel.cuh"
+#include "transpose_kernel.cuh"
+
+namespace tdd {
 
 template<typename U>
 inline U round_up_pow2(const U& a) {
@@ -22,12 +24,11 @@ inline U round_down_pow2(const U& a) {
     return round_up_pow2(a+1)/2;
 }
 
-void transpose_unpack(
+void transpose(
     const dedisp_word* d_in,
     size_t width, size_t height,
     size_t in_stride, size_t out_stride,
     dedisp_word* d_out,
-    dedisp_size in_nbits, dedisp_size out_nbits,
     cudaStream_t stream)
 {
     // Specify thread decomposition (uses up-rounded divisions)
@@ -41,7 +42,6 @@ void transpose_unpack(
                 block_y_offset < tot_block_count.y;
                 block_y_offset += max_grid_dim)
     {
-
         dim3 block_count;
 
         // Handle the possibly incomplete final grid
@@ -69,27 +69,15 @@ void transpose_unpack(
                       round_up_pow2(block_count.y));
 
             // Run the CUDA kernel
-            #define CALL_KERNEL(OUT_NBITS) \
-            transpose_unpack_kernel<dedisp_word, OUT_NBITS><<<grid, block, 0, stream>>> \
-                (d_in + in_offset,      \
-                 w, h,                  \
-                 in_stride, out_stride, \
-                 d_out + out_offset,    \
-                 block_count.x,         \
-                 block_count.y,         \
-                 in_nbits);             \
-
-            // Dispatch dynamically on out_nbits for supported values
-            switch (out_nbits)
-            {
-                case 1:  CALL_KERNEL(1); break;
-                case 2:  CALL_KERNEL(2); break;
-                case 4:  CALL_KERNEL(4); break;
-                case 8:  CALL_KERNEL(8); break;
-                case 16: CALL_KERNEL(16); break;
-                case 32: CALL_KERNEL(32); break;
-                default: /* should never be reached */ break;
-            }
+            transpose_kernel<dedisp_word><<<grid, block, 0, stream>>>
+                (d_in + in_offset,
+                 w, h,
+                 in_stride, out_stride,
+                 d_out + out_offset,
+                 block_count.x,
+                 block_count.y);
         } // end for block_x_offset
     } // end for block_y_offset
-}
\ No newline at end of file
+}
+
+} // end namespace tdd
\ No newline at end of file
diff --git a/src/tdd/unpack/unpack.h b/src/tdd/transpose/transpose.h
similarity index 85%
rename from src/tdd/unpack/unpack.h
rename to src/tdd/transpose/transpose.h
index 30b8abb4095d22ebcceaed53c464582e91043ea0..cd099058466fdcf7fa134d32c79b85a61b6c82a2 100644
--- a/src/tdd/unpack/unpack.h
+++ b/src/tdd/transpose/transpose.h
@@ -7,10 +7,13 @@
 #include "dedisp_types.h"
 #include "cuda_runtime.h"
 
-void transpose_unpack(
+namespace tdd {
+
+void transpose(
     const dedisp_word* d_in,
     size_t width, size_t height,
     size_t in_stride, size_t out_stride,
     dedisp_word* d_out,
-    dedisp_size in_nbits, dedisp_size out_nbits,
     cudaStream_t stream);
+
+}; // end namespace tdd
\ No newline at end of file
diff --git a/src/tdd/transpose/transpose_kernel.cuh b/src/tdd/transpose/transpose_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a5cfda9fb2bd370e465b51a61ea40985f4bc22d4
--- /dev/null
+++ b/src/tdd/transpose/transpose_kernel.cuh
@@ -0,0 +1,60 @@
+/*
+* Copyright (C) 2021 ASTRON (Netherlands Institute for Radio Astronomy)
+* SPDX-License-Identifier: GPL-3.0-or-later
+* Time Domain Dedispersion (TDD)
+* is an optimized version of the original dedisp implementation.
+*/
+#define TILE_DIM     32
+#define BLOCK_ROWS   8
+
+namespace tdd {
+
+template<typename WordType>
+__global__
+void transpose_kernel(
+    const WordType* in,
+    size_t width,
+    size_t height,
+    size_t in_stride,
+    size_t out_stride,
+    WordType* out,
+    size_t block_count_x,
+    size_t block_count_y)
+{
+    __shared__ WordType tile[TILE_DIM][TILE_DIM];
+
+    // Cull excess blocks (there may be many if we round up to a power of 2)
+    if( blockIdx.x >= block_count_x ||
+        blockIdx.y >= block_count_y ) {
+        return;
+    }
+
+    size_t index_in_x = blockIdx.x * TILE_DIM + threadIdx.x;
+    size_t index_in_y = blockIdx.y * TILE_DIM + threadIdx.y;
+    size_t index_in = index_in_x + (index_in_y)*in_stride;
+
+#pragma unroll
+    for( size_t i=0; i<TILE_DIM; i+=BLOCK_ROWS ) {
+        // TODO: Is it possible to cull some excess threads early?
+        if( index_in_x < width && index_in_y+i < height )
+                tile[threadIdx.y+i][threadIdx.x] = in[index_in+i*in_stride];
+    }
+
+    __syncthreads();
+
+    size_t index_out_x = blockIdx.y * TILE_DIM + threadIdx.x;
+    // Avoid excess threads
+    if( index_out_x >= height ) return;
+    size_t index_out_y = blockIdx.x * TILE_DIM + threadIdx.y;
+    size_t index_out = index_out_x + (index_out_y)*out_stride;
+
+#pragma unroll
+    for( size_t i=0; i<TILE_DIM; i+=BLOCK_ROWS ) {
+        // Avoid excess threads
+        if( index_out_y+i < width ) {
+            out[index_out+i*out_stride] = tile[threadIdx.x][threadIdx.y+i];
+        }
+    }
+}
+
+} // end namespace tdd
\ No newline at end of file
diff --git a/src/tdd/unpack/unpack_kernel.cuh b/src/tdd/unpack/unpack_kernel.cuh
deleted file mode 100644
index 3729f234d9b6256a5e30ef0c30d6fdc2653ec8bf..0000000000000000000000000000000000000000
--- a/src/tdd/unpack/unpack_kernel.cuh
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
-* Copyright (C) 2021 ASTRON (Netherlands Institute for Radio Astronomy)
-* SPDX-License-Identifier: GPL-3.0-or-later
-* Time Domain Dedispersion (TDD)
-* is an optimized version of the original dedisp implementation.
-*/
-#define TILE_DIM     32
-#define BLOCK_ROWS   8
-
-template<typename WordType, unsigned int OUT_NBITS>
-__global__
-void transpose_unpack_kernel(
-    const WordType* in,
-    size_t width,
-    size_t height,
-    size_t in_stride,
-    size_t out_stride,
-    WordType* out,
-    size_t block_count_x,
-    size_t block_count_y,
-    size_t in_nbits)
-{
-    // The input data has dimensions height * width,
-    // with width corresponding to a number of channel words,
-    // and height corresponding to a number of observing channels.
-    // The output will have expansion times as many channel words,
-    // thus this kernel transforms:
-    // data[height][width] -> unpacked[width][height*expansion]
-    dedisp_size expansion = OUT_NBITS / in_nbits;
-    // the output stride therefore scales linearly with the expansion.
-    out_stride *= expansion;
-
-    // Shared memory buffer for the transposed input data
-    __shared__ WordType tile[TILE_DIM][TILE_DIM];
-
-    // Cull excess blocks
-    if (blockIdx.x >= block_count_x ||
-        blockIdx.y >= block_count_y)
-    {
-        return;
-    }
-
-    // Compute index in input matrix data[height][width]
-    size_t index_in_x = blockIdx.x * TILE_DIM + threadIdx.x;
-    size_t index_in_y = blockIdx.y * TILE_DIM + threadIdx.y;
-    size_t index_in   = index_in_x + (index_in_y)*in_stride;
-
-    // Transpose a tile of input into shared memory:
-    // data[nwords_tile][nsamps_tile] -> transposed[nsamps_tile][nwords_tile]
-    #pragma unroll
-    for (size_t i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-    {
-        if (index_in_x < width && index_in_y+i < height)
-        {
-            tile[threadIdx.x][threadIdx.y+i] = in[index_in+i*in_stride];
-        }
-    }
-
-    __syncthreads();
-
-    // Helper variables for data unpacking
-    int out_chans_per_word = sizeof(WordType)*8 / OUT_NBITS;
-    int in_chans_per_word = sizeof(WordType)*8 / in_nbits;
-    int norm = ((1l<<OUT_NBITS)-1) / ((1l<<in_nbits)-1);
-    WordType in_mask  = (1<<in_nbits)-1;
-    WordType out_mask = (1<<OUT_NBITS)-1;
-
-    // Unpack the inner dimension of the transposed tile:
-    // transposed[nsamps_tile][nwords_tile] -> unpacked[nsamps_tile][nwords_tile*expansion]
-    // Process one row (one samp) per iteration.
-    #pragma unroll
-    for (size_t i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-    {
-        // Map threads [0:tile_dim] to [0:tile_dim*expansion],
-        // to compute [nwords_tile*expansion] channel words.
-        unsigned int tile_width_out = TILE_DIM * expansion;
-        for (size_t j = 0; j < tile_width_out; j += blockDim.x)
-        {
-            // Compute index in output matrix unpacked[width][height*expansion]
-            size_t index_out_x = blockIdx.y * tile_width_out + (threadIdx.x + j);
-            size_t index_out_y = blockIdx.x * TILE_DIM + threadIdx.y;
-            size_t index_out   = index_out_x + (index_out_y)*out_stride;
-
-            // Avoid excess threads
-            if (index_out_x < out_stride && index_out_y+i < width)
-            {
-                // Construct an output channel word
-                WordType result = 0;
-                for (int k = 0; k < sizeof(WordType)*8; k += OUT_NBITS)
-                {
-                    int out_cw = threadIdx.x + j;
-                    int c = out_cw * out_chans_per_word + k/OUT_NBITS;
-                    int in_cw = c / in_chans_per_word;
-                    int in_k  = c % in_chans_per_word * in_nbits;
-                    WordType word = tile[threadIdx.y+i][in_cw];
-
-                    WordType val = (word >> in_k) & in_mask;
-                    result |= ((val * norm) & out_mask) << k;
-                }
-
-                // Write the result to device memory
-                out[index_out+i*out_stride] = result;
-            }
-        }
-    }
-}
\ No newline at end of file