diff --git a/bin/test/testtranspose.cpp b/bin/test/testtranspose.cpp index c0cc686252ab56f933213dd59208d843909c3f58..f03d14749444516834a123c2e3c92590426ff4f2 100644 --- a/bin/test/testtranspose.cpp +++ b/bin/test/testtranspose.cpp @@ -7,7 +7,7 @@ #include "cuda/CU.h" #include "dedisp_types.h" #include "dedisp/transpose/transpose.hpp" -#include "tdd/unpack/unpack.h" +#include "tdd/transpose/transpose.h" #define BITS_PER_BYTE 8 #define DEDISP_SAMPS_PER_THREAD 2 @@ -73,7 +73,6 @@ int run_test(dedisp_size in_nbits) dedisp_size nsamps = Tobs / dt; // Data parameters - dedisp_size out_nbits = in_nbits; // no unpack dedisp_size chans_per_word = sizeof(dedisp_word)*BITS_PER_BYTE / in_nbits; dedisp_size nchan_words = nchans / chans_per_word; dedisp_float datarms = 25.0; @@ -91,7 +90,6 @@ int run_test(dedisp_size in_nbits) // Print parameters std::cout << " >>> Parameters" << std::endl; std::cout << "in_nbits: " << in_nbits << std::endl; - std::cout << "out_nbits: " << out_nbits << std::endl; std::cout << "nsamps: " << nsamps << std::endl; std::cout << "nsamps_computed: " << nsamps_computed << std::endl; std::cout << "nsamps_gulp: " << nsamps_gulp << std::endl; @@ -150,12 +148,11 @@ int run_test(dedisp_size in_nbits) // TDD fused kernel std::cout << " >>> Run tdd transpose" << std::endl; d_output.zero(stream); - transpose_unpack( + tdd::transpose( (dedisp_word *) d_input, nchan_words, nsamps_gulp, nchan_words, nsamps_padded_gulp, (dedisp_word *) d_output, - in_nbits, out_nbits, stream); stream.memcpyDtoHAsync(output_tst.data(), d_output, d_output.size()); diff --git a/src/tdd/CMakeLists.txt b/src/tdd/CMakeLists.txt index b09dbfde8168a37e247c69eb152dea3beac4c0ee..5ff50566c5eb4e0e4a7ea1bd09c3b75adff19d2f 100644 --- a/src/tdd/CMakeLists.txt +++ b/src/tdd/CMakeLists.txt @@ -2,10 +2,7 @@ cuda_add_library( tdd SHARED TDDPlan.cpp dedisperse/TDDKernel.cu - # FIXME - unpack/unpack.cu - ../dedisp/unpack/unpack.cu - ../dedisp/transpose/transpose.cu + transpose/transpose.cu $<TARGET_OBJECTS:plan> $<TARGET_OBJECTS:external> ) diff --git a/src/tdd/TDDPlan.cpp b/src/tdd/TDDPlan.cpp index 11bb02d3eed721cc98a3b9bd160c965770afec03..8bd3bcc57a33063e4658cb6ad5b370e0c1d5f8d8 100644 --- a/src/tdd/TDDPlan.cpp +++ b/src/tdd/TDDPlan.cpp @@ -27,13 +27,7 @@ #include "common/helper.h" #include "dedisperse/TDDKernel.hpp" -// FIXME: see below for an explanation -#if 0 -#include "unpack/unpack.h" -#else -#include "../dedisp/transpose/transpose.hpp" -#include "../dedisp/unpack/unpack.h" -#endif +#include "transpose/transpose.h" #include "dedisp_error.hpp" @@ -250,15 +244,6 @@ void TDDPlan::execute_guru( dedisp_size in_count_padded_gulp_max = nsamps_padded_gulp_max * in_buf_stride_words; - // TODO: Make this a parameter? - dedisp_size min_in_nbits = 0; - dedisp_size unpacked_in_nbits = std::max((int)in_nbits, (int)min_in_nbits); - dedisp_size unpacked_chans_per_word = - sizeof(dedisp_word)*BITS_PER_BYTE / unpacked_in_nbits; - dedisp_size unpacked_nchan_words = m_nchans / unpacked_chans_per_word; - dedisp_size unpacked_buf_stride_words = unpacked_nchan_words; - dedisp_size unpacked_count_padded_gulp_max = - nsamps_padded_gulp_max * unpacked_buf_stride_words; dedisp_size out_stride_gulp_samples = nsamps_computed_gulp_max; dedisp_size out_stride_gulp_bytes = @@ -273,7 +258,6 @@ void TDDPlan::execute_guru( std::cout << memory_alloc_str << std::endl; #endif cu::DeviceMemory d_transposed(in_count_padded_gulp_max * sizeof(dedisp_word)); - cu::DeviceMemory d_unpacked(unpacked_count_padded_gulp_max * sizeof(dedisp_word)); cu::DeviceMemory d_out(out_count_gulp_max * sizeof(dedisp_word)); cu::HostMemory h_out(out_count_gulp_max * sizeof(dedisp_word)); std::vector<cu::HostMemory> h_in_(2); @@ -387,37 +371,21 @@ void TDDPlan::execute_guru( // Transpose and unpack the words in the input executestream->waitEvent(job.inputEnd); executestream->record(job.preprocessingStart); - #if 0 - // The unpack part of the transpose_unpack function seems broken, - // in some cases (number of bits?) it does not produce the same output - // as the reference dedisp transpose + unpack kernels. TODO: debug. - transpose_unpack( + tdd::transpose( (dedisp_word *) job.d_in_ptr, nchan_words, job.nsamps_gulp, in_buf_stride_words, job.nsamps_padded_gulp, - (dedisp_word *) d_unpacked, - in_nbits, unpacked_in_nbits, + (dedisp_word *) d_transposed, *executestream); - #else - // Use the original dedisp transpose + unpack kernels. - transpose((dedisp_word *) job.d_in_ptr, - nchan_words, job.nsamps_gulp, - in_buf_stride_words, job.nsamps_padded_gulp, - (dedisp_word *) d_transposed); - - unpack(d_transposed, job.nsamps_padded_gulp, nchan_words, - d_unpacked, - in_nbits, unpacked_in_nbits); - #endif executestream->record(job.preprocessingEnd); // Perform direct dedispersion without scrunching executestream->record(job.dedispersionStart); m_kernel.launch( - d_unpacked, // d_in + d_transposed, // d_in job.nsamps_padded_gulp, // in_stride job.nsamps_computed_gulp, // nsamps - unpacked_in_nbits, // in_nbits, + in_nbits, // in_nbits, m_nchans, // nchans 1, // chan_stride d_dm_list, // d_dm_list diff --git a/src/tdd/unpack/unpack.cu b/src/tdd/transpose/transpose.cu similarity index 70% rename from src/tdd/unpack/unpack.cu rename to src/tdd/transpose/transpose.cu index 901659d32662ed3cd4991f161db4269d28ad5ca6..ba24558e22a79db8d982ab8a5b0bb3d5f68b8276 100644 --- a/src/tdd/unpack/unpack.cu +++ b/src/tdd/transpose/transpose.cu @@ -8,7 +8,9 @@ */ #include <algorithm> #include "dedisp_types.h" -#include "unpack_kernel.cuh" +#include "transpose_kernel.cuh" + +namespace tdd { template<typename U> inline U round_up_pow2(const U& a) { @@ -22,12 +24,11 @@ inline U round_down_pow2(const U& a) { return round_up_pow2(a+1)/2; } -void transpose_unpack( +void transpose( const dedisp_word* d_in, size_t width, size_t height, size_t in_stride, size_t out_stride, dedisp_word* d_out, - dedisp_size in_nbits, dedisp_size out_nbits, cudaStream_t stream) { // Specify thread decomposition (uses up-rounded divisions) @@ -41,7 +42,6 @@ void transpose_unpack( block_y_offset < tot_block_count.y; block_y_offset += max_grid_dim) { - dim3 block_count; // Handle the possibly incomplete final grid @@ -69,27 +69,15 @@ void transpose_unpack( round_up_pow2(block_count.y)); // Run the CUDA kernel - #define CALL_KERNEL(OUT_NBITS) \ - transpose_unpack_kernel<dedisp_word, OUT_NBITS><<<grid, block, 0, stream>>> \ - (d_in + in_offset, \ - w, h, \ - in_stride, out_stride, \ - d_out + out_offset, \ - block_count.x, \ - block_count.y, \ - in_nbits); \ - - // Dispatch dynamically on out_nbits for supported values - switch (out_nbits) - { - case 1: CALL_KERNEL(1); break; - case 2: CALL_KERNEL(2); break; - case 4: CALL_KERNEL(4); break; - case 8: CALL_KERNEL(8); break; - case 16: CALL_KERNEL(16); break; - case 32: CALL_KERNEL(32); break; - default: /* should never be reached */ break; - } + transpose_kernel<dedisp_word><<<grid, block, 0, stream>>> + (d_in + in_offset, + w, h, + in_stride, out_stride, + d_out + out_offset, + block_count.x, + block_count.y); } // end for block_x_offset } // end for block_y_offset -} \ No newline at end of file +} + +} // end namespace tdd \ No newline at end of file diff --git a/src/tdd/unpack/unpack.h b/src/tdd/transpose/transpose.h similarity index 85% rename from src/tdd/unpack/unpack.h rename to src/tdd/transpose/transpose.h index 30b8abb4095d22ebcceaed53c464582e91043ea0..cd099058466fdcf7fa134d32c79b85a61b6c82a2 100644 --- a/src/tdd/unpack/unpack.h +++ b/src/tdd/transpose/transpose.h @@ -7,10 +7,13 @@ #include "dedisp_types.h" #include "cuda_runtime.h" -void transpose_unpack( +namespace tdd { + +void transpose( const dedisp_word* d_in, size_t width, size_t height, size_t in_stride, size_t out_stride, dedisp_word* d_out, - dedisp_size in_nbits, dedisp_size out_nbits, cudaStream_t stream); + +}; // end namespace tdd \ No newline at end of file diff --git a/src/tdd/transpose/transpose_kernel.cuh b/src/tdd/transpose/transpose_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a5cfda9fb2bd370e465b51a61ea40985f4bc22d4 --- /dev/null +++ b/src/tdd/transpose/transpose_kernel.cuh @@ -0,0 +1,60 @@ +/* +* Copyright (C) 2021 ASTRON (Netherlands Institute for Radio Astronomy) +* SPDX-License-Identifier: GPL-3.0-or-later +* Time Domain Dedispersion (TDD) +* is an optimized version of the original dedisp implementation. +*/ +#define TILE_DIM 32 +#define BLOCK_ROWS 8 + +namespace tdd { + +template<typename WordType> +__global__ +void transpose_kernel( + const WordType* in, + size_t width, + size_t height, + size_t in_stride, + size_t out_stride, + WordType* out, + size_t block_count_x, + size_t block_count_y) +{ + __shared__ WordType tile[TILE_DIM][TILE_DIM]; + + // Cull excess blocks (there may be many if we round up to a power of 2) + if( blockIdx.x >= block_count_x || + blockIdx.y >= block_count_y ) { + return; + } + + size_t index_in_x = blockIdx.x * TILE_DIM + threadIdx.x; + size_t index_in_y = blockIdx.y * TILE_DIM + threadIdx.y; + size_t index_in = index_in_x + (index_in_y)*in_stride; + +#pragma unroll + for( size_t i=0; i<TILE_DIM; i+=BLOCK_ROWS ) { + // TODO: Is it possible to cull some excess threads early? + if( index_in_x < width && index_in_y+i < height ) + tile[threadIdx.y+i][threadIdx.x] = in[index_in+i*in_stride]; + } + + __syncthreads(); + + size_t index_out_x = blockIdx.y * TILE_DIM + threadIdx.x; + // Avoid excess threads + if( index_out_x >= height ) return; + size_t index_out_y = blockIdx.x * TILE_DIM + threadIdx.y; + size_t index_out = index_out_x + (index_out_y)*out_stride; + +#pragma unroll + for( size_t i=0; i<TILE_DIM; i+=BLOCK_ROWS ) { + // Avoid excess threads + if( index_out_y+i < width ) { + out[index_out+i*out_stride] = tile[threadIdx.x][threadIdx.y+i]; + } + } +} + +} // end namespace tdd \ No newline at end of file diff --git a/src/tdd/unpack/unpack_kernel.cuh b/src/tdd/unpack/unpack_kernel.cuh deleted file mode 100644 index 3729f234d9b6256a5e30ef0c30d6fdc2653ec8bf..0000000000000000000000000000000000000000 --- a/src/tdd/unpack/unpack_kernel.cuh +++ /dev/null @@ -1,106 +0,0 @@ -/* -* Copyright (C) 2021 ASTRON (Netherlands Institute for Radio Astronomy) -* SPDX-License-Identifier: GPL-3.0-or-later -* Time Domain Dedispersion (TDD) -* is an optimized version of the original dedisp implementation. -*/ -#define TILE_DIM 32 -#define BLOCK_ROWS 8 - -template<typename WordType, unsigned int OUT_NBITS> -__global__ -void transpose_unpack_kernel( - const WordType* in, - size_t width, - size_t height, - size_t in_stride, - size_t out_stride, - WordType* out, - size_t block_count_x, - size_t block_count_y, - size_t in_nbits) -{ - // The input data has dimensions height * width, - // with width corresponding to a number of channel words, - // and height corresponding to a number of observing channels. - // The output will have expansion times as many channel words, - // thus this kernel transforms: - // data[height][width] -> unpacked[width][height*expansion] - dedisp_size expansion = OUT_NBITS / in_nbits; - // the output stride therefore scales linearly with the expansion. - out_stride *= expansion; - - // Shared memory buffer for the transposed input data - __shared__ WordType tile[TILE_DIM][TILE_DIM]; - - // Cull excess blocks - if (blockIdx.x >= block_count_x || - blockIdx.y >= block_count_y) - { - return; - } - - // Compute index in input matrix data[height][width] - size_t index_in_x = blockIdx.x * TILE_DIM + threadIdx.x; - size_t index_in_y = blockIdx.y * TILE_DIM + threadIdx.y; - size_t index_in = index_in_x + (index_in_y)*in_stride; - - // Transpose a tile of input into shared memory: - // data[nwords_tile][nsamps_tile] -> transposed[nsamps_tile][nwords_tile] - #pragma unroll - for (size_t i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if (index_in_x < width && index_in_y+i < height) - { - tile[threadIdx.x][threadIdx.y+i] = in[index_in+i*in_stride]; - } - } - - __syncthreads(); - - // Helper variables for data unpacking - int out_chans_per_word = sizeof(WordType)*8 / OUT_NBITS; - int in_chans_per_word = sizeof(WordType)*8 / in_nbits; - int norm = ((1l<<OUT_NBITS)-1) / ((1l<<in_nbits)-1); - WordType in_mask = (1<<in_nbits)-1; - WordType out_mask = (1<<OUT_NBITS)-1; - - // Unpack the inner dimension of the transposed tile: - // transposed[nsamps_tile][nwords_tile] -> unpacked[nsamps_tile][nwords_tile*expansion] - // Process one row (one samp) per iteration. - #pragma unroll - for (size_t i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - // Map threads [0:tile_dim] to [0:tile_dim*expansion], - // to compute [nwords_tile*expansion] channel words. - unsigned int tile_width_out = TILE_DIM * expansion; - for (size_t j = 0; j < tile_width_out; j += blockDim.x) - { - // Compute index in output matrix unpacked[width][height*expansion] - size_t index_out_x = blockIdx.y * tile_width_out + (threadIdx.x + j); - size_t index_out_y = blockIdx.x * TILE_DIM + threadIdx.y; - size_t index_out = index_out_x + (index_out_y)*out_stride; - - // Avoid excess threads - if (index_out_x < out_stride && index_out_y+i < width) - { - // Construct an output channel word - WordType result = 0; - for (int k = 0; k < sizeof(WordType)*8; k += OUT_NBITS) - { - int out_cw = threadIdx.x + j; - int c = out_cw * out_chans_per_word + k/OUT_NBITS; - int in_cw = c / in_chans_per_word; - int in_k = c % in_chans_per_word * in_nbits; - WordType word = tile[threadIdx.y+i][in_cw]; - - WordType val = (word >> in_k) & in_mask; - result |= ((val * norm) & out_mask) << k; - } - - // Write the result to device memory - out[index_out+i*out_stride] = result; - } - } - } -} \ No newline at end of file