diff --git a/bin/test/CMakeLists.txt b/bin/test/CMakeLists.txt index 4f69e95a5a8f3fbdb228ba6a1b4ba2bb54003c13..a4d948d67477d5e1b543af3083f72b2997b3d474 100644 --- a/bin/test/CMakeLists.txt +++ b/bin/test/CMakeLists.txt @@ -41,6 +41,21 @@ target_include_directories(testtdd PRIVATE target_link_libraries(testtdd tdd) +################################################################################ +# test for tdd kernels +################################################################################ +add_executable(testtranspose + testtranspose.cpp +) + +target_include_directories(testtranspose PRIVATE + ${CMAKE_SOURCE_DIR}/src + ${CMAKE_SOURCE_DIR}/src/common +) + +target_link_libraries(testtranspose dedisp tdd) + + ################################################################################ # test for fdd library ################################################################################ diff --git a/bin/test/testtranspose.cpp b/bin/test/testtranspose.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f03d14749444516834a123c2e3c92590426ff4f2 --- /dev/null +++ b/bin/test/testtranspose.cpp @@ -0,0 +1,205 @@ +#include <cstdlib> +#include <iostream> +#include <random> +#include <functional> +#include <vector> + +#include "cuda/CU.h" +#include "dedisp_types.h" +#include "dedisp/transpose/transpose.hpp" +#include "tdd/transpose/transpose.h" + +#define BITS_PER_BYTE 8 +#define DEDISP_SAMPS_PER_THREAD 2 + +unsigned long div_round_up(unsigned long a, unsigned long b) { + return (a-1) / b + 1; +} + +// Assume input is a 0 mean float and quantize to an unsigned 8-bit quantity +dedisp_byte bytequant(dedisp_float f) +{ + dedisp_float v = f + 127.5f; + dedisp_byte r; + if (v > 255.0) { + r = (dedisp_byte) 255; + } else if (v < 0.0f) { + r = (dedisp_byte) 0; + } else { + r = (dedisp_byte) roundf(v); + } + return r; +} + +// Compute mean and standard deviation of a float array +void calc_stats_float(dedisp_float *a, dedisp_size n, dedisp_float *mean, dedisp_float *sigma) +{ + // Use doubles to prevent rounding error + double sum = 0.0, sum2 = 0.0; + double mtmp = 0.0, vartmp; + double v; + dedisp_size i; + + // Use corrected 2-pass algorithm from Numerical Recipes + sum = 0.0; + for (i = 0; i < n; i++) { + sum += a[i]; + } + mtmp = sum/n; + + sum = 0.0; + sum2 = 0.0; + for (i = 0; i < n; i++) { + v = a[i]; + sum2 += (v-mtmp) * (v-mtmp); + sum += v-mtmp; + } + vartmp = (sum2 - (sum*sum) /n) / (n-1); + *mean = mtmp; + *sigma = sqrt(vartmp); + + return; +} + + +int run_test(dedisp_size in_nbits) +{ + // Observaton parameters + dedisp_size nchans = 1024; + dedisp_float sampletime_base = 250.0E-6; // Base is 250 microsecond time samples + dedisp_float downsamp = 1.0; + dedisp_float Tobs = 30.0; // Observation duration in seconds + dedisp_float dt = downsamp*sampletime_base; // s (0.25 ms sampling) + dedisp_size nsamps = Tobs / dt; + + // Data parameters + dedisp_size chans_per_word = sizeof(dedisp_word)*BITS_PER_BYTE / in_nbits; + dedisp_size nchan_words = nchans / chans_per_word; + dedisp_float datarms = 25.0; + + // Dedispersion parameters + dedisp_float last_dm = 102.6; + dedisp_float last_delay = 0.92; + dedisp_size max_delay = last_dm * last_delay + 0.5; + dedisp_size nsamps_gulp = 65536; + dedisp_size nsamps_computed = nsamps - max_delay; + dedisp_size nsamps_padded_gulp = div_round_up( + nsamps_computed, DEDISP_SAMPS_PER_THREAD) + * DEDISP_SAMPS_PER_THREAD + max_delay; + + // Print parameters + std::cout << " >>> Parameters" << std::endl; + std::cout << "in_nbits: " << in_nbits << std::endl; + std::cout << "nsamps: " << nsamps << std::endl; + std::cout << "nsamps_computed: " << nsamps_computed << std::endl; + std::cout << "nsamps_gulp: " << nsamps_gulp << std::endl; + std::cout << "nsamps_padded_gulp: " << nsamps_padded_gulp << std::endl; + std::cout << "nchans: " << nchans << std::endl; + std::cout << "nchan_words: " << nchan_words << std::endl; + + // CUDA + cu::Stream stream; + + // Initialize random number generator + auto random = std::bind(std::normal_distribution<float>(0, 1), + std::mt19937(0)); + + // Create raw data + std::cout << " >>> Initializing input" << std::endl; + std::vector<dedisp_float> rawdata(nsamps_gulp * nchans); + for (unsigned int i = 0; i < rawdata.size(); i++) { + rawdata[i] = datarms * random(); + } + + // Create input data by quantizing the raw data + dedisp_float raw_mean, raw_sigma; + calc_stats_float(rawdata.data(), nsamps_gulp * nchans, &raw_mean, &raw_sigma); + + std::vector<dedisp_byte> input(nsamps_gulp * nchans * (in_nbits / BITS_PER_BYTE)); + + for (unsigned int ns = 0; ns < nsamps_gulp; ns++) { + for (unsigned int nc = 0; nc < nchans; nc++) { + input[ns*nchans+nc] = bytequant(rawdata[ns*nchans+nc]); + } + } + + // Copy the input to the GPU + size_t sizeof_input = input.size() * sizeof(dedisp_byte); + cu::DeviceMemory d_input(sizeof_input); + stream.memcpyHtoDAsync(d_input, input.data(), sizeof_input); + + // Output + std::vector<dedisp_word> output_ref(nsamps_padded_gulp * nchan_words); + std::vector<dedisp_word> output_tst(output_ref.size()); + size_t sizeof_output = nsamps_padded_gulp * nchan_words * sizeof(dedisp_word); + cu::DeviceMemory d_output(sizeof_output); + + // Dedisp reference + std::cout << " >>> Run dedisp transpose" << std::endl; + d_output.zero(stream); + transpose((dedisp_word *) d_input, + nchan_words, nsamps_gulp, + nchan_words, nsamps_padded_gulp, + (dedisp_word *) d_output, + stream); + stream.memcpyDtoHAsync(output_ref.data(), d_output, d_output.size()); + stream.synchronize(); + + // TDD fused kernel + std::cout << " >>> Run tdd transpose" << std::endl; + d_output.zero(stream); + tdd::transpose( + (dedisp_word *) d_input, + nchan_words, nsamps_gulp, + nchan_words, nsamps_padded_gulp, + (dedisp_word *) d_output, + stream); + stream.memcpyDtoHAsync(output_tst.data(), d_output, d_output.size()); + + // Wait for execution to finish + stream.synchronize(); + + // Compare results + std::cout << " >>> Comparing results" << std::endl; + unsigned long nnz_ref = 0; + unsigned long nnz_tst = 0; + unsigned long errors = 0; + + for (unsigned long i = 0; i < (nsamps_padded_gulp * nchan_words); i++) { + dedisp_word ref = output_ref[i]; + dedisp_word tst = output_tst[i]; + if (ref != tst) { + if (errors < 10) { + std::cerr << "i = " << i << ": " << ref << " != " << tst << std::endl; + } + errors++; + } + nnz_ref += ref != 0; + nnz_tst += tst != 0; + } + + // Report results + unsigned long nnz_expected = (nchans * nsamps_gulp) / sizeof(dedisp_word); + bool pass = errors == 0 && // no errors + nnz_ref == nnz_expected && // correct number of nonzeros + nnz_ref == nnz_tst; // same number of nonzeros + if (pass) { + std::cout << " -> TEST PASSED!" << std::endl << std::endl; + return EXIT_SUCCESS; + } else { + std::cerr << "number of errors: " << errors << std::endl; + std::cerr << "number of nonzeros in ref: " << nnz_ref << std::endl; + std::cerr << "number of nonzeros in tst: " << nnz_tst << std::endl; + std::cerr << " -> TEST FAILED!" << std::endl << std::endl; + return EXIT_FAILURE; + } +} + +int main(int argc, char *argv[]) +{ + int retval = 0; + retval |= run_test(8); + retval |= run_test(16); + retval |= run_test(32); + return retval; +} \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index eaf4c687dcea7cde6c6b4d53b008af6ff51ba3f5..1a533f25d4cd66899d7ccc398d05a7f798a482c0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,6 +10,11 @@ include_directories(${CUDA_TOOLKIT_INCLUDE}) add_subdirectory(common) add_subdirectory(external) +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + +# set the RPATH for the subsequent libraries +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") ################################################################################ # plan library @@ -20,7 +25,6 @@ add_library(plan OBJECT Plan.cpp GPUPlan.cpp) ################################################################################ # dedisp c-library ################################################################################ - add_subdirectory(clib) diff --git a/src/Plan.cpp b/src/Plan.cpp index 824e7663945fae254e3e198dd5698663cfcdbc74..3cec4d17c421e19a7c694671b61d332029b2748b 100644 --- a/src/Plan.cpp +++ b/src/Plan.cpp @@ -67,6 +67,8 @@ void Plan::generate_dm_list( double dm = ((b2*prev + std::sqrt(-a2*b2*prev2 + (a2+b2)*k)) / (a2+b2)); dm_table.push_back(dm); } + + h_dt_factors.resize(dm_table.size(), 1); } void Plan::set_dm_list( diff --git a/src/Plan.hpp b/src/Plan.hpp index 9ce2c6589dffef5788bb572d298031ed2c905e0a..f354178983dd13f0d950665ab75e00c540b015d5 100644 --- a/src/Plan.hpp +++ b/src/Plan.hpp @@ -70,6 +70,7 @@ public: const float_type* get_dm_list() const { return h_dm_list.data(); } const float_type* get_delay_table() const { return h_delay_table.data(); } const bool_type* get_killmask() const { return h_killmask.data(); } + const dedisp_size* get_dt_factors() const { return h_dt_factors.data(); } void sync(); @@ -111,6 +112,7 @@ protected: std::vector<dedisp_float> h_dm_list; // size = dm_count std::vector<dedisp_float> h_delay_table; // size = nchans std::vector<dedisp_bool> h_killmask; // size = nchans + std::vector<dedisp_size> h_dt_factors; // size = dm_count }; } // end namespace dedisp diff --git a/src/clib/CMakeLists.txt b/src/clib/CMakeLists.txt index 0fe192b6dbd8a55bbfcc4ba2cb6d78d0ddf34c5a..7da4bafbbf14575e0c6c9e12435ef44138ba3fb9 100644 --- a/src/clib/CMakeLists.txt +++ b/src/clib/CMakeLists.txt @@ -6,4 +6,17 @@ add_library( target_include_directories( clib PRIVATE ${CMAKE_SOURCE_DIR}/src -) \ No newline at end of file + PRIVATE ${CUDA_TOOLKIT_INCLUDE} +) + +set_target_properties( + clib + PROPERTIES PUBLIC_HEADER dedisp.h +) + +install( + TARGETS + clib + PUBLIC_HEADER + DESTINATION include +) diff --git a/src/clib/dedisp.cpp b/src/clib/dedisp.cpp index 049858a93244101d86da400da371001547cfcb3a..612cdb60511e6dcd5b70f2f8ac2094ba186d0d19 100644 --- a/src/clib/dedisp.cpp +++ b/src/clib/dedisp.cpp @@ -9,6 +9,7 @@ #include <memory> //for shared pointers #include <iostream> #include <typeinfo> +#include <cmath> // Implementation specific CPP Plan interfaces #include <dedisp/DedispPlan.hpp> @@ -266,8 +267,34 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, return DEDISP_NO_ERROR; } -/* Currently not implemented in C-interface: -dedisp_float * dedisp_generate_dm_list_guru (dedisp_float dm_start, dedisp_float dm_end, +void generate_dm_list( + std::vector<dedisp_float>& dm_table, + dedisp_float dm_start, dedisp_float dm_end, + double dt, double ti, double f0, double df, + dedisp_size nchans, double tol) +{ + // Note: This algorithm originates from Lina Levin + // Note: Computation done in double precision to match MB's code + + dt *= 1e6; + double f = (f0 + ((nchans/2) - 0.5) * df) * 1e-3; + double tol2 = tol*tol; + double a = 8.3 * df / (f*f*f); + double a2 = a*a; + double b2 = a2 * (double)(nchans*nchans / 16.0); + double c = (dt*dt + ti*ti) * (tol2 - 1.0); + + dm_table.push_back(dm_start); + while( dm_table.back() < dm_end ) { + double prev = dm_table.back(); + double prev2 = prev*prev; + double k = c + tol2*a2*prev2; + double dm = ((b2*prev + std::sqrt(-a2*b2*prev2 + (a2+b2)*k)) / (a2+b2)); + dm_table.push_back(dm); + } +} + +dedisp_float * dedisp_generate_dm_list_guru(dedisp_float dm_start, dedisp_float dm_end, double dt, double ti, double f0, double df, dedisp_size nchans, double tol, dedisp_size * dm_count) { @@ -279,7 +306,6 @@ dedisp_float * dedisp_generate_dm_list_guru (dedisp_float dm_start, dedisp_float *dm_count = dm_table.size(); return &dm_table[0]; } -*/ dedisp_error dedisp_set_device(int device_idx) { //keep global copy of device_idx for usage in dedisp_create_plan() @@ -590,32 +616,24 @@ const char* dedisp_get_error_string(dedisp_error error) } } -/* Currently not implemented in C-interface: dedisp_error dedisp_enable_adaptive_dt(dedisp_plan plan, dedisp_float pulse_width, dedisp_float tol) { - if( !plan ) { throw_error(DEDISP_INVALID_PLAN); } - plan->scrunching_enabled = true; - plan->pulse_width = pulse_width; - plan->scrunch_tol = tol; - return update_scrunch_list(plan); + return DEDISP_NOT_IMPLEMENTED_ERROR; } + dedisp_error dedisp_disable_adaptive_dt(dedisp_plan plan) { - if( !plan ) { throw_error(DEDISP_INVALID_PLAN); } - plan->scrunching_enabled = false; - return update_scrunch_list(plan); + return DEDISP_NO_ERROR; } + dedisp_bool dedisp_using_adaptive_dt(const dedisp_plan plan) { - if( !plan ) { throw_getter_error(DEDISP_INVALID_PLAN,false); } - return plan->scrunching_enabled; + return false; } + const dedisp_size* dedisp_get_dt_factors(const dedisp_plan plan) { - if( !plan ) { throw_getter_error(DEDISP_INVALID_PLAN,0); } - if( 0 == plan->dm_count ) { throw_getter_error(DEDISP_NO_DM_LIST_SET,0); } - return &plan->scrunch_list[0]; + return plan->ptr->get_dt_factors(); } -*/ #ifdef __cplusplus } diff --git a/src/clib/dedisp.h b/src/clib/dedisp.h index 83d074a74e06f87fec859cb4df0d1bfafb0e8514..71c244d163d4ecab42627017428b65eb7671957f 100644 --- a/src/clib/dedisp.h +++ b/src/clib/dedisp.h @@ -90,6 +90,7 @@ typedef enum { DEDISP_DEVICE_ALREADY_SET, DEDISP_PRIOR_GPU_ERROR, DEDISP_INTERNAL_GPU_ERROR, + DEDISP_NOT_IMPLEMENTED_ERROR, DEDISP_UNKNOWN_ERROR } dedisp_error; /*! \enum dedisp_error @@ -257,13 +258,10 @@ dedisp_error dedisp_generate_dm_list(dedisp_plan plan, dedisp_float pulse_width, dedisp_float tol); -/* Currently not implemented in C-interface: dedisp_float * dedisp_generate_dm_list_guru (dedisp_float dm_start, dedisp_float dm_end, double dt, double ti, double f0, double df, dedisp_size nchans, double tol, dedisp_size * dm_count); -*/ - // Getters // ------- /*! \p dedisp_get_max_delay gets the maximum delay (in samples) applied during @@ -512,24 +510,21 @@ dedisp_error dedisp_sync(void); * \param tol The smearing tolerance at which the time resolution is reduced. * A typical value is 1.15, meaning a tolerance of 15%. */ -/* dedisp_error dedisp_enable_adaptive_dt(dedisp_plan plan, dedisp_float pulse_width, dedisp_float tol); -*/ + /*! \p dedisp_disable_adaptive_dt disables adaptive time resolution for \p plan. * \param plan The plan for which to disable adaptive time resolution. */ -/* dedisp_error dedisp_disable_adaptive_dt(dedisp_plan plan); -*/ + /*! \p dedisp_using_adaptive_dt returns whether \p plan has adaptive time * resolution enabled. * \param plan The plan for which to query adaptive time resolution. */ -/* dedisp_bool dedisp_using_adaptive_dt(const dedisp_plan plan); -*/ + /*! \p dedisp_get_dt_factors returns an array of length * \p dedisp_get_dm_count(\p plan) containing the integer factors by which * the time resolution (1/dt) is decreased for each DM. Note that the @@ -541,9 +536,7 @@ dedisp_bool dedisp_using_adaptive_dt(const dedisp_plan plan); * \p dedisp_enable_adaptive_dt has not been called; in such cases the * returned list will consist entirely of ones. */ -/* const dedisp_size* dedisp_get_dt_factors(const dedisp_plan plan); -*/ #ifdef __cplusplus } // closing brace for extern "C" diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 09a41fe55c429f75b6e5c64d0605287d6a84408b..8e6a927c87322684e24bf683d4b2a38c0cfc4afc 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -5,4 +5,25 @@ add_library(common SHARED target_link_libraries(common OpenMP::OpenMP_CXX) +set_target_properties( + common PROPERTIES + VERSION ${DEDISP_VERSION} + SOVERSION ${DEDISP_VERSION_MAJOR} +) + +install( + TARGETS + common + LIBRARY + DESTINATION lib +) + +# Install dedisp_types.h in common subdirectory +# for backwards compatibility with Heimdall +install( + FILES + dedisp_types.h + DESTINATION include/common +) + add_subdirectory(cuda) diff --git a/src/dedisp/CMakeLists.txt b/src/dedisp/CMakeLists.txt index 276f085f8bc8dc20b5a2dee22181c72176ae1081..4f69d408c10c658690953f455105bada9b8b0d28 100644 --- a/src/dedisp/CMakeLists.txt +++ b/src/dedisp/CMakeLists.txt @@ -26,15 +26,11 @@ target_include_directories( PUBLIC ${CUDA_TOOLKIT_INCLUDE} ) -set_target_properties( - dedisp - PROPERTIES PUBLIC_HEADER DedispPlan.hpp -) - set_target_properties( dedisp PROPERTIES VERSION ${DEDISP_VERSION} SOVERSION ${DEDISP_VERSION_MAJOR} + PUBLIC_HEADER DedispPlan.hpp ) install( @@ -44,4 +40,4 @@ install( DESTINATION lib PUBLIC_HEADER DESTINATION include -) +) \ No newline at end of file diff --git a/src/tdd/CMakeLists.txt b/src/tdd/CMakeLists.txt index fd70d7ce77607fbc47df779cf12bfdecb244cdb5..5ff50566c5eb4e0e4a7ea1bd09c3b75adff19d2f 100644 --- a/src/tdd/CMakeLists.txt +++ b/src/tdd/CMakeLists.txt @@ -2,7 +2,7 @@ cuda_add_library( tdd SHARED TDDPlan.cpp dedisperse/TDDKernel.cu - unpack/unpack.cu + transpose/transpose.cu $<TARGET_OBJECTS:plan> $<TARGET_OBJECTS:external> ) @@ -19,15 +19,11 @@ target_include_directories( PUBLIC ${CUDA_TOOLKIT_INCLUDE} ) -set_target_properties( - tdd - PROPERTIES PUBLIC_HEADER TDDPlan.hpp -) - set_target_properties( tdd PROPERTIES VERSION ${DEDISP_VERSION} SOVERSION ${DEDISP_VERSION_MAJOR} + PUBLIC_HEADER TDDPlan.hpp ) install( diff --git a/src/tdd/TDDPlan.cpp b/src/tdd/TDDPlan.cpp index f764890fc3fffaf9a2ecd7945d2bbb5b55fed905..8bd3bcc57a33063e4658cb6ad5b370e0c1d5f8d8 100644 --- a/src/tdd/TDDPlan.cpp +++ b/src/tdd/TDDPlan.cpp @@ -27,7 +27,7 @@ #include "common/helper.h" #include "dedisperse/TDDKernel.hpp" -#include "unpack/unpack.h" +#include "transpose/transpose.h" #include "dedisp_error.hpp" @@ -244,15 +244,6 @@ void TDDPlan::execute_guru( dedisp_size in_count_padded_gulp_max = nsamps_padded_gulp_max * in_buf_stride_words; - // TODO: Make this a parameter? - dedisp_size min_in_nbits = 0; - dedisp_size unpacked_in_nbits = std::max((int)in_nbits, (int)min_in_nbits); - dedisp_size unpacked_chans_per_word = - sizeof(dedisp_word)*BITS_PER_BYTE / unpacked_in_nbits; - dedisp_size unpacked_nchan_words = m_nchans / unpacked_chans_per_word; - dedisp_size unpacked_buf_stride_words = unpacked_nchan_words; - dedisp_size unpacked_count_padded_gulp_max = - nsamps_padded_gulp_max * unpacked_buf_stride_words; dedisp_size out_stride_gulp_samples = nsamps_computed_gulp_max; dedisp_size out_stride_gulp_bytes = @@ -267,7 +258,6 @@ void TDDPlan::execute_guru( std::cout << memory_alloc_str << std::endl; #endif cu::DeviceMemory d_transposed(in_count_padded_gulp_max * sizeof(dedisp_word)); - cu::DeviceMemory d_unpacked(unpacked_count_padded_gulp_max * sizeof(dedisp_word)); cu::DeviceMemory d_out(out_count_gulp_max * sizeof(dedisp_word)); cu::HostMemory h_out(out_count_gulp_max * sizeof(dedisp_word)); std::vector<cu::HostMemory> h_in_(2); @@ -381,22 +371,21 @@ void TDDPlan::execute_guru( // Transpose and unpack the words in the input executestream->waitEvent(job.inputEnd); executestream->record(job.preprocessingStart); - transpose_unpack( + tdd::transpose( (dedisp_word *) job.d_in_ptr, nchan_words, job.nsamps_gulp, in_buf_stride_words, job.nsamps_padded_gulp, - (dedisp_word *) d_unpacked, - in_nbits, unpacked_in_nbits, + (dedisp_word *) d_transposed, *executestream); executestream->record(job.preprocessingEnd); // Perform direct dedispersion without scrunching executestream->record(job.dedispersionStart); m_kernel.launch( - d_unpacked, // d_in + d_transposed, // d_in job.nsamps_padded_gulp, // in_stride job.nsamps_computed_gulp, // nsamps - unpacked_in_nbits, // in_nbits, + in_nbits, // in_nbits, m_nchans, // nchans 1, // chan_stride d_dm_list, // d_dm_list diff --git a/src/tdd/unpack/unpack.cu b/src/tdd/transpose/transpose.cu similarity index 71% rename from src/tdd/unpack/unpack.cu rename to src/tdd/transpose/transpose.cu index 2c3dbdcb5aa781bfb3f927166812435604152853..ba24558e22a79db8d982ab8a5b0bb3d5f68b8276 100644 --- a/src/tdd/unpack/unpack.cu +++ b/src/tdd/transpose/transpose.cu @@ -8,7 +8,9 @@ */ #include <algorithm> #include "dedisp_types.h" -#include "unpack_kernel.cuh" +#include "transpose_kernel.cuh" + +namespace tdd { template<typename U> inline U round_up_pow2(const U& a) { @@ -22,12 +24,11 @@ inline U round_down_pow2(const U& a) { return round_up_pow2(a+1)/2; } -void transpose_unpack( +void transpose( const dedisp_word* d_in, size_t width, size_t height, size_t in_stride, size_t out_stride, dedisp_word* d_out, - dedisp_size in_nbits, dedisp_size out_nbits, cudaStream_t stream) { // Specify thread decomposition (uses up-rounded divisions) @@ -41,7 +42,6 @@ void transpose_unpack( block_y_offset < tot_block_count.y; block_y_offset += max_grid_dim) { - dim3 block_count; // Handle the possibly incomplete final grid @@ -69,26 +69,15 @@ void transpose_unpack( round_up_pow2(block_count.y)); // Run the CUDA kernel - #define CALL_KERNEL(OUT_NBITS) \ - transpose_unpack_kernel<dedisp_word, OUT_NBITS><<<grid, block, 0, stream>>> \ - (d_in + in_offset, \ - w, h, \ - in_stride, out_stride, \ - d_out + out_offset, \ - block_count.x, \ - block_count.y, \ - in_nbits); \ - - // Dispatch dynamically on out_nbits for supported values - switch (out_nbits) - { - case 1: CALL_KERNEL(1); break; - case 2: CALL_KERNEL(2); break; - case 4: CALL_KERNEL(4); break; - case 8: CALL_KERNEL(8); break; - case 16: CALL_KERNEL(16); break; - default: /* should never be reached */ break; - } + transpose_kernel<dedisp_word><<<grid, block, 0, stream>>> + (d_in + in_offset, + w, h, + in_stride, out_stride, + d_out + out_offset, + block_count.x, + block_count.y); } // end for block_x_offset } // end for block_y_offset -} \ No newline at end of file +} + +} // end namespace tdd \ No newline at end of file diff --git a/src/tdd/unpack/unpack.h b/src/tdd/transpose/transpose.h similarity index 85% rename from src/tdd/unpack/unpack.h rename to src/tdd/transpose/transpose.h index 30b8abb4095d22ebcceaed53c464582e91043ea0..cd099058466fdcf7fa134d32c79b85a61b6c82a2 100644 --- a/src/tdd/unpack/unpack.h +++ b/src/tdd/transpose/transpose.h @@ -7,10 +7,13 @@ #include "dedisp_types.h" #include "cuda_runtime.h" -void transpose_unpack( +namespace tdd { + +void transpose( const dedisp_word* d_in, size_t width, size_t height, size_t in_stride, size_t out_stride, dedisp_word* d_out, - dedisp_size in_nbits, dedisp_size out_nbits, cudaStream_t stream); + +}; // end namespace tdd \ No newline at end of file diff --git a/src/tdd/transpose/transpose_kernel.cuh b/src/tdd/transpose/transpose_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a5cfda9fb2bd370e465b51a61ea40985f4bc22d4 --- /dev/null +++ b/src/tdd/transpose/transpose_kernel.cuh @@ -0,0 +1,60 @@ +/* +* Copyright (C) 2021 ASTRON (Netherlands Institute for Radio Astronomy) +* SPDX-License-Identifier: GPL-3.0-or-later +* Time Domain Dedispersion (TDD) +* is an optimized version of the original dedisp implementation. +*/ +#define TILE_DIM 32 +#define BLOCK_ROWS 8 + +namespace tdd { + +template<typename WordType> +__global__ +void transpose_kernel( + const WordType* in, + size_t width, + size_t height, + size_t in_stride, + size_t out_stride, + WordType* out, + size_t block_count_x, + size_t block_count_y) +{ + __shared__ WordType tile[TILE_DIM][TILE_DIM]; + + // Cull excess blocks (there may be many if we round up to a power of 2) + if( blockIdx.x >= block_count_x || + blockIdx.y >= block_count_y ) { + return; + } + + size_t index_in_x = blockIdx.x * TILE_DIM + threadIdx.x; + size_t index_in_y = blockIdx.y * TILE_DIM + threadIdx.y; + size_t index_in = index_in_x + (index_in_y)*in_stride; + +#pragma unroll + for( size_t i=0; i<TILE_DIM; i+=BLOCK_ROWS ) { + // TODO: Is it possible to cull some excess threads early? + if( index_in_x < width && index_in_y+i < height ) + tile[threadIdx.y+i][threadIdx.x] = in[index_in+i*in_stride]; + } + + __syncthreads(); + + size_t index_out_x = blockIdx.y * TILE_DIM + threadIdx.x; + // Avoid excess threads + if( index_out_x >= height ) return; + size_t index_out_y = blockIdx.x * TILE_DIM + threadIdx.y; + size_t index_out = index_out_x + (index_out_y)*out_stride; + +#pragma unroll + for( size_t i=0; i<TILE_DIM; i+=BLOCK_ROWS ) { + // Avoid excess threads + if( index_out_y+i < width ) { + out[index_out+i*out_stride] = tile[threadIdx.x][threadIdx.y+i]; + } + } +} + +} // end namespace tdd \ No newline at end of file diff --git a/src/tdd/unpack/unpack_kernel.cuh b/src/tdd/unpack/unpack_kernel.cuh deleted file mode 100644 index 3729f234d9b6256a5e30ef0c30d6fdc2653ec8bf..0000000000000000000000000000000000000000 --- a/src/tdd/unpack/unpack_kernel.cuh +++ /dev/null @@ -1,106 +0,0 @@ -/* -* Copyright (C) 2021 ASTRON (Netherlands Institute for Radio Astronomy) -* SPDX-License-Identifier: GPL-3.0-or-later -* Time Domain Dedispersion (TDD) -* is an optimized version of the original dedisp implementation. -*/ -#define TILE_DIM 32 -#define BLOCK_ROWS 8 - -template<typename WordType, unsigned int OUT_NBITS> -__global__ -void transpose_unpack_kernel( - const WordType* in, - size_t width, - size_t height, - size_t in_stride, - size_t out_stride, - WordType* out, - size_t block_count_x, - size_t block_count_y, - size_t in_nbits) -{ - // The input data has dimensions height * width, - // with width corresponding to a number of channel words, - // and height corresponding to a number of observing channels. - // The output will have expansion times as many channel words, - // thus this kernel transforms: - // data[height][width] -> unpacked[width][height*expansion] - dedisp_size expansion = OUT_NBITS / in_nbits; - // the output stride therefore scales linearly with the expansion. - out_stride *= expansion; - - // Shared memory buffer for the transposed input data - __shared__ WordType tile[TILE_DIM][TILE_DIM]; - - // Cull excess blocks - if (blockIdx.x >= block_count_x || - blockIdx.y >= block_count_y) - { - return; - } - - // Compute index in input matrix data[height][width] - size_t index_in_x = blockIdx.x * TILE_DIM + threadIdx.x; - size_t index_in_y = blockIdx.y * TILE_DIM + threadIdx.y; - size_t index_in = index_in_x + (index_in_y)*in_stride; - - // Transpose a tile of input into shared memory: - // data[nwords_tile][nsamps_tile] -> transposed[nsamps_tile][nwords_tile] - #pragma unroll - for (size_t i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - if (index_in_x < width && index_in_y+i < height) - { - tile[threadIdx.x][threadIdx.y+i] = in[index_in+i*in_stride]; - } - } - - __syncthreads(); - - // Helper variables for data unpacking - int out_chans_per_word = sizeof(WordType)*8 / OUT_NBITS; - int in_chans_per_word = sizeof(WordType)*8 / in_nbits; - int norm = ((1l<<OUT_NBITS)-1) / ((1l<<in_nbits)-1); - WordType in_mask = (1<<in_nbits)-1; - WordType out_mask = (1<<OUT_NBITS)-1; - - // Unpack the inner dimension of the transposed tile: - // transposed[nsamps_tile][nwords_tile] -> unpacked[nsamps_tile][nwords_tile*expansion] - // Process one row (one samp) per iteration. - #pragma unroll - for (size_t i = 0; i < TILE_DIM; i += BLOCK_ROWS) - { - // Map threads [0:tile_dim] to [0:tile_dim*expansion], - // to compute [nwords_tile*expansion] channel words. - unsigned int tile_width_out = TILE_DIM * expansion; - for (size_t j = 0; j < tile_width_out; j += blockDim.x) - { - // Compute index in output matrix unpacked[width][height*expansion] - size_t index_out_x = blockIdx.y * tile_width_out + (threadIdx.x + j); - size_t index_out_y = blockIdx.x * TILE_DIM + threadIdx.y; - size_t index_out = index_out_x + (index_out_y)*out_stride; - - // Avoid excess threads - if (index_out_x < out_stride && index_out_y+i < width) - { - // Construct an output channel word - WordType result = 0; - for (int k = 0; k < sizeof(WordType)*8; k += OUT_NBITS) - { - int out_cw = threadIdx.x + j; - int c = out_cw * out_chans_per_word + k/OUT_NBITS; - int in_cw = c / in_chans_per_word; - int in_k = c % in_chans_per_word * in_nbits; - WordType word = tile[threadIdx.y+i][in_cw]; - - WordType val = (word >> in_k) & in_mask; - result |= ((val * norm) & out_mask) << k; - } - - // Write the result to device memory - out[index_out+i*out_stride] = result; - } - } - } -} \ No newline at end of file