diff --git a/CMakeLists.txt b/CMakeLists.txt index ce66efeaebe37f9d2caf82e22ef5de4c1408b93c..1a04d8921f8089cd375b0dbc03bf3c677e98c4b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,24 +7,38 @@ option(PREDICT_BUILD_TESTING "Build the test suite" ON) option(PREDICT_BUILD_BENCHMARK "Build the benchmark suite" OFF) option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands" ON) +# Disable Tracy by default to limit performance impact. +if(NOT DEFINED TRACY_ENABLE) + set(TRACY_ENABLE OFF) +endif() + # Casacore include(FetchContent) FetchContent_Declare( aocommon GIT_REPOSITORY https://gitlab.com/aroffringa/aocommon/ GIT_TAG master + GIT_SHALLOW TRUE GIT_PROGRESS TRUE) FetchContent_Populate(aocommon) set(CMAKE_MODULE_PATH ${aocommon_SOURCE_DIR}/CMake) +FetchContent_Declare( + tracy + GIT_REPOSITORY https://github.com/wolfpld/tracy.git + GIT_TAG master # Or pin to a specific version + GIT_SHALLOW true + GIT_PROGRESS true) +FetchContent_MakeAvailable(tracy) + # Include XTensor libraries. DP3 must include FetchXTensor directly (not # indirectly, e.g., via schaapcommon) since it uses 'add_compile_definitions'. set(XTENSOR_LIBRARIES xtl xsimd xtensor xtensor-blas) include(${aocommon_SOURCE_DIR}/CMake/FetchXTensor.cmake) set(PREDICT_BUILD_ARGUMENTS - $<$<CONFIG:Debug>:-g> $<$<CONFIG:Release>:-O3;-march=native > - $<$<CONFIG:RelWithDebInfo>:-O3;-g;-march=native >) + $<$<CONFIG:Debug>:-g -mavx2> $<$<CONFIG:Release>:-O3;-march=native -mavx2> + $<$<CONFIG:RelWithDebInfo>:-O3;-g;-march=native -mavx2>) # Find or fetch dependencies find_package(Casacore REQUIRED casa tables) @@ -48,8 +62,14 @@ target_include_directories( $<INSTALL_INTERFACE:include>) target_link_libraries( - predict PUBLIC xtensor xtensor-blas xsimd OpenMP::OpenMP_CXX - ${CASACORE_LIBRARIES} ${BLAS_LIBRARIES}) + predict + PUBLIC xtensor + xtensor-blas + xsimd + OpenMP::OpenMP_CXX + ${CASACORE_LIBRARIES} + ${BLAS_LIBRARIES} + Tracy::TracyClient) target_include_directories(predict PRIVATE ${CASACORE_INCLUDE_DIRS}) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 34f67ba24d2fb97bccd6f205bb5760ed16fb9e2f..55c07dbfee3ee8fde5ecb165858a2d1e5c7e4057 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -27,8 +27,14 @@ add_executable( target_compile_options(microbenchmarks PRIVATE ${PREDICT_BUILD_ARGUMENTS}) target_link_libraries( - microbenchmarks PRIVATE benchmark::benchmark predict # your main library - xtensor xtensor-blas xsimd ${CASACORE_LIBRARIES}) + microbenchmarks + PRIVATE benchmark::benchmark + predict + xtensor + xtensor-blas + xsimd + ${CASACORE_LIBRARIES} + Tracy::TracyClient) target_include_directories( microbenchmarks PRIVATE include ${CMAKE_SOURCE_DIR}/include - ${CASACORE_INCLUDE_DIRS}) + ${CASACORE_INCLUDE_DIRS} ${tracy_SOURCE_DIR}/include) diff --git a/benchmark/main.cpp b/benchmark/main.cpp index 661ee4dcf535c827aee9a4aa8021f0846af12b9d..00e8f3dfc6c07cdf0ca7c06ef2ff0e5023d32881 100644 --- a/benchmark/main.cpp +++ b/benchmark/main.cpp @@ -5,6 +5,8 @@ #include <benchmark/benchmark.h> #include <test/Common.h> +#include <tracy/Tracy.hpp> + void ParseCustomFlags(int argc, char **argv) { extern bool do_randomized_run; extern int randomized_run_seed; diff --git a/benchmark/phases.cpp b/benchmark/phases.cpp index b6d048526d1ed33f97988aa221caee24b0e6a1ce..42fba3d7db8bcbbb577a903681b3e94f26ebbc6d 100644 --- a/benchmark/phases.cpp +++ b/benchmark/phases.cpp @@ -1,7 +1,7 @@ #include "PointSourceCollection.h" #include "Predict.h" #include "PredictPlan.h" -#include "PredictPlanExec.h" +#include "PredictPlanExecCPU.h" #include <benchmark/benchmark.h> #include <random> #include <xtensor/xtensor.hpp> @@ -23,7 +23,7 @@ public: plan.nstations = n_stations; plan.nchannels = n_channels; - plan_exec = std::make_unique<PredictPlanExec>(plan); + plan_exec = std::make_unique<PredictPlanExecCPU>(plan); plan_exec->uvw = xt::xtensor<double, 2>({n_stations, 3}); for (size_t i = 0; i < n_stations; ++i) { @@ -43,13 +43,12 @@ public: void TearDown(benchmark::State &) override { plan_exec.reset(); } protected: - std::unique_ptr<PredictPlanExec> plan_exec; + std::unique_ptr<PredictPlanExecCPU> plan_exec; size_t n_stations; size_t n_channels; }; BENCHMARK_DEFINE_F(PhasesBenchmark, ComputePhases)(benchmark::State &state) { - for (auto _ : state) { plan_exec->ComputeStationPhases(); } diff --git a/benchmark/predict.cpp b/benchmark/predict.cpp index 9d771b3c78cc25844c310c61dfa0872f95f569f5..baa2400091c9d64c8535c9ac8a24aa1671cf8285 100644 --- a/benchmark/predict.cpp +++ b/benchmark/predict.cpp @@ -1,3 +1,4 @@ +#include "Directions.h" #include "GaussianSourceCollection.h" #include <random> @@ -53,7 +54,7 @@ BENCHMARK_DEFINE_F(PredictBenchmark, PointSource) sources.Add(predict_run->makeSource<PointSource>()); for (auto _ : state) { - predict_run->Run(sources); + predict_run->RunWithStrategy(sources, computation_strategy::SINGLE); } } @@ -67,6 +68,46 @@ BENCHMARK_REGISTER_F(PredictBenchmark, PointSource) ->ArgNames({"stat", "chan", "fullstokes", "freqsmear"}) ->Unit(benchmark::kMillisecond); +BENCHMARK_DEFINE_F(PredictBenchmark, PointSourceSIMD) +(benchmark::State &state) { + PointSourceCollection sources; + sources.Add(predict_run->makeSource<PointSource>()); + + for (auto _ : state) { + predict_run->RunWithStrategy(sources, computation_strategy::MULTI_SIMD); + } +} + +BENCHMARK_REGISTER_F(PredictBenchmark, PointSourceSIMD) + ->ArgsProduct({ + nstations, + nsources, + {false, true}, + {false, true}, + }) + ->ArgNames({"stat", "chan", "fullstokes", "freqsmear"}) + ->Unit(benchmark::kMillisecond); + +BENCHMARK_DEFINE_F(PredictBenchmark, PointSourceXSIMD) +(benchmark::State &state) { + PointSourceCollection sources; + sources.Add(predict_run->makeSource<PointSource>()); + + for (auto _ : state) { + predict_run->RunWithStrategy(sources, computation_strategy::XSIMD); + } +} + +BENCHMARK_REGISTER_F(PredictBenchmark, PointSourceXSIMD) + ->ArgsProduct({ + nstations, + nsources, + {false, true}, + {false, true}, + }) + ->ArgNames({"stat", "chan", "fullstokes", "freqsmear"}) + ->Unit(benchmark::kMillisecond); + BENCHMARK_DEFINE_F(PredictBenchmark, GaussianSource) (benchmark::State &state) { GaussianSourceCollection sources; diff --git a/benchmark/smearterms.cpp b/benchmark/smearterms.cpp index 52e55dc64c102b02e5d45da564d7df00e710e0f3..6b452c5910f9f5ede43dce1ac1b5dbea3c0160a3 100644 --- a/benchmark/smearterms.cpp +++ b/benchmark/smearterms.cpp @@ -1,3 +1,4 @@ +#include "Directions.h" #include <benchmark/benchmark.h> #include <xtensor/xtensor.hpp> @@ -26,7 +27,7 @@ public: run = MakePredictRun(test_reference_point, test_offset_point, n_stations, n_frequencies, false, true); - plan_exec_ptr = std::make_unique<PredictPlanExec>(run->plan); + plan_exec_ptr = std::make_unique<PredictPlanExecCPU>(run->plan); static std::mt19937 gen(-5); std::uniform_real_distribution<> phase_dist(-M_PI, M_PI); for (size_t t = 0; t < n_stations; t++) { @@ -37,19 +38,19 @@ public: protected: std::unique_ptr<PredictRun> run; - std::unique_ptr<PredictPlanExec> plan_exec_ptr; + std::unique_ptr<PredictPlanExecCPU> plan_exec_ptr; }; BENCHMARK_DEFINE_F(SmearTermsBenchmark, SmearTermBenchmarkXTensor) (benchmark::State &state) { for (auto _ : state) { - plan_exec_ptr->ComputeSmearTerms(); + plan_exec_ptr->ComputeSmearTerms<computation_strategy::XSIMD>(); } } BENCHMARK_DEFINE_F(SmearTermsBenchmark, SmearTermBenchmarkXOriginal) (benchmark::State &state) { - PredictPlanExec &sim_plan_exec = *plan_exec_ptr; + PredictPlanExecCPU &sim_plan_exec = *plan_exec_ptr; for (auto _ : state) { for (size_t bl_idx = 0; bl_idx < sim_plan_exec.baselines.size(); bl_idx++) { const auto &baseline = sim_plan_exec.baselines[bl_idx]; @@ -65,6 +66,13 @@ BENCHMARK_DEFINE_F(SmearTermsBenchmark, SmearTermBenchmarkXOriginal) } } +BENCHMARK_DEFINE_F(SmearTermsBenchmark, SmearTermBenchmarkSIMD) +(benchmark::State &state) { + for (auto _ : state) { + plan_exec_ptr->ComputeSmearTerms<computation_strategy::MULTI_SIMD>(); + } +} + BENCHMARK_REGISTER_F(SmearTermsBenchmark, SmearTermBenchmarkXTensor) ->ArgsProduct({n_stations, n_frequencies}) ->ArgNames({"#stations", "#frequencies"}) @@ -74,3 +82,8 @@ BENCHMARK_REGISTER_F(SmearTermsBenchmark, SmearTermBenchmarkXOriginal) ->ArgsProduct({n_stations, n_frequencies}) ->ArgNames({"#stations", "#frequencies"}) ->Unit(benchmark::kMillisecond); + +BENCHMARK_REGISTER_F(SmearTermsBenchmark, SmearTermBenchmarkSIMD) + ->ArgsProduct({n_stations, n_frequencies}) + ->ArgNames({"#stations", "#frequencies"}) + ->Unit(benchmark::kMillisecond); diff --git a/benchmark/spectrum_cross.cpp b/benchmark/spectrum_cross.cpp index 13def0baaa8bd11c2244f643fc8eae57bbff9813..92cc24a547b4cc8aca34e90c658534058111980b 100644 --- a/benchmark/spectrum_cross.cpp +++ b/benchmark/spectrum_cross.cpp @@ -29,9 +29,8 @@ public: spec.SetReferenceFlux({1.0, 0.0, 0.0, 0.0}); spec.SetPolarization(0.3, 0.2); - computed_spectrum = - std::make_unique<xt::xtensor<std::complex<double>, 2>>(); - computed_spectrum->resize({4, n_frequencies}); + computed_spectrum = std::make_unique<xt::xtensor<double, 3>>(); + computed_spectrum->resize({2, 4, n_frequencies}); } void TearDown(benchmark::State &) override { spectrum.reset(); @@ -40,7 +39,7 @@ public: protected: std::unique_ptr<Spectrum> spectrum; - std::unique_ptr<xt::xtensor<std::complex<double>, 2>> computed_spectrum; + std::unique_ptr<xt::xtensor<double, 3>> computed_spectrum; size_t n_frequencies; xt::xtensor<double, 1> frequencies; }; diff --git a/ci/das6/compile_and_run.sh b/ci/das6/compile_and_run.sh index 19030e9d935625beb7a7a2a8ffed8ae797c66a15..22283a5872653f3597ee0960a1d0714f0c91c78b 100755 --- a/ci/das6/compile_and_run.sh +++ b/ci/das6/compile_and_run.sh @@ -8,6 +8,16 @@ ARCHITECTURE=$1 COMPILER_VERSION=$2 echo "Running on compiler version: ${COMPILER_VERSION}, architecture: ${ARCHITECTURE}, hostname: $(hostname)" + +# A third argument may be passed. This is the benchmark filter +# Check if the filter args is available +if [ "$#" -ge 3 ]; then + echo "Filter: $3" + FILTER=$3 +else + FILTER="" +fi + BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE} module purge module load spack/20250403 @@ -16,4 +26,4 @@ module load cmake casacore boost openblas cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release -DPREDICT_BUILD_BENCHMARK=ON make -C ${BUILD_DIR} -j -${BUILD_DIR}/benchmark/microbenchmarks --benchmark_out=results-${COMPILER_VERSION}-${ARCHITECTURE}.json --benchmark_out_format=json \ No newline at end of file +${BUILD_DIR}/benchmark/microbenchmarks --benchmark_filter=${FILTER} --benchmark_out=results-${COMPILER_VERSION}-${ARCHITECTURE}.json --benchmark_out_format=json --benchmark_min_warmup_time=1 \ No newline at end of file diff --git a/ci/summarize-results.py b/ci/summarize-results.py index 48c33bf456997c76964cb8e4722cf9b409a34b8c..1e7144a96fb33ec4e6256eaeabad4e66edb1985e 100644 --- a/ci/summarize-results.py +++ b/ci/summarize-results.py @@ -43,9 +43,21 @@ def create_summary_plot(metrics, outplot_name): grid.set_ylabels(f"CPU time({time_unit})") grid.set_xlabels("Parameter") grid.add_legend() - plt.setp(grid._legend.get_texts(), fontsize=9) - plt.xticks(rotation=90) + plt.setp(grid._legend.get_texts(), fontsize=8) + for ax in grid.axes.flat: + for label in ax.get_xticklabels(): + label.set_rotation(90) + label.set_fontsize(8) + + ax.set_title(ax.get_title(), fontsize=8) + compilers = sorted(metrics["compiler_version"].unique()) + archs = sorted(metrics["architecture"].unique()) + title = f"Predict compilers: [{', '.join(compilers)}], architectures: [{', '.join(archs)}]" + + grid.fig.suptitle(title, fontsize=14) + grid.fig.subplots_adjust(top=0.75) + grid.savefig(outplot_name + ".png") def main(): diff --git a/include/Directions.h b/include/Directions.h index f172324e00e452c3b90e0cdab430d72a716e5831..1f66b862da0a70ffdb33551cba7a27e3072fdd62 100644 --- a/include/Directions.h +++ b/include/Directions.h @@ -13,6 +13,7 @@ #include <ObjectCollection.h> #include <common/SmartVector.h> +#include <tracy/Tracy.hpp> #include <vector> #include <xsimd/xsimd.hpp> #include <xtensor/xmath.hpp> @@ -21,7 +22,7 @@ using Direction = dp3::base::Direction; -enum computation_strategy { SINGLE, MULTI, MULTI_SIMD }; +enum computation_strategy { SINGLE, XSIMD, MULTI_SIMD }; class Directions : ObjectCollection<Direction> { public: @@ -197,6 +198,7 @@ void radec2lmn::operator()(Arch, const Direction &reference, const C &ra, template <> inline void Directions::RaDec2Lmn<Directions::computation_strategy::MULTI_SIMD>( const Direction &reference, xt::xtensor<double, 2> &lmn) const { + ZoneScoped; xt::xtensor<double, 2> lmn_tmp; lmn_tmp.resize({3, ra.size()}); xsimd::dispatch(xsimd::radec2lmn{})(reference, ra, dec, lmn_tmp, diff --git a/include/PointSource.h b/include/PointSource.h index bc09b0b8e120191c1b4f693d8fac812e11ce9e6d..fb2bf3ec4f5afc1158721dfcff88b1825b5b7eba 100644 --- a/include/PointSource.h +++ b/include/PointSource.h @@ -31,7 +31,7 @@ public: void SetDirection(const Direction &position); void ComputeSpectrum(const xt::xtensor<double, 1> &frequencies, - xt::xtensor<std::complex<double>, 2> &result) const; + xt::xtensor<double, 3> &result) const; const Spectrum &GetSpectrum() const { return spectrum_; } Stokes GetStokes(double freq) const; diff --git a/include/Predict.h b/include/Predict.h index 0851580c2d1f822e53311ea108a0f52098e5f0ce..c5d57d4e01645dda8e2dcf52a4b4616e285998f0 100644 --- a/include/Predict.h +++ b/include/Predict.h @@ -10,10 +10,12 @@ #include <casacore/casa/Arrays/Cube.h> #include <casacore/measures/Measures.h> +#include <xtensor/xlayout.hpp> #include <xtensor/xtensor.hpp> +#include "Directions.h" #include "GaussianSourceCollection.h" -#include "PredictPlanExec.h" +#include "PredictPlanExecCPU.h" #include <Direction.h> #include <PointSource.h> #include <Stokes.h> @@ -122,10 +124,23 @@ inline void spectrum(const PointSource &component, size_t nChannel, class Predict { public: - void run(PredictPlanExec &plan, const PointSourceCollection &sources, - xt::xtensor<dcomplex, 3> &buffer) const; - void run(PredictPlanExec &plan, const GaussianSourceCollection &sources, - xt::xtensor<dcomplex, 3> &buffer) const; + void run(PredictPlanExecCPU &plan, const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) const; + + void run(PredictPlanExecCPU &plan, const GaussianSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) const; + + void + runWithStrategy(PredictPlanExecCPU &plan, + const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer, + const computation_strategy strat); + + void + runWithStrategy(PredictPlanExecCPU &plan, + const GaussianSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer, + const computation_strategy strat); }; /// @} diff --git a/include/PredictPlan.h b/include/PredictPlan.h index c5b68cc35f8bda76081f62a8a161cb7961b8d873..73d09837cceb2c9393d549f3d38e73e88ccc764a 100644 --- a/include/PredictPlan.h +++ b/include/PredictPlan.h @@ -14,6 +14,7 @@ #include <xtensor/xlayout.hpp> #include <xtensor/xtensor.hpp> +#include "Directions.h" #include "GaussianSourceCollection.h" #include "PointSourceCollection.h" @@ -42,11 +43,13 @@ struct PredictPlan { // standalone PredictPlan object virtual void Precompute(const PointSourceCollection &) {} - virtual void Compute(const PointSourceCollection &sources, - xt::xtensor<std::complex<double>, 3> &buffer) {} + virtual void + Compute(const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) {} - virtual void Compute(const GaussianSourceCollection &sources, - xt::xtensor<std::complex<double>, 3> &buffer) {} + virtual void + Compute(const GaussianSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) {} }; #endif // PREDICT_PLAN_H_ diff --git a/include/PredictPlanExec.h b/include/PredictPlanExec.h deleted file mode 100644 index c9e79d853dfc7e6465ee595bd6cc17465f5155f6..0000000000000000000000000000000000000000 --- a/include/PredictPlanExec.h +++ /dev/null @@ -1,67 +0,0 @@ -// PredictPlanExec.h: PredictPlanExec -// -// Copyright (C) 2020 ASTRON (Netherlands Institute for Radio Astronomy) -// SPDX-License-Identifier: Apache2.0 - -/// \file -/// \brief PredictPlanExec - -#ifndef PREDICT_PLAN_EXEC_H_ -#define PREDICT_PLAN_EXEC_H_ - -#include "GaussianSourceCollection.h" -#include "PointSourceCollection.h" -#include "PredictPlan.h" -#include <casacore/measures/Measures.h> -#include <complex> -#include <xtensor/xtensor.hpp> -class PredictPlanExec : public PredictPlan { -public: - explicit PredictPlanExec(const PredictPlan &plan) : PredictPlan(plan) { - // Initialize the station phases and shifts - Initialize(); - } - - virtual void Precompute(const PointSourceCollection &sources) final override; - - virtual void - Compute(const PointSourceCollection &sources, - xt::xtensor<std::complex<double>, 3> &buffer) final override; - - virtual void - Compute(const GaussianSourceCollection &sources, - xt::xtensor<std::complex<double>, 3> &buffer) final override; - - void ComputeStationPhases(); - - const xt::xtensor<double, 1> &GetStationPhases() const { - return station_phases; - } - const xt::xtensor<std::complex<double>, 2> &GetShiftData() const { - return shift_data; - } - - void FillEulerMatrix(xt::xtensor<double, 2> &mat, const double ra, - const double dec); - - void ComputeSmearTerms(); - - const xt::xtensor<double, 2> &get_lmn() const { return lmn; } - const xt::xtensor<double, 2, xt::layout_type::column_major> &get_uvw() const { - return uvw; - } - const xt::xtensor<double, 1> &get_frequencies() const { return frequencies; } - - xt::xtensor<double, 2> lmn; - xt::xtensor<double, 1> station_phases; - xt::xtensor<std::complex<double>, 2> shift_data; - xt::xtensor<double, 2> smear_terms; - -private: - void Initialize(); - -private: - const double kCInv = 2.0 * M_PI / casacore::C::c; -}; - -#endif // PREDICT_PLAN_EXEC_H_ diff --git a/include/PredictPlanExecCPU.h b/include/PredictPlanExecCPU.h new file mode 100644 index 0000000000000000000000000000000000000000..d50c562c3c0dce7a37940a9f1c3467e8a614041a --- /dev/null +++ b/include/PredictPlanExecCPU.h @@ -0,0 +1,158 @@ +// PredictPlanExecCPU.h: PredictPlanExecCPU +// +// Copyright (C) 2020 ASTRON (Netherlands Institute for Radio Astronomy) +// SPDX-License-Identifier: Apache2.0 + +/// \file +/// \brief PredictPlanExecCPU + +#ifndef PREDICT_PLAN_EXEC_CPU_H_ +#define PREDICT_PLAN_EXEC_CPU_H_ + +#include <casacore/measures/Measures.h> +#include <xtensor/xtensor.hpp> +#include <xtensor/xview.hpp> + +#include "Directions.h" +#include "GaussianSourceCollection.h" +#include "PointSourceCollection.h" +#include "PredictPlan.h" + +class PredictPlanExecCPU : public PredictPlan { +public: + explicit PredictPlanExecCPU(const PredictPlan &plan) + : PredictPlan(plan), xtensor_dispatch_(*this) { + // Initialize the station phases and shifts + Initialize(); + } + + virtual void Precompute(const PointSourceCollection &sources) final override; + + virtual void Compute(const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> + &buffer) final override; + + virtual void Compute(const GaussianSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> + &buffer) final override; + + void + ComputeWithTarget(const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer, + const computation_strategy strat); + + void ComputeStationPhases(); + + template <computation_strategy T = computation_strategy::XSIMD> + constexpr inline void ComputeSmearTerms() { + if constexpr (T == computation_strategy::MULTI_SIMD) { + ComputeSmearTermsSIMD(); + } else if constexpr (T == computation_strategy::XSIMD) { + ComputeSmearTermsXtensor(); + } else { + static_assert(T == computation_strategy::XSIMD || + T == computation_strategy::MULTI_SIMD, + "Unknown computation strategy"); + } + } + + void ComputeSmearTermsXtensor(); + void ComputeSmearTermsSIMD(); + + const xt::xtensor<double, 1> &GetStationPhases() const { + return station_phases; + } + + const xt::xtensor<double, 3> &GetShiftData() const { return shift_data; } + + void FillEulerMatrix(xt::xtensor<double, 2> &mat, const double ra, + const double dec); + + const xt::xtensor<double, 2> &GetLmn() const { return lmn; } + const xt::xtensor<double, 2, xt::layout_type::column_major> &GetUvw() const { + return uvw; + } + const xt::xtensor<double, 1> &GetFrequencies() const { return frequencies; } + + template <class E, class S, class Tag, class Arch> + void operator()(Arch, E &buffer_expr, const S &spectrum, Tag); + + struct XTensorDispatch { + template <class E, class S, class Tag, class Arch> + void operator()(Arch, E &buffer_expr, const S &spectrum, Tag) const; + + const PredictPlanExecCPU &parent; + + explicit XTensorDispatch(const PredictPlanExecCPU &parent) + : parent(parent) {} + }; + + XTensorDispatch xtensor_dispatch_; + + xt::xtensor<double, 2> lmn; + xt::xtensor<double, 1> station_phases; + xt::xtensor<double, 3> shift_data; + xt::xtensor<double, 2, xt::layout_type::row_major> smear_terms; + +private: + void Initialize(); + + template <computation_strategy T = computation_strategy::MULTI_SIMD, class E, + class S> + inline constexpr void ProcessPolarizationComponent(E &stoke_buffer_expr, + const S &stoke_spectrum) { + if constexpr (T == computation_strategy::SINGLE) { + ProcessPolarizationComponentSingle<E, S>(stoke_buffer_expr, + stoke_spectrum); + } else if constexpr (T == computation_strategy::XSIMD) { + ProcessPolarizationComponentXtensor<E, S>(stoke_buffer_expr, + stoke_spectrum); + } else if constexpr (T == computation_strategy::MULTI_SIMD) { + ProcessPolarizationComponentMultiSimd<E, S>(stoke_buffer_expr, + stoke_spectrum); + } else { + static_assert(T == computation_strategy::SINGLE || + T == computation_strategy::XSIMD || + T == computation_strategy::MULTI_SIMD, + "Unknown computation strategy"); + } + } + + template <class E, class S> + inline void ProcessPolarizationComponent(const computation_strategy T, + E &buffer_expr, const S &spectrum) { + switch (T) { + case computation_strategy::SINGLE: + ProcessPolarizationComponentSingle<E, S>(buffer_expr, spectrum); + break; + case computation_strategy::XSIMD: + ProcessPolarizationComponentXtensor<E, S>(buffer_expr, spectrum); + break; + case computation_strategy::MULTI_SIMD: + ProcessPolarizationComponentMultiSimd<E, S>(buffer_expr, spectrum); + break; + default: + break; + } + } + + template <class E, class S> + void ProcessPolarizationComponentSingle(E &stoke_buffer_expr, + const S &stoke_spectrum); + + template <class E, class S> + void ProcessPolarizationComponentMulti(E &stoke_buffer_expr, + const S &stoke_spectrum); + + template <class E, class S> + void ProcessPolarizationComponentMultiSimd(E &stoke_buffer_expr, + const S &stoke_spectrum); + template <class E, class S> + void ProcessPolarizationComponentXtensor(E &stoke_buffer_expr, + const S &stoke_spectrum); + +private: + const double kCInv = 2.0 * M_PI / casacore::C::c; +}; + +#endif // PREDICT_PLAN_EXEC_CPU_H_ diff --git a/include/Spectrum.h b/include/Spectrum.h index 3fabc95f515cd20dc63e1916640ae4e0c168fdf5..7cf2800eec8f91769bbeed0f0b7483e5b8a6923b 100644 --- a/include/Spectrum.h +++ b/include/Spectrum.h @@ -116,9 +116,9 @@ public: const Stokes &GetReferenceFlux() const { return reference_flux_; } - void EvaluateCrossCorrelations( - const xt::xtensor<double, 1> &frequencies, - xt::xtensor<std::complex<double>, 2> &result) const { + void EvaluateCrossCorrelations(const xt::xtensor<double, 1> &frequencies, + xt::xtensor<double, 3> &result, + const bool need_reshape = true) const { xt::xtensor<double, 1>::shape_type kSpectralCorrectionShape{ frequencies.size()}; xt::xtensor<double, 2>::shape_type kRotationCoefficientsShape{ @@ -128,7 +128,9 @@ public: xt::xtensor<double, 2> rotation_coefficients = xt::zeros<double>(kRotationCoefficientsShape); - result.resize({4, frequencies.size()}); + if (need_reshape) { + result.resize({2, 4, frequencies.size()}); + } if (spectral_terms_.size() > 0 && has_logarithmic_spectral_index_) { for (size_t f_idx = 0; f_idx < frequencies.size(); f_idx++) { @@ -166,14 +168,14 @@ public: auto Q = I * xt::view(rotation_coefficients, 0, xt::all()); auto U = I * xt::view(rotation_coefficients, 1, xt::all()); - xt::real(xt::view(result, CrossCorrelation::XX, xt::all())) = I + Q; - xt::imag(xt::view(result, CrossCorrelation::XX, xt::all())) = 0.0; - xt::real(xt::view(result, CrossCorrelation::XY, xt::all())) = U; - xt::imag(xt::view(result, CrossCorrelation::XY, xt::all())) = V; - xt::real(xt::view(result, CrossCorrelation::YX, xt::all())) = U; - xt::imag(xt::view(result, CrossCorrelation::YX, xt::all())) = -V; - xt::real(xt::view(result, CrossCorrelation::YY, xt::all())) = I - Q; - xt::imag(xt::view(result, CrossCorrelation::YY, xt::all())) = 0.0; + xt::view(result, 0, CrossCorrelation::XX, xt::all()) = I + Q; + xt::view(result, 1, CrossCorrelation::XX, xt::all()) = 0.0; + xt::view(result, 0, CrossCorrelation::XY, xt::all()) = U; + xt::view(result, 1, CrossCorrelation::XY, xt::all()) = V; + xt::view(result, 0, CrossCorrelation::YX, xt::all()) = U; + xt::view(result, 1, CrossCorrelation::YX, xt::all()) = -V; + xt::view(result, 0, CrossCorrelation::YY, xt::all()) = I - Q; + xt::view(result, 1, CrossCorrelation::YY, xt::all()) = 0.0; } private: diff --git a/include/test/Common.h b/include/test/Common.h index 62c305c80528a860caa0a753cf7a5c5d12865b84..9ab13c43844a758ee9064545c5d5056b0dda6caf 100644 --- a/include/test/Common.h +++ b/include/test/Common.h @@ -1,9 +1,10 @@ #ifndef PREDICT_BENCHMARK_COMMON_HPP #define PREDICT_BENCHMARK_COMMON_HPP +#include "Directions.h" #include "GaussianSourceCollection.h" #include "PointSourceCollection.h" -#include "PredictPlanExec.h" +#include "PredictPlanExecCPU.h" #include <complex> #include <random> #include <vector> @@ -55,26 +56,46 @@ struct PredictRun { plan.channel_widths = std::vector<double>(plan.nchannels, 1.0e6); plan.baselines.resize(plan.nbaselines); - buffer.resize({plan.nstokes, plan.nchannels, plan.nbaselines}); + buffer = xt::xtensor<double, 4, xt::layout_type::row_major>( + {2, plan.nstokes, plan.nbaselines, plan.nchannels}, 0.0); } void Run(const PointSourceCollection &sources) { - PredictPlanExec plan_exec{plan}; + PredictPlanExecCPU plan_exec{plan}; Predict pred; pred.run(plan_exec, sources, buffer); } void Run(const GaussianSourceCollection &sources) { - PredictPlanExec plan_exec{plan}; + PredictPlanExecCPU plan_exec{plan}; Predict pred; pred.run(plan_exec, sources, buffer); } + void RunWithStrategy(const PointSourceCollection &sources, + const computation_strategy strat) { + PredictPlanExecCPU plan_exec{plan}; + Predict pred; + pred.runWithStrategy(plan_exec, sources, buffer, strat); + } + + void RunWithStrategy(const GaussianSourceCollection &sources, + const computation_strategy strat) { + PredictPlanExecCPU plan_exec{plan}; + Predict pred; + pred.runWithStrategy(plan_exec, sources, buffer, strat); + } + + void Clean() { + buffer = xt::xtensor<double, 4, xt::layout_type::row_major>( + {2, plan.nstokes, plan.nbaselines, plan.nchannels}, 0.0); + } + template <typename SourceType> SourceType makeSource() const { return SourceType{offset_source, Stokes{1.0, 0.0, 0.0, 0.0}}; } - xt::xtensor<dcomplex, 3> buffer; + xt::xtensor<double, 4, xt::layout_type::row_major> buffer; Direction reference_point; Direction offset_source; diff --git a/run_bench.sh b/run_bench.sh new file mode 100755 index 0000000000000000000000000000000000000000..c651bae99763a5739e43eb6b5a996ba12c3fdb8c --- /dev/null +++ b/run_bench.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Check if the script is being run with two or three arguments +if [ "$#" -lt 2 ] || [ "$#" -gt 3 ]; then + echo "Usage: $0 <architecture> <compiler_version> [--bench-only-sources]" + exit 1 +fi + +module load python + +ARCHITECTURE=$1 +COMPILER_VERSION=$2 +FILTER="" + +# Check if the --bench-only-sources flag is set. If so, append the Google benchmark filter +if [ "$3" == "--bench-only-sources" ]; then + BENCH_ONLY_SOURCES=true + FILTER="PredictBenchmark/PointSource PredictBenchmark/GaussianSource" +else + BENCH_ONLY_SOURCES=false +fi + +echo "Running on compiler version: ${COMPILER_VERSION}, architecture: ${ARCHITECTURE}, hostname: $(hostname), filter: ${FILTER}" + +sbatch --wait -C ${ARCHITECTURE} -o output.txt -e error.txt ci/das6/compile_and_run.sh ${ARCHITECTURE} ${COMPILER_VERSION} ${FILTER} +cat output.txt >&1 +cat error.txt >&2 + +python3 ci/summarize-results.py --filter PredictBenchmark/PointSource results*.json result-summary-pointsource +python3 ci/summarize-results.py --filter PredictBenchmark/GaussianSource results*.json result-summary-gaussian + +if [ "$BENCH_ONLY_SOURCES" == false ]; then + python3 ci/summarize-results.py --filter DirectionsBenchmark/Directions results*.json result-summary-directions + python3 ci/summarize-results.py --filter PhasesBenchmark/ComputePhases results*.json result-summary-phases + python3 ci/summarize-results.py --filter SpectrumBenchmark/Spectrum results*.json result-summary-spectrum + python3 ci/summarize-results.py --filter SmearTermsBenchmark/SmearTermBenchmark results*.json result-summary-smearterms +fi \ No newline at end of file diff --git a/src/PointSource.cpp b/src/PointSource.cpp index 4b1be08f09026ad15287420e4edf913de71ed20b..7b71cf1de851da8fbe15d932ac6c29fa999731ab 100644 --- a/src/PointSource.cpp +++ b/src/PointSource.cpp @@ -28,9 +28,8 @@ void PointSource::SetDirection(const Direction &direction) { direction_ = direction; } -void PointSource::ComputeSpectrum( - const xt::xtensor<double, 1> &frequencies, - xt::xtensor<std::complex<double>, 2> &result) const { +void PointSource::ComputeSpectrum(const xt::xtensor<double, 1> &frequencies, + xt::xtensor<double, 3> &result) const { spectrum_.EvaluateCrossCorrelations(frequencies, result); } diff --git a/src/Predict.cpp b/src/Predict.cpp index a05045a49f4583a27e98bba51fb83a1875b7835a..6bb6b912bd360f136b9676c0a8100e0ca58a3f88 100644 --- a/src/Predict.cpp +++ b/src/Predict.cpp @@ -10,29 +10,50 @@ #include <casacore/casa/BasicSL/Constants.h> #include <cmath> +#include <xtensor/xlayout.hpp> +#include <tracy/Tracy.hpp> + +#include "Directions.h" #include "GaussianSourceCollection.h" #include "PointSourceCollection.h" #include "Predict.h" -#include "PredictPlanExec.h" +#include "PredictPlanExecCPU.h" namespace {} // namespace namespace dp3 { namespace base { -void Predict::run(PredictPlanExec &plan, const PointSourceCollection &sources, - xt::xtensor<dcomplex, 3> &buffer) +void Predict::run(PredictPlanExecCPU &plan, + const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) const { // buffer dimensions are (nCor, nFreq, nBaselines) plan.Precompute(sources); plan.Compute(sources, buffer); } -void Predict::run(PredictPlanExec &plan, +void Predict::run(PredictPlanExecCPU &plan, const GaussianSourceCollection &sources, - xt::xtensor<dcomplex, 3> &buffer) + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) const { // buffer dimensions are (nCor, nFreq, nBaselines) plan.Precompute(sources); plan.Compute(sources, buffer); } +void Predict::runWithStrategy( + PredictPlanExecCPU &plan, const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer, + const computation_strategy strat) { + plan.Precompute(sources); + plan.ComputeWithTarget(sources, buffer, strat); +} + +void Predict::runWithStrategy( + PredictPlanExecCPU &plan, const GaussianSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer, + const computation_strategy strat) { + plan.Precompute(sources); + plan.ComputeWithTarget(sources, buffer, strat); +} + } // namespace base } // namespace dp3 diff --git a/src/PredictPlanExec.cpp b/src/PredictPlanExec.cpp deleted file mode 100644 index 4b91cef87ceaa30ff94e9489098a4fc8c315713a..0000000000000000000000000000000000000000 --- a/src/PredictPlanExec.cpp +++ /dev/null @@ -1,439 +0,0 @@ -#include "PredictPlanExec.h" - -#include "Directions.h" -#include "GaussianSourceCollection.h" -#include "PointSourceCollection.h" -#include "Spectrum.h" - -#include <xtensor-blas/xlinalg.hpp> -#include <xtensor/xadapt.hpp> -#include <xtensor/xindex_view.hpp> -#include <xtensor/xtensor.hpp> -#include <xtensor/xview.hpp> - -void PredictPlanExec::Initialize() { - // Initialize the station phases and shifts - station_phases.resize({nstations}); - shift_data.resize({nstations, nchannels}); - lmn.resize({nchannels, 3}); - uvw.resize({nstations, 3}); - frequencies.resize({nchannels}); - smear_terms = xt::ones<float>({baselines.size(), nchannels}); -} - -void PredictPlanExec::Precompute(const PointSourceCollection &sources) { - sources.direction_vector.RaDec2Lmn(reference, lmn); - ComputeStationPhases(); - if (correct_frequency_smearing) - ComputeSmearTerms(); -} - -/** - * Compute station phase shifts. - * - * \f[ \mathrm{stationphases}(p) = \frac{2\pi}{c}((u_p, v_p, w_p) \cdot (\ell, - * m, n)) \f] - * - * \f[ \mathrm{phases}(p) = e^{\mathrm{stationphases}(p)} \f] - * - * @param nStation Number of stations - * @param nChannel Number of channels - * @param lmn LMN coordinates of all sources. - * @param uvw Station UVW coordinates, matrix of shape (3, nSt) - * @param freq Channel frequencies, should be length nChannel - * @param shift Output matrix (2 for real,imag), shift per station, matrix of - * shape (3, nSt) - * @param stationPhases Output vector, store per station \f$(x_1,y_1)\f$ - */ -void PredictPlanExec::ComputeStationPhases() { - // Compute station phases - const xt::xtensor<double, 1> lmn_offset = {lmn[0], lmn[1], lmn[2] - 1.0}; - station_phases = xt::sum(uvw * lmn_offset * kCInv, 1); - - xt::xtensor<double, 1> phase_terms; - phase_terms.resize({nchannels}); - - for (size_t st = 0; st < nstations; ++st) { - phase_terms = station_phases(st) * frequencies; - - const auto sin_phase = xt::sin(phase_terms); - const auto cos_phase = xt::cos(phase_terms); - - // Assign to shifts as std::complex - for (size_t ch = 0; ch < nchannels; ++ch) { - shift_data(st, ch) = std::complex<double>(cos_phase(ch), sin_phase(ch)); - } - } -} - -void PredictPlanExec::FillEulerMatrix(xt::xtensor<double, 2> &mat, - const double ra, const double dec) { - double sinra = sin(ra); - double cosra = cos(ra); - double sindec = sin(dec); - double cosdec = cos(dec); - - mat(0, 0) = cosra; - mat(1, 0) = -sinra; - mat(2, 0) = 0; - mat(0, 1) = -sinra * sindec; - mat(1, 1) = -cosra * sindec; - mat(2, 1) = cosdec; - mat(0, 2) = sinra * cosdec; - mat(1, 2) = cosra * cosdec; - mat(2, 2) = sindec; -} - -void PredictPlanExec::Compute(const PointSourceCollection &sources, - xt::xtensor<std::complex<double>, 3> &buffer) { - xt::xtensor<std::complex<double>, 2> computed_spectrum; - - for (size_t source_index = 0; source_index < sources.Size(); ++source_index) { - const Spectrum &spectrum = sources.spectrums[source_index]; - spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum); - - // Use preallocated storage (outside for loops) - std::vector<double> temp_prod_real_I(nchannels); - std::vector<double> temp_prod_imag_I(nchannels); - - std::vector<double> temp_prod_real_Q; - std::vector<double> temp_prod_imag_Q; - std::vector<double> temp_prod_real_U; - std::vector<double> temp_prod_imag_U; - std::vector<double> temp_prod_real_V; - std::vector<double> temp_prod_imag_V; - - // If needed, preallocate storage for Q,U,V - if (!compute_stokes_I_only) { - temp_prod_real_Q.resize(nchannels); - temp_prod_imag_Q.resize(nchannels); - temp_prod_real_U.resize(nchannels); - temp_prod_imag_U.resize(nchannels); - temp_prod_real_V.resize(nchannels); - temp_prod_imag_V.resize(nchannels); - } - - // Compute visibilities. - // The following loop can be parallelized, but because there is already - // a parallelization over sources, this is not necessary. - // It used to be parallelized with a #pragma omp parallel for, but since - // the outer loop was also parallelized with omp, this had no effect - // (since omp by default doesn't parallelize nested loops). After the change - // to a ThreadPool, this would parallize each separate source, which started - // hundreds of threads on many-core machines. - for (size_t bl = 0; bl < baselines.size(); ++bl) { - const size_t p = baselines[bl].first; - const size_t q = baselines[bl].second; - - if (p != q) { - if (compute_stokes_I_only) { -#pragma GCC ivdep - for (size_t ch = 0; ch < nchannels; ++ch) { - // Note the notation: - // Each complex number is represented as (x + j*y) - // x: real part, y: imaginary part - // Subscripts _p and _q represent stations p and q, respectively - // Subscript _c represents channel (SpectrumBuffer) - const double x_p = (GetShiftData()(p, ch).real()); - const double y_p = (GetShiftData()(p, ch).imag()); - const double x_q = (GetShiftData()(q, ch).real()); - const double y_q = (GetShiftData()(q, ch).imag()); - const double x_c = computed_spectrum(StokesComponent::I, ch).real(); - const double y_c = computed_spectrum(StokesComponent::I, ch).imag(); - - // Compute baseline phase shift. - // Compute visibilities. - const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q); - const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p); - temp_prod_real_I[ch] = q_conj_p_real * (x_c)-q_conj_p_imag * (y_c); - temp_prod_imag_I[ch] = - q_conj_p_real * (y_c) + q_conj_p_imag * (x_c); - } // Channels. - if (correct_frequency_smearing) { -#pragma GCC ivdep - for (size_t ch = 0; ch < nchannels; ++ch) { - buffer(StokesComponent::I, ch, bl) += - dcomplex(smear_terms(bl, ch) * temp_prod_real_I[ch], - smear_terms(bl, ch) * temp_prod_imag_I[ch]); - } - } else { -#pragma GCC ivdep - for (size_t ch = 0; ch < nchannels; ++ch) { - buffer(StokesComponent::I, ch, bl) += - dcomplex(temp_prod_real_I[ch], temp_prod_imag_I[ch]); - } - } - } else { -#pragma GCC ivdep - for (size_t ch = 0; ch < nchannels; ++ch) { - const double x_p = (GetShiftData()(p, ch).real()); - const double y_p = (GetShiftData()(p, ch).imag()); - const double x_q = (GetShiftData()(q, ch).real()); - const double y_q = (GetShiftData()(q, ch).imag()); - - const double x_c = computed_spectrum(StokesComponent::I, ch).real(); - const double y_c = computed_spectrum(StokesComponent::I, ch).imag(); - const double x_c2 = - computed_spectrum(StokesComponent::Q, ch).real(); - const double y_c2 = - computed_spectrum(StokesComponent::Q, ch).imag(); - const double x_c3 = - computed_spectrum(StokesComponent::U, ch).real(); - const double y_c3 = - computed_spectrum(StokesComponent::U, ch).imag(); - const double x_c4 = - computed_spectrum(StokesComponent::V, ch).real(); - const double y_c4 = - computed_spectrum(StokesComponent::V, ch).imag(); - - // Compute baseline phase shift. - // Compute visibilities. - const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q); - const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p); - temp_prod_real_I[ch] = q_conj_p_real * (x_c)-q_conj_p_imag * (y_c); - temp_prod_imag_I[ch] = - q_conj_p_real * (y_c) + q_conj_p_imag * (x_c); - temp_prod_real_Q[ch] = - q_conj_p_real * (x_c2)-q_conj_p_imag * (y_c2); - temp_prod_imag_Q[ch] = - q_conj_p_real * (y_c2) + q_conj_p_imag * (x_c2); - temp_prod_real_U[ch] = - q_conj_p_real * (x_c3)-q_conj_p_imag * (y_c3); - temp_prod_imag_U[ch] = - q_conj_p_real * (y_c3) + q_conj_p_imag * (x_c3); - temp_prod_real_V[ch] = - q_conj_p_real * (x_c4)-q_conj_p_imag * (y_c4); - temp_prod_imag_V[ch] = - q_conj_p_real * (y_c4) + q_conj_p_imag * (x_c4); - } // Channels. - - if (correct_frequency_smearing) { -#pragma GCC ivdep - for (size_t ch = 0; ch < nchannels; ++ch) { - buffer(StokesComponent::I, ch, bl) += - dcomplex(smear_terms(bl, ch) * temp_prod_real_I[ch], - smear_terms(bl, ch) * temp_prod_imag_I[ch]); - buffer(StokesComponent::Q, ch, bl) += - dcomplex(smear_terms(bl, ch) * temp_prod_real_Q[ch], - smear_terms(bl, ch) * temp_prod_imag_Q[ch]); - buffer(StokesComponent::U, ch, bl) += - dcomplex(smear_terms(bl, ch) * temp_prod_real_U[ch], - smear_terms(bl, ch) * temp_prod_imag_U[ch]); - buffer(StokesComponent::V, ch, bl) += - dcomplex(smear_terms(bl, ch) * temp_prod_real_V[ch], - smear_terms(bl, ch) * temp_prod_imag_V[ch]); - } - } else { -#pragma GCC ivdep - for (size_t ch = 0; ch < nchannels; ++ch) { - buffer(StokesComponent::I, ch, bl) += - dcomplex(temp_prod_real_I[ch], temp_prod_imag_I[ch]); - buffer(StokesComponent::Q, ch, bl) += - dcomplex(temp_prod_real_Q[ch], temp_prod_imag_Q[ch]); - buffer(StokesComponent::U, ch, bl) += - dcomplex(temp_prod_real_U[ch], temp_prod_imag_U[ch]); - buffer(StokesComponent::V, ch, bl) += - dcomplex(temp_prod_real_V[ch], temp_prod_imag_V[ch]); - } - } - } - } - } // Baselines. - } -} - -void PredictPlanExec::Compute(const GaussianSourceCollection &sources, - xt::xtensor<std::complex<double>, 3> &buffer) { - const std::array<size_t, 2> kStationUvwShape = {get_uvw().shape(1), - get_uvw().shape(0)}; - xt::xtensor<std::complex<double>, 2> computed_spectrum; - xt::xtensor<double, 2> xtStationUvw; - xt::xtensor<double, 2> uvwShifted; - std::vector<double> angular_smear(nchannels); - - for (size_t source_index = 0; source_index < sources.Size(); ++source_index) { - const Spectrum &spectrum = sources.spectrums[source_index]; - const double position_angle = sources.position_angle[source_index]; - const double major_axis = sources.major_axis[source_index]; - const double minor_axis = sources.minor_axis[source_index]; - const bool position_angle_is_absolute = - sources.position_angle_is_absolute[source_index]; - - spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum); - - // Convert position angle from North over East to the angle used to - // rotate the right-handed UV-plane. - const double kPhi = M_PI_2 + position_angle + M_PI; - const double kCosPhi = cos(kPhi); - const double kSinPhi = sin(kPhi); - - xtStationUvw = xt::adapt(get_uvw().data(), get_uvw().size(), - xt::no_ownership(), kStationUvwShape); - - if (position_angle_is_absolute) { - // Correct for projection and rotation effects: phase shift u, v, w to - // position of the source for evaluating the gaussian - - xt::xtensor<double, 2> euler_matrix_phasecenter = - xt::zeros<double>({3, 3}); - - xt::xtensor<double, 2> euler_matrix_source = xt::zeros<double>({3, 3}); - - const Directions &directions = sources.direction_vector; - - FillEulerMatrix(euler_matrix_phasecenter, reference.ra, reference.dec); - FillEulerMatrix(euler_matrix_source, directions.ra[source_index], - directions.dec[source_index]); - - xt::xtensor<double, 2> euler_matrix = xt::linalg::dot( - xt::transpose(euler_matrix_source), euler_matrix_phasecenter); - uvwShifted = xt::linalg::dot(euler_matrix, xtStationUvw); - - } else { - uvwShifted = xtStationUvw; - } - - // Take care of the conversion of axis lengths from FWHM in radians to - // sigma. - const double kFwhm2Sigma = 1.0 / (2.0 * std::sqrt(2.0 * std::log(2.0))); - const double kUScale = major_axis * kFwhm2Sigma; - const double kVScale = minor_axis * kFwhm2Sigma; - const double kInvCSqr = 1.0 / (casacore::C::c * casacore::C::c); - - for (size_t bl = 0; bl < baselines.size(); ++bl) { - const size_t p = baselines[bl].first; - const size_t q = baselines[bl].second; - - if (p != q) { - double u = uvwShifted(0, q); - double v = uvwShifted(1, q); - - u -= uvwShifted(0, p); - v -= uvwShifted(1, p); - - // Rotate (u, v) by the position angle and scale with the major - // and minor axis lengths (FWHM in rad). - const double uPrime = kUScale * (u * kCosPhi - v * kSinPhi); - const double vPrime = kVScale * (u * kSinPhi + v * kCosPhi); - - // Compute uPrime^2 + vPrime^2 and pre-multiply with -2.0 * PI^2 - // / C^2. - const double uvPrime = - (-2.0 * M_PI * M_PI) * (uPrime * uPrime + vPrime * vPrime); - - for (size_t ch = 0; ch < nchannels; ++ch) { - const double lambda2 = - frequencies(ch) * frequencies(ch) * kInvCSqr * uvPrime; - angular_smear[ch] = exp(lambda2); - } - - if (compute_stokes_I_only) { -#pragma GCC ivdep - for (size_t ch = 0; ch < nchannels; ++ch) { - const double x_p = (GetShiftData()(p, ch).real()); - const double y_p = (GetShiftData()(p, ch).imag()); - const double x_q = (GetShiftData()(q, ch).real()); - const double y_q = (GetShiftData()(q, ch).imag()); - const double x_c = - (computed_spectrum(StokesComponent::I, ch).real()); - const double y_c = - (computed_spectrum(StokesComponent::I, ch).imag()); - - // Compute baseline phase shift. - // Compute visibilities. - const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q); - const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p); - const double temp_prod_real_I = - q_conj_p_real * (x_c)-q_conj_p_imag * (y_c); - const double temp_prod_imag_I = - q_conj_p_real * (y_c) + q_conj_p_imag * (x_c); - buffer(StokesComponent::I, ch, bl) += dcomplex( - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_I, - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_I); - } - } else { -#pragma GCC ivdep - for (size_t ch = 0; ch < nchannels; ++ch) { - const double x_p = (GetShiftData()(p, ch).real()); - const double y_p = (GetShiftData()(p, ch).imag()); - const double x_q = (GetShiftData()(q, ch).real()); - const double y_q = (GetShiftData()(q, ch).imag()); - const double x_c = - (computed_spectrum(StokesComponent::I, ch).real()); - const double y_c = - (computed_spectrum(StokesComponent::I, ch).imag()); - const double x_c2 = - (computed_spectrum(StokesComponent::Q, ch).real()); - const double y_c2 = - (computed_spectrum(StokesComponent::Q, ch).imag()); - const double x_c3 = - (computed_spectrum(StokesComponent::U, ch).real()); - const double y_c3 = - (computed_spectrum(StokesComponent::U, ch).imag()); - const double x_c4 = - (computed_spectrum(StokesComponent::V, ch).real()); - const double y_c4 = - (computed_spectrum(StokesComponent::V, ch).imag()); - - // Compute baseline phase shift. - // Compute visibilities. - const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q); - const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p); - const double temp_prod_real_I = - q_conj_p_real * (x_c)-q_conj_p_imag * (y_c); - const double temp_prod_imag_I = - q_conj_p_real * (y_c) + q_conj_p_imag * (x_c); - const double temp_prod_real_Q = - q_conj_p_real * (x_c2)-q_conj_p_imag * (y_c2); - const double temp_prod_imag_Q = - q_conj_p_real * (y_c2) + q_conj_p_imag * (x_c2); - const double temp_prod_real_U = - q_conj_p_real * (x_c3)-q_conj_p_imag * (y_c3); - const double temp_prod_imag_U = - q_conj_p_real * (y_c3) + q_conj_p_imag * (x_c3); - const double temp_prod_real_V = - q_conj_p_real * (x_c4)-q_conj_p_imag * (y_c4); - const double temp_prod_imag_V = - q_conj_p_real * (y_c4) + q_conj_p_imag * (x_c4); - - // The following operations are memory-bound, unlike - // the compute-bound segment above. By merging these together, - // an improvement in speed is expected - buffer(StokesComponent::I, ch, bl) += dcomplex( - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_I, - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_I); - buffer(StokesComponent::Q, ch, bl) += dcomplex( - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_Q, - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_Q); - buffer(StokesComponent::U, ch, bl) += dcomplex( - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_U, - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_U); - buffer(StokesComponent::V, ch, bl) += dcomplex( - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_V, - angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_V); - } - } - } - } // Baselines. - } -} -void PredictPlanExec::ComputeSmearTerms() { - // Compute the smear terms - size_t p = 0UL; - size_t q = 0UL; - std::vector<float> channel_floats(channel_widths.size()); - std::copy(channel_widths.begin(), channel_widths.end(), - channel_floats.begin()); - for (size_t bl = 0; bl < baselines.size(); ++bl) { - std::tie(p, q) = baselines[bl]; - const float phase_diff = - static_cast<float>(station_phases[q] - station_phases[p]); - if (abs(phase_diff) > 1.e-6) [[likely]] { -#pragma omp simd - for (size_t ch = 0; ch < nchannels; ch++) { - const float shift = phase_diff * channel_floats[ch] * 0.5f; - smear_terms(bl, ch) = std::fabs(sinf(shift) / shift); - } - } - } -} \ No newline at end of file diff --git a/src/PredictPlanExecCPU.cpp b/src/PredictPlanExecCPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f41d074b99e3d5f155b1d3fbc9aa07a27c81673e --- /dev/null +++ b/src/PredictPlanExecCPU.cpp @@ -0,0 +1,723 @@ +#include "PredictPlanExecCPU.h" + +#include "Directions.h" +#include "GaussianSourceCollection.h" +#include "PointSourceCollection.h" +#include "PredictPlan.h" +#include "Spectrum.h" + +#include <tracy/Tracy.hpp> + +#include <cstdint> +#include <immintrin.h> + +#include <iomanip> +#include <iostream> +#include <thread> +#include <xtensor-blas/xlinalg.hpp> +#include <xtensor/xadapt.hpp> +#include <xtensor/xindex_view.hpp> +#include <xtensor/xlayout.hpp> +#include <xtensor/xtensor.hpp> +#include <xtensor/xtensor_forward.hpp> +#include <xtensor/xview.hpp> + +void PredictPlanExecCPU::Initialize() { + // Initialize the station phases and shifts + station_phases.resize({nstations}); + shift_data.resize({2, nstations, nchannels}); + lmn.resize({nchannels, 3}); + uvw.resize({nstations, 3}); + frequencies.resize({nchannels}); + + smear_terms = xt::ones<float>({baselines.size(), nchannels}); +} + +void PredictPlanExecCPU::Precompute(const PointSourceCollection &sources) { + FrameMark; + { + // ZoneScoped; // Mark the scope with Tracy + + sources.direction_vector.RaDec2Lmn(reference, lmn); + ComputeStationPhases(); + + // Precompute smearing factors if needed + if (correct_frequency_smearing) { + ComputeSmearTerms(); + } + } +} + +/** + * Compute station phase shifts. + * + * \f[ \mathrm{stationphases}(p) = \frac{2\pi}{c}((u_p, v_p, w_p) \cdot (\ell, + * m, n)) \f] + * + * \f[ \mathrm{phases}(p) = e^{\mathrm{stationphases}(p)} \f] + * + * @param nStation Number of stations + * @param nChannel Number of channels + * @param lmn LMN coordinates of all sources. + * @param uvw Station UVW coordinates, matrix of shape (3, nSt) + * @param freq Channel frequencies, should be length nChannel + * @param shift Output matrix (2 for real,imag), shift per station, matrix of + * shape (3, nSt) + * @param stationPhases Output vector, store per station \f$(x_1,y_1)\f$ + */ +void PredictPlanExecCPU::ComputeStationPhases() { + // Compute station phases + const xt::xtensor<double, 1> lmn_offset = {lmn[0], lmn[1], lmn[2] - 1.0}; + station_phases = xt::sum(uvw * lmn_offset * kCInv, 1); + + xt::xtensor<double, 1> phase_terms; + phase_terms.resize({nchannels}); + + for (size_t st = 0; st < nstations; ++st) { + phase_terms = station_phases(st) * frequencies; + + const auto sin_phase = xt::sin(phase_terms); + const auto cos_phase = xt::cos(phase_terms); + + // Assign to shifts as std::complex + for (size_t ch = 0; ch < nchannels; ++ch) { + shift_data(0, st, ch) = cos_phase(ch); + shift_data(1, st, ch) = sin_phase(ch); + } + } +} + +void PredictPlanExecCPU::FillEulerMatrix(xt::xtensor<double, 2> &mat, + const double ra, const double dec) { + double sinra = sin(ra); + double cosra = cos(ra); + double sindec = sin(dec); + double cosdec = cos(dec); + + mat(0, 0) = cosra; + mat(1, 0) = -sinra; + mat(2, 0) = 0; + mat(0, 1) = -sinra * sindec; + mat(1, 1) = -cosra * sindec; + mat(2, 1) = cosdec; + mat(0, 2) = sinra * cosdec; + mat(1, 2) = cosra * cosdec; + mat(2, 2) = sindec; +} + +template <class E, class S> +void PredictPlanExecCPU::ProcessPolarizationComponentSingle( + E &stoke_buffer, const S &stoke_spectrum) { + // Use preallocated storage (outside for loops) + std::vector<double> temp_prod_real(nchannels); + std::vector<double> temp_prod_imag(nchannels); + + const auto &shift_data = GetShiftData(); // Cache reference + + for (size_t bl = 0; bl < baselines.size(); ++bl) { + const size_t p = baselines[bl].first; + const size_t q = baselines[bl].second; + + // Compute visibilities. + // The following loop can be parallelized, but because there is already + // a parallelization over sources, this is not necessary. + // It used to be parallelized with a #pragma omp parallel for, but since + // the outer loop was also parallelized with omp, this had no effect + // (since omp by default doesn't parallelize nested loops). After the + // change to a ThreadPool, this would parallize each separate source, + // which started hundreds of threads on many-core machines. + + for (size_t ch = 0; ch < nchannels; ++ch) { + // Note the notation: + // Each complex number is represented as (x + j*y) + // x: real part, y: imaginary part + // Subscripts _p and _q represent stations p and q, respectively + // Subscript _c represents channel (SpectrumBuffer) + const double x_p = (shift_data(0, p, ch)); + const double y_p = (shift_data(1, p, ch)); + const double x_q = (shift_data(0, q, ch)); + const double y_q = (shift_data(1, q, ch)); + const double x_c = stoke_spectrum(0, ch); + const double y_c = stoke_spectrum(1, ch); + + // Compute baseline phase shift. + // Compute visibilities. + const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q); + const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p); + temp_prod_real[ch] = q_conj_p_real * (x_c)-q_conj_p_imag * (y_c); + temp_prod_imag[ch] = q_conj_p_real * (y_c) + q_conj_p_imag * (x_c); + } // Channels. + if (correct_frequency_smearing) { + for (size_t ch = 0; ch < nchannels; ++ch) { + const double smear_term = smear_terms(bl, ch); + stoke_buffer(0, bl, ch) += smear_term * temp_prod_real[ch]; + stoke_buffer(1, bl, ch) += smear_term * temp_prod_imag[ch]; + } + } else { + for (size_t ch = 0; ch < nchannels; ++ch) { + stoke_buffer(0, bl, ch) += temp_prod_real[ch]; + stoke_buffer(1, bl, ch) += temp_prod_imag[ch]; + } + } + } +} + +inline void store_and_add(double *const buffer, const __m256d &new_values) { + // Load existing values from buffer + __m256d existing = _mm256_loadu_pd(buffer); + __m256d result = _mm256_add_pd(existing, new_values); + _mm256_storeu_pd(buffer, result); +} + +template <class E, class S, class Tag, class Arch> +void PredictPlanExecCPU::XTensorDispatch::operator()(Arch, E &buffer_expr, + const S &spectrum, + Tag) const { + using b_type = xsimd::batch<double, Arch>; + size_t inc = b_type::size; + size_t size = parent.nchannels; + // size for which the vectorization is possible + size_t vec_size = size - size % inc; + + const auto &shift_data = parent.GetShiftData(); // Cache reference + + const double *const spectrum_real_ptr = &(spectrum.unchecked(0, 0)); + const double *const spectrum_imag_ptr = &(spectrum.unchecked(1, 0)); + + for (size_t bl = 0; bl < parent.baselines.size(); ++bl) { + double *const output_real_ptr = &(buffer_expr.unchecked(0, bl, 0)); + double *const output_imag_ptr = &(buffer_expr.unchecked(1, bl, 0)); + + for (size_t i = 0; i < vec_size; i += inc) { + const b_type spectrum_real = b_type::load(spectrum_real_ptr + i, Tag()); + const b_type spectrum_imag = b_type::load(spectrum_imag_ptr + i, Tag()); + + const size_t p = parent.baselines[bl].first; + const size_t q = parent.baselines[bl].second; + + // Load shift data for p and q + const b_type x_p = b_type::load(&shift_data.unchecked(0, p, i), Tag()); + const b_type y_p = b_type::load(&shift_data.unchecked(1, p, i), Tag()); + const b_type x_q = b_type::load(&shift_data.unchecked(0, q, i), Tag()); + const b_type y_q = b_type::load(&shift_data.unchecked(1, q, i), Tag()); + + const b_type real_part = + xsimd::add(xsimd::mul(x_p, x_q), xsimd::mul(y_p, y_q)); + const b_type imag_part = + xsimd::sub(xsimd::mul(x_p, y_q), xsimd::mul(x_q, y_p)); + + // Compute the product with the spectrum + const b_type temp_real = xsimd::sub(xsimd::mul(real_part, spectrum_real), + xsimd::mul(imag_part, spectrum_imag)); + const b_type temp_imag = xsimd::add(xsimd::mul(real_part, spectrum_imag), + xsimd::mul(imag_part, spectrum_real)); + const b_type buffer_real = b_type::load(output_real_ptr + i, Tag()); + const b_type buffer_imag = b_type::load(output_imag_ptr + i, Tag()); + + if (parent.correct_frequency_smearing) { + // Apply smearing terms + const b_type smear_terms_temp = + b_type::load(&parent.smear_terms.unchecked(bl, i), Tag()); + + xsimd::store( + output_real_ptr + i, + xsimd::add(xsimd::mul(temp_real, smear_terms_temp), buffer_real), + Tag()); + + xsimd::store( + output_imag_ptr + i, + xsimd::add(xsimd::mul(temp_imag, smear_terms_temp), buffer_imag), + Tag()); + } else { + xsimd::store(output_real_ptr + i, xsimd::add(temp_real, buffer_real), + Tag()); + + xsimd::store(output_imag_ptr + i, xsimd::add(temp_imag, buffer_imag), + Tag()); + } + } + } + + // Remaining part that cannot be vectorized + for (size_t i = vec_size; i < size; ++i) { + for (size_t bl = 0; bl < parent.baselines.size(); ++bl) { + const size_t p = parent.baselines[bl].first; + const size_t q = parent.baselines[bl].second; + + // Load shift data for p and q + const double x_p = shift_data(0, p, i); + const double y_p = shift_data(1, p, i); + const double x_q = shift_data(0, q, i); + const double y_q = shift_data(1, q, i); + + const double real_part = x_p * x_q + y_p * y_q; + const double imag_part = x_p * y_q - x_q * y_p; + + const double temp_real = + real_part * spectrum(0, i) - imag_part * spectrum(1, i); + const double temp_imag = + real_part * spectrum(1, i) + imag_part * spectrum(0, i); + const double buffer_real = buffer_expr(0, bl, i); + const double buffer_imag = buffer_expr(1, bl, i); + + if (parent.correct_frequency_smearing) { + // Apply smearing terms + const double smear_terms_temp = parent.smear_terms(bl, i); + + buffer_expr(0, bl, i) = temp_real * smear_terms_temp + buffer_real; + buffer_expr(1, bl, i) = temp_imag * smear_terms_temp + buffer_imag; + } else { + buffer_expr(0, bl, i) = temp_real + buffer_real; + buffer_expr(1, bl, i) = temp_imag + buffer_imag; + } + } + } +} + +template <class E, class S> +void PredictPlanExecCPU::ProcessPolarizationComponentXtensor( + E &buffer_expr, const S &spectrum) { + xsimd::dispatch(xtensor_dispatch_)(buffer_expr, spectrum, + xsimd::unaligned_mode()); +} + +template <class E, class S> +void PredictPlanExecCPU::ProcessPolarizationComponentMultiSimd( + E &buffer_expr, const S &spectrum) { + // Precompute number of channels that can be processed in parallel + const uint_fast32_t simd_channels = (nchannels / 4) * 4; + const uint_fast32_t total_vector_size = simd_channels * 4; + + // std::vector<double> channel_accum(simd_channels, 0.0); + + for (uint_fast32_t bl = 0; bl < baselines.size(); ++bl) { + const size_t p = baselines[bl].first; + const size_t q = baselines[bl].second; + + double real_accum = 0.0, imag_accum = 0.0; + + const auto &shift_data = GetShiftData(); // Cache reference + + uint_fast32_t ch = 0; + + const double *const smear_terms_ptr = &(smear_terms.unchecked(bl, 0)); + const double *const spectrum_real_ptr = &(spectrum.unchecked(0, 0)); + const double *const spectrum_imag_ptr = &(spectrum.unchecked(1, 0)); + double *const output_real_ptr = &(buffer_expr.unchecked(0, bl, 0)); + double *const output_imag_ptr = &(buffer_expr.unchecked(1, bl, 0)); + + if (correct_frequency_smearing) { + for (; ch < simd_channels; ch += 4) { + // Load 4 values from shift_data for p and q + __m256d x_p = _mm256_loadu_pd(&shift_data.unchecked(0, p, ch)); + __m256d y_p = _mm256_loadu_pd(&shift_data.unchecked(1, p, ch)); + __m256d x_q = _mm256_loadu_pd(&shift_data.unchecked(0, q, ch)); + __m256d y_q = _mm256_loadu_pd(&shift_data.unchecked(1, q, ch)); + __m256d x_c = _mm256_loadu_pd(spectrum_real_ptr + ch); + __m256d y_c = _mm256_loadu_pd(spectrum_imag_ptr + ch); + + // Complex multiply (q_conj * p) * stokes + // FIXME: maybe used a fused multiply add + __m256d real_part = + _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q)); + __m256d imag_part = + _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p)); + + __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c), + _mm256_mul_pd(imag_part, y_c)); + __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c), + _mm256_mul_pd(imag_part, x_c)); + + __m256d smear_terms_temp = _mm256_loadu_pd(smear_terms_ptr + ch); + temp_real = _mm256_mul_pd(temp_real, smear_terms_temp); + temp_imag = _mm256_mul_pd(temp_imag, smear_terms_temp); + + store_and_add(output_real_ptr + ch, temp_real); + store_and_add(output_imag_ptr + ch, temp_imag); + } + } else { + for (; ch < simd_channels; ch += 4) { + // Load 4 values from shift_data for p and q + __m256d x_p = _mm256_loadu_pd(&shift_data.unchecked(0, p, ch)); + __m256d y_p = _mm256_loadu_pd(&shift_data.unchecked(1, p, ch)); + __m256d x_q = _mm256_loadu_pd(&shift_data.unchecked(0, q, ch)); + __m256d y_q = _mm256_loadu_pd(&shift_data.unchecked(1, q, ch)); + __m256d x_c = _mm256_loadu_pd(spectrum_real_ptr + ch); + __m256d y_c = _mm256_loadu_pd(spectrum_imag_ptr + ch); + + // Complex multiply (q_conj * p) * stokes + __m256d real_part = + _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q)); + __m256d imag_part = + _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p)); + + __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c), + _mm256_mul_pd(imag_part, y_c)); + __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c), + _mm256_mul_pd(imag_part, x_c)); + + store_and_add(output_real_ptr + ch, temp_real); + store_and_add(output_imag_ptr + ch, temp_imag); + } + } + + for (; ch < nchannels; ++ch) { + const double x_p = (shift_data(0, p, ch)); + const double y_p = (shift_data(1, p, ch)); + const double x_q = (shift_data(0, q, ch)); + const double y_q = (shift_data(1, q, ch)); + const double x_c = spectrum(0, ch); + const double y_c = spectrum(1, ch); + + // Compute baseline phase shift. + // Compute visibilities. + const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q); + const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p); + + const double temp_prod_real = q_conj_p_real * (x_c)-q_conj_p_imag * (y_c); + const double temp_prod_imag = + q_conj_p_real * (y_c) + q_conj_p_imag * (x_c); + + if (correct_frequency_smearing) { + const double smear_term = smear_terms(bl, ch); + // buffer_expr(bl, ch) += + // dcomplex{temp_prod_real * smear_term, temp_prod_imag * + // smear_term}; + buffer_expr(0, bl, ch) += (temp_prod_real * smear_term); + buffer_expr(1, bl, ch) += (temp_prod_imag * smear_term); + + } else { + // buffer_expr(bl, ch) += dcomplex{temp_prod_real, temp_prod_imag}; + buffer_expr(0, bl, ch) += temp_prod_real; + buffer_expr(1, bl, ch) += temp_prod_imag; + } + } + } +} + +void PredictPlanExecCPU::Compute( + const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) { + xt::xtensor<double, 3> computed_spectrum({2, 4, frequencies.size()}); + + const auto spectrum_XX = xt::view(computed_spectrum, xt::all(), 0, xt::all()); + const auto spectrum_XY = xt::view(computed_spectrum, xt::all(), 1, xt::all()); + const auto spectrum_YX = xt::view(computed_spectrum, xt::all(), 2, xt::all()); + const auto spectrum_YY = xt::view(computed_spectrum, xt::all(), 3, xt::all()); + + auto buffer_XX = xt::view(buffer, xt::all(), 0, xt::all(), xt::all()); + auto buffer_XY = xt::view(buffer, xt::all(), 1, xt::all(), xt::all()); + auto buffer_YX = xt::view(buffer, xt::all(), 2, xt::all(), xt::all()); + auto buffer_YY = xt::view(buffer, xt::all(), 3, xt::all(), xt::all()); + + for (size_t source_index = 0; source_index < sources.Size(); ++source_index) { + const Spectrum &spectrum = sources.spectrums[source_index]; + spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum, false); + + if (compute_stokes_I_only) { + ProcessPolarizationComponent(buffer_XX, spectrum_XX); + } else { + ProcessPolarizationComponent(buffer_XX, spectrum_XX); + ProcessPolarizationComponent(buffer_XY, spectrum_XY); + ProcessPolarizationComponent(buffer_YX, spectrum_YX); + ProcessPolarizationComponent(buffer_YY, spectrum_YY); + } + + } // Baselines. +} + +void PredictPlanExecCPU::ComputeWithTarget( + const PointSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer, + const computation_strategy strat) { + xt::xtensor<double, 3> computed_spectrum({2, 4, frequencies.size()}); + + const auto spectrum_XX = xt::view(computed_spectrum, xt::all(), 0, xt::all()); + const auto spectrum_XY = xt::view(computed_spectrum, xt::all(), 1, xt::all()); + const auto spectrum_YX = xt::view(computed_spectrum, xt::all(), 2, xt::all()); + const auto spectrum_YY = xt::view(computed_spectrum, xt::all(), 3, xt::all()); + + auto buffer_XX = xt::view(buffer, xt::all(), 0, xt::all(), xt::all()); + auto buffer_XY = xt::view(buffer, xt::all(), 1, xt::all(), xt::all()); + auto buffer_YX = xt::view(buffer, xt::all(), 2, xt::all(), xt::all()); + auto buffer_YY = xt::view(buffer, xt::all(), 3, xt::all(), xt::all()); + + for (size_t source_index = 0; source_index < sources.Size(); ++source_index) { + const Spectrum &spectrum = sources.spectrums[source_index]; + spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum, false); + + if (compute_stokes_I_only) { + ProcessPolarizationComponent(strat, buffer_XX, spectrum_XX); + } else { + ProcessPolarizationComponent(strat, buffer_XX, spectrum_XX); + ProcessPolarizationComponent(strat, buffer_XY, spectrum_XY); + ProcessPolarizationComponent(strat, buffer_YX, spectrum_YX); + ProcessPolarizationComponent(strat, buffer_YY, spectrum_YY); + } + + } // Baselines. +} + +void PredictPlanExecCPU::Compute( + const GaussianSourceCollection &sources, + xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) { + const std::array<size_t, 2> kStationUvwShape = {GetUvw().shape(1), + GetUvw().shape(0)}; + xt::xtensor<double, 3> computed_spectrum; + xt::xtensor<double, 2> xtStationUvw; + xt::xtensor<double, 2> uvwShifted; + std::vector<double> angular_smear(nchannels); + + for (size_t source_index = 0; source_index < sources.Size(); ++source_index) { + const Spectrum &spectrum = sources.spectrums[source_index]; + const double position_angle = sources.position_angle[source_index]; + const double major_axis = sources.major_axis[source_index]; + const double minor_axis = sources.minor_axis[source_index]; + const bool position_angle_is_absolute = + sources.position_angle_is_absolute[source_index]; + + spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum); + + // Convert position angle from North over East to the angle used to + // rotate the right-handed UV-plane. + const double kPhi = M_PI_2 + position_angle + M_PI; + const double kCosPhi = cos(kPhi); + const double kSinPhi = sin(kPhi); + + xtStationUvw = xt::adapt(GetUvw().data(), GetUvw().size(), + xt::no_ownership(), kStationUvwShape); + + const auto &shift_data = GetShiftData(); // Cache reference + + if (position_angle_is_absolute) { + // Correct for projection and rotation effects: phase shift u, v, w to + // position of the source for evaluating the gaussian + + xt::xtensor<double, 2> euler_matrix_phasecenter = + xt::zeros<double>({3, 3}); + + xt::xtensor<double, 2> euler_matrix_source = xt::zeros<double>({3, 3}); + + const Directions &directions = sources.direction_vector; + + FillEulerMatrix(euler_matrix_phasecenter, reference.ra, reference.dec); + FillEulerMatrix(euler_matrix_source, directions.ra[source_index], + directions.dec[source_index]); + + xt::xtensor<double, 2> euler_matrix = xt::linalg::dot( + xt::transpose(euler_matrix_source), euler_matrix_phasecenter); + uvwShifted = xt::linalg::dot(euler_matrix, xtStationUvw); + + } else { + uvwShifted = xtStationUvw; + } + + // Take care of the conversion of axis lengths from FWHM in radians to + // sigma. + const double kFwhm2Sigma = 1.0 / (2.0 * std::sqrt(2.0 * std::log(2.0))); + const double kUScale = major_axis * kFwhm2Sigma; + const double kVScale = minor_axis * kFwhm2Sigma; + const double kInvCSqr = 1.0 / (casacore::C::c * casacore::C::c); + + for (size_t bl = 0; bl < baselines.size(); ++bl) { + const size_t p = baselines[bl].first; + const size_t q = baselines[bl].second; + + if (p != q) { + double u = uvwShifted(0, q); + double v = uvwShifted(1, q); + + u -= uvwShifted(0, p); + v -= uvwShifted(1, p); + + // Rotate (u, v) by the position angle and scale with the major + // and minor axis lengths (FWHM in rad). + const double uPrime = kUScale * (u * kCosPhi - v * kSinPhi); + const double vPrime = kVScale * (u * kSinPhi + v * kCosPhi); + + // Compute uPrime^2 + vPrime^2 and pre-multiply with -2.0 * PI^2 + // / C^2. + const double uvPrime = + (-2.0 * M_PI * M_PI) * (uPrime * uPrime + vPrime * vPrime); + + for (size_t ch = 0; ch < nchannels; ++ch) { + const double lambda2 = + frequencies(ch) * frequencies(ch) * kInvCSqr * uvPrime; + angular_smear[ch] = exp(lambda2); + } + + if (compute_stokes_I_only) { +#pragma GCC ivdep + for (size_t ch = 0; ch < nchannels; ++ch) { + const double x_p = (shift_data(0, p, ch)); + const double y_p = (shift_data(1, p, ch)); + const double x_q = (shift_data(0, q, ch)); + const double y_q = (shift_data(1, q, ch)); + const double x_c = (computed_spectrum(0, StokesComponent::I, ch)); + const double y_c = (computed_spectrum(1, StokesComponent::I, ch)); + + // Compute baseline phase shift. + // Compute visibilities. + const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q); + const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p); + const double temp_prod_real_I = + q_conj_p_real * (x_c)-q_conj_p_imag * (y_c); + const double temp_prod_imag_I = + q_conj_p_real * (y_c) + q_conj_p_imag * (x_c); + + const double channel_angular_smear = angular_smear[ch]; + + buffer(0, CrossCorrelation::XX, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_I; + buffer(1, CrossCorrelation::XX, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_I; + } + } else { +#pragma GCC ivdep + for (size_t ch = 0; ch < nchannels; ++ch) { + const double x_p = (shift_data(0, p, ch)); + const double y_p = (shift_data(1, p, ch)); + const double x_q = (shift_data(0, q, ch)); + const double y_q = (shift_data(1, q, ch)); + const double x_c = (computed_spectrum(0, CrossCorrelation::XX, ch)); + const double y_c = (computed_spectrum(1, CrossCorrelation::XX, ch)); + const double x_c2 = + (computed_spectrum(0, CrossCorrelation::XY, ch)); + const double y_c2 = + (computed_spectrum(1, CrossCorrelation::XY, ch)); + const double x_c3 = + (computed_spectrum(0, CrossCorrelation::YX, ch)); + const double y_c3 = + (computed_spectrum(1, CrossCorrelation::YX, ch)); + const double x_c4 = + (computed_spectrum(0, CrossCorrelation::YY, ch)); + const double y_c4 = + (computed_spectrum(1, CrossCorrelation::YY, ch)); + + // Compute baseline phase shift. + // Compute visibilities. + const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q); + const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p); + const double temp_prod_real_I = + q_conj_p_real * (x_c)-q_conj_p_imag * (y_c); + const double temp_prod_imag_I = + q_conj_p_real * (y_c) + q_conj_p_imag * (x_c); + const double temp_prod_real_Q = + q_conj_p_real * (x_c2)-q_conj_p_imag * (y_c2); + const double temp_prod_imag_Q = + q_conj_p_real * (y_c2) + q_conj_p_imag * (x_c2); + const double temp_prod_real_U = + q_conj_p_real * (x_c3)-q_conj_p_imag * (y_c3); + const double temp_prod_imag_U = + q_conj_p_real * (y_c3) + q_conj_p_imag * (x_c3); + const double temp_prod_real_V = + q_conj_p_real * (x_c4)-q_conj_p_imag * (y_c4); + const double temp_prod_imag_V = + q_conj_p_real * (y_c4) + q_conj_p_imag * (x_c4); + + // The following operations are memory-bound, unlike + // the compute-bound segment above. By merging these together, + // an improvement in speed is expected + const double channel_angular_smear = angular_smear[ch]; + + buffer(0, CrossCorrelation::XX, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_I; + buffer(1, CrossCorrelation::XX, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_I; + + buffer(0, CrossCorrelation::XY, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_Q; + buffer(1, CrossCorrelation::XY, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_Q; + + buffer(0, CrossCorrelation::YX, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_U; + buffer(1, CrossCorrelation::YX, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_U; + + buffer(0, CrossCorrelation::YY, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_V; + buffer(1, CrossCorrelation::YY, bl, ch) += + channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_V; + } + } + } + } // Baselines. + } +} + +void PredictPlanExecCPU::ComputeSmearTermsXtensor() { + // Compute the smear terms + size_t p = 0UL; + size_t q = 0UL; + std::vector<float> channel_floats(channel_widths.size()); + std::copy(channel_widths.begin(), channel_widths.end(), + channel_floats.begin()); + + for (size_t bl = 0; bl < baselines.size(); ++bl) { + std::tie(p, q) = baselines[bl]; + const float phase_diff = + static_cast<float>(station_phases[q] - station_phases[p]); + double *const smear_terms_ptr = &(smear_terms.unchecked(bl, 0)); + + if (abs(phase_diff) > 1.e-6) [[likely]] { +#pragma omp simd + for (size_t ch = 0; ch < nchannels; ++ch) { + const float shift = phase_diff * channel_floats[ch] * 0.5f; + *(smear_terms_ptr + ch) = std::fabs(sinf(shift) / shift); + } + } + } +} + +void PredictPlanExecCPU::ComputeSmearTermsSIMD() { + // Compute the smear terms + size_t p = 0UL; + size_t q = 0UL; + std::vector<double> channel_floats(channel_widths.size()); + std::copy(channel_widths.begin(), channel_widths.end(), + channel_floats.begin()); + + const size_t simd_channels = (nchannels / 4) * 4; + + for (size_t bl = 0; bl < baselines.size(); ++bl) { + std::tie(p, q) = baselines[bl]; + const double phase_diff = + static_cast<double>(station_phases[q] - station_phases[p]); + double *const smear_terms_ptr = &(smear_terms.unchecked(bl, 0)); + const double *const channel_floats_ptr = channel_floats.data(); + + if (abs(phase_diff) > 1.e-6) [[likely]] { + size_t ch = 0; + + __m256d muliplier_vec = _mm256_set1_pd(0.5); + __m256d phase_diff_vec = _mm256_set1_pd(phase_diff); + + for (; ch < simd_channels; ch += 4) { + __m256d channel_vec = _mm256_loadu_pd(channel_floats_ptr + ch); + __m256d shift_vec = _mm256_mul_pd( + muliplier_vec, _mm256_mul_pd(phase_diff_vec, channel_vec)); + // __m256d fabs_vec = _mm256_abs_pd(div_vec); + _mm256_storeu_pd(smear_terms_ptr + ch, shift_vec); + + const double val = *(smear_terms_ptr + ch + 0); + const double val1 = *(smear_terms_ptr + ch + 1); + const double val2 = *(smear_terms_ptr + ch + 2); + const double val3 = *(smear_terms_ptr + ch + 3); + *(smear_terms_ptr + ch + 0) = std::abs(std::sin(val)) / val; + + *(smear_terms_ptr + ch + 1) = std::abs(std::sin(val1)) / val1; + + *(smear_terms_ptr + ch + 2) = std::abs(std::sin(val2)) / val2; + + *(smear_terms_ptr + ch + 3) = std::abs(std::sin(val3)) / val3; + } + + for (; ch < nchannels; ++ch) { + const float shift = + static_cast<float>(phase_diff * channel_floats[ch] * 0.5); + *(smear_terms_ptr + ch) = std::fabs(sinf(shift) / shift); + } + } + } +} diff --git a/tests/test_predictplanexec.cpp b/tests/test_predictplanexeccpu.cpp similarity index 91% rename from tests/test_predictplanexec.cpp rename to tests/test_predictplanexeccpu.cpp index ea055206506995b207acd86ea7fe74a623c9dcf3..8f129dddb94898f07ae1f59d8c62fe1ac0b5ffa9 100644 --- a/tests/test_predictplanexec.cpp +++ b/tests/test_predictplanexeccpu.cpp @@ -3,13 +3,13 @@ // SPDX-License-Identifier: Apache2.0 #define BOOST_TEST_MODULE PREDICT_PLAN_EXEC_TEST -#include <PredictPlanExec.h> +#include <PredictPlanExecCPU.h> #include <boost/test/unit_test.hpp> #include <test/Common.h> -BOOST_AUTO_TEST_SUITE(PredictPlanExecTestSuite) +BOOST_AUTO_TEST_SUITE(PredictPlanExecCPUTestSuite) -BOOST_AUTO_TEST_CASE(test_PredictPlanExec) { BOOST_CHECK(true); } +BOOST_AUTO_TEST_CASE(test_PredictPlanExecCPU) { BOOST_CHECK(true); } BOOST_AUTO_TEST_CASE(compute_smear_terms) { static constexpr size_t test_n_stations = 4; @@ -26,7 +26,7 @@ BOOST_AUTO_TEST_CASE(compute_smear_terms) { PredictPlan plan; PointSourceCollection sources; - PredictPlanExec sim_plan_exec(run->plan); + PredictPlanExecCPU sim_plan_exec(run->plan); for (size_t s = 0; s < test_n_sources; ++s) { sources.Add(run->makeSource<PointSource>()); @@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(dont_compute_smear_terms) { PredictPlan plan; PointSourceCollection sources; - PredictPlanExec sim_plan_exec(run->plan); + PredictPlanExecCPU sim_plan_exec(run->plan); for (size_t s = 0; s < test_n_sources; ++s) { sources.Add(run->makeSource<PointSource>()); diff --git a/tests/test_simulator.cpp b/tests/test_simulator.cpp index 8b6f52bfeaaa6e60e7590e3d089c91ad646c98fe..ff309f690a2e79e3b13791a1702917a16430a82e 100644 --- a/tests/test_simulator.cpp +++ b/tests/test_simulator.cpp @@ -4,6 +4,7 @@ #include "Directions.h" #include "GaussianSourceCollection.h" #include "PointSourceCollection.h" +#include <boost/test/tools/old/interface.hpp> #include <xtensor/xtensor.hpp> #include "Predict.h" @@ -11,7 +12,7 @@ #include <Direction.h> #include <GaussianSource.h> #include <PointSource.h> -#include <PredictPlanExec.h> +#include <PredictPlanExecCPU.h> #include <Stokes.h> #include <xtensor/xtensor_forward.hpp> @@ -24,6 +25,13 @@ namespace dp3 { namespace base { namespace test { +inline double ConvToAbsComplex( + const xt::xtensor<double, 4, xt::layout_type::row_major> &buffer, + const size_t pol, const size_t bl, const size_t channel) { + return std::abs(std::complex<double>{buffer(0, pol, bl, channel), + buffer(1, pol, bl, channel)}); +} + struct PredictTestFixture { PredictTestFixture() : predict_run(nullptr) {} @@ -33,8 +41,8 @@ struct PredictTestFixture { test_n_channels, onlyI, freqsmear); } - static constexpr size_t test_n_stations = 4; - static constexpr size_t test_n_channels = 2; + size_t test_n_stations = 4; + size_t test_n_channels = 2; const Direction test_reference_point{0.5, 0.1}; const Direction test_offset_point{test_reference_point.ra + 0.02, test_reference_point.dec + 0.02}; @@ -52,9 +60,9 @@ BOOST_AUTO_TEST_CASE(pointsource_onlyI) { sources.Add(pointsource); predict_run->Run(sources); - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 1.0, 1.0e-3); // Channel 0 - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 1, 0), 1.0, 1.0e-3); // Channel 1 } @@ -66,21 +74,21 @@ BOOST_AUTO_TEST_CASE(pointsource_fullstokes) { sources.Add(pointsource); predict_run->Run(sources); - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 1.0, 1.0e-3); // Channel 0, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 0, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 0), 0.0, 1.0e-3); // Channel 0, XY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 0, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 0), 0.0, 1.0e-3); // Channel 0, YX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 0, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 0), 1.0, 1.0e-3); // Channel 0, YY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 1, 0), 1.0, 1.0e-3); // Channel 1, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 1, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 1, 0), 0.0, 1.0e-3); // Channel 1, XY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 1, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 1, 0), 0.0, 1.0e-3); // Channel 1, YX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 1, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 1, 0), 1.0, 1.0e-3); // Channel 1, YY } @@ -92,10 +100,86 @@ BOOST_AUTO_TEST_CASE(pointsource_onlyI_freqsmear) { sources.Add(pointsource); predict_run->Run(sources); - BOOST_CHECK_CLOSE(std::abs(predict_run->buffer(0, 0, test_n_stations - 1)), - 0.759154, 1.0e-3); - BOOST_CHECK_CLOSE(std::abs(predict_run->buffer(0, 1, test_n_stations - 1)), - 0.759154, 1.0e-3); + BOOST_CHECK_CLOSE( + ConvToAbsComplex(predict_run->buffer, 0, 0, test_n_channels - 1), 0.75915, + 1.0e-3); + BOOST_CHECK_CLOSE( + ConvToAbsComplex(predict_run->buffer, 1, 0, test_n_channels - 1), 0.75915, + 1.0e-3); +} + +BOOST_AUTO_TEST_CASE(pointsource_fullstokes_freqsmear_all_strategies) { + test_n_stations = 16; + test_n_channels = + 51; // Take an odd number of channels to verify boundary cases + SetupPredictRun(false, true); + PointSourceCollection sources; + + // Generate multiple point sources + const size_t num_sources = 4; + for (size_t i = 0; i < num_sources; ++i) { + auto pointsource = predict_run->makeSource<PointSource>(); + sources.Add(pointsource); + } + + // Define computation strategies to test + const std::vector<computation_strategy> strategies = { + computation_strategy::SINGLE, computation_strategy::MULTI_SIMD, + computation_strategy::XSIMD}; + + const auto ref_buffer = [&]() { + predict_run->RunWithStrategy(sources, computation_strategy::SINGLE); + return predict_run->buffer; + }(); + + for (const auto &strategy : strategies) { + predict_run->Clean(); + predict_run->RunWithStrategy(sources, strategy); + + BOOST_TEST_CONTEXT("Testing strategy: " << static_cast<int>(strategy)) { + for (size_t bl = 0; bl < predict_run->plan.nbaselines; ++bl) { + for (size_t ch = 0; ch < predict_run->plan.nchannels; ++ch) { + for (size_t pol = 0; pol < 4; ++pol) { + BOOST_CHECK_CLOSE( + ConvToAbsComplex(ref_buffer, pol, bl, ch), + ConvToAbsComplex(predict_run->buffer, pol, bl, ch), 1.0e-3); + } + } + } + } + } +} + +BOOST_AUTO_TEST_CASE(pointsource_fullstokes_freqsmear_xtensor) { + test_n_stations = 16; + test_n_channels = + 51; // Take a odd number of channels to verify boundary cases + SetupPredictRun(false, true); + PointSourceCollection sources; + + // Generate multiple point sources + const size_t num_sources = 4; + for (size_t i = 0; i < num_sources; ++i) { + auto pointsource = predict_run->makeSource<PointSource>(); + sources.Add(pointsource); + } + + predict_run->RunWithStrategy(sources, computation_strategy::SINGLE); + + const auto ref_buffer = predict_run->buffer; + predict_run->Clean(); + + predict_run->RunWithStrategy(sources, computation_strategy::XSIMD); + + for (size_t bl = 0; bl < predict_run->plan.nbaselines; ++bl) { + for (size_t ch = 0; ch < predict_run->plan.nchannels; ++ch) { + for (size_t pol = 0; pol < 4; ++pol) { + BOOST_CHECK_CLOSE(ConvToAbsComplex(ref_buffer, pol, bl, ch), + ConvToAbsComplex(predict_run->buffer, pol, bl, ch), + 1.0e-3); + } + } + } } BOOST_AUTO_TEST_CASE(gaussiansource_onlyI) { @@ -106,9 +190,9 @@ BOOST_AUTO_TEST_CASE(gaussiansource_onlyI) { sources.Add(gaussiansource); predict_run->Run(sources); - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 1.0, 1.0e-3); // Channel 0 - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 1.0, 1.0e-3); // Channel 1 } @@ -120,26 +204,25 @@ BOOST_AUTO_TEST_CASE(gaussiansource_fullstokes) { sources.Add(gaussiansource); predict_run->Run(sources); - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 1.0, 1.0e-3); // Channel 0, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 0, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 0), 0.0, 1.0e-3); // Channel 0, XY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 0, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 0), 0.0, 1.0e-3); // Channel 0, YX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 0, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 0), 1.0, 1.0e-3); // Channel 0, YY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 1, 0), 1.0, 1.0e-3); // Channel 1, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 1, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 1, 0), 0.0, 1.0e-3); // Channel 1, XY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 1, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 1, 0), 0.0, 1.0e-3); // Channel 1, YX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 1, 0)), 1.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 1, 0), 1.0, 1.0e-3); // Channel 1, YY } -BOOST_AUTO_TEST_CASE(gaussiansource_onlyI_freqsmear, - *boost::unit_test::expected_failures(2)) { +BOOST_AUTO_TEST_CASE(gaussiansource_onlyI_freqsmear) { SetupPredictRun(true, true); auto gaussiansource = predict_run->makeSource<GaussianSource>(); @@ -147,10 +230,10 @@ BOOST_AUTO_TEST_CASE(gaussiansource_onlyI_freqsmear, sources.Add(gaussiansource); predict_run->Run(sources); - BOOST_CHECK_CLOSE(std::abs(predict_run->buffer(0, 0, test_n_stations - 1)), - 0.75915, 1.0e-3); - BOOST_CHECK_CLOSE(std::abs(predict_run->buffer(0, 1, test_n_stations - 1)), - 0.75915, 1.0e-3); + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.75915, + 1.0e-3); // Channel 0 + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.75915, + 1.0e-3); // Channel 1 } BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_empty_initialization) { @@ -177,9 +260,9 @@ BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_onlyI) { predict_run->Run(sources); - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 0.98917, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.98917, 1.0e-3); // Channel 0, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 0.98900, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.98900, 1.0e-3); // Channel 0, XY } @@ -197,26 +280,25 @@ BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_fullstokes) { predict_run->Run(sources); - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 0.98917, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.98917, 1.0e-3); // Channel 0, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 0, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 0), 0.0, 1.0e-3); // Channel 0, XY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 0, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 0), 0.0, 1.0e-3); // Channel 0, YX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 0, 0)), 0.98917, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 0), 0.98917, 1.0e-3); // Channel 0, YY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 0.98900, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.98900, 1.0e-3); // Channel 1, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 1, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 1), 0.0, 1.0e-3); // Channel 1, XY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 1, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 1), 0.0, 1.0e-3); // Channel 1, YX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 1, 0)), 0.98900, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 1), 0.98900, 1.0e-3); // Channel 1, YY } -BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_onlyI_freqsmear, - *boost::unit_test::expected_failures(2)) { +BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_onlyI_freqsmear) { SetupPredictRun(true, true); auto gaussiansource = predict_run->makeSource<GaussianSource>(); @@ -230,14 +312,13 @@ BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_onlyI_freqsmear, predict_run->Run(sources); - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 0.75093, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.75093, 1.0e-3); // Channel 0, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 0.75081, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.75081, 1.0e-3); // Channel 0, XY } -BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_fullstokes_freqsmear, - *boost::unit_test::expected_failures(4)) { +BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_fullstokes_freqsmear) { SetupPredictRun(false, true); auto gaussiansource = predict_run->makeSource<GaussianSource>(); @@ -251,21 +332,21 @@ BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_fullstokes_freqsmear, predict_run->Run(sources); - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 0.750934, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.750934, 1.0e-3); // Channel 0, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 0, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 0), 0.0, 1.0e-3); // Channel 0, XY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 0, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 0), 0.0, 1.0e-3); // Channel 0, YX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 0, 0)), 0.750934, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 0), 0.750934, 1.0e-3); // Channel 0, YY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 0.750807, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.750807, 1.0e-3); // Channel 1, XX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 1, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 1), 0.0, 1.0e-3); // Channel 1, XY - BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 1, 0)), 0.0, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 1), 0.0, 1.0e-3); // Channel 1, YX - BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 1, 0)), 0.750807, + BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 1), 0.750807, 1.0e-3); // Channel 1, YY } @@ -348,7 +429,8 @@ BOOST_AUTO_TEST_CASE(test_calc_phase) { plan.uvw = {{100.0, 200.0, 300.0}}; plan.frequencies = {1e8, 2e8}; - PredictPlanExec plan_exec{plan}; + PredictPlanExecCPU plan_exec{plan}; + plan_exec.correct_frequency_smearing = false; plan_exec.lmn = {{0.1, 0.2, 0.3}}; plan_exec.ComputeStationPhases(); @@ -359,17 +441,17 @@ BOOST_AUTO_TEST_CASE(test_calc_phase) { BOOST_CHECK_CLOSE(plan_exec.GetStationPhases()[0], expectedPhase, 1e-6); - const auto &freq = plan_exec.get_frequencies(); + const auto &freq = plan_exec.GetFrequencies(); double phase_term_0 = expectedPhase * freq[0]; double phase_term_1 = expectedPhase * freq[1]; const auto &shift = plan_exec.GetShiftData(); - BOOST_CHECK_CLOSE(shift(0, 0).real(), std::cos(phase_term_0), 1e-6); - BOOST_CHECK_CLOSE(shift(0, 1).imag(), std::sin(phase_term_1), 1e-6); - BOOST_CHECK_CLOSE(shift(1, 0).real(), std::cos(phase_term_0), 1e-6); - BOOST_CHECK_CLOSE(shift(1, 1).imag(), std::sin(phase_term_1), 1e-6); + BOOST_CHECK_CLOSE(shift(0, 0, 0), std::cos(phase_term_0), 1e-6); + BOOST_CHECK_CLOSE(shift(1, 0, 1), std::sin(phase_term_1), 1e-6); + BOOST_CHECK_CLOSE(shift(0, 1, 0), std::cos(phase_term_0), 1e-6); + BOOST_CHECK_CLOSE(shift(1, 1, 1), std::sin(phase_term_1), 1e-6); } BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/test_spectrum.cpp b/tests/test_spectrum.cpp index 10badfb3df0f39e90b90baf4ee09876255802d3a..6176cc8ca1c7b10f40c179a1198ec5ccf334684b 100644 --- a/tests/test_spectrum.cpp +++ b/tests/test_spectrum.cpp @@ -227,14 +227,14 @@ BOOST_FIXTURE_TEST_CASE(normal_log_with_rotation_cross, SpectrumFixture) { stokes_nu1 = point_source.GetStokes(frequencies[1]); stokes_nu2 = point_source.GetStokes(frequencies[2]); - xt::xtensor<std::complex<double>, 2> result_complex; + xt::xtensor<double, 3> result_complex; spectrum.EvaluateCrossCorrelations(frequencies, result_complex); - BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0).real(), + BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 0), stokes_nu0.I + stokes_nu0.Q, 1.e-3); - BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 1).real(), + BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 1), stokes_nu1.I + stokes_nu1.Q, 1.e-3); - BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 2).real(), + BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 2), stokes_nu2.I + stokes_nu2.Q, 1.e-3); } @@ -261,14 +261,14 @@ BOOST_FIXTURE_TEST_CASE(normal_with_rotation_cross, SpectrumFixture) { stokes_nu1 = point_source.GetStokes(frequencies[1]); stokes_nu2 = point_source.GetStokes(frequencies[2]); - xt::xtensor<std::complex<double>, 2> result_complex; + xt::xtensor<double, 3> result_complex; spectrum.EvaluateCrossCorrelations(frequencies, result_complex); - BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0).real(), + BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 0), stokes_nu0.I + stokes_nu0.Q, 1.e-3); - BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 1).real(), + BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 1), stokes_nu1.I + stokes_nu1.Q, 1.e-3); - BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 2).real(), + BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 2), stokes_nu2.I + stokes_nu2.Q, 1.e-3); }