diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce66efeaebe37f9d2caf82e22ef5de4c1408b93c..1a04d8921f8089cd375b0dbc03bf3c677e98c4b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,24 +7,38 @@ option(PREDICT_BUILD_TESTING "Build the test suite" ON)
 option(PREDICT_BUILD_BENCHMARK "Build the benchmark suite" OFF)
 option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands" ON)
 
+# Disable Tracy by default to limit performance impact.
+if(NOT DEFINED TRACY_ENABLE)
+  set(TRACY_ENABLE OFF)
+endif()
+
 # Casacore
 include(FetchContent)
 FetchContent_Declare(
   aocommon
   GIT_REPOSITORY https://gitlab.com/aroffringa/aocommon/
   GIT_TAG master
+  GIT_SHALLOW TRUE
   GIT_PROGRESS TRUE)
 FetchContent_Populate(aocommon)
 set(CMAKE_MODULE_PATH ${aocommon_SOURCE_DIR}/CMake)
 
+FetchContent_Declare(
+  tracy
+  GIT_REPOSITORY https://github.com/wolfpld/tracy.git
+  GIT_TAG master # Or pin to a specific version
+  GIT_SHALLOW true
+  GIT_PROGRESS true)
+FetchContent_MakeAvailable(tracy)
+
 # Include XTensor libraries. DP3 must include FetchXTensor directly (not
 # indirectly, e.g., via schaapcommon) since it uses 'add_compile_definitions'.
 set(XTENSOR_LIBRARIES xtl xsimd xtensor xtensor-blas)
 include(${aocommon_SOURCE_DIR}/CMake/FetchXTensor.cmake)
 
 set(PREDICT_BUILD_ARGUMENTS
-    $<$<CONFIG:Debug>:-g> $<$<CONFIG:Release>:-O3;-march=native >
-    $<$<CONFIG:RelWithDebInfo>:-O3;-g;-march=native >)
+    $<$<CONFIG:Debug>:-g -mavx2> $<$<CONFIG:Release>:-O3;-march=native -mavx2>
+    $<$<CONFIG:RelWithDebInfo>:-O3;-g;-march=native -mavx2>)
 
 # Find or fetch dependencies
 find_package(Casacore REQUIRED casa tables)
@@ -48,8 +62,14 @@ target_include_directories(
                  $<INSTALL_INTERFACE:include>)
 
 target_link_libraries(
-  predict PUBLIC xtensor xtensor-blas xsimd OpenMP::OpenMP_CXX
-                 ${CASACORE_LIBRARIES} ${BLAS_LIBRARIES})
+  predict
+  PUBLIC xtensor
+         xtensor-blas
+         xsimd
+         OpenMP::OpenMP_CXX
+         ${CASACORE_LIBRARIES}
+         ${BLAS_LIBRARIES}
+         Tracy::TracyClient)
 
 target_include_directories(predict PRIVATE ${CASACORE_INCLUDE_DIRS})
 
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 34f67ba24d2fb97bccd6f205bb5760ed16fb9e2f..55c07dbfee3ee8fde5ecb165858a2d1e5c7e4057 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -27,8 +27,14 @@ add_executable(
 target_compile_options(microbenchmarks PRIVATE ${PREDICT_BUILD_ARGUMENTS})
 
 target_link_libraries(
-  microbenchmarks PRIVATE benchmark::benchmark predict # your main library
-                          xtensor xtensor-blas xsimd ${CASACORE_LIBRARIES})
+  microbenchmarks
+  PRIVATE benchmark::benchmark
+          predict
+          xtensor
+          xtensor-blas
+          xsimd
+          ${CASACORE_LIBRARIES}
+          Tracy::TracyClient)
 target_include_directories(
   microbenchmarks PRIVATE include ${CMAKE_SOURCE_DIR}/include
-                          ${CASACORE_INCLUDE_DIRS})
+                          ${CASACORE_INCLUDE_DIRS} ${tracy_SOURCE_DIR}/include)
diff --git a/benchmark/main.cpp b/benchmark/main.cpp
index 661ee4dcf535c827aee9a4aa8021f0846af12b9d..00e8f3dfc6c07cdf0ca7c06ef2ff0e5023d32881 100644
--- a/benchmark/main.cpp
+++ b/benchmark/main.cpp
@@ -5,6 +5,8 @@
 #include <benchmark/benchmark.h>
 #include <test/Common.h>
 
+#include <tracy/Tracy.hpp>
+
 void ParseCustomFlags(int argc, char **argv) {
   extern bool do_randomized_run;
   extern int randomized_run_seed;
diff --git a/benchmark/phases.cpp b/benchmark/phases.cpp
index b6d048526d1ed33f97988aa221caee24b0e6a1ce..42fba3d7db8bcbbb577a903681b3e94f26ebbc6d 100644
--- a/benchmark/phases.cpp
+++ b/benchmark/phases.cpp
@@ -1,7 +1,7 @@
 #include "PointSourceCollection.h"
 #include "Predict.h"
 #include "PredictPlan.h"
-#include "PredictPlanExec.h"
+#include "PredictPlanExecCPU.h"
 #include <benchmark/benchmark.h>
 #include <random>
 #include <xtensor/xtensor.hpp>
@@ -23,7 +23,7 @@ public:
     plan.nstations = n_stations;
     plan.nchannels = n_channels;
 
-    plan_exec = std::make_unique<PredictPlanExec>(plan);
+    plan_exec = std::make_unique<PredictPlanExecCPU>(plan);
 
     plan_exec->uvw = xt::xtensor<double, 2>({n_stations, 3});
     for (size_t i = 0; i < n_stations; ++i) {
@@ -43,13 +43,12 @@ public:
   void TearDown(benchmark::State &) override { plan_exec.reset(); }
 
 protected:
-  std::unique_ptr<PredictPlanExec> plan_exec;
+  std::unique_ptr<PredictPlanExecCPU> plan_exec;
   size_t n_stations;
   size_t n_channels;
 };
 
 BENCHMARK_DEFINE_F(PhasesBenchmark, ComputePhases)(benchmark::State &state) {
-
   for (auto _ : state) {
     plan_exec->ComputeStationPhases();
   }
diff --git a/benchmark/predict.cpp b/benchmark/predict.cpp
index 9d771b3c78cc25844c310c61dfa0872f95f569f5..baa2400091c9d64c8535c9ac8a24aa1671cf8285 100644
--- a/benchmark/predict.cpp
+++ b/benchmark/predict.cpp
@@ -1,3 +1,4 @@
+#include "Directions.h"
 #include "GaussianSourceCollection.h"
 #include <random>
 
@@ -53,7 +54,7 @@ BENCHMARK_DEFINE_F(PredictBenchmark, PointSource)
   sources.Add(predict_run->makeSource<PointSource>());
 
   for (auto _ : state) {
-    predict_run->Run(sources);
+    predict_run->RunWithStrategy(sources, computation_strategy::SINGLE);
   }
 }
 
@@ -67,6 +68,46 @@ BENCHMARK_REGISTER_F(PredictBenchmark, PointSource)
     ->ArgNames({"stat", "chan", "fullstokes", "freqsmear"})
     ->Unit(benchmark::kMillisecond);
 
+BENCHMARK_DEFINE_F(PredictBenchmark, PointSourceSIMD)
+(benchmark::State &state) {
+  PointSourceCollection sources;
+  sources.Add(predict_run->makeSource<PointSource>());
+
+  for (auto _ : state) {
+    predict_run->RunWithStrategy(sources, computation_strategy::MULTI_SIMD);
+  }
+}
+
+BENCHMARK_REGISTER_F(PredictBenchmark, PointSourceSIMD)
+    ->ArgsProduct({
+        nstations,
+        nsources,
+        {false, true},
+        {false, true},
+    })
+    ->ArgNames({"stat", "chan", "fullstokes", "freqsmear"})
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_DEFINE_F(PredictBenchmark, PointSourceXSIMD)
+(benchmark::State &state) {
+  PointSourceCollection sources;
+  sources.Add(predict_run->makeSource<PointSource>());
+
+  for (auto _ : state) {
+    predict_run->RunWithStrategy(sources, computation_strategy::XSIMD);
+  }
+}
+
+BENCHMARK_REGISTER_F(PredictBenchmark, PointSourceXSIMD)
+    ->ArgsProduct({
+        nstations,
+        nsources,
+        {false, true},
+        {false, true},
+    })
+    ->ArgNames({"stat", "chan", "fullstokes", "freqsmear"})
+    ->Unit(benchmark::kMillisecond);
+
 BENCHMARK_DEFINE_F(PredictBenchmark, GaussianSource)
 (benchmark::State &state) {
   GaussianSourceCollection sources;
diff --git a/benchmark/smearterms.cpp b/benchmark/smearterms.cpp
index 52e55dc64c102b02e5d45da564d7df00e710e0f3..6b452c5910f9f5ede43dce1ac1b5dbea3c0160a3 100644
--- a/benchmark/smearterms.cpp
+++ b/benchmark/smearterms.cpp
@@ -1,3 +1,4 @@
+#include "Directions.h"
 #include <benchmark/benchmark.h>
 #include <xtensor/xtensor.hpp>
 
@@ -26,7 +27,7 @@ public:
     run = MakePredictRun(test_reference_point, test_offset_point, n_stations,
                          n_frequencies, false, true);
 
-    plan_exec_ptr = std::make_unique<PredictPlanExec>(run->plan);
+    plan_exec_ptr = std::make_unique<PredictPlanExecCPU>(run->plan);
     static std::mt19937 gen(-5);
     std::uniform_real_distribution<> phase_dist(-M_PI, M_PI);
     for (size_t t = 0; t < n_stations; t++) {
@@ -37,19 +38,19 @@ public:
 
 protected:
   std::unique_ptr<PredictRun> run;
-  std::unique_ptr<PredictPlanExec> plan_exec_ptr;
+  std::unique_ptr<PredictPlanExecCPU> plan_exec_ptr;
 };
 
 BENCHMARK_DEFINE_F(SmearTermsBenchmark, SmearTermBenchmarkXTensor)
 (benchmark::State &state) {
   for (auto _ : state) {
-    plan_exec_ptr->ComputeSmearTerms();
+    plan_exec_ptr->ComputeSmearTerms<computation_strategy::XSIMD>();
   }
 }
 
 BENCHMARK_DEFINE_F(SmearTermsBenchmark, SmearTermBenchmarkXOriginal)
 (benchmark::State &state) {
-  PredictPlanExec &sim_plan_exec = *plan_exec_ptr;
+  PredictPlanExecCPU &sim_plan_exec = *plan_exec_ptr;
   for (auto _ : state) {
     for (size_t bl_idx = 0; bl_idx < sim_plan_exec.baselines.size(); bl_idx++) {
       const auto &baseline = sim_plan_exec.baselines[bl_idx];
@@ -65,6 +66,13 @@ BENCHMARK_DEFINE_F(SmearTermsBenchmark, SmearTermBenchmarkXOriginal)
   }
 }
 
+BENCHMARK_DEFINE_F(SmearTermsBenchmark, SmearTermBenchmarkSIMD)
+(benchmark::State &state) {
+  for (auto _ : state) {
+    plan_exec_ptr->ComputeSmearTerms<computation_strategy::MULTI_SIMD>();
+  }
+}
+
 BENCHMARK_REGISTER_F(SmearTermsBenchmark, SmearTermBenchmarkXTensor)
     ->ArgsProduct({n_stations, n_frequencies})
     ->ArgNames({"#stations", "#frequencies"})
@@ -74,3 +82,8 @@ BENCHMARK_REGISTER_F(SmearTermsBenchmark, SmearTermBenchmarkXOriginal)
     ->ArgsProduct({n_stations, n_frequencies})
     ->ArgNames({"#stations", "#frequencies"})
     ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(SmearTermsBenchmark, SmearTermBenchmarkSIMD)
+    ->ArgsProduct({n_stations, n_frequencies})
+    ->ArgNames({"#stations", "#frequencies"})
+    ->Unit(benchmark::kMillisecond);
diff --git a/benchmark/spectrum_cross.cpp b/benchmark/spectrum_cross.cpp
index 13def0baaa8bd11c2244f643fc8eae57bbff9813..92cc24a547b4cc8aca34e90c658534058111980b 100644
--- a/benchmark/spectrum_cross.cpp
+++ b/benchmark/spectrum_cross.cpp
@@ -29,9 +29,8 @@ public:
     spec.SetReferenceFlux({1.0, 0.0, 0.0, 0.0});
     spec.SetPolarization(0.3, 0.2);
 
-    computed_spectrum =
-        std::make_unique<xt::xtensor<std::complex<double>, 2>>();
-    computed_spectrum->resize({4, n_frequencies});
+    computed_spectrum = std::make_unique<xt::xtensor<double, 3>>();
+    computed_spectrum->resize({2, 4, n_frequencies});
   }
   void TearDown(benchmark::State &) override {
     spectrum.reset();
@@ -40,7 +39,7 @@ public:
 
 protected:
   std::unique_ptr<Spectrum> spectrum;
-  std::unique_ptr<xt::xtensor<std::complex<double>, 2>> computed_spectrum;
+  std::unique_ptr<xt::xtensor<double, 3>> computed_spectrum;
   size_t n_frequencies;
   xt::xtensor<double, 1> frequencies;
 };
diff --git a/ci/das6/compile_and_run.sh b/ci/das6/compile_and_run.sh
index 19030e9d935625beb7a7a2a8ffed8ae797c66a15..22283a5872653f3597ee0960a1d0714f0c91c78b 100755
--- a/ci/das6/compile_and_run.sh
+++ b/ci/das6/compile_and_run.sh
@@ -8,6 +8,16 @@ ARCHITECTURE=$1
 COMPILER_VERSION=$2
 
 echo "Running on compiler version: ${COMPILER_VERSION}, architecture: ${ARCHITECTURE}, hostname: $(hostname)"
+
+# A third argument may be passed. This is the benchmark filter
+# Check if the filter args is available
+if [ "$#" -ge 3 ]; then
+    echo "Filter: $3"
+    FILTER=$3
+else
+    FILTER=""
+fi
+
 BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
 module purge
 module load spack/20250403
@@ -16,4 +26,4 @@ module load cmake casacore boost openblas
 cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release -DPREDICT_BUILD_BENCHMARK=ON
 make -C ${BUILD_DIR} -j
 
-${BUILD_DIR}/benchmark/microbenchmarks --benchmark_out=results-${COMPILER_VERSION}-${ARCHITECTURE}.json --benchmark_out_format=json
\ No newline at end of file
+${BUILD_DIR}/benchmark/microbenchmarks --benchmark_filter=${FILTER} --benchmark_out=results-${COMPILER_VERSION}-${ARCHITECTURE}.json --benchmark_out_format=json --benchmark_min_warmup_time=1
\ No newline at end of file
diff --git a/ci/summarize-results.py b/ci/summarize-results.py
index 48c33bf456997c76964cb8e4722cf9b409a34b8c..1e7144a96fb33ec4e6256eaeabad4e66edb1985e 100644
--- a/ci/summarize-results.py
+++ b/ci/summarize-results.py
@@ -43,9 +43,21 @@ def create_summary_plot(metrics, outplot_name):
     grid.set_ylabels(f"CPU time({time_unit})")
     grid.set_xlabels("Parameter")
     grid.add_legend()
-    plt.setp(grid._legend.get_texts(), fontsize=9)
-    plt.xticks(rotation=90)
+    plt.setp(grid._legend.get_texts(), fontsize=8)
+    for ax in grid.axes.flat:
+        for label in ax.get_xticklabels():
+            label.set_rotation(90)
+            label.set_fontsize(8)
+
+        ax.set_title(ax.get_title(), fontsize=8)
     
+    compilers = sorted(metrics["compiler_version"].unique())
+    archs = sorted(metrics["architecture"].unique())
+    title = f"Predict compilers: [{', '.join(compilers)}], architectures: [{', '.join(archs)}]"
+
+    grid.fig.suptitle(title, fontsize=14)
+    grid.fig.subplots_adjust(top=0.75) 
+
     grid.savefig(outplot_name + ".png")
 
 def main():
diff --git a/include/Directions.h b/include/Directions.h
index f172324e00e452c3b90e0cdab430d72a716e5831..1f66b862da0a70ffdb33551cba7a27e3072fdd62 100644
--- a/include/Directions.h
+++ b/include/Directions.h
@@ -13,6 +13,7 @@
 #include <ObjectCollection.h>
 #include <common/SmartVector.h>
 
+#include <tracy/Tracy.hpp>
 #include <vector>
 #include <xsimd/xsimd.hpp>
 #include <xtensor/xmath.hpp>
@@ -21,7 +22,7 @@
 
 using Direction = dp3::base::Direction;
 
-enum computation_strategy { SINGLE, MULTI, MULTI_SIMD };
+enum computation_strategy { SINGLE, XSIMD, MULTI_SIMD };
 
 class Directions : ObjectCollection<Direction> {
 public:
@@ -197,6 +198,7 @@ void radec2lmn::operator()(Arch, const Direction &reference, const C &ra,
 template <>
 inline void Directions::RaDec2Lmn<Directions::computation_strategy::MULTI_SIMD>(
     const Direction &reference, xt::xtensor<double, 2> &lmn) const {
+  ZoneScoped;
   xt::xtensor<double, 2> lmn_tmp;
   lmn_tmp.resize({3, ra.size()});
   xsimd::dispatch(xsimd::radec2lmn{})(reference, ra, dec, lmn_tmp,
diff --git a/include/PointSource.h b/include/PointSource.h
index bc09b0b8e120191c1b4f693d8fac812e11ce9e6d..fb2bf3ec4f5afc1158721dfcff88b1825b5b7eba 100644
--- a/include/PointSource.h
+++ b/include/PointSource.h
@@ -31,7 +31,7 @@ public:
   void SetDirection(const Direction &position);
 
   void ComputeSpectrum(const xt::xtensor<double, 1> &frequencies,
-                       xt::xtensor<std::complex<double>, 2> &result) const;
+                       xt::xtensor<double, 3> &result) const;
   const Spectrum &GetSpectrum() const { return spectrum_; }
 
   Stokes GetStokes(double freq) const;
diff --git a/include/Predict.h b/include/Predict.h
index 0851580c2d1f822e53311ea108a0f52098e5f0ce..c5d57d4e01645dda8e2dcf52a4b4616e285998f0 100644
--- a/include/Predict.h
+++ b/include/Predict.h
@@ -10,10 +10,12 @@
 #include <casacore/casa/Arrays/Cube.h>
 #include <casacore/measures/Measures.h>
 
+#include <xtensor/xlayout.hpp>
 #include <xtensor/xtensor.hpp>
 
+#include "Directions.h"
 #include "GaussianSourceCollection.h"
-#include "PredictPlanExec.h"
+#include "PredictPlanExecCPU.h"
 #include <Direction.h>
 #include <PointSource.h>
 #include <Stokes.h>
@@ -122,10 +124,23 @@ inline void spectrum(const PointSource &component, size_t nChannel,
 
 class Predict {
 public:
-  void run(PredictPlanExec &plan, const PointSourceCollection &sources,
-           xt::xtensor<dcomplex, 3> &buffer) const;
-  void run(PredictPlanExec &plan, const GaussianSourceCollection &sources,
-           xt::xtensor<dcomplex, 3> &buffer) const;
+  void run(PredictPlanExecCPU &plan, const PointSourceCollection &sources,
+           xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) const;
+
+  void run(PredictPlanExecCPU &plan, const GaussianSourceCollection &sources,
+           xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) const;
+
+  void
+  runWithStrategy(PredictPlanExecCPU &plan,
+                  const PointSourceCollection &sources,
+                  xt::xtensor<double, 4, xt::layout_type::row_major> &buffer,
+                  const computation_strategy strat);
+
+  void
+  runWithStrategy(PredictPlanExecCPU &plan,
+                  const GaussianSourceCollection &sources,
+                  xt::xtensor<double, 4, xt::layout_type::row_major> &buffer,
+                  const computation_strategy strat);
 };
 
 /// @}
diff --git a/include/PredictPlan.h b/include/PredictPlan.h
index c5b68cc35f8bda76081f62a8a161cb7961b8d873..73d09837cceb2c9393d549f3d38e73e88ccc764a 100644
--- a/include/PredictPlan.h
+++ b/include/PredictPlan.h
@@ -14,6 +14,7 @@
 #include <xtensor/xlayout.hpp>
 #include <xtensor/xtensor.hpp>
 
+#include "Directions.h"
 #include "GaussianSourceCollection.h"
 #include "PointSourceCollection.h"
 
@@ -42,11 +43,13 @@ struct PredictPlan {
   // standalone PredictPlan object
   virtual void Precompute(const PointSourceCollection &) {}
 
-  virtual void Compute(const PointSourceCollection &sources,
-                       xt::xtensor<std::complex<double>, 3> &buffer) {}
+  virtual void
+  Compute(const PointSourceCollection &sources,
+          xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) {}
 
-  virtual void Compute(const GaussianSourceCollection &sources,
-                       xt::xtensor<std::complex<double>, 3> &buffer) {}
+  virtual void
+  Compute(const GaussianSourceCollection &sources,
+          xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) {}
 };
 
 #endif // PREDICT_PLAN_H_
diff --git a/include/PredictPlanExec.h b/include/PredictPlanExec.h
deleted file mode 100644
index c9e79d853dfc7e6465ee595bd6cc17465f5155f6..0000000000000000000000000000000000000000
--- a/include/PredictPlanExec.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// PredictPlanExec.h: PredictPlanExec
-//
-// Copyright (C) 2020 ASTRON (Netherlands Institute for Radio Astronomy)
-// SPDX-License-Identifier: Apache2.0
-
-/// \file
-/// \brief PredictPlanExec
-
-#ifndef PREDICT_PLAN_EXEC_H_
-#define PREDICT_PLAN_EXEC_H_
-
-#include "GaussianSourceCollection.h"
-#include "PointSourceCollection.h"
-#include "PredictPlan.h"
-#include <casacore/measures/Measures.h>
-#include <complex>
-#include <xtensor/xtensor.hpp>
-class PredictPlanExec : public PredictPlan {
-public:
-  explicit PredictPlanExec(const PredictPlan &plan) : PredictPlan(plan) {
-    // Initialize the station phases and shifts
-    Initialize();
-  }
-
-  virtual void Precompute(const PointSourceCollection &sources) final override;
-
-  virtual void
-  Compute(const PointSourceCollection &sources,
-          xt::xtensor<std::complex<double>, 3> &buffer) final override;
-
-  virtual void
-  Compute(const GaussianSourceCollection &sources,
-          xt::xtensor<std::complex<double>, 3> &buffer) final override;
-
-  void ComputeStationPhases();
-
-  const xt::xtensor<double, 1> &GetStationPhases() const {
-    return station_phases;
-  }
-  const xt::xtensor<std::complex<double>, 2> &GetShiftData() const {
-    return shift_data;
-  }
-
-  void FillEulerMatrix(xt::xtensor<double, 2> &mat, const double ra,
-                       const double dec);
-
-  void ComputeSmearTerms();
-
-  const xt::xtensor<double, 2> &get_lmn() const { return lmn; }
-  const xt::xtensor<double, 2, xt::layout_type::column_major> &get_uvw() const {
-    return uvw;
-  }
-  const xt::xtensor<double, 1> &get_frequencies() const { return frequencies; }
-
-  xt::xtensor<double, 2> lmn;
-  xt::xtensor<double, 1> station_phases;
-  xt::xtensor<std::complex<double>, 2> shift_data;
-  xt::xtensor<double, 2> smear_terms;
-
-private:
-  void Initialize();
-
-private:
-  const double kCInv = 2.0 * M_PI / casacore::C::c;
-};
-
-#endif // PREDICT_PLAN_EXEC_H_
diff --git a/include/PredictPlanExecCPU.h b/include/PredictPlanExecCPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..d50c562c3c0dce7a37940a9f1c3467e8a614041a
--- /dev/null
+++ b/include/PredictPlanExecCPU.h
@@ -0,0 +1,158 @@
+// PredictPlanExecCPU.h: PredictPlanExecCPU
+//
+// Copyright (C) 2020 ASTRON (Netherlands Institute for Radio Astronomy)
+// SPDX-License-Identifier: Apache2.0
+
+/// \file
+/// \brief PredictPlanExecCPU
+
+#ifndef PREDICT_PLAN_EXEC_CPU_H_
+#define PREDICT_PLAN_EXEC_CPU_H_
+
+#include <casacore/measures/Measures.h>
+#include <xtensor/xtensor.hpp>
+#include <xtensor/xview.hpp>
+
+#include "Directions.h"
+#include "GaussianSourceCollection.h"
+#include "PointSourceCollection.h"
+#include "PredictPlan.h"
+
+class PredictPlanExecCPU : public PredictPlan {
+public:
+  explicit PredictPlanExecCPU(const PredictPlan &plan)
+      : PredictPlan(plan), xtensor_dispatch_(*this) {
+    // Initialize the station phases and shifts
+    Initialize();
+  }
+
+  virtual void Precompute(const PointSourceCollection &sources) final override;
+
+  virtual void Compute(const PointSourceCollection &sources,
+                       xt::xtensor<double, 4, xt::layout_type::row_major>
+                           &buffer) final override;
+
+  virtual void Compute(const GaussianSourceCollection &sources,
+                       xt::xtensor<double, 4, xt::layout_type::row_major>
+                           &buffer) final override;
+
+  void
+  ComputeWithTarget(const PointSourceCollection &sources,
+                    xt::xtensor<double, 4, xt::layout_type::row_major> &buffer,
+                    const computation_strategy strat);
+
+  void ComputeStationPhases();
+
+  template <computation_strategy T = computation_strategy::XSIMD>
+  constexpr inline void ComputeSmearTerms() {
+    if constexpr (T == computation_strategy::MULTI_SIMD) {
+      ComputeSmearTermsSIMD();
+    } else if constexpr (T == computation_strategy::XSIMD) {
+      ComputeSmearTermsXtensor();
+    } else {
+      static_assert(T == computation_strategy::XSIMD ||
+                        T == computation_strategy::MULTI_SIMD,
+                    "Unknown computation strategy");
+    }
+  }
+
+  void ComputeSmearTermsXtensor();
+  void ComputeSmearTermsSIMD();
+
+  const xt::xtensor<double, 1> &GetStationPhases() const {
+    return station_phases;
+  }
+
+  const xt::xtensor<double, 3> &GetShiftData() const { return shift_data; }
+
+  void FillEulerMatrix(xt::xtensor<double, 2> &mat, const double ra,
+                       const double dec);
+
+  const xt::xtensor<double, 2> &GetLmn() const { return lmn; }
+  const xt::xtensor<double, 2, xt::layout_type::column_major> &GetUvw() const {
+    return uvw;
+  }
+  const xt::xtensor<double, 1> &GetFrequencies() const { return frequencies; }
+
+  template <class E, class S, class Tag, class Arch>
+  void operator()(Arch, E &buffer_expr, const S &spectrum, Tag);
+
+  struct XTensorDispatch {
+    template <class E, class S, class Tag, class Arch>
+    void operator()(Arch, E &buffer_expr, const S &spectrum, Tag) const;
+
+    const PredictPlanExecCPU &parent;
+
+    explicit XTensorDispatch(const PredictPlanExecCPU &parent)
+        : parent(parent) {}
+  };
+
+  XTensorDispatch xtensor_dispatch_;
+
+  xt::xtensor<double, 2> lmn;
+  xt::xtensor<double, 1> station_phases;
+  xt::xtensor<double, 3> shift_data;
+  xt::xtensor<double, 2, xt::layout_type::row_major> smear_terms;
+
+private:
+  void Initialize();
+
+  template <computation_strategy T = computation_strategy::MULTI_SIMD, class E,
+            class S>
+  inline constexpr void ProcessPolarizationComponent(E &stoke_buffer_expr,
+                                                     const S &stoke_spectrum) {
+    if constexpr (T == computation_strategy::SINGLE) {
+      ProcessPolarizationComponentSingle<E, S>(stoke_buffer_expr,
+                                               stoke_spectrum);
+    } else if constexpr (T == computation_strategy::XSIMD) {
+      ProcessPolarizationComponentXtensor<E, S>(stoke_buffer_expr,
+                                                stoke_spectrum);
+    } else if constexpr (T == computation_strategy::MULTI_SIMD) {
+      ProcessPolarizationComponentMultiSimd<E, S>(stoke_buffer_expr,
+                                                  stoke_spectrum);
+    } else {
+      static_assert(T == computation_strategy::SINGLE ||
+                        T == computation_strategy::XSIMD ||
+                        T == computation_strategy::MULTI_SIMD,
+                    "Unknown computation strategy");
+    }
+  }
+
+  template <class E, class S>
+  inline void ProcessPolarizationComponent(const computation_strategy T,
+                                           E &buffer_expr, const S &spectrum) {
+    switch (T) {
+    case computation_strategy::SINGLE:
+      ProcessPolarizationComponentSingle<E, S>(buffer_expr, spectrum);
+      break;
+    case computation_strategy::XSIMD:
+      ProcessPolarizationComponentXtensor<E, S>(buffer_expr, spectrum);
+      break;
+    case computation_strategy::MULTI_SIMD:
+      ProcessPolarizationComponentMultiSimd<E, S>(buffer_expr, spectrum);
+      break;
+    default:
+      break;
+    }
+  }
+
+  template <class E, class S>
+  void ProcessPolarizationComponentSingle(E &stoke_buffer_expr,
+                                          const S &stoke_spectrum);
+
+  template <class E, class S>
+  void ProcessPolarizationComponentMulti(E &stoke_buffer_expr,
+                                         const S &stoke_spectrum);
+
+  template <class E, class S>
+  void ProcessPolarizationComponentMultiSimd(E &stoke_buffer_expr,
+                                             const S &stoke_spectrum);
+  template <class E, class S>
+  void ProcessPolarizationComponentXtensor(E &stoke_buffer_expr,
+                                           const S &stoke_spectrum);
+
+private:
+  const double kCInv = 2.0 * M_PI / casacore::C::c;
+};
+
+#endif // PREDICT_PLAN_EXEC_CPU_H_
diff --git a/include/Spectrum.h b/include/Spectrum.h
index 3fabc95f515cd20dc63e1916640ae4e0c168fdf5..7cf2800eec8f91769bbeed0f0b7483e5b8a6923b 100644
--- a/include/Spectrum.h
+++ b/include/Spectrum.h
@@ -116,9 +116,9 @@ public:
 
   const Stokes &GetReferenceFlux() const { return reference_flux_; }
 
-  void EvaluateCrossCorrelations(
-      const xt::xtensor<double, 1> &frequencies,
-      xt::xtensor<std::complex<double>, 2> &result) const {
+  void EvaluateCrossCorrelations(const xt::xtensor<double, 1> &frequencies,
+                                 xt::xtensor<double, 3> &result,
+                                 const bool need_reshape = true) const {
     xt::xtensor<double, 1>::shape_type kSpectralCorrectionShape{
         frequencies.size()};
     xt::xtensor<double, 2>::shape_type kRotationCoefficientsShape{
@@ -128,7 +128,9 @@ public:
     xt::xtensor<double, 2> rotation_coefficients =
         xt::zeros<double>(kRotationCoefficientsShape);
 
-    result.resize({4, frequencies.size()});
+    if (need_reshape) {
+      result.resize({2, 4, frequencies.size()});
+    }
 
     if (spectral_terms_.size() > 0 && has_logarithmic_spectral_index_) {
       for (size_t f_idx = 0; f_idx < frequencies.size(); f_idx++) {
@@ -166,14 +168,14 @@ public:
     auto Q = I * xt::view(rotation_coefficients, 0, xt::all());
     auto U = I * xt::view(rotation_coefficients, 1, xt::all());
 
-    xt::real(xt::view(result, CrossCorrelation::XX, xt::all())) = I + Q;
-    xt::imag(xt::view(result, CrossCorrelation::XX, xt::all())) = 0.0;
-    xt::real(xt::view(result, CrossCorrelation::XY, xt::all())) = U;
-    xt::imag(xt::view(result, CrossCorrelation::XY, xt::all())) = V;
-    xt::real(xt::view(result, CrossCorrelation::YX, xt::all())) = U;
-    xt::imag(xt::view(result, CrossCorrelation::YX, xt::all())) = -V;
-    xt::real(xt::view(result, CrossCorrelation::YY, xt::all())) = I - Q;
-    xt::imag(xt::view(result, CrossCorrelation::YY, xt::all())) = 0.0;
+    xt::view(result, 0, CrossCorrelation::XX, xt::all()) = I + Q;
+    xt::view(result, 1, CrossCorrelation::XX, xt::all()) = 0.0;
+    xt::view(result, 0, CrossCorrelation::XY, xt::all()) = U;
+    xt::view(result, 1, CrossCorrelation::XY, xt::all()) = V;
+    xt::view(result, 0, CrossCorrelation::YX, xt::all()) = U;
+    xt::view(result, 1, CrossCorrelation::YX, xt::all()) = -V;
+    xt::view(result, 0, CrossCorrelation::YY, xt::all()) = I - Q;
+    xt::view(result, 1, CrossCorrelation::YY, xt::all()) = 0.0;
   }
 
 private:
diff --git a/include/test/Common.h b/include/test/Common.h
index 62c305c80528a860caa0a753cf7a5c5d12865b84..9ab13c43844a758ee9064545c5d5056b0dda6caf 100644
--- a/include/test/Common.h
+++ b/include/test/Common.h
@@ -1,9 +1,10 @@
 #ifndef PREDICT_BENCHMARK_COMMON_HPP
 #define PREDICT_BENCHMARK_COMMON_HPP
 
+#include "Directions.h"
 #include "GaussianSourceCollection.h"
 #include "PointSourceCollection.h"
-#include "PredictPlanExec.h"
+#include "PredictPlanExecCPU.h"
 #include <complex>
 #include <random>
 #include <vector>
@@ -55,26 +56,46 @@ struct PredictRun {
     plan.channel_widths = std::vector<double>(plan.nchannels, 1.0e6);
 
     plan.baselines.resize(plan.nbaselines);
-    buffer.resize({plan.nstokes, plan.nchannels, plan.nbaselines});
+    buffer = xt::xtensor<double, 4, xt::layout_type::row_major>(
+        {2, plan.nstokes, plan.nbaselines, plan.nchannels}, 0.0);
   }
 
   void Run(const PointSourceCollection &sources) {
-    PredictPlanExec plan_exec{plan};
+    PredictPlanExecCPU plan_exec{plan};
     Predict pred;
     pred.run(plan_exec, sources, buffer);
   }
 
   void Run(const GaussianSourceCollection &sources) {
-    PredictPlanExec plan_exec{plan};
+    PredictPlanExecCPU plan_exec{plan};
     Predict pred;
     pred.run(plan_exec, sources, buffer);
   }
 
+  void RunWithStrategy(const PointSourceCollection &sources,
+                       const computation_strategy strat) {
+    PredictPlanExecCPU plan_exec{plan};
+    Predict pred;
+    pred.runWithStrategy(plan_exec, sources, buffer, strat);
+  }
+
+  void RunWithStrategy(const GaussianSourceCollection &sources,
+                       const computation_strategy strat) {
+    PredictPlanExecCPU plan_exec{plan};
+    Predict pred;
+    pred.runWithStrategy(plan_exec, sources, buffer, strat);
+  }
+
+  void Clean() {
+    buffer = xt::xtensor<double, 4, xt::layout_type::row_major>(
+        {2, plan.nstokes, plan.nbaselines, plan.nchannels}, 0.0);
+  }
+
   template <typename SourceType> SourceType makeSource() const {
     return SourceType{offset_source, Stokes{1.0, 0.0, 0.0, 0.0}};
   }
 
-  xt::xtensor<dcomplex, 3> buffer;
+  xt::xtensor<double, 4, xt::layout_type::row_major> buffer;
 
   Direction reference_point;
   Direction offset_source;
diff --git a/run_bench.sh b/run_bench.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c651bae99763a5739e43eb6b5a996ba12c3fdb8c
--- /dev/null
+++ b/run_bench.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Check if the script is being run with two or three arguments
+if [ "$#" -lt 2 ] || [ "$#" -gt 3 ]; then
+    echo "Usage: $0 <architecture> <compiler_version> [--bench-only-sources]"
+    exit 1
+fi
+
+module load python
+
+ARCHITECTURE=$1
+COMPILER_VERSION=$2
+FILTER=""
+
+# Check if the --bench-only-sources flag is set. If so, append the Google benchmark filter
+if [ "$3" == "--bench-only-sources" ]; then
+    BENCH_ONLY_SOURCES=true
+    FILTER="PredictBenchmark/PointSource PredictBenchmark/GaussianSource"
+else
+    BENCH_ONLY_SOURCES=false
+fi
+
+echo "Running on compiler version: ${COMPILER_VERSION}, architecture: ${ARCHITECTURE}, hostname: $(hostname), filter: ${FILTER}"
+
+sbatch --wait -C ${ARCHITECTURE} -o output.txt -e error.txt ci/das6/compile_and_run.sh ${ARCHITECTURE} ${COMPILER_VERSION} ${FILTER}
+cat output.txt >&1
+cat error.txt >&2
+
+python3 ci/summarize-results.py --filter PredictBenchmark/PointSource results*.json result-summary-pointsource
+python3 ci/summarize-results.py --filter PredictBenchmark/GaussianSource results*.json result-summary-gaussian
+
+if [ "$BENCH_ONLY_SOURCES" == false ]; then
+    python3 ci/summarize-results.py --filter DirectionsBenchmark/Directions results*.json result-summary-directions
+    python3 ci/summarize-results.py --filter PhasesBenchmark/ComputePhases results*.json result-summary-phases
+    python3 ci/summarize-results.py --filter SpectrumBenchmark/Spectrum results*.json result-summary-spectrum
+    python3 ci/summarize-results.py --filter SmearTermsBenchmark/SmearTermBenchmark results*.json result-summary-smearterms
+fi
\ No newline at end of file
diff --git a/src/PointSource.cpp b/src/PointSource.cpp
index 4b1be08f09026ad15287420e4edf913de71ed20b..7b71cf1de851da8fbe15d932ac6c29fa999731ab 100644
--- a/src/PointSource.cpp
+++ b/src/PointSource.cpp
@@ -28,9 +28,8 @@ void PointSource::SetDirection(const Direction &direction) {
   direction_ = direction;
 }
 
-void PointSource::ComputeSpectrum(
-    const xt::xtensor<double, 1> &frequencies,
-    xt::xtensor<std::complex<double>, 2> &result) const {
+void PointSource::ComputeSpectrum(const xt::xtensor<double, 1> &frequencies,
+                                  xt::xtensor<double, 3> &result) const {
   spectrum_.EvaluateCrossCorrelations(frequencies, result);
 }
 
diff --git a/src/Predict.cpp b/src/Predict.cpp
index a05045a49f4583a27e98bba51fb83a1875b7835a..6bb6b912bd360f136b9676c0a8100e0ca58a3f88 100644
--- a/src/Predict.cpp
+++ b/src/Predict.cpp
@@ -10,29 +10,50 @@
 #include <casacore/casa/BasicSL/Constants.h>
 
 #include <cmath>
+#include <xtensor/xlayout.hpp>
 
+#include <tracy/Tracy.hpp>
+
+#include "Directions.h"
 #include "GaussianSourceCollection.h"
 #include "PointSourceCollection.h"
 #include "Predict.h"
-#include "PredictPlanExec.h"
+#include "PredictPlanExecCPU.h"
 namespace {} // namespace
 namespace dp3 {
 namespace base {
 
-void Predict::run(PredictPlanExec &plan, const PointSourceCollection &sources,
-                  xt::xtensor<dcomplex, 3> &buffer)
+void Predict::run(PredictPlanExecCPU &plan,
+                  const PointSourceCollection &sources,
+                  xt::xtensor<double, 4, xt::layout_type::row_major> &buffer)
     const { // buffer dimensions are (nCor, nFreq, nBaselines)
   plan.Precompute(sources);
   plan.Compute(sources, buffer);
 }
 
-void Predict::run(PredictPlanExec &plan,
+void Predict::run(PredictPlanExecCPU &plan,
                   const GaussianSourceCollection &sources,
-                  xt::xtensor<dcomplex, 3> &buffer)
+                  xt::xtensor<double, 4, xt::layout_type::row_major> &buffer)
     const { // buffer dimensions are (nCor, nFreq, nBaselines)
   plan.Precompute(sources);
   plan.Compute(sources, buffer);
 }
 
+void Predict::runWithStrategy(
+    PredictPlanExecCPU &plan, const PointSourceCollection &sources,
+    xt::xtensor<double, 4, xt::layout_type::row_major> &buffer,
+    const computation_strategy strat) {
+  plan.Precompute(sources);
+  plan.ComputeWithTarget(sources, buffer, strat);
+}
+
+void Predict::runWithStrategy(
+    PredictPlanExecCPU &plan, const GaussianSourceCollection &sources,
+    xt::xtensor<double, 4, xt::layout_type::row_major> &buffer,
+    const computation_strategy strat) {
+  plan.Precompute(sources);
+  plan.ComputeWithTarget(sources, buffer, strat);
+}
+
 } // namespace base
 } // namespace dp3
diff --git a/src/PredictPlanExec.cpp b/src/PredictPlanExec.cpp
deleted file mode 100644
index 4b91cef87ceaa30ff94e9489098a4fc8c315713a..0000000000000000000000000000000000000000
--- a/src/PredictPlanExec.cpp
+++ /dev/null
@@ -1,439 +0,0 @@
-#include "PredictPlanExec.h"
-
-#include "Directions.h"
-#include "GaussianSourceCollection.h"
-#include "PointSourceCollection.h"
-#include "Spectrum.h"
-
-#include <xtensor-blas/xlinalg.hpp>
-#include <xtensor/xadapt.hpp>
-#include <xtensor/xindex_view.hpp>
-#include <xtensor/xtensor.hpp>
-#include <xtensor/xview.hpp>
-
-void PredictPlanExec::Initialize() {
-  // Initialize the station phases and shifts
-  station_phases.resize({nstations});
-  shift_data.resize({nstations, nchannels});
-  lmn.resize({nchannels, 3});
-  uvw.resize({nstations, 3});
-  frequencies.resize({nchannels});
-  smear_terms = xt::ones<float>({baselines.size(), nchannels});
-}
-
-void PredictPlanExec::Precompute(const PointSourceCollection &sources) {
-  sources.direction_vector.RaDec2Lmn(reference, lmn);
-  ComputeStationPhases();
-  if (correct_frequency_smearing)
-    ComputeSmearTerms();
-}
-
-/**
- * Compute station phase shifts.
- *
- * \f[ \mathrm{stationphases}(p) = \frac{2\pi}{c}((u_p, v_p, w_p) \cdot (\ell,
- * m, n)) \f]
- *
- * \f[ \mathrm{phases}(p) = e^{\mathrm{stationphases}(p)} \f]
- *
- * @param nStation Number of stations
- * @param nChannel Number of channels
- * @param lmn LMN coordinates of all sources.
- * @param uvw Station UVW coordinates, matrix of shape (3, nSt)
- * @param freq Channel frequencies, should be length nChannel
- * @param shift Output matrix (2 for real,imag), shift per station, matrix of
- * shape (3, nSt)
- * @param stationPhases Output vector, store per station \f$(x_1,y_1)\f$
- */
-void PredictPlanExec::ComputeStationPhases() {
-  // Compute station phases
-  const xt::xtensor<double, 1> lmn_offset = {lmn[0], lmn[1], lmn[2] - 1.0};
-  station_phases = xt::sum(uvw * lmn_offset * kCInv, 1);
-
-  xt::xtensor<double, 1> phase_terms;
-  phase_terms.resize({nchannels});
-
-  for (size_t st = 0; st < nstations; ++st) {
-    phase_terms = station_phases(st) * frequencies;
-
-    const auto sin_phase = xt::sin(phase_terms);
-    const auto cos_phase = xt::cos(phase_terms);
-
-    // Assign to shifts as std::complex
-    for (size_t ch = 0; ch < nchannels; ++ch) {
-      shift_data(st, ch) = std::complex<double>(cos_phase(ch), sin_phase(ch));
-    }
-  }
-}
-
-void PredictPlanExec::FillEulerMatrix(xt::xtensor<double, 2> &mat,
-                                      const double ra, const double dec) {
-  double sinra = sin(ra);
-  double cosra = cos(ra);
-  double sindec = sin(dec);
-  double cosdec = cos(dec);
-
-  mat(0, 0) = cosra;
-  mat(1, 0) = -sinra;
-  mat(2, 0) = 0;
-  mat(0, 1) = -sinra * sindec;
-  mat(1, 1) = -cosra * sindec;
-  mat(2, 1) = cosdec;
-  mat(0, 2) = sinra * cosdec;
-  mat(1, 2) = cosra * cosdec;
-  mat(2, 2) = sindec;
-}
-
-void PredictPlanExec::Compute(const PointSourceCollection &sources,
-                              xt::xtensor<std::complex<double>, 3> &buffer) {
-  xt::xtensor<std::complex<double>, 2> computed_spectrum;
-
-  for (size_t source_index = 0; source_index < sources.Size(); ++source_index) {
-    const Spectrum &spectrum = sources.spectrums[source_index];
-    spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum);
-
-    // Use preallocated storage (outside for loops)
-    std::vector<double> temp_prod_real_I(nchannels);
-    std::vector<double> temp_prod_imag_I(nchannels);
-
-    std::vector<double> temp_prod_real_Q;
-    std::vector<double> temp_prod_imag_Q;
-    std::vector<double> temp_prod_real_U;
-    std::vector<double> temp_prod_imag_U;
-    std::vector<double> temp_prod_real_V;
-    std::vector<double> temp_prod_imag_V;
-
-    // If needed, preallocate storage for Q,U,V
-    if (!compute_stokes_I_only) {
-      temp_prod_real_Q.resize(nchannels);
-      temp_prod_imag_Q.resize(nchannels);
-      temp_prod_real_U.resize(nchannels);
-      temp_prod_imag_U.resize(nchannels);
-      temp_prod_real_V.resize(nchannels);
-      temp_prod_imag_V.resize(nchannels);
-    }
-
-    // Compute visibilities.
-    // The following loop can be parallelized, but because there is already
-    // a parallelization over sources, this is not necessary.
-    // It used to be parallelized with a #pragma omp parallel for, but since
-    // the outer loop was also parallelized with omp, this had no effect
-    // (since omp by default doesn't parallelize nested loops). After the change
-    // to a ThreadPool, this would parallize each separate source, which started
-    // hundreds of threads on many-core machines.
-    for (size_t bl = 0; bl < baselines.size(); ++bl) {
-      const size_t p = baselines[bl].first;
-      const size_t q = baselines[bl].second;
-
-      if (p != q) {
-        if (compute_stokes_I_only) {
-#pragma GCC ivdep
-          for (size_t ch = 0; ch < nchannels; ++ch) {
-            // Note the notation:
-            // Each complex number is represented as (x + j*y)
-            // x: real part, y: imaginary part
-            // Subscripts _p and _q represent stations p and q, respectively
-            // Subscript _c represents channel (SpectrumBuffer)
-            const double x_p = (GetShiftData()(p, ch).real());
-            const double y_p = (GetShiftData()(p, ch).imag());
-            const double x_q = (GetShiftData()(q, ch).real());
-            const double y_q = (GetShiftData()(q, ch).imag());
-            const double x_c = computed_spectrum(StokesComponent::I, ch).real();
-            const double y_c = computed_spectrum(StokesComponent::I, ch).imag();
-
-            // Compute baseline phase shift.
-            // Compute visibilities.
-            const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q);
-            const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p);
-            temp_prod_real_I[ch] = q_conj_p_real * (x_c)-q_conj_p_imag * (y_c);
-            temp_prod_imag_I[ch] =
-                q_conj_p_real * (y_c) + q_conj_p_imag * (x_c);
-          } // Channels.
-          if (correct_frequency_smearing) {
-#pragma GCC ivdep
-            for (size_t ch = 0; ch < nchannels; ++ch) {
-              buffer(StokesComponent::I, ch, bl) +=
-                  dcomplex(smear_terms(bl, ch) * temp_prod_real_I[ch],
-                           smear_terms(bl, ch) * temp_prod_imag_I[ch]);
-            }
-          } else {
-#pragma GCC ivdep
-            for (size_t ch = 0; ch < nchannels; ++ch) {
-              buffer(StokesComponent::I, ch, bl) +=
-                  dcomplex(temp_prod_real_I[ch], temp_prod_imag_I[ch]);
-            }
-          }
-        } else {
-#pragma GCC ivdep
-          for (size_t ch = 0; ch < nchannels; ++ch) {
-            const double x_p = (GetShiftData()(p, ch).real());
-            const double y_p = (GetShiftData()(p, ch).imag());
-            const double x_q = (GetShiftData()(q, ch).real());
-            const double y_q = (GetShiftData()(q, ch).imag());
-
-            const double x_c = computed_spectrum(StokesComponent::I, ch).real();
-            const double y_c = computed_spectrum(StokesComponent::I, ch).imag();
-            const double x_c2 =
-                computed_spectrum(StokesComponent::Q, ch).real();
-            const double y_c2 =
-                computed_spectrum(StokesComponent::Q, ch).imag();
-            const double x_c3 =
-                computed_spectrum(StokesComponent::U, ch).real();
-            const double y_c3 =
-                computed_spectrum(StokesComponent::U, ch).imag();
-            const double x_c4 =
-                computed_spectrum(StokesComponent::V, ch).real();
-            const double y_c4 =
-                computed_spectrum(StokesComponent::V, ch).imag();
-
-            // Compute baseline phase shift.
-            // Compute visibilities.
-            const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q);
-            const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p);
-            temp_prod_real_I[ch] = q_conj_p_real * (x_c)-q_conj_p_imag * (y_c);
-            temp_prod_imag_I[ch] =
-                q_conj_p_real * (y_c) + q_conj_p_imag * (x_c);
-            temp_prod_real_Q[ch] =
-                q_conj_p_real * (x_c2)-q_conj_p_imag * (y_c2);
-            temp_prod_imag_Q[ch] =
-                q_conj_p_real * (y_c2) + q_conj_p_imag * (x_c2);
-            temp_prod_real_U[ch] =
-                q_conj_p_real * (x_c3)-q_conj_p_imag * (y_c3);
-            temp_prod_imag_U[ch] =
-                q_conj_p_real * (y_c3) + q_conj_p_imag * (x_c3);
-            temp_prod_real_V[ch] =
-                q_conj_p_real * (x_c4)-q_conj_p_imag * (y_c4);
-            temp_prod_imag_V[ch] =
-                q_conj_p_real * (y_c4) + q_conj_p_imag * (x_c4);
-          } // Channels.
-
-          if (correct_frequency_smearing) {
-#pragma GCC ivdep
-            for (size_t ch = 0; ch < nchannels; ++ch) {
-              buffer(StokesComponent::I, ch, bl) +=
-                  dcomplex(smear_terms(bl, ch) * temp_prod_real_I[ch],
-                           smear_terms(bl, ch) * temp_prod_imag_I[ch]);
-              buffer(StokesComponent::Q, ch, bl) +=
-                  dcomplex(smear_terms(bl, ch) * temp_prod_real_Q[ch],
-                           smear_terms(bl, ch) * temp_prod_imag_Q[ch]);
-              buffer(StokesComponent::U, ch, bl) +=
-                  dcomplex(smear_terms(bl, ch) * temp_prod_real_U[ch],
-                           smear_terms(bl, ch) * temp_prod_imag_U[ch]);
-              buffer(StokesComponent::V, ch, bl) +=
-                  dcomplex(smear_terms(bl, ch) * temp_prod_real_V[ch],
-                           smear_terms(bl, ch) * temp_prod_imag_V[ch]);
-            }
-          } else {
-#pragma GCC ivdep
-            for (size_t ch = 0; ch < nchannels; ++ch) {
-              buffer(StokesComponent::I, ch, bl) +=
-                  dcomplex(temp_prod_real_I[ch], temp_prod_imag_I[ch]);
-              buffer(StokesComponent::Q, ch, bl) +=
-                  dcomplex(temp_prod_real_Q[ch], temp_prod_imag_Q[ch]);
-              buffer(StokesComponent::U, ch, bl) +=
-                  dcomplex(temp_prod_real_U[ch], temp_prod_imag_U[ch]);
-              buffer(StokesComponent::V, ch, bl) +=
-                  dcomplex(temp_prod_real_V[ch], temp_prod_imag_V[ch]);
-            }
-          }
-        }
-      }
-    } // Baselines.
-  }
-}
-
-void PredictPlanExec::Compute(const GaussianSourceCollection &sources,
-                              xt::xtensor<std::complex<double>, 3> &buffer) {
-  const std::array<size_t, 2> kStationUvwShape = {get_uvw().shape(1),
-                                                  get_uvw().shape(0)};
-  xt::xtensor<std::complex<double>, 2> computed_spectrum;
-  xt::xtensor<double, 2> xtStationUvw;
-  xt::xtensor<double, 2> uvwShifted;
-  std::vector<double> angular_smear(nchannels);
-
-  for (size_t source_index = 0; source_index < sources.Size(); ++source_index) {
-    const Spectrum &spectrum = sources.spectrums[source_index];
-    const double position_angle = sources.position_angle[source_index];
-    const double major_axis = sources.major_axis[source_index];
-    const double minor_axis = sources.minor_axis[source_index];
-    const bool position_angle_is_absolute =
-        sources.position_angle_is_absolute[source_index];
-
-    spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum);
-
-    // Convert position angle from North over East to the angle used to
-    // rotate the right-handed UV-plane.
-    const double kPhi = M_PI_2 + position_angle + M_PI;
-    const double kCosPhi = cos(kPhi);
-    const double kSinPhi = sin(kPhi);
-
-    xtStationUvw = xt::adapt(get_uvw().data(), get_uvw().size(),
-                             xt::no_ownership(), kStationUvwShape);
-
-    if (position_angle_is_absolute) {
-      // Correct for projection and rotation effects: phase shift u, v, w to
-      // position of the source for evaluating the gaussian
-
-      xt::xtensor<double, 2> euler_matrix_phasecenter =
-          xt::zeros<double>({3, 3});
-
-      xt::xtensor<double, 2> euler_matrix_source = xt::zeros<double>({3, 3});
-
-      const Directions &directions = sources.direction_vector;
-
-      FillEulerMatrix(euler_matrix_phasecenter, reference.ra, reference.dec);
-      FillEulerMatrix(euler_matrix_source, directions.ra[source_index],
-                      directions.dec[source_index]);
-
-      xt::xtensor<double, 2> euler_matrix = xt::linalg::dot(
-          xt::transpose(euler_matrix_source), euler_matrix_phasecenter);
-      uvwShifted = xt::linalg::dot(euler_matrix, xtStationUvw);
-
-    } else {
-      uvwShifted = xtStationUvw;
-    }
-
-    // Take care of the conversion of axis lengths from FWHM in radians to
-    // sigma.
-    const double kFwhm2Sigma = 1.0 / (2.0 * std::sqrt(2.0 * std::log(2.0)));
-    const double kUScale = major_axis * kFwhm2Sigma;
-    const double kVScale = minor_axis * kFwhm2Sigma;
-    const double kInvCSqr = 1.0 / (casacore::C::c * casacore::C::c);
-
-    for (size_t bl = 0; bl < baselines.size(); ++bl) {
-      const size_t p = baselines[bl].first;
-      const size_t q = baselines[bl].second;
-
-      if (p != q) {
-        double u = uvwShifted(0, q);
-        double v = uvwShifted(1, q);
-
-        u -= uvwShifted(0, p);
-        v -= uvwShifted(1, p);
-
-        // Rotate (u, v) by the position angle and scale with the major
-        // and minor axis lengths (FWHM in rad).
-        const double uPrime = kUScale * (u * kCosPhi - v * kSinPhi);
-        const double vPrime = kVScale * (u * kSinPhi + v * kCosPhi);
-
-        // Compute uPrime^2 + vPrime^2 and pre-multiply with -2.0 * PI^2
-        // / C^2.
-        const double uvPrime =
-            (-2.0 * M_PI * M_PI) * (uPrime * uPrime + vPrime * vPrime);
-
-        for (size_t ch = 0; ch < nchannels; ++ch) {
-          const double lambda2 =
-              frequencies(ch) * frequencies(ch) * kInvCSqr * uvPrime;
-          angular_smear[ch] = exp(lambda2);
-        }
-
-        if (compute_stokes_I_only) {
-#pragma GCC ivdep
-          for (size_t ch = 0; ch < nchannels; ++ch) {
-            const double x_p = (GetShiftData()(p, ch).real());
-            const double y_p = (GetShiftData()(p, ch).imag());
-            const double x_q = (GetShiftData()(q, ch).real());
-            const double y_q = (GetShiftData()(q, ch).imag());
-            const double x_c =
-                (computed_spectrum(StokesComponent::I, ch).real());
-            const double y_c =
-                (computed_spectrum(StokesComponent::I, ch).imag());
-
-            // Compute baseline phase shift.
-            // Compute visibilities.
-            const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q);
-            const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p);
-            const double temp_prod_real_I =
-                q_conj_p_real * (x_c)-q_conj_p_imag * (y_c);
-            const double temp_prod_imag_I =
-                q_conj_p_real * (y_c) + q_conj_p_imag * (x_c);
-            buffer(StokesComponent::I, ch, bl) += dcomplex(
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_I,
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_I);
-          }
-        } else {
-#pragma GCC ivdep
-          for (size_t ch = 0; ch < nchannels; ++ch) {
-            const double x_p = (GetShiftData()(p, ch).real());
-            const double y_p = (GetShiftData()(p, ch).imag());
-            const double x_q = (GetShiftData()(q, ch).real());
-            const double y_q = (GetShiftData()(q, ch).imag());
-            const double x_c =
-                (computed_spectrum(StokesComponent::I, ch).real());
-            const double y_c =
-                (computed_spectrum(StokesComponent::I, ch).imag());
-            const double x_c2 =
-                (computed_spectrum(StokesComponent::Q, ch).real());
-            const double y_c2 =
-                (computed_spectrum(StokesComponent::Q, ch).imag());
-            const double x_c3 =
-                (computed_spectrum(StokesComponent::U, ch).real());
-            const double y_c3 =
-                (computed_spectrum(StokesComponent::U, ch).imag());
-            const double x_c4 =
-                (computed_spectrum(StokesComponent::V, ch).real());
-            const double y_c4 =
-                (computed_spectrum(StokesComponent::V, ch).imag());
-
-            // Compute baseline phase shift.
-            // Compute visibilities.
-            const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q);
-            const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p);
-            const double temp_prod_real_I =
-                q_conj_p_real * (x_c)-q_conj_p_imag * (y_c);
-            const double temp_prod_imag_I =
-                q_conj_p_real * (y_c) + q_conj_p_imag * (x_c);
-            const double temp_prod_real_Q =
-                q_conj_p_real * (x_c2)-q_conj_p_imag * (y_c2);
-            const double temp_prod_imag_Q =
-                q_conj_p_real * (y_c2) + q_conj_p_imag * (x_c2);
-            const double temp_prod_real_U =
-                q_conj_p_real * (x_c3)-q_conj_p_imag * (y_c3);
-            const double temp_prod_imag_U =
-                q_conj_p_real * (y_c3) + q_conj_p_imag * (x_c3);
-            const double temp_prod_real_V =
-                q_conj_p_real * (x_c4)-q_conj_p_imag * (y_c4);
-            const double temp_prod_imag_V =
-                q_conj_p_real * (y_c4) + q_conj_p_imag * (x_c4);
-
-            // The following operations are memory-bound, unlike
-            // the compute-bound segment above. By merging these together,
-            // an improvement in speed is expected
-            buffer(StokesComponent::I, ch, bl) += dcomplex(
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_I,
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_I);
-            buffer(StokesComponent::Q, ch, bl) += dcomplex(
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_Q,
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_Q);
-            buffer(StokesComponent::U, ch, bl) += dcomplex(
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_U,
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_U);
-            buffer(StokesComponent::V, ch, bl) += dcomplex(
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_real_V,
-                angular_smear[ch] * smear_terms(bl, ch) * temp_prod_imag_V);
-          }
-        }
-      }
-    } // Baselines.
-  }
-}
-void PredictPlanExec::ComputeSmearTerms() {
-  // Compute the smear terms
-  size_t p = 0UL;
-  size_t q = 0UL;
-  std::vector<float> channel_floats(channel_widths.size());
-  std::copy(channel_widths.begin(), channel_widths.end(),
-            channel_floats.begin());
-  for (size_t bl = 0; bl < baselines.size(); ++bl) {
-    std::tie(p, q) = baselines[bl];
-    const float phase_diff =
-        static_cast<float>(station_phases[q] - station_phases[p]);
-    if (abs(phase_diff) > 1.e-6) [[likely]] {
-#pragma omp simd
-      for (size_t ch = 0; ch < nchannels; ch++) {
-        const float shift = phase_diff * channel_floats[ch] * 0.5f;
-        smear_terms(bl, ch) = std::fabs(sinf(shift) / shift);
-      }
-    }
-  }
-}
\ No newline at end of file
diff --git a/src/PredictPlanExecCPU.cpp b/src/PredictPlanExecCPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f41d074b99e3d5f155b1d3fbc9aa07a27c81673e
--- /dev/null
+++ b/src/PredictPlanExecCPU.cpp
@@ -0,0 +1,723 @@
+#include "PredictPlanExecCPU.h"
+
+#include "Directions.h"
+#include "GaussianSourceCollection.h"
+#include "PointSourceCollection.h"
+#include "PredictPlan.h"
+#include "Spectrum.h"
+
+#include <tracy/Tracy.hpp>
+
+#include <cstdint>
+#include <immintrin.h>
+
+#include <iomanip>
+#include <iostream>
+#include <thread>
+#include <xtensor-blas/xlinalg.hpp>
+#include <xtensor/xadapt.hpp>
+#include <xtensor/xindex_view.hpp>
+#include <xtensor/xlayout.hpp>
+#include <xtensor/xtensor.hpp>
+#include <xtensor/xtensor_forward.hpp>
+#include <xtensor/xview.hpp>
+
+void PredictPlanExecCPU::Initialize() {
+  // Initialize the station phases and shifts
+  station_phases.resize({nstations});
+  shift_data.resize({2, nstations, nchannels});
+  lmn.resize({nchannels, 3});
+  uvw.resize({nstations, 3});
+  frequencies.resize({nchannels});
+
+  smear_terms = xt::ones<float>({baselines.size(), nchannels});
+}
+
+void PredictPlanExecCPU::Precompute(const PointSourceCollection &sources) {
+  FrameMark;
+  {
+    // ZoneScoped; // Mark the scope with Tracy
+
+    sources.direction_vector.RaDec2Lmn(reference, lmn);
+    ComputeStationPhases();
+
+    // Precompute smearing factors if needed
+    if (correct_frequency_smearing) {
+      ComputeSmearTerms();
+    }
+  }
+}
+
+/**
+ * Compute station phase shifts.
+ *
+ * \f[ \mathrm{stationphases}(p) = \frac{2\pi}{c}((u_p, v_p, w_p) \cdot (\ell,
+ * m, n)) \f]
+ *
+ * \f[ \mathrm{phases}(p) = e^{\mathrm{stationphases}(p)} \f]
+ *
+ * @param nStation Number of stations
+ * @param nChannel Number of channels
+ * @param lmn LMN coordinates of all sources.
+ * @param uvw Station UVW coordinates, matrix of shape (3, nSt)
+ * @param freq Channel frequencies, should be length nChannel
+ * @param shift Output matrix (2 for real,imag), shift per station, matrix of
+ * shape (3, nSt)
+ * @param stationPhases Output vector, store per station \f$(x_1,y_1)\f$
+ */
+void PredictPlanExecCPU::ComputeStationPhases() {
+  // Compute station phases
+  const xt::xtensor<double, 1> lmn_offset = {lmn[0], lmn[1], lmn[2] - 1.0};
+  station_phases = xt::sum(uvw * lmn_offset * kCInv, 1);
+
+  xt::xtensor<double, 1> phase_terms;
+  phase_terms.resize({nchannels});
+
+  for (size_t st = 0; st < nstations; ++st) {
+    phase_terms = station_phases(st) * frequencies;
+
+    const auto sin_phase = xt::sin(phase_terms);
+    const auto cos_phase = xt::cos(phase_terms);
+
+    // Assign to shifts as std::complex
+    for (size_t ch = 0; ch < nchannels; ++ch) {
+      shift_data(0, st, ch) = cos_phase(ch);
+      shift_data(1, st, ch) = sin_phase(ch);
+    }
+  }
+}
+
+void PredictPlanExecCPU::FillEulerMatrix(xt::xtensor<double, 2> &mat,
+                                         const double ra, const double dec) {
+  double sinra = sin(ra);
+  double cosra = cos(ra);
+  double sindec = sin(dec);
+  double cosdec = cos(dec);
+
+  mat(0, 0) = cosra;
+  mat(1, 0) = -sinra;
+  mat(2, 0) = 0;
+  mat(0, 1) = -sinra * sindec;
+  mat(1, 1) = -cosra * sindec;
+  mat(2, 1) = cosdec;
+  mat(0, 2) = sinra * cosdec;
+  mat(1, 2) = cosra * cosdec;
+  mat(2, 2) = sindec;
+}
+
+template <class E, class S>
+void PredictPlanExecCPU::ProcessPolarizationComponentSingle(
+    E &stoke_buffer, const S &stoke_spectrum) {
+  // Use preallocated storage (outside for loops)
+  std::vector<double> temp_prod_real(nchannels);
+  std::vector<double> temp_prod_imag(nchannels);
+
+  const auto &shift_data = GetShiftData(); // Cache reference
+
+  for (size_t bl = 0; bl < baselines.size(); ++bl) {
+    const size_t p = baselines[bl].first;
+    const size_t q = baselines[bl].second;
+
+    // Compute visibilities.
+    // The following loop can be parallelized, but because there is already
+    // a parallelization over sources, this is not necessary.
+    // It used to be parallelized with a #pragma omp parallel for, but since
+    // the outer loop was also parallelized with omp, this had no effect
+    // (since omp by default doesn't parallelize nested loops). After the
+    // change to a ThreadPool, this would parallize each separate source,
+    // which started hundreds of threads on many-core machines.
+
+    for (size_t ch = 0; ch < nchannels; ++ch) {
+      // Note the notation:
+      // Each complex number is represented as (x + j*y)
+      // x: real part, y: imaginary part
+      // Subscripts _p and _q represent stations p and q, respectively
+      // Subscript _c represents channel (SpectrumBuffer)
+      const double x_p = (shift_data(0, p, ch));
+      const double y_p = (shift_data(1, p, ch));
+      const double x_q = (shift_data(0, q, ch));
+      const double y_q = (shift_data(1, q, ch));
+      const double x_c = stoke_spectrum(0, ch);
+      const double y_c = stoke_spectrum(1, ch);
+
+      // Compute baseline phase shift.
+      // Compute visibilities.
+      const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q);
+      const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p);
+      temp_prod_real[ch] = q_conj_p_real * (x_c)-q_conj_p_imag * (y_c);
+      temp_prod_imag[ch] = q_conj_p_real * (y_c) + q_conj_p_imag * (x_c);
+    } // Channels.
+    if (correct_frequency_smearing) {
+      for (size_t ch = 0; ch < nchannels; ++ch) {
+        const double smear_term = smear_terms(bl, ch);
+        stoke_buffer(0, bl, ch) += smear_term * temp_prod_real[ch];
+        stoke_buffer(1, bl, ch) += smear_term * temp_prod_imag[ch];
+      }
+    } else {
+      for (size_t ch = 0; ch < nchannels; ++ch) {
+        stoke_buffer(0, bl, ch) += temp_prod_real[ch];
+        stoke_buffer(1, bl, ch) += temp_prod_imag[ch];
+      }
+    }
+  }
+}
+
+inline void store_and_add(double *const buffer, const __m256d &new_values) {
+  // Load existing values from buffer
+  __m256d existing = _mm256_loadu_pd(buffer);
+  __m256d result = _mm256_add_pd(existing, new_values);
+  _mm256_storeu_pd(buffer, result);
+}
+
+template <class E, class S, class Tag, class Arch>
+void PredictPlanExecCPU::XTensorDispatch::operator()(Arch, E &buffer_expr,
+                                                     const S &spectrum,
+                                                     Tag) const {
+  using b_type = xsimd::batch<double, Arch>;
+  size_t inc = b_type::size;
+  size_t size = parent.nchannels;
+  // size for which the vectorization is possible
+  size_t vec_size = size - size % inc;
+
+  const auto &shift_data = parent.GetShiftData(); // Cache reference
+
+  const double *const spectrum_real_ptr = &(spectrum.unchecked(0, 0));
+  const double *const spectrum_imag_ptr = &(spectrum.unchecked(1, 0));
+
+  for (size_t bl = 0; bl < parent.baselines.size(); ++bl) {
+    double *const output_real_ptr = &(buffer_expr.unchecked(0, bl, 0));
+    double *const output_imag_ptr = &(buffer_expr.unchecked(1, bl, 0));
+
+    for (size_t i = 0; i < vec_size; i += inc) {
+      const b_type spectrum_real = b_type::load(spectrum_real_ptr + i, Tag());
+      const b_type spectrum_imag = b_type::load(spectrum_imag_ptr + i, Tag());
+
+      const size_t p = parent.baselines[bl].first;
+      const size_t q = parent.baselines[bl].second;
+
+      // Load shift data for p and q
+      const b_type x_p = b_type::load(&shift_data.unchecked(0, p, i), Tag());
+      const b_type y_p = b_type::load(&shift_data.unchecked(1, p, i), Tag());
+      const b_type x_q = b_type::load(&shift_data.unchecked(0, q, i), Tag());
+      const b_type y_q = b_type::load(&shift_data.unchecked(1, q, i), Tag());
+
+      const b_type real_part =
+          xsimd::add(xsimd::mul(x_p, x_q), xsimd::mul(y_p, y_q));
+      const b_type imag_part =
+          xsimd::sub(xsimd::mul(x_p, y_q), xsimd::mul(x_q, y_p));
+
+      // Compute the product with the spectrum
+      const b_type temp_real = xsimd::sub(xsimd::mul(real_part, spectrum_real),
+                                          xsimd::mul(imag_part, spectrum_imag));
+      const b_type temp_imag = xsimd::add(xsimd::mul(real_part, spectrum_imag),
+                                          xsimd::mul(imag_part, spectrum_real));
+      const b_type buffer_real = b_type::load(output_real_ptr + i, Tag());
+      const b_type buffer_imag = b_type::load(output_imag_ptr + i, Tag());
+
+      if (parent.correct_frequency_smearing) {
+        // Apply smearing terms
+        const b_type smear_terms_temp =
+            b_type::load(&parent.smear_terms.unchecked(bl, i), Tag());
+
+        xsimd::store(
+            output_real_ptr + i,
+            xsimd::add(xsimd::mul(temp_real, smear_terms_temp), buffer_real),
+            Tag());
+
+        xsimd::store(
+            output_imag_ptr + i,
+            xsimd::add(xsimd::mul(temp_imag, smear_terms_temp), buffer_imag),
+            Tag());
+      } else {
+        xsimd::store(output_real_ptr + i, xsimd::add(temp_real, buffer_real),
+                     Tag());
+
+        xsimd::store(output_imag_ptr + i, xsimd::add(temp_imag, buffer_imag),
+                     Tag());
+      }
+    }
+  }
+
+  // Remaining part that cannot be vectorized
+  for (size_t i = vec_size; i < size; ++i) {
+    for (size_t bl = 0; bl < parent.baselines.size(); ++bl) {
+      const size_t p = parent.baselines[bl].first;
+      const size_t q = parent.baselines[bl].second;
+
+      // Load shift data for p and q
+      const double x_p = shift_data(0, p, i);
+      const double y_p = shift_data(1, p, i);
+      const double x_q = shift_data(0, q, i);
+      const double y_q = shift_data(1, q, i);
+
+      const double real_part = x_p * x_q + y_p * y_q;
+      const double imag_part = x_p * y_q - x_q * y_p;
+
+      const double temp_real =
+          real_part * spectrum(0, i) - imag_part * spectrum(1, i);
+      const double temp_imag =
+          real_part * spectrum(1, i) + imag_part * spectrum(0, i);
+      const double buffer_real = buffer_expr(0, bl, i);
+      const double buffer_imag = buffer_expr(1, bl, i);
+
+      if (parent.correct_frequency_smearing) {
+        // Apply smearing terms
+        const double smear_terms_temp = parent.smear_terms(bl, i);
+
+        buffer_expr(0, bl, i) = temp_real * smear_terms_temp + buffer_real;
+        buffer_expr(1, bl, i) = temp_imag * smear_terms_temp + buffer_imag;
+      } else {
+        buffer_expr(0, bl, i) = temp_real + buffer_real;
+        buffer_expr(1, bl, i) = temp_imag + buffer_imag;
+      }
+    }
+  }
+}
+
+template <class E, class S>
+void PredictPlanExecCPU::ProcessPolarizationComponentXtensor(
+    E &buffer_expr, const S &spectrum) {
+  xsimd::dispatch(xtensor_dispatch_)(buffer_expr, spectrum,
+                                     xsimd::unaligned_mode());
+}
+
+template <class E, class S>
+void PredictPlanExecCPU::ProcessPolarizationComponentMultiSimd(
+    E &buffer_expr, const S &spectrum) {
+  // Precompute number of channels that can be processed in parallel
+  const uint_fast32_t simd_channels = (nchannels / 4) * 4;
+  const uint_fast32_t total_vector_size = simd_channels * 4;
+
+  // std::vector<double> channel_accum(simd_channels, 0.0);
+
+  for (uint_fast32_t bl = 0; bl < baselines.size(); ++bl) {
+    const size_t p = baselines[bl].first;
+    const size_t q = baselines[bl].second;
+
+    double real_accum = 0.0, imag_accum = 0.0;
+
+    const auto &shift_data = GetShiftData(); // Cache reference
+
+    uint_fast32_t ch = 0;
+
+    const double *const smear_terms_ptr = &(smear_terms.unchecked(bl, 0));
+    const double *const spectrum_real_ptr = &(spectrum.unchecked(0, 0));
+    const double *const spectrum_imag_ptr = &(spectrum.unchecked(1, 0));
+    double *const output_real_ptr = &(buffer_expr.unchecked(0, bl, 0));
+    double *const output_imag_ptr = &(buffer_expr.unchecked(1, bl, 0));
+
+    if (correct_frequency_smearing) {
+      for (; ch < simd_channels; ch += 4) {
+        // Load 4 values from shift_data for p and q
+        __m256d x_p = _mm256_loadu_pd(&shift_data.unchecked(0, p, ch));
+        __m256d y_p = _mm256_loadu_pd(&shift_data.unchecked(1, p, ch));
+        __m256d x_q = _mm256_loadu_pd(&shift_data.unchecked(0, q, ch));
+        __m256d y_q = _mm256_loadu_pd(&shift_data.unchecked(1, q, ch));
+        __m256d x_c = _mm256_loadu_pd(spectrum_real_ptr + ch);
+        __m256d y_c = _mm256_loadu_pd(spectrum_imag_ptr + ch);
+
+        // Complex multiply (q_conj * p) * stokes
+        // FIXME: maybe used a fused multiply add
+        __m256d real_part =
+            _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q));
+        __m256d imag_part =
+            _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p));
+
+        __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c),
+                                          _mm256_mul_pd(imag_part, y_c));
+        __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c),
+                                          _mm256_mul_pd(imag_part, x_c));
+
+        __m256d smear_terms_temp = _mm256_loadu_pd(smear_terms_ptr + ch);
+        temp_real = _mm256_mul_pd(temp_real, smear_terms_temp);
+        temp_imag = _mm256_mul_pd(temp_imag, smear_terms_temp);
+
+        store_and_add(output_real_ptr + ch, temp_real);
+        store_and_add(output_imag_ptr + ch, temp_imag);
+      }
+    } else {
+      for (; ch < simd_channels; ch += 4) {
+        // Load 4 values from shift_data for p and q
+        __m256d x_p = _mm256_loadu_pd(&shift_data.unchecked(0, p, ch));
+        __m256d y_p = _mm256_loadu_pd(&shift_data.unchecked(1, p, ch));
+        __m256d x_q = _mm256_loadu_pd(&shift_data.unchecked(0, q, ch));
+        __m256d y_q = _mm256_loadu_pd(&shift_data.unchecked(1, q, ch));
+        __m256d x_c = _mm256_loadu_pd(spectrum_real_ptr + ch);
+        __m256d y_c = _mm256_loadu_pd(spectrum_imag_ptr + ch);
+
+        // Complex multiply (q_conj * p) * stokes
+        __m256d real_part =
+            _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q));
+        __m256d imag_part =
+            _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p));
+
+        __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c),
+                                          _mm256_mul_pd(imag_part, y_c));
+        __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c),
+                                          _mm256_mul_pd(imag_part, x_c));
+
+        store_and_add(output_real_ptr + ch, temp_real);
+        store_and_add(output_imag_ptr + ch, temp_imag);
+      }
+    }
+
+    for (; ch < nchannels; ++ch) {
+      const double x_p = (shift_data(0, p, ch));
+      const double y_p = (shift_data(1, p, ch));
+      const double x_q = (shift_data(0, q, ch));
+      const double y_q = (shift_data(1, q, ch));
+      const double x_c = spectrum(0, ch);
+      const double y_c = spectrum(1, ch);
+
+      // Compute baseline phase shift.
+      // Compute visibilities.
+      const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q);
+      const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p);
+
+      const double temp_prod_real = q_conj_p_real * (x_c)-q_conj_p_imag * (y_c);
+      const double temp_prod_imag =
+          q_conj_p_real * (y_c) + q_conj_p_imag * (x_c);
+
+      if (correct_frequency_smearing) {
+        const double smear_term = smear_terms(bl, ch);
+        // buffer_expr(bl, ch) +=
+        //     dcomplex{temp_prod_real * smear_term, temp_prod_imag *
+        //     smear_term};
+        buffer_expr(0, bl, ch) += (temp_prod_real * smear_term);
+        buffer_expr(1, bl, ch) += (temp_prod_imag * smear_term);
+
+      } else {
+        // buffer_expr(bl, ch) += dcomplex{temp_prod_real, temp_prod_imag};
+        buffer_expr(0, bl, ch) += temp_prod_real;
+        buffer_expr(1, bl, ch) += temp_prod_imag;
+      }
+    }
+  }
+}
+
+void PredictPlanExecCPU::Compute(
+    const PointSourceCollection &sources,
+    xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) {
+  xt::xtensor<double, 3> computed_spectrum({2, 4, frequencies.size()});
+
+  const auto spectrum_XX = xt::view(computed_spectrum, xt::all(), 0, xt::all());
+  const auto spectrum_XY = xt::view(computed_spectrum, xt::all(), 1, xt::all());
+  const auto spectrum_YX = xt::view(computed_spectrum, xt::all(), 2, xt::all());
+  const auto spectrum_YY = xt::view(computed_spectrum, xt::all(), 3, xt::all());
+
+  auto buffer_XX = xt::view(buffer, xt::all(), 0, xt::all(), xt::all());
+  auto buffer_XY = xt::view(buffer, xt::all(), 1, xt::all(), xt::all());
+  auto buffer_YX = xt::view(buffer, xt::all(), 2, xt::all(), xt::all());
+  auto buffer_YY = xt::view(buffer, xt::all(), 3, xt::all(), xt::all());
+
+  for (size_t source_index = 0; source_index < sources.Size(); ++source_index) {
+    const Spectrum &spectrum = sources.spectrums[source_index];
+    spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum, false);
+
+    if (compute_stokes_I_only) {
+      ProcessPolarizationComponent(buffer_XX, spectrum_XX);
+    } else {
+      ProcessPolarizationComponent(buffer_XX, spectrum_XX);
+      ProcessPolarizationComponent(buffer_XY, spectrum_XY);
+      ProcessPolarizationComponent(buffer_YX, spectrum_YX);
+      ProcessPolarizationComponent(buffer_YY, spectrum_YY);
+    }
+
+  } // Baselines.
+}
+
+void PredictPlanExecCPU::ComputeWithTarget(
+    const PointSourceCollection &sources,
+    xt::xtensor<double, 4, xt::layout_type::row_major> &buffer,
+    const computation_strategy strat) {
+  xt::xtensor<double, 3> computed_spectrum({2, 4, frequencies.size()});
+
+  const auto spectrum_XX = xt::view(computed_spectrum, xt::all(), 0, xt::all());
+  const auto spectrum_XY = xt::view(computed_spectrum, xt::all(), 1, xt::all());
+  const auto spectrum_YX = xt::view(computed_spectrum, xt::all(), 2, xt::all());
+  const auto spectrum_YY = xt::view(computed_spectrum, xt::all(), 3, xt::all());
+
+  auto buffer_XX = xt::view(buffer, xt::all(), 0, xt::all(), xt::all());
+  auto buffer_XY = xt::view(buffer, xt::all(), 1, xt::all(), xt::all());
+  auto buffer_YX = xt::view(buffer, xt::all(), 2, xt::all(), xt::all());
+  auto buffer_YY = xt::view(buffer, xt::all(), 3, xt::all(), xt::all());
+
+  for (size_t source_index = 0; source_index < sources.Size(); ++source_index) {
+    const Spectrum &spectrum = sources.spectrums[source_index];
+    spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum, false);
+
+    if (compute_stokes_I_only) {
+      ProcessPolarizationComponent(strat, buffer_XX, spectrum_XX);
+    } else {
+      ProcessPolarizationComponent(strat, buffer_XX, spectrum_XX);
+      ProcessPolarizationComponent(strat, buffer_XY, spectrum_XY);
+      ProcessPolarizationComponent(strat, buffer_YX, spectrum_YX);
+      ProcessPolarizationComponent(strat, buffer_YY, spectrum_YY);
+    }
+
+  } // Baselines.
+}
+
+void PredictPlanExecCPU::Compute(
+    const GaussianSourceCollection &sources,
+    xt::xtensor<double, 4, xt::layout_type::row_major> &buffer) {
+  const std::array<size_t, 2> kStationUvwShape = {GetUvw().shape(1),
+                                                  GetUvw().shape(0)};
+  xt::xtensor<double, 3> computed_spectrum;
+  xt::xtensor<double, 2> xtStationUvw;
+  xt::xtensor<double, 2> uvwShifted;
+  std::vector<double> angular_smear(nchannels);
+
+  for (size_t source_index = 0; source_index < sources.Size(); ++source_index) {
+    const Spectrum &spectrum = sources.spectrums[source_index];
+    const double position_angle = sources.position_angle[source_index];
+    const double major_axis = sources.major_axis[source_index];
+    const double minor_axis = sources.minor_axis[source_index];
+    const bool position_angle_is_absolute =
+        sources.position_angle_is_absolute[source_index];
+
+    spectrum.EvaluateCrossCorrelations(frequencies, computed_spectrum);
+
+    // Convert position angle from North over East to the angle used to
+    // rotate the right-handed UV-plane.
+    const double kPhi = M_PI_2 + position_angle + M_PI;
+    const double kCosPhi = cos(kPhi);
+    const double kSinPhi = sin(kPhi);
+
+    xtStationUvw = xt::adapt(GetUvw().data(), GetUvw().size(),
+                             xt::no_ownership(), kStationUvwShape);
+
+    const auto &shift_data = GetShiftData(); // Cache reference
+
+    if (position_angle_is_absolute) {
+      // Correct for projection and rotation effects: phase shift u, v, w to
+      // position of the source for evaluating the gaussian
+
+      xt::xtensor<double, 2> euler_matrix_phasecenter =
+          xt::zeros<double>({3, 3});
+
+      xt::xtensor<double, 2> euler_matrix_source = xt::zeros<double>({3, 3});
+
+      const Directions &directions = sources.direction_vector;
+
+      FillEulerMatrix(euler_matrix_phasecenter, reference.ra, reference.dec);
+      FillEulerMatrix(euler_matrix_source, directions.ra[source_index],
+                      directions.dec[source_index]);
+
+      xt::xtensor<double, 2> euler_matrix = xt::linalg::dot(
+          xt::transpose(euler_matrix_source), euler_matrix_phasecenter);
+      uvwShifted = xt::linalg::dot(euler_matrix, xtStationUvw);
+
+    } else {
+      uvwShifted = xtStationUvw;
+    }
+
+    // Take care of the conversion of axis lengths from FWHM in radians to
+    // sigma.
+    const double kFwhm2Sigma = 1.0 / (2.0 * std::sqrt(2.0 * std::log(2.0)));
+    const double kUScale = major_axis * kFwhm2Sigma;
+    const double kVScale = minor_axis * kFwhm2Sigma;
+    const double kInvCSqr = 1.0 / (casacore::C::c * casacore::C::c);
+
+    for (size_t bl = 0; bl < baselines.size(); ++bl) {
+      const size_t p = baselines[bl].first;
+      const size_t q = baselines[bl].second;
+
+      if (p != q) {
+        double u = uvwShifted(0, q);
+        double v = uvwShifted(1, q);
+
+        u -= uvwShifted(0, p);
+        v -= uvwShifted(1, p);
+
+        // Rotate (u, v) by the position angle and scale with the major
+        // and minor axis lengths (FWHM in rad).
+        const double uPrime = kUScale * (u * kCosPhi - v * kSinPhi);
+        const double vPrime = kVScale * (u * kSinPhi + v * kCosPhi);
+
+        // Compute uPrime^2 + vPrime^2 and pre-multiply with -2.0 * PI^2
+        // / C^2.
+        const double uvPrime =
+            (-2.0 * M_PI * M_PI) * (uPrime * uPrime + vPrime * vPrime);
+
+        for (size_t ch = 0; ch < nchannels; ++ch) {
+          const double lambda2 =
+              frequencies(ch) * frequencies(ch) * kInvCSqr * uvPrime;
+          angular_smear[ch] = exp(lambda2);
+        }
+
+        if (compute_stokes_I_only) {
+#pragma GCC ivdep
+          for (size_t ch = 0; ch < nchannels; ++ch) {
+            const double x_p = (shift_data(0, p, ch));
+            const double y_p = (shift_data(1, p, ch));
+            const double x_q = (shift_data(0, q, ch));
+            const double y_q = (shift_data(1, q, ch));
+            const double x_c = (computed_spectrum(0, StokesComponent::I, ch));
+            const double y_c = (computed_spectrum(1, StokesComponent::I, ch));
+
+            // Compute baseline phase shift.
+            // Compute visibilities.
+            const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q);
+            const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p);
+            const double temp_prod_real_I =
+                q_conj_p_real * (x_c)-q_conj_p_imag * (y_c);
+            const double temp_prod_imag_I =
+                q_conj_p_real * (y_c) + q_conj_p_imag * (x_c);
+
+            const double channel_angular_smear = angular_smear[ch];
+
+            buffer(0, CrossCorrelation::XX, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_I;
+            buffer(1, CrossCorrelation::XX, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_I;
+          }
+        } else {
+#pragma GCC ivdep
+          for (size_t ch = 0; ch < nchannels; ++ch) {
+            const double x_p = (shift_data(0, p, ch));
+            const double y_p = (shift_data(1, p, ch));
+            const double x_q = (shift_data(0, q, ch));
+            const double y_q = (shift_data(1, q, ch));
+            const double x_c = (computed_spectrum(0, CrossCorrelation::XX, ch));
+            const double y_c = (computed_spectrum(1, CrossCorrelation::XX, ch));
+            const double x_c2 =
+                (computed_spectrum(0, CrossCorrelation::XY, ch));
+            const double y_c2 =
+                (computed_spectrum(1, CrossCorrelation::XY, ch));
+            const double x_c3 =
+                (computed_spectrum(0, CrossCorrelation::YX, ch));
+            const double y_c3 =
+                (computed_spectrum(1, CrossCorrelation::YX, ch));
+            const double x_c4 =
+                (computed_spectrum(0, CrossCorrelation::YY, ch));
+            const double y_c4 =
+                (computed_spectrum(1, CrossCorrelation::YY, ch));
+
+            // Compute baseline phase shift.
+            // Compute visibilities.
+            const double q_conj_p_real = (x_p) * (x_q) + (y_p) * (y_q);
+            const double q_conj_p_imag = (x_p) * (y_q) - (x_q) * (y_p);
+            const double temp_prod_real_I =
+                q_conj_p_real * (x_c)-q_conj_p_imag * (y_c);
+            const double temp_prod_imag_I =
+                q_conj_p_real * (y_c) + q_conj_p_imag * (x_c);
+            const double temp_prod_real_Q =
+                q_conj_p_real * (x_c2)-q_conj_p_imag * (y_c2);
+            const double temp_prod_imag_Q =
+                q_conj_p_real * (y_c2) + q_conj_p_imag * (x_c2);
+            const double temp_prod_real_U =
+                q_conj_p_real * (x_c3)-q_conj_p_imag * (y_c3);
+            const double temp_prod_imag_U =
+                q_conj_p_real * (y_c3) + q_conj_p_imag * (x_c3);
+            const double temp_prod_real_V =
+                q_conj_p_real * (x_c4)-q_conj_p_imag * (y_c4);
+            const double temp_prod_imag_V =
+                q_conj_p_real * (y_c4) + q_conj_p_imag * (x_c4);
+
+            // The following operations are memory-bound, unlike
+            // the compute-bound segment above. By merging these together,
+            // an improvement in speed is expected
+            const double channel_angular_smear = angular_smear[ch];
+
+            buffer(0, CrossCorrelation::XX, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_I;
+            buffer(1, CrossCorrelation::XX, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_I;
+
+            buffer(0, CrossCorrelation::XY, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_Q;
+            buffer(1, CrossCorrelation::XY, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_Q;
+
+            buffer(0, CrossCorrelation::YX, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_U;
+            buffer(1, CrossCorrelation::YX, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_U;
+
+            buffer(0, CrossCorrelation::YY, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_real_V;
+            buffer(1, CrossCorrelation::YY, bl, ch) +=
+                channel_angular_smear * smear_terms(bl, ch) * temp_prod_imag_V;
+          }
+        }
+      }
+    } // Baselines.
+  }
+}
+
+void PredictPlanExecCPU::ComputeSmearTermsXtensor() {
+  // Compute the smear terms
+  size_t p = 0UL;
+  size_t q = 0UL;
+  std::vector<float> channel_floats(channel_widths.size());
+  std::copy(channel_widths.begin(), channel_widths.end(),
+            channel_floats.begin());
+
+  for (size_t bl = 0; bl < baselines.size(); ++bl) {
+    std::tie(p, q) = baselines[bl];
+    const float phase_diff =
+        static_cast<float>(station_phases[q] - station_phases[p]);
+    double *const smear_terms_ptr = &(smear_terms.unchecked(bl, 0));
+
+    if (abs(phase_diff) > 1.e-6) [[likely]] {
+#pragma omp simd
+      for (size_t ch = 0; ch < nchannels; ++ch) {
+        const float shift = phase_diff * channel_floats[ch] * 0.5f;
+        *(smear_terms_ptr + ch) = std::fabs(sinf(shift) / shift);
+      }
+    }
+  }
+}
+
+void PredictPlanExecCPU::ComputeSmearTermsSIMD() {
+  // Compute the smear terms
+  size_t p = 0UL;
+  size_t q = 0UL;
+  std::vector<double> channel_floats(channel_widths.size());
+  std::copy(channel_widths.begin(), channel_widths.end(),
+            channel_floats.begin());
+
+  const size_t simd_channels = (nchannels / 4) * 4;
+
+  for (size_t bl = 0; bl < baselines.size(); ++bl) {
+    std::tie(p, q) = baselines[bl];
+    const double phase_diff =
+        static_cast<double>(station_phases[q] - station_phases[p]);
+    double *const smear_terms_ptr = &(smear_terms.unchecked(bl, 0));
+    const double *const channel_floats_ptr = channel_floats.data();
+
+    if (abs(phase_diff) > 1.e-6) [[likely]] {
+      size_t ch = 0;
+
+      __m256d muliplier_vec = _mm256_set1_pd(0.5);
+      __m256d phase_diff_vec = _mm256_set1_pd(phase_diff);
+
+      for (; ch < simd_channels; ch += 4) {
+        __m256d channel_vec = _mm256_loadu_pd(channel_floats_ptr + ch);
+        __m256d shift_vec = _mm256_mul_pd(
+            muliplier_vec, _mm256_mul_pd(phase_diff_vec, channel_vec));
+        // __m256d fabs_vec = _mm256_abs_pd(div_vec);
+        _mm256_storeu_pd(smear_terms_ptr + ch, shift_vec);
+
+        const double val = *(smear_terms_ptr + ch + 0);
+        const double val1 = *(smear_terms_ptr + ch + 1);
+        const double val2 = *(smear_terms_ptr + ch + 2);
+        const double val3 = *(smear_terms_ptr + ch + 3);
+        *(smear_terms_ptr + ch + 0) = std::abs(std::sin(val)) / val;
+
+        *(smear_terms_ptr + ch + 1) = std::abs(std::sin(val1)) / val1;
+
+        *(smear_terms_ptr + ch + 2) = std::abs(std::sin(val2)) / val2;
+
+        *(smear_terms_ptr + ch + 3) = std::abs(std::sin(val3)) / val3;
+      }
+
+      for (; ch < nchannels; ++ch) {
+        const float shift =
+            static_cast<float>(phase_diff * channel_floats[ch] * 0.5);
+        *(smear_terms_ptr + ch) = std::fabs(sinf(shift) / shift);
+      }
+    }
+  }
+}
diff --git a/tests/test_predictplanexec.cpp b/tests/test_predictplanexeccpu.cpp
similarity index 91%
rename from tests/test_predictplanexec.cpp
rename to tests/test_predictplanexeccpu.cpp
index ea055206506995b207acd86ea7fe74a623c9dcf3..8f129dddb94898f07ae1f59d8c62fe1ac0b5ffa9 100644
--- a/tests/test_predictplanexec.cpp
+++ b/tests/test_predictplanexeccpu.cpp
@@ -3,13 +3,13 @@
 // SPDX-License-Identifier: Apache2.0
 
 #define BOOST_TEST_MODULE PREDICT_PLAN_EXEC_TEST
-#include <PredictPlanExec.h>
+#include <PredictPlanExecCPU.h>
 #include <boost/test/unit_test.hpp>
 #include <test/Common.h>
 
-BOOST_AUTO_TEST_SUITE(PredictPlanExecTestSuite)
+BOOST_AUTO_TEST_SUITE(PredictPlanExecCPUTestSuite)
 
-BOOST_AUTO_TEST_CASE(test_PredictPlanExec) { BOOST_CHECK(true); }
+BOOST_AUTO_TEST_CASE(test_PredictPlanExecCPU) { BOOST_CHECK(true); }
 
 BOOST_AUTO_TEST_CASE(compute_smear_terms) {
   static constexpr size_t test_n_stations = 4;
@@ -26,7 +26,7 @@ BOOST_AUTO_TEST_CASE(compute_smear_terms) {
 
   PredictPlan plan;
   PointSourceCollection sources;
-  PredictPlanExec sim_plan_exec(run->plan);
+  PredictPlanExecCPU sim_plan_exec(run->plan);
 
   for (size_t s = 0; s < test_n_sources; ++s) {
     sources.Add(run->makeSource<PointSource>());
@@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(dont_compute_smear_terms) {
 
   PredictPlan plan;
   PointSourceCollection sources;
-  PredictPlanExec sim_plan_exec(run->plan);
+  PredictPlanExecCPU sim_plan_exec(run->plan);
 
   for (size_t s = 0; s < test_n_sources; ++s) {
     sources.Add(run->makeSource<PointSource>());
diff --git a/tests/test_simulator.cpp b/tests/test_simulator.cpp
index 8b6f52bfeaaa6e60e7590e3d089c91ad646c98fe..ff309f690a2e79e3b13791a1702917a16430a82e 100644
--- a/tests/test_simulator.cpp
+++ b/tests/test_simulator.cpp
@@ -4,6 +4,7 @@
 #include "Directions.h"
 #include "GaussianSourceCollection.h"
 #include "PointSourceCollection.h"
+#include <boost/test/tools/old/interface.hpp>
 #include <xtensor/xtensor.hpp>
 
 #include "Predict.h"
@@ -11,7 +12,7 @@
 #include <Direction.h>
 #include <GaussianSource.h>
 #include <PointSource.h>
-#include <PredictPlanExec.h>
+#include <PredictPlanExecCPU.h>
 #include <Stokes.h>
 #include <xtensor/xtensor_forward.hpp>
 
@@ -24,6 +25,13 @@ namespace dp3 {
 namespace base {
 namespace test {
 
+inline double ConvToAbsComplex(
+    const xt::xtensor<double, 4, xt::layout_type::row_major> &buffer,
+    const size_t pol, const size_t bl, const size_t channel) {
+  return std::abs(std::complex<double>{buffer(0, pol, bl, channel),
+                                       buffer(1, pol, bl, channel)});
+}
+
 struct PredictTestFixture {
   PredictTestFixture() : predict_run(nullptr) {}
 
@@ -33,8 +41,8 @@ struct PredictTestFixture {
                        test_n_channels, onlyI, freqsmear);
   }
 
-  static constexpr size_t test_n_stations = 4;
-  static constexpr size_t test_n_channels = 2;
+  size_t test_n_stations = 4;
+  size_t test_n_channels = 2;
   const Direction test_reference_point{0.5, 0.1};
   const Direction test_offset_point{test_reference_point.ra + 0.02,
                                     test_reference_point.dec + 0.02};
@@ -52,9 +60,9 @@ BOOST_AUTO_TEST_CASE(pointsource_onlyI) {
   sources.Add(pointsource);
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 1.0,
                     1.0e-3); // Channel 0
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 1, 0), 1.0,
                     1.0e-3); // Channel 1
 }
 
@@ -66,21 +74,21 @@ BOOST_AUTO_TEST_CASE(pointsource_fullstokes) {
   sources.Add(pointsource);
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 1.0,
                     1.0e-3); // Channel 0, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 0, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 0), 0.0,
                     1.0e-3); // Channel 0, XY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 0, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 0), 0.0,
                     1.0e-3); // Channel 0, YX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 0, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 0), 1.0,
                     1.0e-3); // Channel 0, YY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 1, 0), 1.0,
                     1.0e-3); // Channel 1, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 1, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 1, 0), 0.0,
                     1.0e-3); // Channel 1, XY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 1, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 1, 0), 0.0,
                     1.0e-3); // Channel 1, YX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 1, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 1, 0), 1.0,
                     1.0e-3); // Channel 1, YY
 }
 
@@ -92,10 +100,86 @@ BOOST_AUTO_TEST_CASE(pointsource_onlyI_freqsmear) {
   sources.Add(pointsource);
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(std::abs(predict_run->buffer(0, 0, test_n_stations - 1)),
-                    0.759154, 1.0e-3);
-  BOOST_CHECK_CLOSE(std::abs(predict_run->buffer(0, 1, test_n_stations - 1)),
-                    0.759154, 1.0e-3);
+  BOOST_CHECK_CLOSE(
+      ConvToAbsComplex(predict_run->buffer, 0, 0, test_n_channels - 1), 0.75915,
+      1.0e-3);
+  BOOST_CHECK_CLOSE(
+      ConvToAbsComplex(predict_run->buffer, 1, 0, test_n_channels - 1), 0.75915,
+      1.0e-3);
+}
+
+BOOST_AUTO_TEST_CASE(pointsource_fullstokes_freqsmear_all_strategies) {
+  test_n_stations = 16;
+  test_n_channels =
+      51; // Take an odd number of channels to verify boundary cases
+  SetupPredictRun(false, true);
+  PointSourceCollection sources;
+
+  // Generate multiple point sources
+  const size_t num_sources = 4;
+  for (size_t i = 0; i < num_sources; ++i) {
+    auto pointsource = predict_run->makeSource<PointSource>();
+    sources.Add(pointsource);
+  }
+
+  // Define computation strategies to test
+  const std::vector<computation_strategy> strategies = {
+      computation_strategy::SINGLE, computation_strategy::MULTI_SIMD,
+      computation_strategy::XSIMD};
+
+  const auto ref_buffer = [&]() {
+    predict_run->RunWithStrategy(sources, computation_strategy::SINGLE);
+    return predict_run->buffer;
+  }();
+
+  for (const auto &strategy : strategies) {
+    predict_run->Clean();
+    predict_run->RunWithStrategy(sources, strategy);
+
+    BOOST_TEST_CONTEXT("Testing strategy: " << static_cast<int>(strategy)) {
+      for (size_t bl = 0; bl < predict_run->plan.nbaselines; ++bl) {
+        for (size_t ch = 0; ch < predict_run->plan.nchannels; ++ch) {
+          for (size_t pol = 0; pol < 4; ++pol) {
+            BOOST_CHECK_CLOSE(
+                ConvToAbsComplex(ref_buffer, pol, bl, ch),
+                ConvToAbsComplex(predict_run->buffer, pol, bl, ch), 1.0e-3);
+          }
+        }
+      }
+    }
+  }
+}
+
+BOOST_AUTO_TEST_CASE(pointsource_fullstokes_freqsmear_xtensor) {
+  test_n_stations = 16;
+  test_n_channels =
+      51; // Take a odd number of channels to verify boundary cases
+  SetupPredictRun(false, true);
+  PointSourceCollection sources;
+
+  // Generate multiple point sources
+  const size_t num_sources = 4;
+  for (size_t i = 0; i < num_sources; ++i) {
+    auto pointsource = predict_run->makeSource<PointSource>();
+    sources.Add(pointsource);
+  }
+
+  predict_run->RunWithStrategy(sources, computation_strategy::SINGLE);
+
+  const auto ref_buffer = predict_run->buffer;
+  predict_run->Clean();
+
+  predict_run->RunWithStrategy(sources, computation_strategy::XSIMD);
+
+  for (size_t bl = 0; bl < predict_run->plan.nbaselines; ++bl) {
+    for (size_t ch = 0; ch < predict_run->plan.nchannels; ++ch) {
+      for (size_t pol = 0; pol < 4; ++pol) {
+        BOOST_CHECK_CLOSE(ConvToAbsComplex(ref_buffer, pol, bl, ch),
+                          ConvToAbsComplex(predict_run->buffer, pol, bl, ch),
+                          1.0e-3);
+      }
+    }
+  }
 }
 
 BOOST_AUTO_TEST_CASE(gaussiansource_onlyI) {
@@ -106,9 +190,9 @@ BOOST_AUTO_TEST_CASE(gaussiansource_onlyI) {
   sources.Add(gaussiansource);
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 1.0,
                     1.0e-3); // Channel 0
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 1.0,
                     1.0e-3); // Channel 1
 }
 
@@ -120,26 +204,25 @@ BOOST_AUTO_TEST_CASE(gaussiansource_fullstokes) {
   sources.Add(gaussiansource);
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 1.0,
                     1.0e-3); // Channel 0, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 0, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 0), 0.0,
                     1.0e-3); // Channel 0, XY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 0, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 0), 0.0,
                     1.0e-3); // Channel 0, YX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 0, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 0), 1.0,
                     1.0e-3); // Channel 0, YY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 1, 0), 1.0,
                     1.0e-3); // Channel 1, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 1, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 1, 0), 0.0,
                     1.0e-3); // Channel 1, XY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 1, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 1, 0), 0.0,
                     1.0e-3); // Channel 1, YX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 1, 0)), 1.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 1, 0), 1.0,
                     1.0e-3); // Channel 1, YY
 }
 
-BOOST_AUTO_TEST_CASE(gaussiansource_onlyI_freqsmear,
-                     *boost::unit_test::expected_failures(2)) {
+BOOST_AUTO_TEST_CASE(gaussiansource_onlyI_freqsmear) {
   SetupPredictRun(true, true);
   auto gaussiansource = predict_run->makeSource<GaussianSource>();
 
@@ -147,10 +230,10 @@ BOOST_AUTO_TEST_CASE(gaussiansource_onlyI_freqsmear,
   sources.Add(gaussiansource);
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(std::abs(predict_run->buffer(0, 0, test_n_stations - 1)),
-                    0.75915, 1.0e-3);
-  BOOST_CHECK_CLOSE(std::abs(predict_run->buffer(0, 1, test_n_stations - 1)),
-                    0.75915, 1.0e-3);
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.75915,
+                    1.0e-3); // Channel 0
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.75915,
+                    1.0e-3); // Channel 1
 }
 
 BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_empty_initialization) {
@@ -177,9 +260,9 @@ BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_onlyI) {
 
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 0.98917,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.98917,
                     1.0e-3); // Channel 0, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 0.98900,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.98900,
                     1.0e-3); // Channel 0, XY
 }
 
@@ -197,26 +280,25 @@ BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_fullstokes) {
 
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 0.98917,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.98917,
                     1.0e-3); // Channel 0, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 0, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 0), 0.0,
                     1.0e-3); // Channel 0, XY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 0, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 0), 0.0,
                     1.0e-3); // Channel 0, YX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 0, 0)), 0.98917,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 0), 0.98917,
                     1.0e-3); // Channel 0, YY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 0.98900,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.98900,
                     1.0e-3); // Channel 1, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 1, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 1), 0.0,
                     1.0e-3); // Channel 1, XY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 1, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 1), 0.0,
                     1.0e-3); // Channel 1, YX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 1, 0)), 0.98900,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 1), 0.98900,
                     1.0e-3); // Channel 1, YY
 }
 
-BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_onlyI_freqsmear,
-                     *boost::unit_test::expected_failures(2)) {
+BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_onlyI_freqsmear) {
   SetupPredictRun(true, true);
   auto gaussiansource = predict_run->makeSource<GaussianSource>();
 
@@ -230,14 +312,13 @@ BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_onlyI_freqsmear,
 
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 0.75093,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.75093,
                     1.0e-3); // Channel 0, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 0.75081,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.75081,
                     1.0e-3); // Channel 0, XY
 }
 
-BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_fullstokes_freqsmear,
-                     *boost::unit_test::expected_failures(4)) {
+BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_fullstokes_freqsmear) {
   SetupPredictRun(false, true);
   auto gaussiansource = predict_run->makeSource<GaussianSource>();
 
@@ -251,21 +332,21 @@ BOOST_AUTO_TEST_CASE(gaussiansource_absolute_angle_fullstokes_freqsmear,
 
   predict_run->Run(sources);
 
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 0, 0)), 0.750934,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 0), 0.750934,
                     1.0e-3); // Channel 0, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 0, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 0), 0.0,
                     1.0e-3); // Channel 0, XY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 0, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 0), 0.0,
                     1.0e-3); // Channel 0, YX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 0, 0)), 0.750934,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 0), 0.750934,
                     1.0e-3); // Channel 0, YY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(0, 1, 0)), 0.750807,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 0, 0, 1), 0.750807,
                     1.0e-3); // Channel 1, XX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(1, 1, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 1, 0, 1), 0.0,
                     1.0e-3); // Channel 1, XY
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(2, 1, 0)), 0.0,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 2, 0, 1), 0.0,
                     1.0e-3); // Channel 1, YX
-  BOOST_CHECK_CLOSE(abs(predict_run->buffer(3, 1, 0)), 0.750807,
+  BOOST_CHECK_CLOSE(ConvToAbsComplex(predict_run->buffer, 3, 0, 1), 0.750807,
                     1.0e-3); // Channel 1, YY
 }
 
@@ -348,7 +429,8 @@ BOOST_AUTO_TEST_CASE(test_calc_phase) {
   plan.uvw = {{100.0, 200.0, 300.0}};
   plan.frequencies = {1e8, 2e8};
 
-  PredictPlanExec plan_exec{plan};
+  PredictPlanExecCPU plan_exec{plan};
+  plan_exec.correct_frequency_smearing = false;
   plan_exec.lmn = {{0.1, 0.2, 0.3}};
 
   plan_exec.ComputeStationPhases();
@@ -359,17 +441,17 @@ BOOST_AUTO_TEST_CASE(test_calc_phase) {
 
   BOOST_CHECK_CLOSE(plan_exec.GetStationPhases()[0], expectedPhase, 1e-6);
 
-  const auto &freq = plan_exec.get_frequencies();
+  const auto &freq = plan_exec.GetFrequencies();
 
   double phase_term_0 = expectedPhase * freq[0];
   double phase_term_1 = expectedPhase * freq[1];
 
   const auto &shift = plan_exec.GetShiftData();
 
-  BOOST_CHECK_CLOSE(shift(0, 0).real(), std::cos(phase_term_0), 1e-6);
-  BOOST_CHECK_CLOSE(shift(0, 1).imag(), std::sin(phase_term_1), 1e-6);
-  BOOST_CHECK_CLOSE(shift(1, 0).real(), std::cos(phase_term_0), 1e-6);
-  BOOST_CHECK_CLOSE(shift(1, 1).imag(), std::sin(phase_term_1), 1e-6);
+  BOOST_CHECK_CLOSE(shift(0, 0, 0), std::cos(phase_term_0), 1e-6);
+  BOOST_CHECK_CLOSE(shift(1, 0, 1), std::sin(phase_term_1), 1e-6);
+  BOOST_CHECK_CLOSE(shift(0, 1, 0), std::cos(phase_term_0), 1e-6);
+  BOOST_CHECK_CLOSE(shift(1, 1, 1), std::sin(phase_term_1), 1e-6);
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/test_spectrum.cpp b/tests/test_spectrum.cpp
index 10badfb3df0f39e90b90baf4ee09876255802d3a..6176cc8ca1c7b10f40c179a1198ec5ccf334684b 100644
--- a/tests/test_spectrum.cpp
+++ b/tests/test_spectrum.cpp
@@ -227,14 +227,14 @@ BOOST_FIXTURE_TEST_CASE(normal_log_with_rotation_cross, SpectrumFixture) {
   stokes_nu1 = point_source.GetStokes(frequencies[1]);
   stokes_nu2 = point_source.GetStokes(frequencies[2]);
 
-  xt::xtensor<std::complex<double>, 2> result_complex;
+  xt::xtensor<double, 3> result_complex;
   spectrum.EvaluateCrossCorrelations(frequencies, result_complex);
 
-  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0).real(),
+  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 0),
                              stokes_nu0.I + stokes_nu0.Q, 1.e-3);
-  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 1).real(),
+  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 1),
                              stokes_nu1.I + stokes_nu1.Q, 1.e-3);
-  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 2).real(),
+  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 2),
                              stokes_nu2.I + stokes_nu2.Q, 1.e-3);
 }
 
@@ -261,14 +261,14 @@ BOOST_FIXTURE_TEST_CASE(normal_with_rotation_cross, SpectrumFixture) {
   stokes_nu1 = point_source.GetStokes(frequencies[1]);
   stokes_nu2 = point_source.GetStokes(frequencies[2]);
 
-  xt::xtensor<std::complex<double>, 2> result_complex;
+  xt::xtensor<double, 3> result_complex;
   spectrum.EvaluateCrossCorrelations(frequencies, result_complex);
 
-  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0).real(),
+  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 0),
                              stokes_nu0.I + stokes_nu0.Q, 1.e-3);
-  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 1).real(),
+  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 1),
                              stokes_nu1.I + stokes_nu1.Q, 1.e-3);
-  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 2).real(),
+  BOOST_CHECK_CLOSE_FRACTION(result_complex(0, 0, 2),
                              stokes_nu2.I + stokes_nu2.Q, 1.e-3);
 }