First implementation of performance tests

d9f199f9 · Mattia Mancini · a908a076 · d9f199f9 · d9f199f9 · d9f199f9
Commit d9f199f9 authored Jun 7, 2024 by Mattia Mancini
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,11 +72,11 @@ option(BUILD_TESTING "Include tests in the build" OFF)
 set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/CMake)

 add_compile_options(
+  -fopenmp
  -Wall
  -Wnon-virtual-dtor
  -Wzero-as-null-pointer-constant
  -Wduplicated-branches
-  -Wundef
  -Wvla
  -Wpointer-arith
  -Wextra
@@ -226,7 +226,7 @@ endif()
 # Prevent accidentally finding old BoostConfig.cmake file from casapy
 set(Boost_NO_BOOST_CMAKE ON)
 unset(BOOST_MINIMUM_VERSION)
-set(BOOST_COMPONENTS "filesystem;program_options;system")
+set(BOOST_COMPONENTS "filesystem;program_options;system;unit_test_framework")
 if(BUILD_TESTING)
  # Boost 1.59 introduced BOOST_TEST. Many tests use this feature.
  set(BOOST_MINIMUM_VERSION 1.59)
@@ -687,6 +687,34 @@ target_link_libraries(showsourcedb ${SOURCEDB_LIBRARIES})
 add_executable(msoverview base/msoverview.cc base/MS.cc)
 target_link_libraries(msoverview ${CASACORE_LIBRARIES})

+include(FetchContent)
+
+
+set(BENCHMARK_ENABLE_GTEST_TESTS
+    OFF
+    CACHE INTERNAL "Download GTest sources")
+
+FetchContent_Declare(
+  googlebenchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG v1.8.3 # Optionally specify the exact tag or commit to fetch
+  CMAKE_ARGS "-DBENCHMARK_DOWNLOAD_DEPENDENCIES=True")
+
+# Make the fetched content available.
+FetchContent_MakeAvailable(googlebenchmark)
+
+set(COMPILER_FLAGS "-O3;-march=native;-ggdb;-fno-omit-frame-pointer;-fopenmp")
+# Add the benchmark executable
+file(GLOB BENCHMARK_SOURCES "steps/test/performance/*.cc")
+add_executable(microbenchmarks ${BENCHMARK_SOURCES} ${KERNEL_SOURCES}) 
+# Link against Google Benchmark
+target_link_libraries(microbenchmarks benchmark::benchmark)
+target_include_directories(microbenchmarks PRIVATE PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${EVERYBEAM_INCLUDE_DIR})
+target_link_libraries(microbenchmarks LIBDP3 ${HDF5_LIBRARIES} ${EVERYBEAM_LIB})
+target_compile_options(microbenchmarks PUBLIC ${COMPILER_FLAGS})
+
+target_compile_options(LIBDP3 PUBLIC -fno-omit-frame-pointer PUBLIC -ggdb3)
+
 install(TARGETS DP3 makesourcedb showsourcedb msoverview DESTINATION bin)

 # Install a script that warns users that DP3 is the new name of the executable

--- a/steps/ApplyBeam.cc
+++ b/steps/ApplyBeam.cc
@@ -282,9 +282,6 @@ void ApplyBeam::applyBeam(const DPInfo& info, double time, T* data0,
  // Get the beam values for each station.
  const size_t n_channels = info.chanFreqs().size();
  const size_t n_baselines = info.nbaselines();
-  //  std::vector<std::array<std::complex<double>, 4>> beam_values_normal;
-  // beam_values_normal.resize(beam_values.size());
-
  std::unique_ptr<everybeam::pointresponse::PointResponse> point_response =
      telescope->GetPointResponse(time);

@@ -300,14 +297,6 @@ void ApplyBeam::applyBeam(const DPInfo& info, double time, T* data0,
      case everybeam::CorrectionMode::kElement:
        // Fill beam_values for channel ch
        for (size_t st = 0; st < n_stations; ++st) {
-          /*
-          auto val = point_response->Response(
-              mode, station_indices[st], info.chanFreqs()[ch], srcdir, mutex);
-          beam_values_normal[n_channels * st + ch][0] = val[0];
-          beam_values_normal[n_channels * st + ch][1] = val[1];
-          beam_values_normal[n_channels * st + ch][2] = val[2];
-          beam_values_normal[n_channels * st + ch][3] = val[3];
-          */
          beam_values[n_channels * st + ch] = point_response->Response(
              mode, station_indices[st], info.chanFreqs()[ch], srcdir, mutex);
          if (invert) {
@@ -343,58 +332,250 @@ void ApplyBeam::applyBeam(const DPInfo& info, double time, T* data0,
    // Apply beam for channel ch on all baselines
    // For mode=ARRAY_FACTOR, too much work is done here because we know
    // that r and l are diagonal
-    std::array<size_t, 1> shape{n_baselines * n_channels * 4};
-    xt::xtensor<T, 1> data_tensor(shape);
-    memcpy(data_tensor.data(), data0, shape[0] * sizeof(T));
    for (size_t bl = 0; bl < n_baselines; ++bl) {
-      T* data = data_tensor.data();
-      data += bl * 4 * n_channels +
-              ch * 4;  // data0 + bl * 4 * n_channels + ch * 4;
-      // const aocommon::MC2x2F mat(data);
-      const aocommon::MC2x2F mat(data0 + bl * 4 * n_channels + ch * 4);
-      // std::array<size_t, 2> shape = {2, 2};
-      /*
-      auto mat = xt::adapt(data, 4U, xt::no_ownership(), shape);
-      xt::xtensor<std::complex<double>, 2> mat(shape);
-      mat(0, 0) = data[0];
-      mat(0, 1) = data[1];
-      mat(1, 0) = data[2];
-      mat(1, 1) = data[3];
-      */
+      T* data = data0 + bl * 4 * n_channels + ch * 4;
+      const aocommon::MC2x2F mat(data);
      // If the beam is the same for all stations (i.e. when n_stations =
      // 1), all baselines will have the same beam values
      size_t index_left =
          (n_stations == 1 ? ch : n_channels * info.getAnt1()[bl] + ch);
      size_t index_right =
          (n_stations == 1 ? ch : n_channels * info.getAnt2()[bl] + ch);
-      /*
-            auto left = xt::adapt(beam_values_normal[index_left].data(), 4U,
-                                  xt::no_ownership(), shape);
-            auto right = xt::adapt(beam_values_normal[index_right].data(), 4U,
-                                   xt::no_ownership(), shape);

-            auto result = xt::linalg::dot(
-                left, xt::linalg::dot(xt::transpose(xt::conj(mat)), right));
-      */
      const aocommon::MC2x2F left(beam_values[index_left]);
      const aocommon::MC2x2F right(beam_values[index_right]);
      const aocommon::MC2x2F result = left * mat.MultiplyHerm(right);
-      // const aocommon::MC2x2 result{res.data()};
+
      result.AssignTo(data);
+      if (doUpdateWeights) {
+        ApplyCal::ApplyWeights(left, right,
+                               weight0 + bl * 4 * n_channels + ch * 4);
+      }
+    }
+  }
+}
+
+template <typename T, typename D>
+void assign(xt::xtensor<T, 3> mat, D other_mat, size_t st) {
+  mat(st, 0, 0) = other_mat[0];
+  mat(st, 0, 1) = other_mat[1];
+  mat(st, 1, 0) = other_mat[2];
+  mat(st, 1, 1) = other_mat[3];
+}
+
+void compute_beam(everybeam::pointresponse::PointResponse& point_response,
+                  std::vector<aocommon::MC2x2>& beam_values,
+                  xt::xtensor<std::complex<float>, 3>& left,
+                  xt::xtensor<std::complex<float>, 3>& right, bool invert,
+                  const std::vector<size_t>& station_indices, size_t n_stations,
+                  everybeam::CorrectionMode mode, size_t n_channels, size_t ch,
+                  everybeam::vector3r_t srcdir, double frequency,
+                  std::mutex* mutex) {
+  switch (mode) {
+    case everybeam::CorrectionMode::kFull:
+    case everybeam::CorrectionMode::kElement:
+      // Fill beam_values for channel ch
+      for (size_t st = 0; st < n_stations; ++st) {
+        const aocommon::MC2x2 tmp = point_response.Response(
+            mode, station_indices[st], frequency, srcdir, mutex);
+        beam_values[n_channels * st + ch] = tmp;
+        left(st, 0, 0) = tmp[0];
+        left(st, 0, 1) = tmp[1];
+        left(st, 1, 0) = tmp[2];
+        left(st, 1, 1) = tmp[3];
+        xt::view(right, st, xt::all(), xt::all()) =
+            xt::transpose(xt::conj(xt::view(left, st, xt::all(), xt::all())));

+        if (invert) {
+          // Terminate if the matrix is not invertible.
+          [[maybe_unused]] bool status =
+              beam_values[n_channels * st + ch].Invert();
+          assert(status);
+        }
+      }
+      break;
+    case everybeam::CorrectionMode::kArrayFactor: {
+      aocommon::MC2x2 af_tmp;
+      // Fill beam_values for channel ch
+      for (size_t st = 0; st < n_stations; ++st) {
+        af_tmp = point_response.Response(mode, station_indices[st], frequency,
+                                         srcdir, mutex);
+
+        if (invert) {
+          af_tmp[0] = 1. / af_tmp[0];
+          af_tmp[3] = 1. / af_tmp[3];
+        }
+        assign(left, af_tmp, st);
+        xt::view(right, st, xt::all(), xt::all()) =
+            xt::transpose(xt::conj(xt::view(left, st, xt::all(), xt::all())));
+        beam_values[n_channels * st + ch] = af_tmp;
+      }
+      break;
+    }
+    case everybeam::CorrectionMode::kNone:  // this should not happen
+      for (size_t st = 0; st < n_stations; ++st) {
+        beam_values[n_channels * st + ch] = aocommon::MC2x2::Unity();
        /*
-      mat(0, 0) = result(0, 0);
-      mat(1, 0) = result(1, 0);
-      mat(0, 1) = result(0, 1);
-      mat(1, 1) = result(1, 1);
+        left[st] = aocommon::MC2x2F::Unity();
+        right[st] = aocommon::MC2x2F::Unity();
        */
+      }
+      break;
+  }
+}
+
+inline void multiply_mat(const float* a, const float* b, float* c, float sign) {
+  c[0] += sign * (a[0] * b[0] + a[1] * b[2]);
+  c[1] += sign * (a[0] * b[1] + a[1] * b[3]);
+  c[2] += sign * (a[2] * b[0] + a[3] * b[2]);
+  c[3] += sign * (a[2] * b[1] + a[3] * b[3]);
+}
+
+inline void multiply_mat_cmpl(const float* a, const float* b, float* c,
+                              float sign) {
+  c[0] += sign * (a[0] * b[0] + a[1] * b[2]);
+  c[1] += sign * (a[0] * b[1] + a[1] * b[3]);
+  c[2] += sign * (a[2] * b[0] + a[3] * b[2]);
+  c[3] += sign * (a[2] * b[1] + a[3] * b[3]);
+}
+
+void multiply_complex(const std::complex<float>* a,
+                      const std::complex<float>* b, std::complex<float>* c) {
+  const float a_real[] = {a[0].real(), a[1].real(), a[2].real(), a[3].real()};
+  const float b_real[] = {b[0].real(), b[1].real(), b[2].real(), b[3].real()};
+  const float a_imag[] = {a[0].imag(), a[1].imag(), a[2].imag(), a[3].imag()};
+  const float b_imag[] = {b[0].imag(), b[1].imag(), b[2].imag(), b[3].imag()};
+
+  float c_real[4] = {0, 0, 0, 0};
+  float c_imag[4] = {0, 0, 0, 0};
+
+  multiply_mat(a_real, b_real, c_real, 1.0f);
+  multiply_mat(a_imag, b_imag, c_real, -1.0f);
+  multiply_mat(a_real, b_imag, c_imag, 1.0f);
+  multiply_mat(a_imag, b_real, c_imag, 1.0f);
+
+  c[0] = {c_real[0], c_imag[0]};
+  c[1] = {c_real[1], c_imag[1]};
+  c[2] = {c_real[2], c_imag[2]};
+  c[3] = {c_real[3], c_imag[3]};
+}
+
+void multiply_complex_batch(const std::complex<float>* a,
+                            const std::complex<float>* b,
+                            std::complex<float>* c, size_t batch_size) {
+  float* a_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(a));
+  float* b_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(b));
+  float* c_ptr = reinterpret_cast<float*>(c);
+#pragma omp simd
+  for (size_t s = 0; s < batch_size; s++) {
+    const float a_00_real = a_ptr[s * 8 + 0];
+    const float a_00_imag = a_ptr[s * 8 + 1];
+    const float a_01_real = a_ptr[s * 8 + 2];
+    const float a_01_imag = a_ptr[s * 8 + 3];
+    const float a_10_real = a_ptr[s * 8 + 4];
+    const float a_10_imag = a_ptr[s * 8 + 5];
+    const float a_11_real = a_ptr[s * 8 + 6];
+    const float a_11_imag = a_ptr[s * 8 + 7];
+    const float b_00_real = b_ptr[s * 8 + 0];
+    const float b_00_imag = b_ptr[s * 8 + 1];
+    const float b_01_real = b_ptr[s * 8 + 2];
+    const float b_01_imag = b_ptr[s * 8 + 3];
+    const float b_10_real = b_ptr[s * 8 + 4];
+    const float b_10_imag = b_ptr[s * 8 + 5];
+    const float b_11_real = b_ptr[s * 8 + 6];
+    const float b_11_imag = b_ptr[s * 8 + 7];
+
+    c_ptr[s * 8 + 0] = a_00_real * b_00_real + a_01_real * b_10_real;
+    c_ptr[s * 8 + 0] -= a_00_imag * b_00_imag + a_01_imag * b_10_imag;
+    c_ptr[s * 8 + 1] = a_00_real * b_00_imag + a_01_real * b_10_imag;
+    c_ptr[s * 8 + 1] += a_00_imag * b_00_real + a_01_imag * b_10_real;
+    c_ptr[s * 8 + 2] = a_00_real * b_01_real + a_01_real * b_11_real;
+    c_ptr[s * 8 + 2] -= a_00_imag * b_01_imag + a_01_imag * b_11_imag;
+    c_ptr[s * 8 + 3] = a_00_real * b_01_imag + a_01_real * b_11_imag;
+    c_ptr[s * 8 + 3] += a_00_imag * b_01_real + a_01_imag * b_11_real;
+    c_ptr[s * 8 + 4] = a_10_real * b_00_real + a_11_real * b_10_real;
+    c_ptr[s * 8 + 4] -= a_10_imag * b_00_imag + a_11_imag * b_10_imag;
+    c_ptr[s * 8 + 5] = a_10_real * b_00_imag + a_11_real * b_10_imag;
+    c_ptr[s * 8 + 5] += a_10_imag * b_00_real + a_11_imag * b_10_real;
+    c_ptr[s * 8 + 6] = a_10_real * b_01_real + a_11_real * b_11_real;
+    c_ptr[s * 8 + 6] -= a_10_imag * b_01_imag + a_11_imag * b_11_imag;
+    c_ptr[s * 8 + 7] = a_10_real * b_01_imag + a_11_real * b_11_imag;
+    c_ptr[s * 8 + 7] += a_10_imag * b_01_real + a_11_imag * b_11_real;
+  }
+}
+
+template <typename T>
+void apply_beam(xt::xtensor<std::complex<float>, 3>& left_beam_values,
+                xt::xtensor<std::complex<float>, 3>& right_beam_values,
+                T* data0, const std::vector<int>& ant1,
+                const std::vector<int>& ant2, size_t n_baselines,
+                size_t n_stations, size_t n_channels) {
+  // Apply beam for channel ch on all baselines
+  // For mode=ARRAY_FACTOR, too much work is done here because we know
+  // that r and l are diagonal
+
+  const std::array<size_t, 3> shape = {n_channels, 2, 2};
+  xt::xtensor<std::complex<float>, 3> tmp(shape);
+  for (size_t bl = 0; bl < n_baselines; ++bl) {
+    const int ant1_bl = ant1[bl];
+    const int ant2_bl = ant2[bl];
+
+    T* mat = data0 + bl * 4 * n_channels;
+    // If the beam is the same for all stations (i.e. when n_stations =
+    // 1), all baselines will have the same beam values
+    const size_t index_left = ant1_bl * n_channels;
+    const size_t index_right = ant2_bl * n_channels;
+
+    multiply_complex_batch(mat, right_beam_values.data() + index_right,
+                           tmp.data(), n_channels);
+    multiply_complex_batch(left_beam_values.data() + index_left, tmp.data(),
+                           mat, n_channels);
+  }
+}
+
+template <typename T>
+void ApplyBeam::applyBeamFast(const DPInfo& info, double time, T* data0,
+                              float* weight0,
+                              const everybeam::vector3r_t& srcdir,
+                              const everybeam::telescope::Telescope* telescope,
+                              std::vector<aocommon::MC2x2>& beam_values,
+                              bool invert, everybeam::CorrectionMode mode,
+                              bool doUpdateWeights, std::mutex* mutex) {
+  // Get the beam values for each station.
+  const size_t n_channels = info.chanFreqs().size();
+  const size_t n_baselines = info.nbaselines();
+
+  std::unique_ptr<everybeam::pointresponse::PointResponse> point_response =
+      telescope->GetPointResponse(time);
+
+  const std::vector<size_t> station_indices =
+      base::SelectStationIndices(*telescope, info.antennaNames());
+
+  const size_t n_stations = station_indices.size();
+  const std::vector<double>& channel_frequencies = info.chanFreqs();
+  const std::vector<int>& ant1 = info.getAnt1();
+  const std::vector<int>& ant2 = info.getAnt2();
+  std::array<size_t, 3> shape = {n_stations * n_channels, 2, 2};
+  xt::xtensor<std::complex<float>, 3> left(shape), right(shape);
+
+  // Apply the beam values of both stations to the ApplyBeamed data.
+  for (size_t ch = 0; ch < n_channels; ++ch) {
+    const double channel_frequency = channel_frequencies[ch];
+    compute_beam(*point_response, beam_values, left, right, invert,
+                 station_indices, n_stations, mode, n_channels, ch, srcdir,
+                 channel_frequency, mutex);
+  }
+  // Apply beam for channel ch on all baselines
+  // For mode=ARRAY_FACTOR, too much work is done here because we know
+  // that r and l are diagonal
+  apply_beam(left, right, data0, ant1, ant2, n_baselines, n_stations,
+             n_channels);
+
  if (doUpdateWeights) {
    // ApplyCal::ApplyWeights(left, right,
    //                        weight0 + bl * 4 * n_channels + ch * 4);
  }
 }
-  }
-}

 template void ApplyBeam::applyBeam(
    const DPInfo& info, double time, std::complex<double>* data0,
@@ -410,6 +591,13 @@ template void ApplyBeam::applyBeam(
    std::vector<aocommon::MC2x2>& beam_values, bool invert,
    everybeam::CorrectionMode mode, bool doUpdateWeights, std::mutex* mutex);

+template void ApplyBeam::applyBeamFast(
+    const DPInfo& info, double time, std::complex<float>* data0, float* weight0,
+    const everybeam::vector3r_t& srcdir,
+    const everybeam::telescope::Telescope* telescope,
+    std::vector<aocommon::MC2x2>& beam_values, bool invert,
+    everybeam::CorrectionMode mode, bool doUpdateWeights, std::mutex* mutex);
+
 void ApplyBeam::applyBeam(const DPInfo& info, double time,
                          std::complex<double>* data0, float* weight0,
                          const everybeam::vector3r_t& srcdir,

--- a/steps/ApplyBeam.h
+++ b/steps/ApplyBeam.h
@@ -85,6 +85,14 @@ class ApplyBeam : public Step {
                        everybeam::CorrectionMode mode,
                        bool doUpdateWeights = false,
                        std::mutex* mutex = nullptr);
+  template <typename T>
+  static void applyBeamFast(const base::DPInfo& info, double time, T* data0,
+                            float* weight0, const everybeam::vector3r_t& srcdir,
+                            const everybeam::telescope::Telescope* telescope,
+                            std::vector<aocommon::MC2x2>& beamValues,
+                            bool invert, everybeam::CorrectionMode mode,
+                            bool doUpdateWeights = false,
+                            std::mutex* mutex = nullptr);
  // This method applies the beam for processing when parallelizing over
  // baselines. Because the beam is a per-antenna effect, this requires
  // synchronisation, which is performed with the provided barrier.

--- a/steps/test/performance/applybeam.cc
+++ b/steps/test/performance/applybeam.cc
+#include "../../ApplyBeam.h"
+#include <benchmark/benchmark.h>
+#include <xtensor/xcomplex.hpp>
+#include <base/DPInfo.cc>
+#include <vector>
+#include <random>
+#include <casacore/ms/MeasurementSets.h>
+#include <casacore/tables/Tables/ArrayColumn.h>
+#include <casacore/tables/Tables/ScalarColumn.h>
+#include <casacore/tables/Tables/Table.h>
+#include <base/Telescope.h>
+#include <EveryBeam/pointresponse/pointresponse.h>
+#include <steps/ApplyBeam.h>
+#include <random>
+
+#include <cstdlib>  // For getenv
+
+std::string get_env_var(const std::string& key) {
+  const char* val = std::getenv(key.c_str());
+  if (val == nullptr) {
+    return "";  // Environment variable not found
+  }
+  return std::string(val);
+}
+
+using dp3::steps::ApplyBeam;
+using dp3::steps::Step;
+
+std::unique_ptr<dp3::base::DPInfo> GetDPInfoFromMS(std::string ms_path) {
+  casacore::MeasurementSet ms(ms_path);
+
+  casacore::Table antenna_table(ms.antenna());
+  casacore::Table spectral_window(ms.spectralWindow());
+  casacore::Table data_table(ms);
+
+  casacore::ScalarColumn<casacore::String> antenna_names_column(antenna_table,
+                                                                "NAME");
+  casacore::ScalarColumn<casacore::Double> antenna_diameters_column(
+      antenna_table, "DISH_DIAMETER");
+
+  casacore::ArrayColumn<casacore::Double> antenna_positions_column(
+      antenna_table, "POSITION");
+
+  casacore::ArrayColumn<casacore::Double> channels_frequencies_column(
+      spectral_window, "CHAN_FREQ");
+  casacore::ArrayColumn<casacore::Double> channels_width_column(spectral_window,
+                                                                "CHAN_WIDTH");
+  casacore::ScalarColumn<casacore::Int> antenna1_column(data_table, "ANTENNA1");
+  casacore::ScalarColumn<casacore::Int> antenna2_column(data_table, "ANTENNA2");
+
+  const size_t n_channels = channels_width_column.shape(0)[0];
+  const size_t n_antennas = antenna_names_column.nrow();
+  const size_t n_correlations = n_antennas * (n_antennas + 1) / 2;
+  std::unique_ptr<dp3::base::DPInfo> info = std::make_unique<dp3::base::DPInfo>(
+      n_correlations, n_channels, 0U, "HBA_DUAL");
+
+  std::vector<casacore::MPosition> antenna_positions;
+  std::vector<int> ant1 = antenna1_column
+                              .getColumnRange(casacore::Slicer(
+                                  casacore::IPosition(1, 0),
+                                  casacore::IPosition(1, n_correlations - 1),
+                                  casacore::Slicer::endIsLast))
+                              .tovector();
+  std::vector<int> ant2 = antenna2_column
+                              .getColumnRange(casacore::Slicer(
+                                  casacore::IPosition(1, 0),
+                                  casacore::IPosition(1, n_correlations - 1),
+                                  casacore::Slicer::endIsLast))
+                              .tovector();
+
+  std::vector<string> antenna_names;
+
+  for (size_t i = 0; i < n_antennas; i++) {
+    antenna_names.push_back(antenna_names_column(i));
+    std::vector<double> position = antenna_positions_column(i).tovector();
+    antenna_positions.push_back(casacore::MPosition(
+        casacore::MVPosition{position[0], position[1], position[2]},
+        casacore::MPosition::ITRF));
+  }
+  info->setAntennas(antenna_names,
+                    antenna_diameters_column.getColumn().tovector(),
+                    antenna_positions, ant1, ant2);
+  info->setChannels(channels_frequencies_column.getColumn().tovector(),
+                    channels_width_column.getColumn().tovector(),
+                    channels_width_column.getColumn().tovector());
+  return info;
+}
+
+template <typename T>
+void ReadData(std::vector<T>& data, const dp3::base::DPInfo& info,
+              std::string ms_path) {
+  casacore::Table data_table(ms_path);
+  casacore::ArrayColumn<casacore::Complex> data_column(data_table, "DATA");
+  const size_t kNCorrelations = info.ncorr();
+  const size_t kNChannels = info.nchan();
+  const size_t kNPolarizations = 4;
+  data.resize(kNChannels * kNCorrelations * kNPolarizations);
+  std::vector<casacore::Complex> data_vec =
+      data_column.getColumnCells(casacore::RefRows(0, kNCorrelations - 1))
+          .tovector();
+  std::copy(data_vec.begin(), data_vec.end(), data.begin());
+}
+
+class LoadMeasurementSet : public benchmark::Fixture {
+ public:
+  void SetUp(::benchmark::State& state) {
+    std::string ms_path = get_env_var("MS");
+    if (ms_path.compare("") == 0) {
+      throw std::runtime_error(
+          "Please set environment variable MS to point to a valid measurement "
+          "set");
+    }
+    info = GetDPInfoFromMS(ms_path);
+    telescope = dp3::base::GetTelescope(
+        ms_path, everybeam::ElementResponseModel::kDefault, false);
+
+    beam_values.resize(info->nantenna() * info->nchan());
+
+    ReadData(data, *info, ms_path);
+    weights.resize(data.size());
+    std::fill(weights.begin(), weights.end(), 1.0);
+    size_t size_in_bytes = sizeof(std::complex<float>) * data.size();
+    data_ptr = static_cast<std::complex<float>*>(
+        aligned_alloc(sizeof(aocommon::MC2x2F), size_in_bytes));
+    beam_values.resize(info->nantenna() * info->nchan());
+    memcpy(data_ptr, data.data(), size_in_bytes);
+    time_u = 5221188346.770621;
+    telescope->SetTime(time_u);
+
+    // Create a random device to seed the random number generator
+    std::random_device rd;
+    // Initialize a Mersenne Twister random number generator
+    std::mt19937 gen(rd());
+
+    // Define the range for the random number
+    std::uniform_real_distribution<> distr(-1.0f, 1.0f);
+    const float x = distr(gen);
+    const float y = distr(gen);
+    const float z = sqrt(1.0f - (x * x) - (y * y));
+    pointing = {x, y, z};
+  }
+  void TearDown(::benchmark::State& state) {}
+
+  std::vector<casacore::Complex> data;
+  std::vector<float> weights;
+  std::vector<aocommon::MC2x2> beam_values;
+
+  std::unique_ptr<everybeam::telescope::Telescope> telescope;
+  std::unique_ptr<dp3::base::DPInfo> info;
+  std::complex<float>* data_ptr;
+  std::mutex mut;
+  double time_u;
+  everybeam::vector3r_t pointing;
+};
+
+BENCHMARK_DEFINE_F(LoadMeasurementSet, ApplyBeamReference)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    dp3::steps::ApplyBeam::applyBeam(
+        *info, time_u, data_ptr, weights.data(), pointing, telescope.get(),
+        beam_values, false, everybeam::CorrectionMode::kFull, false, &mut);
+  }
+}
+
+BENCHMARK_DEFINE_F(LoadMeasurementSet, ApplyBeamBatch)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    dp3::steps::ApplyBeam::applyBeamFast(
+        *info, time_u, data_ptr, weights.data(), pointing, telescope.get(),
+        beam_values, false, everybeam::CorrectionMode::kFull, false, &mut);
+  }
+}
+
+BENCHMARK_REGISTER_F(LoadMeasurementSet, ApplyBeamReference)->Iterations(100);
+BENCHMARK_REGISTER_F(LoadMeasurementSet, ApplyBeamBatch)->Iterations(100);
+
+BENCHMARK_MAIN();