From 81b60be587777e03baadb3a586f7f56de7c111d0 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 14:29:58 +0100
Subject: [PATCH 01/18] Add schaapspack and gracehopper runs

---
 .gitlab-ci.yml                    | 20 ++++++++++++++++++++
 CMakeLists.txt                    |  9 +++++++++
 ci/das6/compile_and_run_native.sh | 20 ++++++++++++++++++++
 3 files changed, 49 insertions(+)
 create mode 100644 ci/das6/compile_and_run_native.sh

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d11a266..d1d46da 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -96,6 +96,26 @@ performance-jetson:   # This job runs in the test stage.
     access: all
     expire_in: 1 days
 
+performance-gracehopper:   # This job runs in the test stage.
+  allow_failure: true
+  tags:
+    - das6-gpu
+  stage: benchmark    # It only starts when the job in the build stage completes successfully.  
+  script:
+    - sbatch --wait -p ghq -o output.txt -e error.txt ci/das6/compile_and_run_native_sh ghq_arm64
+    - cat output.txt >&1
+    - cat error.txt >&2
+    
+  artifacts:
+    paths:
+      - ./results*.json
+      - ./output.txt
+      - ./error.txt
+      - ./*.tar
+    when: always
+    access: all
+    expire_in: 1 days
+
 performance-generic:
   stage: benchmark
   image: "$CI_REGISTRY_IMAGE:latest"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73d5509..153e159 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,15 @@ FetchContent_Declare(
 
 FetchContent_Populate(aocommon)
 
+# Make schaapspack available
+FetchContent_Declare(
+  schaapcommon
+  GIT_REPOSITORY git@git.astron.nl:RD/schaapcommon.git
+  GIT_TAG master)
+
+FetchContent_Populate(schaapcommon)
+
+
 set(COMPILER_FLAGS "-O3;-march=native;-ggdb;")
 
 # List all kernel code
diff --git a/ci/das6/compile_and_run_native.sh b/ci/das6/compile_and_run_native.sh
new file mode 100644
index 0000000..4125cdc
--- /dev/null
+++ b/ci/das6/compile_and_run_native.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -e
+# compile the code and run it on das6
+# Specify the compiler version and architecture
+
+ARCHITECTURE=$1
+COMPILER_VERSION=$(gcc --version | head -n 1 | awk '{print $4}')
+
+
+echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE}
+BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
+
+cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release
+
+make -C ${BUILD_DIR} -j
+
+tar -cvf asm-${ARCHITECTURE}-${COMPILER_VERSION}.tar ${BUILD_DIR}/*.s
+
+${BUILD_DIR}/microbenchmarks --benchmark_out=results-${COMPILER_VERSION}-${ARCHITECTURE}.json --benchmark_out_format=json
\ No newline at end of file
-- 
GitLab


From e038b2a35a4e2465c8be64b9d06b93b2012ab106 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 17:07:03 +0100
Subject: [PATCH 02/18] Fix build and add benchmark test

---
 CMakeLists.txt                   |  18 +--
 benchmarks/convolution.cpp       |  62 ++++++++++
 code/convolution.h               |  26 +++++
 code/convolution_reference.cpp   |   8 ++
 code/convolution_serial_fftw.cpp | 190 +++++++++++++++++++++++++++++++
 5 files changed, 296 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/convolution.cpp
 create mode 100644 code/convolution.h
 create mode 100644 code/convolution_reference.cpp
 create mode 100644 code/convolution_serial_fftw.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 153e159..fdbe25d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,9 +45,10 @@ include(Catch)
 FetchContent_Declare(
   aocommon
   GIT_REPOSITORY https://gitlab.com/aroffringa/aocommon.git
-  GIT_TAG master)
+  GIT_TAG master
+  EXCLUDE_FROM_ALL)
 
-FetchContent_Populate(aocommon)
+FetchContent_MakeAvailable(aocommon)
 
 # Make schaapspack available
 FetchContent_Declare(
@@ -55,8 +56,8 @@ FetchContent_Declare(
   GIT_REPOSITORY git@git.astron.nl:RD/schaapcommon.git
   GIT_TAG master)
 
-FetchContent_Populate(schaapcommon)
-
+FetchContent_MakeAvailable(schaapcommon)
+target_include_directories(schaapcommon PUBLIC ${aocommon_SOURCE_DIR}/include)
 
 set(COMPILER_FLAGS "-O3;-march=native;-ggdb;")
 
@@ -73,8 +74,9 @@ find_package(OpenMP)
 
 # Link against Google Benchmark
 target_link_libraries(microbenchmarks PRIVATE benchmark::benchmark)
-target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include)
+target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include ${schaapcommon_SOURCE_DIR}/include)
 target_include_directories(microbenchmarks PRIVATE code)
+target_link_libraries(microbenchmarks PRIVATE schaapcommon)
 target_compile_options(microbenchmarks PUBLIC ${COMPILER_FLAGS})
 if(OpenMP_CXX_FOUND)
     target_link_libraries(microbenchmarks PRIVATE OpenMP::OpenMP_CXX)
@@ -88,15 +90,15 @@ list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
 catch_discover_tests(unittests WORKING_DIRECTORY)
 target_link_libraries(unittests PRIVATE Catch2::Catch2WithMain)
 target_include_directories(unittests PRIVATE code)
-target_include_directories(unittests PRIVATE ${aocommon_SOURCE_DIR}/include)
-
+target_include_directories(unittests PRIVATE ${aocommon_SOURCE_DIR}/include  ${schaapcommon_SOURCE_DIR}/include)
+target_link_libraries(unittests PRIVATE schaapcommon)
 target_compile_options(unittests PUBLIC ${COMPILER_FLAGS})
 
 foreach(KERNEL_SOURCE ${KERNEL_SOURCES})
 
 get_filename_component(KERNEL_NAME ${KERNEL_SOURCE} NAME_WE)
 add_precompile_target(TARGET_NAME ${KERNEL_NAME} 
-                        INCLUDE_DIRS code ${aocommon_SOURCE_DIR}/include
+                        INCLUDE_DIRS code ${aocommon_SOURCE_DIR}/include ${schaapcommon_SOURCE_DIR}/include
                         SOURCES ${KERNEL_SOURCE}
                         COMPILER_FLAGS ${COMPILER_FLAGS})
 endforeach()
diff --git a/benchmarks/convolution.cpp b/benchmarks/convolution.cpp
new file mode 100644
index 0000000..ae82e9f
--- /dev/null
+++ b/benchmarks/convolution.cpp
@@ -0,0 +1,62 @@
+#include <benchmark/benchmark.h>
+#include <convolution.h>
+#include <memory>
+#include <vector>
+
+namespace {
+
+class InitializeInput : public benchmark::Fixture {
+ public:
+  void SetUp(::benchmark::State& state) {
+    size_t width = state.range(0);
+    size_t height = state.range(1);
+    image = std::make_unique<std::vector<float>>(width * height);
+    kernel = std::make_unique<std::vector<float>>(width * height);
+
+    Initialize(image->data(), width, height);
+    Initialize(kernel->data(), width, height);
+  }
+  void TearDown(::benchmark::State& state) {
+    image.reset();
+    kernel.reset();
+  }
+
+  std::unique_ptr<std::vector<float>> image;
+  std::unique_ptr<std::vector<float>> kernel;
+};
+}  // namespace
+
+// Reference standard
+BENCHMARK_DEFINE_F(InitializeInput, ConvolveReference)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    ConvolveReference(image->data(), kernel->data(), state.range(0),
+                      state.range(1));
+  }
+}
+BENCHMARK_REGISTER_F(InitializeInput, ConvolveReference)
+    ->Args({64, 32})
+    ->Args({128, 64})
+    ->Args({256, 126})
+    ->Args({512, 754})
+    ->Args({1024, 124})
+    ->Args({2048, 1000})
+    ->Args({4096, 5000});
+
+// FFTW serial standard
+BENCHMARK_DEFINE_F(InitializeInput, ConvolveSerial)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    ConvolveSerial(image->data(), kernel->data(), state.range(0),
+                   state.range(1));
+  }
+}
+
+BENCHMARK_REGISTER_F(InitializeInput, ConvolveSerial)
+    ->Args({64, 32})
+    ->Args({128, 64})
+    ->Args({256, 126})
+    ->Args({512, 754})
+    ->Args({1024, 124})
+    ->Args({2048, 1000})
+    ->Args({4096, 5000});
diff --git a/code/convolution.h b/code/convolution.h
new file mode 100644
index 0000000..66e8c8c
--- /dev/null
+++ b/code/convolution.h
@@ -0,0 +1,26 @@
+#ifndef CONVOLUTION_H_
+#define CONVOLUTION_H_
+
+#include <complex>
+#include <iomanip>
+#include <iostream>
+#include <random>
+
+inline void Initialize(float* a, const size_t width, const size_t height) {
+  // Initialize matrices with random complex values
+  std::seed_seq seed({42});
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<float> dis(-1.0, 1.0);
+  const size_t linear_size = width * height;
+  for (int i = 0; i < linear_size; i++) {
+    a[i] = dis(gen);
+  }
+}
+
+// Function to perform matrix multiplication for 2x2 complex matrices
+void ConvolveReference(float* image, const float* kernel, size_t width,
+                       size_t height);
+
+void ConvolveSerial(float* image, const float* kernel, size_t width,
+                    size_t height);
+#endif
diff --git a/code/convolution_reference.cpp b/code/convolution_reference.cpp
new file mode 100644
index 0000000..e83c8d0
--- /dev/null
+++ b/code/convolution_reference.cpp
@@ -0,0 +1,8 @@
+#include "convolution.h"
+
+#include <schaapcommon/math/convolution.h>
+
+void ConvolveReference(float* image, const float* kernel, size_t width,
+                       size_t height) {
+  schaapcommon::math::Convolve(image, kernel, width, height);
+}
\ No newline at end of file
diff --git a/code/convolution_serial_fftw.cpp b/code/convolution_serial_fftw.cpp
new file mode 100644
index 0000000..e097730
--- /dev/null
+++ b/code/convolution_serial_fftw.cpp
@@ -0,0 +1,190 @@
+#include "convolution.h"
+#include <fftw3.h>
+#include <algorithm>
+// Partially unroll rows/columns with a factor of kUnroll
+constexpr size_t kUnroll = 4;
+
+// With kUnroll > 1, the temporary buffers need to be aligned
+// for FFTW to work correctly.
+constexpr size_t kAlignment = 64;
+
+size_t RoundUp(size_t a, size_t b) { return ((a + b) / b) * b; }
+
+void FftR2CComposite(fftwf_plan plan_r2c, fftwf_plan plan_c2c,
+                     size_t image_height, size_t image_width, const float* in,
+                     fftwf_complex* out) {
+  const size_t complex_width = image_width / 2 + 1;
+  const size_t complex_size = image_height * complex_width;
+
+  fftwf_complex* temp1 = fftwf_alloc_complex(complex_size);
+
+  fftwf_complex* temp2 = fftwf_alloc_complex(complex_width);
+  float* temp2_ptr = reinterpret_cast<float*>(temp2);
+  for (size_t y = 0; y < image_height; y++) {
+    float* temp1_ptr = reinterpret_cast<float*>(&temp1[y * complex_width]);
+    std::copy_n(&in[y * image_width], image_width, temp2_ptr);
+    fftwf_execute_dft_r2c(plan_r2c, temp2_ptr, temp2);
+    std::copy_n(temp2_ptr, 2 * complex_width, temp1_ptr);
+  }
+  fftwf_free(temp2);
+
+  // Partially kUnroll over columns
+  size_t padded_height = RoundUp(image_height, kAlignment);
+  temp2 = fftwf_alloc_complex(kUnroll * padded_height);
+
+  for (size_t x = 0; x < complex_width; x += kUnroll) {
+    // Copy input
+    for (size_t y = 0; y < image_height; y++) {
+      for (size_t i = 0; i < kUnroll; i++) {
+        if ((x + i) < complex_width) {
+          float* temp1_ptr =
+              reinterpret_cast<float*>(&temp1[y * complex_width + x + i]);
+          float* temp2_ptr =
+              reinterpret_cast<float*>(&temp2[i * padded_height + y]);
+          std::copy_n(temp1_ptr, 2, temp2_ptr);
+        }
+      }
+    }
+
+    // Perform 1D FFT over columns
+    for (size_t i = 0; i < kUnroll; i++) {
+      fftwf_complex* temp2_ptr = &temp2[i * padded_height];
+      fftwf_execute_dft(plan_c2c, temp2_ptr, temp2_ptr);
+    }
+
+    // Transpose output
+    for (size_t y = 0; y < image_height; y++) {
+      for (size_t i = 0; i < kUnroll; i++) {
+        if ((x + i) < complex_width) {
+          float* temp2_ptr =
+              reinterpret_cast<float*>(&temp2[i * padded_height + y]);
+          float* out_ptr =
+              reinterpret_cast<float*>(&out[y * complex_width + x + i]);
+          std::copy_n(temp2_ptr, 2, out_ptr);
+        }
+      }
+    }
+  }
+
+  fftwf_free(temp2);
+  fftwf_free(temp1);
+}
+
+void FftC2RComposite(fftwf_plan plan_c2c, fftwf_plan plan_c2r,
+                     size_t image_height, size_t image_width,
+                     const fftwf_complex* in, float* out) {
+  const size_t complex_width = image_width / 2 + 1;
+
+  size_t padded_height = RoundUp(image_height, kAlignment);
+  size_t padded_size = padded_height * complex_width;
+  fftwf_complex* temp1 = fftwf_alloc_complex(padded_size);
+
+  for (size_t x = 0; x < complex_width; x += kUnroll) {
+    // Transpose input
+    for (size_t y = 0; y < image_height; y++) {
+      for (size_t i = 0; i < kUnroll; i++) {
+        if ((x + i) < complex_width) {
+          const float* in_ptr =
+              reinterpret_cast<const float*>(&in[y * complex_width + x + i]);
+          float* temp1_ptr =
+              reinterpret_cast<float*>(&temp1[(x + i) * padded_height + y]);
+          std::copy_n(in_ptr, 2, temp1_ptr);
+        }
+      }
+    }
+
+    // Perform 1D C2C FFT over columns
+    for (size_t i = 0; i < kUnroll; i++) {
+      if ((x + i) < complex_width) {
+        fftwf_complex* temp1_ptr = &temp1[(x + i) * padded_height];
+        fftwf_execute_dft(plan_c2c, temp1_ptr, temp1_ptr);
+      }
+    }
+  }
+
+  size_t paddedWidth = RoundUp(complex_width, kAlignment);
+  fftwf_complex* temp2 = fftwf_alloc_complex(kUnroll * paddedWidth);
+
+  for (size_t y = 0; y < image_height; y += kUnroll) {
+    // Transpose input
+    for (size_t x = 0; x < complex_width; x++) {
+      for (size_t i = 0; i < kUnroll; i++) {
+        if ((y + i) < image_height) {
+          float* temp1_ptr =
+              reinterpret_cast<float*>(&temp1[x * padded_height + y + i]);
+          float* temp2_ptr =
+              reinterpret_cast<float*>(&temp2[i * paddedWidth + x]);
+          std::copy_n(temp1_ptr, 2, temp2_ptr);
+        }
+      }
+    }
+
+    // Perform 1D C2R FFT over rows
+    for (size_t i = 0; i < kUnroll; i++) {
+      if ((y + i) < image_height) {
+        fftwf_complex* temp2_ptr = &temp2[i * paddedWidth];
+        fftwf_execute_dft_c2r(plan_c2r, temp2_ptr,
+                              reinterpret_cast<float*>(temp2_ptr));
+      }
+    }
+
+    // Copy output
+    for (size_t i = 0; i < kUnroll; i++) {
+      if ((y + i) < image_height) {
+        float* temp2_ptr = reinterpret_cast<float*>(&temp2[i * paddedWidth]);
+        std::copy_n(temp2_ptr, image_width, &out[(y + i) * image_width]);
+      }
+    }
+  }
+
+  fftwf_free(temp2);
+
+  fftwf_free(temp1);
+}
+
+void ConvolveSerial(float* image, const float* kernel, size_t image_width,
+                    size_t image_height) {
+  const size_t image_size = image_width * image_height;
+  const size_t complex_width = image_width / 2 + 1;
+  const size_t complex_size = complex_width * image_height;
+  float* temp_data = fftwf_alloc_real(image_size);
+  fftwf_complex* fft_image_data = fftwf_alloc_complex(complex_size);
+  fftwf_complex* fft_kernel_data = fftwf_alloc_complex(complex_size);
+
+  fftwf_plan plan_r2c =
+      fftwf_plan_dft_r2c_1d(image_width, nullptr, nullptr, FFTW_ESTIMATE);
+  fftwf_plan plan_c2c_forward = fftwf_plan_dft_1d(
+      image_height, nullptr, nullptr, FFTW_FORWARD, FFTW_ESTIMATE);
+  fftwf_plan plan_c2c_backward = fftwf_plan_dft_1d(
+      image_height, nullptr, nullptr, FFTW_BACKWARD, FFTW_ESTIMATE);
+  fftwf_plan plan_c2r =
+      fftwf_plan_dft_c2r_1d(image_width, nullptr, nullptr, FFTW_ESTIMATE);
+
+  FftR2CComposite(plan_r2c, plan_c2c_forward, image_height, image_width, image,
+                  fft_image_data);
+
+  std::copy_n(kernel, image_size, temp_data);
+  FftR2CComposite(plan_r2c, plan_c2c_forward, image_height, image_width,
+                  temp_data, fft_kernel_data);
+
+  const float fact = 1.0 / image_size;
+  for (size_t y = 0; y != image_height; ++y) {
+    for (size_t x = 0; x != complex_width; ++x) {
+      const size_t i = y * complex_width + x;
+      reinterpret_cast<std::complex<float>*>(fft_image_data)[i] *=
+          fact * reinterpret_cast<std::complex<float>*>(fft_kernel_data)[i];
+    }
+  }
+
+  FftC2RComposite(plan_c2c_backward, plan_c2r, image_height, image_width,
+                  fft_image_data, image);
+
+  fftwf_free(fft_image_data);
+  fftwf_free(fft_kernel_data);
+  fftwf_free(temp_data);
+
+  fftwf_destroy_plan(plan_r2c);
+  fftwf_destroy_plan(plan_c2c_forward);
+  fftwf_destroy_plan(plan_c2c_backward);
+  fftwf_destroy_plan(plan_c2r);
+}
\ No newline at end of file
-- 
GitLab


From 21f86690899ad17e18156251dba4c6d395364c54 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 17:32:50 +0100
Subject: [PATCH 03/18] Add tests

---
 code/convolution.h        |  1 +
 test/helpers.cpp          | 49 ++++++++++++++++++++++++++++++++++++++-
 test/helpers.h            |  7 ++++++
 test/test_convolution.cpp | 37 +++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 test/test_convolution.cpp

diff --git a/code/convolution.h b/code/convolution.h
index 66e8c8c..5e0ceb4 100644
--- a/code/convolution.h
+++ b/code/convolution.h
@@ -23,4 +23,5 @@ void ConvolveReference(float* image, const float* kernel, size_t width,
 
 void ConvolveSerial(float* image, const float* kernel, size_t width,
                     size_t height);
+
 #endif
diff --git a/test/helpers.cpp b/test/helpers.cpp
index 60cabef..9b4b04b 100644
--- a/test/helpers.cpp
+++ b/test/helpers.cpp
@@ -16,6 +16,14 @@ void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv,
   REQUIRE_THAT(lv, Catch::Matchers::WithinAbs(rv, precision));
 }
 
+template <typename T>
+void compareMulti(const std::vector<T>& lv, const std::vector<T>& rv,
+                  float precision) {
+  for (size_t idx = 0; idx < lv.size(); idx++) {
+    REQUIRE_THAT(lv[idx], Catch::Matchers::WithinAbs(rv[idx], precision));
+  }
+}
+
 template <>
 void compareSingle(const std::vector<std::complex<float>>& lv,
                    const std::vector<std::complex<float>>& rv,
@@ -49,7 +57,7 @@ void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
 
   std::stringstream ss;
   ss << "Expected : \n";
-  for (size_t idx = 0; idx < N; idx++) {
+  for (size_t idx = 0; idx < lv.size(); idx++) {
     ss << valueToString(lhs[idx]) << "\t";
   }
 
@@ -59,14 +67,53 @@ void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
   }
   ss << "\n";
   INFO("Reason: \n" << ss.str());
+
   compareSingle(lv, rv, precision);
 }
 
+template <typename T>
+void compareVectors(const std::string& test, unsigned line, std::vector<T> lhs,
+                    std::vector<T> rhs, float precision) {
+  INFO("Test case [" << test << "] failed at line "
+                     << line);  // Reported only if REQUIRE fails
+
+  std::stringstream ss;
+  if (lhs.size() != rhs.size()) {
+    ss << " Size mismatch\n";
+    ss << "Expected size : " << lhs.size() << "\n";
+    ss << "Obtained size : " << rhs.size() << "\n";
+    INFO("Reason: \n" << ss.str());
+  }
+
+  CHECK(lhs.size() == rhs.size());
+  const size_t N = lhs.size();
+  ss << "Expected : \n";
+  for (size_t idx = 0; idx < N; idx++) {
+    ss << valueToString(lhs[idx]) << "\t";
+  }
+
+  ss << "\nObtained : \n";
+  for (size_t idx = 0; idx < N; idx++) {
+    ss << valueToString(rhs[idx]) << "\t";
+  }
+  ss << "\n";
+  INFO("Reason: \n" << ss.str());
+  compareMulti(lhs, rhs, precision);
+}
+
 template void compareArrays(const std::string& test, unsigned line,
                             std::array<std::complex<float>, 4ul> lhs,
                             std::array<std::complex<float>, 4ul> rhs,
                             float precision);
 
+template void compareVectors(const std::string& test, unsigned line,
+                             std::vector<float> lhs, std::vector<float> rhs,
+                             float precision);
+
+template void compareVectors(const std::string& test, unsigned line,
+                             std::vector<double> lhs, std::vector<double> rhs,
+                             float precision);
+
 void AssertEqual(const aocommon::Matrix4x4& a, const aocommon::Matrix4x4& b,
                  float precision) {
   for (size_t i = 0; i < 16; i++) {
diff --git a/test/helpers.h b/test/helpers.h
index 83f306f..f3ce809 100644
--- a/test/helpers.h
+++ b/test/helpers.h
@@ -14,6 +14,9 @@
 #define COMPARE_ARRAYS(lhs, rhs, precision)                                    \
   compareArrays(Catch::getResultCapture().getCurrentTestName(), __LINE__, lhs, \
                 rhs, precision)
+#define COMPARE_VECTORS(lhs, rhs, precision)                               \
+  compareVectors(Catch::getResultCapture().getCurrentTestName(), __LINE__, \
+                 lhs, rhs, precision)
 
 template <typename T>
 void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv,
@@ -32,6 +35,10 @@ template <typename T, size_t N>
 void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
                    std::array<T, N> rhs, float precision);
 
+template <typename T>
+void compareVectors(const std::string& test, unsigned line, std::vector<T> lhs,
+                    std::vector<T> rhs, float precision);
+
 void AssertEqual(const aocommon::Matrix4x4& a, const aocommon::Matrix4x4& b,
                  float precision);
 
diff --git a/test/test_convolution.cpp b/test/test_convolution.cpp
new file mode 100644
index 0000000..4227e34
--- /dev/null
+++ b/test/test_convolution.cpp
@@ -0,0 +1,37 @@
+#include <convolution.h>
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+#include "helpers.h"
+
+TEST_CASE("test convolution", "[float]") {
+  // This setup will be done 4 times in total, once for each section
+  size_t width = 16;
+  size_t height = 32;
+
+  std::vector<float> image(width * height);
+  std::vector<float> expected_image(width * height);
+
+  std::vector<float> kernel(width * height);
+  Initialize(image.data(), width, height);
+  std::copy(image.begin(), image.end(), expected_image.begin());
+  Initialize(kernel.data(), width, height);
+
+  ConvolveReference(expected_image.data(), kernel.data(), width, height);
+
+  SECTION("test correctness of reference implementation") {
+    ConvolveReference(image.data(), kernel.data(), width, height);
+    COMPARE_VECTORS(expected_image, image, 1.e-5);
+  }
+
+  SECTION("test correctness of reference implementation twice") {
+    ConvolveReference(image.data(), kernel.data(), width, height);
+    COMPARE_VECTORS(expected_image, image, 1.e-5);
+  }
+
+  SECTION("test correctness of serial implementation") {
+    ConvolveSerial(image.data(), kernel.data(), width, height);
+    COMPARE_VECTORS(expected_image, image, 1.e-5);
+  }
+}
\ No newline at end of file
-- 
GitLab


From 09fc57641cd73355524fb48419acc53c1056735e Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 17:45:56 +0100
Subject: [PATCH 04/18] Use HTTPS instead of SSH to clone schaapcommon

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fdbe25d..26c5e45 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,7 @@ FetchContent_MakeAvailable(aocommon)
 # Make schaapspack available
 FetchContent_Declare(
   schaapcommon
-  GIT_REPOSITORY git@git.astron.nl:RD/schaapcommon.git
+  GIT_REPOSITORY https://git.astron.nl/RD/schaapcommon.git
   GIT_TAG master)
 
 FetchContent_MakeAvailable(schaapcommon)
-- 
GitLab


From 247218cff598ce95e5ed5264c1e158692952559c Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 17:48:37 +0100
Subject: [PATCH 05/18] Add missing dependency

---
 docker/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 67a2ef8..a60a0d8 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -9,6 +9,7 @@ RUN apt-get update -qq &&\
     git \
     libblas-dev liblapack-dev \
     libboost-date-time-dev \
+    libbost-filesystem-dev \
     libboost-test-dev \
     libboost-dev \
     libcfitsio-dev \
-- 
GitLab


From ac9acc622bee1834db0a622e7605a4ed71306c64 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 17:49:34 +0100
Subject: [PATCH 06/18] Add fftw3

---
 docker/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index a60a0d8..4f51d02 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -18,6 +18,7 @@ RUN apt-get update -qq &&\
     libhdf5-dev \
     libopenmpi-dev \
     libpython3-dev \
+    libfftw3-dev \
     pkg-config \
     python3-dev python3-numpy \
     python3-sphinx \
-- 
GitLab


From 0f361b708b7dc360f73009666c978b6fbe08eca1 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 17:51:33 +0100
Subject: [PATCH 07/18] Fix name of library

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 4f51d02..6b72250 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get update -qq &&\
     git \
     libblas-dev liblapack-dev \
     libboost-date-time-dev \
-    libbost-filesystem-dev \
+    libboost-filesystem-dev \
     libboost-test-dev \
     libboost-dev \
     libcfitsio-dev \
-- 
GitLab


From 1131c9a79e460da3b7a062a8950159f2106ec0e9 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 19:56:14 +0100
Subject: [PATCH 08/18] Force pull policy

---
 .gitlab-ci.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d1d46da..28fda75 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,7 +25,9 @@ docker-base:
       - docker/Dockerfile
 
 .build_docker:
-  image: "$CI_REGISTRY_IMAGE:latest"
+  image:
+    name: "$CI_REGISTRY_IMAGE:latest"
+    pull_policy: always
   before_script:
     - cmake -B build . -DCMAKE_BUILD_TYPE=Release
     - make -C build -j
@@ -33,7 +35,9 @@ docker-base:
 
 build-job:       # This job runs in the build stage, which runs first.
   stage: build
-  image: "$CI_REGISTRY_IMAGE:latest"
+  image:
+    name: "$CI_REGISTRY_IMAGE:latest"
+    pull_policy: always
   script:
     - cmake -B build . -DCMAKE_BUILD_TYPE=Release
     - make -C build -j
-- 
GitLab


From 5ae0648f742bf7dda19cafb2228d3af07f5fc92b Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 19:59:13 +0100
Subject: [PATCH 09/18] Summarize convolution results

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 28fda75..d4e9793 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -155,4 +155,5 @@ collect-performance:
   - python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary-matrix-multiplication
   - python3 ci/summarize-results.py --filter HermitianSquare results*.json result-summary-hermitian-square
   - python3 ci/summarize-results.py --filter KroneckerSquare results*.json result-summary-kronecker-square
+  - python3 ci/summarize-results.py --filter Convolution results*.json result-summary-convolution
   
-- 
GitLab


From 5ea6c8fb99c1d840c8890e335b346afddd4b1c36 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 20:01:31 +0100
Subject: [PATCH 10/18] Nodes need boost

---
 ci/das6/compile_and_run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/das6/compile_and_run.sh b/ci/das6/compile_and_run.sh
index 5d89c01..3641f71 100755
--- a/ci/das6/compile_and_run.sh
+++ b/ci/das6/compile_and_run.sh
@@ -11,7 +11,7 @@ COMPILER_VERSION=$2
 echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE}
 BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
 module load spack/${COMPILER_VERSION}
-module load cmake
+module load cmake boost
 
 cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release
 
-- 
GitLab


From a4e5c2ca5de48f44df701e08439fda963f829fae Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 20:07:28 +0100
Subject: [PATCH 11/18] Add more dependencies

---
 ci/das6/compile_and_run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/das6/compile_and_run.sh b/ci/das6/compile_and_run.sh
index 3641f71..4b8175d 100755
--- a/ci/das6/compile_and_run.sh
+++ b/ci/das6/compile_and_run.sh
@@ -11,7 +11,7 @@ COMPILER_VERSION=$2
 echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE}
 BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
 module load spack/${COMPILER_VERSION}
-module load cmake boost
+module load cmake boost casacore fftw3
 
 cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release
 
-- 
GitLab


From bc4fd251874b2bcb640171defb78a685bbc184b5 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Thu, 6 Feb 2025 20:13:13 +0100
Subject: [PATCH 12/18] Fix wrong module name

---
 ci/das6/compile_and_run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/das6/compile_and_run.sh b/ci/das6/compile_and_run.sh
index 4b8175d..34d5ef9 100755
--- a/ci/das6/compile_and_run.sh
+++ b/ci/das6/compile_and_run.sh
@@ -11,7 +11,7 @@ COMPILER_VERSION=$2
 echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE}
 BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
 module load spack/${COMPILER_VERSION}
-module load cmake boost casacore fftw3
+module load cmake boost casacore fftw
 
 cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release
 
-- 
GitLab


From 0e46a84507c32c4bc8d94848f760e6809c428815 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Fri, 7 Feb 2025 09:00:50 +0100
Subject: [PATCH 13/18] Add hdf5

---
 ci/das6/compile_and_run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/das6/compile_and_run.sh b/ci/das6/compile_and_run.sh
index 34d5ef9..9838a5d 100755
--- a/ci/das6/compile_and_run.sh
+++ b/ci/das6/compile_and_run.sh
@@ -11,7 +11,7 @@ COMPILER_VERSION=$2
 echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE}
 BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
 module load spack/${COMPILER_VERSION}
-module load cmake boost casacore fftw
+module load cmake boost casacore fftw hdf5
 
 cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release
 
-- 
GitLab


From 6de58f42860c3f481b2ecbc60381ae6017ee04f0 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Fri, 7 Feb 2025 09:16:58 +0100
Subject: [PATCH 14/18] Add explicitly fftw3

---
 CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 26c5e45..f694c8f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,12 +71,14 @@ file(GLOB TEST_SOURCES "test/*.cpp")
 add_executable(unittests ${TEST_SOURCES} ${KERNEL_SOURCES})
 
 find_package(OpenMP)
+find_package(PkgConfig REQUIRED)
+pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET)
 
 # Link against Google Benchmark
 target_link_libraries(microbenchmarks PRIVATE benchmark::benchmark)
 target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include ${schaapcommon_SOURCE_DIR}/include)
 target_include_directories(microbenchmarks PRIVATE code)
-target_link_libraries(microbenchmarks PRIVATE schaapcommon)
+target_link_libraries(microbenchmarks PRIVATE schaapcommon PkgConfig::FFTW)
 target_compile_options(microbenchmarks PUBLIC ${COMPILER_FLAGS})
 if(OpenMP_CXX_FOUND)
     target_link_libraries(microbenchmarks PRIVATE OpenMP::OpenMP_CXX)
@@ -91,7 +93,7 @@ catch_discover_tests(unittests WORKING_DIRECTORY)
 target_link_libraries(unittests PRIVATE Catch2::Catch2WithMain)
 target_include_directories(unittests PRIVATE code)
 target_include_directories(unittests PRIVATE ${aocommon_SOURCE_DIR}/include  ${schaapcommon_SOURCE_DIR}/include)
-target_link_libraries(unittests PRIVATE schaapcommon)
+target_link_libraries(unittests PRIVATE schaapcommon PkgConfig::FFTW)
 target_compile_options(unittests PUBLIC ${COMPILER_FLAGS})
 
 foreach(KERNEL_SOURCE ${KERNEL_SOURCES})
-- 
GitLab


From b590d541623bbe866c5457fdeafae072d6c0b940 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Fri, 7 Feb 2025 09:48:31 +0100
Subject: [PATCH 15/18] Add include to precompile

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f694c8f..cac174a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,7 +100,7 @@ foreach(KERNEL_SOURCE ${KERNEL_SOURCES})
 
 get_filename_component(KERNEL_NAME ${KERNEL_SOURCE} NAME_WE)
 add_precompile_target(TARGET_NAME ${KERNEL_NAME} 
-                        INCLUDE_DIRS code ${aocommon_SOURCE_DIR}/include ${schaapcommon_SOURCE_DIR}/include
+                        INCLUDE_DIRS code ${aocommon_SOURCE_DIR}/include ${schaapcommon_SOURCE_DIR}/include ${FFTW_INCLUDE_DIRS}
                         SOURCES ${KERNEL_SOURCE}
                         COMPILER_FLAGS ${COMPILER_FLAGS})
 endforeach()
-- 
GitLab


From 070d07dbabcc3d0ea759f5071e382bd943e9eca6 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Fri, 7 Feb 2025 10:31:23 +0100
Subject: [PATCH 16/18] Add cfitsio

---
 ci/das6/compile_and_run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/das6/compile_and_run.sh b/ci/das6/compile_and_run.sh
index 9838a5d..295945e 100755
--- a/ci/das6/compile_and_run.sh
+++ b/ci/das6/compile_and_run.sh
@@ -11,7 +11,7 @@ COMPILER_VERSION=$2
 echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE}
 BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
 module load spack/${COMPILER_VERSION}
-module load cmake boost casacore fftw hdf5
+module load cmake boost casacore fftw hdf5 cfitsio
 
 cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release
 
-- 
GitLab


From 7a250ed25e1e8afc144cd32970b750987da21186 Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Fri, 7 Feb 2025 10:42:00 +0100
Subject: [PATCH 17/18] Fix naming

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d4e9793..5f614c6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -155,5 +155,5 @@ collect-performance:
   - python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary-matrix-multiplication
   - python3 ci/summarize-results.py --filter HermitianSquare results*.json result-summary-hermitian-square
   - python3 ci/summarize-results.py --filter KroneckerSquare results*.json result-summary-kronecker-square
-  - python3 ci/summarize-results.py --filter Convolution results*.json result-summary-convolution
+  - python3 ci/summarize-results.py --filter Convolve results*.json result-summary-convolution
   
-- 
GitLab


From 75da170fbd79219faf2a70f4f9a22a47fd75a9dd Mon Sep 17 00:00:00 2001
From: mancini <mancini@astron.nl>
Date: Fri, 7 Feb 2025 11:08:22 +0100
Subject: [PATCH 18/18] Reduce the number of tests

---
 benchmarks/convolution.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/benchmarks/convolution.cpp b/benchmarks/convolution.cpp
index ae82e9f..d1dbb4a 100644
--- a/benchmarks/convolution.cpp
+++ b/benchmarks/convolution.cpp
@@ -35,11 +35,6 @@ BENCHMARK_DEFINE_F(InitializeInput, ConvolveReference)
   }
 }
 BENCHMARK_REGISTER_F(InitializeInput, ConvolveReference)
-    ->Args({64, 32})
-    ->Args({128, 64})
-    ->Args({256, 126})
-    ->Args({512, 754})
-    ->Args({1024, 124})
     ->Args({2048, 1000})
     ->Args({4096, 5000});
 
@@ -53,10 +48,5 @@ BENCHMARK_DEFINE_F(InitializeInput, ConvolveSerial)
 }
 
 BENCHMARK_REGISTER_F(InitializeInput, ConvolveSerial)
-    ->Args({64, 32})
-    ->Args({128, 64})
-    ->Args({256, 126})
-    ->Args({512, 754})
-    ->Args({1024, 124})
     ->Args({2048, 1000})
     ->Args({4096, 5000});
-- 
GitLab