diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d11a266f967e9699e0bccf0472fef125ad6ae421..5f614c6fb34f649ee1dfe73a5cdf2834b02b9851 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,7 +25,9 @@ docker-base: - docker/Dockerfile .build_docker: - image: "$CI_REGISTRY_IMAGE:latest" + image: + name: "$CI_REGISTRY_IMAGE:latest" + pull_policy: always before_script: - cmake -B build . -DCMAKE_BUILD_TYPE=Release - make -C build -j @@ -33,7 +35,9 @@ docker-base: build-job: # This job runs in the build stage, which runs first. stage: build - image: "$CI_REGISTRY_IMAGE:latest" + image: + name: "$CI_REGISTRY_IMAGE:latest" + pull_policy: always script: - cmake -B build . -DCMAKE_BUILD_TYPE=Release - make -C build -j @@ -96,6 +100,26 @@ performance-jetson: # This job runs in the test stage. access: all expire_in: 1 days +performance-gracehopper: # This job runs in the test stage. + allow_failure: true + tags: + - das6-gpu + stage: benchmark # It only starts when the job in the build stage completes successfully. + script: + - sbatch --wait -p ghq -o output.txt -e error.txt ci/das6/compile_and_run_native_sh ghq_arm64 + - cat output.txt >&1 + - cat error.txt >&2 + + artifacts: + paths: + - ./results*.json + - ./output.txt + - ./error.txt + - ./*.tar + when: always + access: all + expire_in: 1 days + performance-generic: stage: benchmark image: "$CI_REGISTRY_IMAGE:latest" @@ -131,4 +155,5 @@ collect-performance: - python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary-matrix-multiplication - python3 ci/summarize-results.py --filter HermitianSquare results*.json result-summary-hermitian-square - python3 ci/summarize-results.py --filter KroneckerSquare results*.json result-summary-kronecker-square + - python3 ci/summarize-results.py --filter Convolve results*.json result-summary-convolution diff --git a/CMakeLists.txt b/CMakeLists.txt index 73d5509823c9b275c2517ed7fae9649b55ec0962..cac174ac8fc69d09e34e5956e4c07f36dbd801c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,9 +45,19 @@ include(Catch) FetchContent_Declare( aocommon GIT_REPOSITORY https://gitlab.com/aroffringa/aocommon.git + GIT_TAG master + EXCLUDE_FROM_ALL) + +FetchContent_MakeAvailable(aocommon) + +# Make schaapspack available +FetchContent_Declare( + schaapcommon + GIT_REPOSITORY https://git.astron.nl/RD/schaapcommon.git GIT_TAG master) -FetchContent_Populate(aocommon) +FetchContent_MakeAvailable(schaapcommon) +target_include_directories(schaapcommon PUBLIC ${aocommon_SOURCE_DIR}/include) set(COMPILER_FLAGS "-O3;-march=native;-ggdb;") @@ -61,11 +71,14 @@ file(GLOB TEST_SOURCES "test/*.cpp") add_executable(unittests ${TEST_SOURCES} ${KERNEL_SOURCES}) find_package(OpenMP) +find_package(PkgConfig REQUIRED) +pkg_search_module(FFTW REQUIRED fftw3 IMPORTED_TARGET) # Link against Google Benchmark target_link_libraries(microbenchmarks PRIVATE benchmark::benchmark) -target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include) +target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include ${schaapcommon_SOURCE_DIR}/include) target_include_directories(microbenchmarks PRIVATE code) +target_link_libraries(microbenchmarks PRIVATE schaapcommon PkgConfig::FFTW) target_compile_options(microbenchmarks PUBLIC ${COMPILER_FLAGS}) if(OpenMP_CXX_FOUND) target_link_libraries(microbenchmarks PRIVATE OpenMP::OpenMP_CXX) @@ -79,15 +92,15 @@ list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) catch_discover_tests(unittests WORKING_DIRECTORY) target_link_libraries(unittests PRIVATE Catch2::Catch2WithMain) target_include_directories(unittests PRIVATE code) -target_include_directories(unittests PRIVATE ${aocommon_SOURCE_DIR}/include) - +target_include_directories(unittests PRIVATE ${aocommon_SOURCE_DIR}/include ${schaapcommon_SOURCE_DIR}/include) +target_link_libraries(unittests PRIVATE schaapcommon PkgConfig::FFTW) target_compile_options(unittests PUBLIC ${COMPILER_FLAGS}) foreach(KERNEL_SOURCE ${KERNEL_SOURCES}) get_filename_component(KERNEL_NAME ${KERNEL_SOURCE} NAME_WE) add_precompile_target(TARGET_NAME ${KERNEL_NAME} - INCLUDE_DIRS code ${aocommon_SOURCE_DIR}/include + INCLUDE_DIRS code ${aocommon_SOURCE_DIR}/include ${schaapcommon_SOURCE_DIR}/include ${FFTW_INCLUDE_DIRS} SOURCES ${KERNEL_SOURCE} COMPILER_FLAGS ${COMPILER_FLAGS}) endforeach() diff --git a/benchmarks/convolution.cpp b/benchmarks/convolution.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d1dbb4a91aab9b384a4038b0f4db35092e7e3576 --- /dev/null +++ b/benchmarks/convolution.cpp @@ -0,0 +1,52 @@ +#include <benchmark/benchmark.h> +#include <convolution.h> +#include <memory> +#include <vector> + +namespace { + +class InitializeInput : public benchmark::Fixture { + public: + void SetUp(::benchmark::State& state) { + size_t width = state.range(0); + size_t height = state.range(1); + image = std::make_unique<std::vector<float>>(width * height); + kernel = std::make_unique<std::vector<float>>(width * height); + + Initialize(image->data(), width, height); + Initialize(kernel->data(), width, height); + } + void TearDown(::benchmark::State& state) { + image.reset(); + kernel.reset(); + } + + std::unique_ptr<std::vector<float>> image; + std::unique_ptr<std::vector<float>> kernel; +}; +} // namespace + +// Reference standard +BENCHMARK_DEFINE_F(InitializeInput, ConvolveReference) +(benchmark::State& state) { + for (auto _ : state) { + ConvolveReference(image->data(), kernel->data(), state.range(0), + state.range(1)); + } +} +BENCHMARK_REGISTER_F(InitializeInput, ConvolveReference) + ->Args({2048, 1000}) + ->Args({4096, 5000}); + +// FFTW serial standard +BENCHMARK_DEFINE_F(InitializeInput, ConvolveSerial) +(benchmark::State& state) { + for (auto _ : state) { + ConvolveSerial(image->data(), kernel->data(), state.range(0), + state.range(1)); + } +} + +BENCHMARK_REGISTER_F(InitializeInput, ConvolveSerial) + ->Args({2048, 1000}) + ->Args({4096, 5000}); diff --git a/ci/das6/compile_and_run.sh b/ci/das6/compile_and_run.sh index 5d89c013ffd5e1a249a3ae1969e4025811288e6b..295945ed5fa92fb65c2b2f18cae6ea635f57b613 100755 --- a/ci/das6/compile_and_run.sh +++ b/ci/das6/compile_and_run.sh @@ -11,7 +11,7 @@ COMPILER_VERSION=$2 echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE} BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE} module load spack/${COMPILER_VERSION} -module load cmake +module load cmake boost casacore fftw hdf5 cfitsio cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release diff --git a/ci/das6/compile_and_run_native.sh b/ci/das6/compile_and_run_native.sh new file mode 100644 index 0000000000000000000000000000000000000000..4125cdc9cd3125349c26fc6ed38c208fdc997061 --- /dev/null +++ b/ci/das6/compile_and_run_native.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e +# compile the code and run it on das6 +# Specify the compiler version and architecture + +ARCHITECTURE=$1 +COMPILER_VERSION=$(gcc --version | head -n 1 | awk '{print $4}') + + +echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE} +BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE} + +cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release + +make -C ${BUILD_DIR} -j + +tar -cvf asm-${ARCHITECTURE}-${COMPILER_VERSION}.tar ${BUILD_DIR}/*.s + +${BUILD_DIR}/microbenchmarks --benchmark_out=results-${COMPILER_VERSION}-${ARCHITECTURE}.json --benchmark_out_format=json \ No newline at end of file diff --git a/code/convolution.h b/code/convolution.h new file mode 100644 index 0000000000000000000000000000000000000000..5e0ceb4582c03e27d1064a3eb4b1c1e27c423aba --- /dev/null +++ b/code/convolution.h @@ -0,0 +1,27 @@ +#ifndef CONVOLUTION_H_ +#define CONVOLUTION_H_ + +#include <complex> +#include <iomanip> +#include <iostream> +#include <random> + +inline void Initialize(float* a, const size_t width, const size_t height) { + // Initialize matrices with random complex values + std::seed_seq seed({42}); + std::mt19937 gen(seed); + std::uniform_real_distribution<float> dis(-1.0, 1.0); + const size_t linear_size = width * height; + for (int i = 0; i < linear_size; i++) { + a[i] = dis(gen); + } +} + +// Function to perform matrix multiplication for 2x2 complex matrices +void ConvolveReference(float* image, const float* kernel, size_t width, + size_t height); + +void ConvolveSerial(float* image, const float* kernel, size_t width, + size_t height); + +#endif diff --git a/code/convolution_reference.cpp b/code/convolution_reference.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e83c8d080cbd8578c2dc5d51270e0a09048e2c04 --- /dev/null +++ b/code/convolution_reference.cpp @@ -0,0 +1,8 @@ +#include "convolution.h" + +#include <schaapcommon/math/convolution.h> + +void ConvolveReference(float* image, const float* kernel, size_t width, + size_t height) { + schaapcommon::math::Convolve(image, kernel, width, height); +} \ No newline at end of file diff --git a/code/convolution_serial_fftw.cpp b/code/convolution_serial_fftw.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e097730f93b3318c67a35c55cdc53618eac39cb4 --- /dev/null +++ b/code/convolution_serial_fftw.cpp @@ -0,0 +1,190 @@ +#include "convolution.h" +#include <fftw3.h> +#include <algorithm> +// Partially unroll rows/columns with a factor of kUnroll +constexpr size_t kUnroll = 4; + +// With kUnroll > 1, the temporary buffers need to be aligned +// for FFTW to work correctly. +constexpr size_t kAlignment = 64; + +size_t RoundUp(size_t a, size_t b) { return ((a + b) / b) * b; } + +void FftR2CComposite(fftwf_plan plan_r2c, fftwf_plan plan_c2c, + size_t image_height, size_t image_width, const float* in, + fftwf_complex* out) { + const size_t complex_width = image_width / 2 + 1; + const size_t complex_size = image_height * complex_width; + + fftwf_complex* temp1 = fftwf_alloc_complex(complex_size); + + fftwf_complex* temp2 = fftwf_alloc_complex(complex_width); + float* temp2_ptr = reinterpret_cast<float*>(temp2); + for (size_t y = 0; y < image_height; y++) { + float* temp1_ptr = reinterpret_cast<float*>(&temp1[y * complex_width]); + std::copy_n(&in[y * image_width], image_width, temp2_ptr); + fftwf_execute_dft_r2c(plan_r2c, temp2_ptr, temp2); + std::copy_n(temp2_ptr, 2 * complex_width, temp1_ptr); + } + fftwf_free(temp2); + + // Partially kUnroll over columns + size_t padded_height = RoundUp(image_height, kAlignment); + temp2 = fftwf_alloc_complex(kUnroll * padded_height); + + for (size_t x = 0; x < complex_width; x += kUnroll) { + // Copy input + for (size_t y = 0; y < image_height; y++) { + for (size_t i = 0; i < kUnroll; i++) { + if ((x + i) < complex_width) { + float* temp1_ptr = + reinterpret_cast<float*>(&temp1[y * complex_width + x + i]); + float* temp2_ptr = + reinterpret_cast<float*>(&temp2[i * padded_height + y]); + std::copy_n(temp1_ptr, 2, temp2_ptr); + } + } + } + + // Perform 1D FFT over columns + for (size_t i = 0; i < kUnroll; i++) { + fftwf_complex* temp2_ptr = &temp2[i * padded_height]; + fftwf_execute_dft(plan_c2c, temp2_ptr, temp2_ptr); + } + + // Transpose output + for (size_t y = 0; y < image_height; y++) { + for (size_t i = 0; i < kUnroll; i++) { + if ((x + i) < complex_width) { + float* temp2_ptr = + reinterpret_cast<float*>(&temp2[i * padded_height + y]); + float* out_ptr = + reinterpret_cast<float*>(&out[y * complex_width + x + i]); + std::copy_n(temp2_ptr, 2, out_ptr); + } + } + } + } + + fftwf_free(temp2); + fftwf_free(temp1); +} + +void FftC2RComposite(fftwf_plan plan_c2c, fftwf_plan plan_c2r, + size_t image_height, size_t image_width, + const fftwf_complex* in, float* out) { + const size_t complex_width = image_width / 2 + 1; + + size_t padded_height = RoundUp(image_height, kAlignment); + size_t padded_size = padded_height * complex_width; + fftwf_complex* temp1 = fftwf_alloc_complex(padded_size); + + for (size_t x = 0; x < complex_width; x += kUnroll) { + // Transpose input + for (size_t y = 0; y < image_height; y++) { + for (size_t i = 0; i < kUnroll; i++) { + if ((x + i) < complex_width) { + const float* in_ptr = + reinterpret_cast<const float*>(&in[y * complex_width + x + i]); + float* temp1_ptr = + reinterpret_cast<float*>(&temp1[(x + i) * padded_height + y]); + std::copy_n(in_ptr, 2, temp1_ptr); + } + } + } + + // Perform 1D C2C FFT over columns + for (size_t i = 0; i < kUnroll; i++) { + if ((x + i) < complex_width) { + fftwf_complex* temp1_ptr = &temp1[(x + i) * padded_height]; + fftwf_execute_dft(plan_c2c, temp1_ptr, temp1_ptr); + } + } + } + + size_t paddedWidth = RoundUp(complex_width, kAlignment); + fftwf_complex* temp2 = fftwf_alloc_complex(kUnroll * paddedWidth); + + for (size_t y = 0; y < image_height; y += kUnroll) { + // Transpose input + for (size_t x = 0; x < complex_width; x++) { + for (size_t i = 0; i < kUnroll; i++) { + if ((y + i) < image_height) { + float* temp1_ptr = + reinterpret_cast<float*>(&temp1[x * padded_height + y + i]); + float* temp2_ptr = + reinterpret_cast<float*>(&temp2[i * paddedWidth + x]); + std::copy_n(temp1_ptr, 2, temp2_ptr); + } + } + } + + // Perform 1D C2R FFT over rows + for (size_t i = 0; i < kUnroll; i++) { + if ((y + i) < image_height) { + fftwf_complex* temp2_ptr = &temp2[i * paddedWidth]; + fftwf_execute_dft_c2r(plan_c2r, temp2_ptr, + reinterpret_cast<float*>(temp2_ptr)); + } + } + + // Copy output + for (size_t i = 0; i < kUnroll; i++) { + if ((y + i) < image_height) { + float* temp2_ptr = reinterpret_cast<float*>(&temp2[i * paddedWidth]); + std::copy_n(temp2_ptr, image_width, &out[(y + i) * image_width]); + } + } + } + + fftwf_free(temp2); + + fftwf_free(temp1); +} + +void ConvolveSerial(float* image, const float* kernel, size_t image_width, + size_t image_height) { + const size_t image_size = image_width * image_height; + const size_t complex_width = image_width / 2 + 1; + const size_t complex_size = complex_width * image_height; + float* temp_data = fftwf_alloc_real(image_size); + fftwf_complex* fft_image_data = fftwf_alloc_complex(complex_size); + fftwf_complex* fft_kernel_data = fftwf_alloc_complex(complex_size); + + fftwf_plan plan_r2c = + fftwf_plan_dft_r2c_1d(image_width, nullptr, nullptr, FFTW_ESTIMATE); + fftwf_plan plan_c2c_forward = fftwf_plan_dft_1d( + image_height, nullptr, nullptr, FFTW_FORWARD, FFTW_ESTIMATE); + fftwf_plan plan_c2c_backward = fftwf_plan_dft_1d( + image_height, nullptr, nullptr, FFTW_BACKWARD, FFTW_ESTIMATE); + fftwf_plan plan_c2r = + fftwf_plan_dft_c2r_1d(image_width, nullptr, nullptr, FFTW_ESTIMATE); + + FftR2CComposite(plan_r2c, plan_c2c_forward, image_height, image_width, image, + fft_image_data); + + std::copy_n(kernel, image_size, temp_data); + FftR2CComposite(plan_r2c, plan_c2c_forward, image_height, image_width, + temp_data, fft_kernel_data); + + const float fact = 1.0 / image_size; + for (size_t y = 0; y != image_height; ++y) { + for (size_t x = 0; x != complex_width; ++x) { + const size_t i = y * complex_width + x; + reinterpret_cast<std::complex<float>*>(fft_image_data)[i] *= + fact * reinterpret_cast<std::complex<float>*>(fft_kernel_data)[i]; + } + } + + FftC2RComposite(plan_c2c_backward, plan_c2r, image_height, image_width, + fft_image_data, image); + + fftwf_free(fft_image_data); + fftwf_free(fft_kernel_data); + fftwf_free(temp_data); + + fftwf_destroy_plan(plan_r2c); + fftwf_destroy_plan(plan_c2c_forward); + fftwf_destroy_plan(plan_c2c_backward); + fftwf_destroy_plan(plan_c2r); +} \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 67a2ef8ced00cd29f5b828431a2daddc12ea2362..6b722504440a008a04fadee1a38c185dd1e1e354 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,6 +9,7 @@ RUN apt-get update -qq &&\ git \ libblas-dev liblapack-dev \ libboost-date-time-dev \ + libboost-filesystem-dev \ libboost-test-dev \ libboost-dev \ libcfitsio-dev \ @@ -17,6 +18,7 @@ RUN apt-get update -qq &&\ libhdf5-dev \ libopenmpi-dev \ libpython3-dev \ + libfftw3-dev \ pkg-config \ python3-dev python3-numpy \ python3-sphinx \ diff --git a/test/helpers.cpp b/test/helpers.cpp index 60cabefaf0126cae7bd1aa937f81e0c40e0cc61d..9b4b04bc83d1e1b4d5e1260bce6e79d2d92d14bb 100644 --- a/test/helpers.cpp +++ b/test/helpers.cpp @@ -16,6 +16,14 @@ void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv, REQUIRE_THAT(lv, Catch::Matchers::WithinAbs(rv, precision)); } +template <typename T> +void compareMulti(const std::vector<T>& lv, const std::vector<T>& rv, + float precision) { + for (size_t idx = 0; idx < lv.size(); idx++) { + REQUIRE_THAT(lv[idx], Catch::Matchers::WithinAbs(rv[idx], precision)); + } +} + template <> void compareSingle(const std::vector<std::complex<float>>& lv, const std::vector<std::complex<float>>& rv, @@ -49,7 +57,7 @@ void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs, std::stringstream ss; ss << "Expected : \n"; - for (size_t idx = 0; idx < N; idx++) { + for (size_t idx = 0; idx < lv.size(); idx++) { ss << valueToString(lhs[idx]) << "\t"; } @@ -59,14 +67,53 @@ void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs, } ss << "\n"; INFO("Reason: \n" << ss.str()); + compareSingle(lv, rv, precision); } +template <typename T> +void compareVectors(const std::string& test, unsigned line, std::vector<T> lhs, + std::vector<T> rhs, float precision) { + INFO("Test case [" << test << "] failed at line " + << line); // Reported only if REQUIRE fails + + std::stringstream ss; + if (lhs.size() != rhs.size()) { + ss << " Size mismatch\n"; + ss << "Expected size : " << lhs.size() << "\n"; + ss << "Obtained size : " << rhs.size() << "\n"; + INFO("Reason: \n" << ss.str()); + } + + CHECK(lhs.size() == rhs.size()); + const size_t N = lhs.size(); + ss << "Expected : \n"; + for (size_t idx = 0; idx < N; idx++) { + ss << valueToString(lhs[idx]) << "\t"; + } + + ss << "\nObtained : \n"; + for (size_t idx = 0; idx < N; idx++) { + ss << valueToString(rhs[idx]) << "\t"; + } + ss << "\n"; + INFO("Reason: \n" << ss.str()); + compareMulti(lhs, rhs, precision); +} + template void compareArrays(const std::string& test, unsigned line, std::array<std::complex<float>, 4ul> lhs, std::array<std::complex<float>, 4ul> rhs, float precision); +template void compareVectors(const std::string& test, unsigned line, + std::vector<float> lhs, std::vector<float> rhs, + float precision); + +template void compareVectors(const std::string& test, unsigned line, + std::vector<double> lhs, std::vector<double> rhs, + float precision); + void AssertEqual(const aocommon::Matrix4x4& a, const aocommon::Matrix4x4& b, float precision) { for (size_t i = 0; i < 16; i++) { diff --git a/test/helpers.h b/test/helpers.h index 83f306f74f8abfdb9ecbc9fc3fc544e3ca63aa2a..f3ce80914a73bc51418eeff5ca8436b3c6489b54 100644 --- a/test/helpers.h +++ b/test/helpers.h @@ -14,6 +14,9 @@ #define COMPARE_ARRAYS(lhs, rhs, precision) \ compareArrays(Catch::getResultCapture().getCurrentTestName(), __LINE__, lhs, \ rhs, precision) +#define COMPARE_VECTORS(lhs, rhs, precision) \ + compareVectors(Catch::getResultCapture().getCurrentTestName(), __LINE__, \ + lhs, rhs, precision) template <typename T> void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv, @@ -32,6 +35,10 @@ template <typename T, size_t N> void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs, std::array<T, N> rhs, float precision); +template <typename T> +void compareVectors(const std::string& test, unsigned line, std::vector<T> lhs, + std::vector<T> rhs, float precision); + void AssertEqual(const aocommon::Matrix4x4& a, const aocommon::Matrix4x4& b, float precision); diff --git a/test/test_convolution.cpp b/test/test_convolution.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4227e3442a4319bc385d77350c60f6c65797e938 --- /dev/null +++ b/test/test_convolution.cpp @@ -0,0 +1,37 @@ +#include <convolution.h> + +#include <catch2/catch_test_macros.hpp> +#include <catch2/matchers/catch_matchers_floating_point.hpp> + +#include "helpers.h" + +TEST_CASE("test convolution", "[float]") { + // This setup will be done 4 times in total, once for each section + size_t width = 16; + size_t height = 32; + + std::vector<float> image(width * height); + std::vector<float> expected_image(width * height); + + std::vector<float> kernel(width * height); + Initialize(image.data(), width, height); + std::copy(image.begin(), image.end(), expected_image.begin()); + Initialize(kernel.data(), width, height); + + ConvolveReference(expected_image.data(), kernel.data(), width, height); + + SECTION("test correctness of reference implementation") { + ConvolveReference(image.data(), kernel.data(), width, height); + COMPARE_VECTORS(expected_image, image, 1.e-5); + } + + SECTION("test correctness of reference implementation twice") { + ConvolveReference(image.data(), kernel.data(), width, height); + COMPARE_VECTORS(expected_image, image, 1.e-5); + } + + SECTION("test correctness of serial implementation") { + ConvolveSerial(image.data(), kernel.data(), width, height); + COMPARE_VECTORS(expected_image, image, 1.e-5); + } +} \ No newline at end of file