Merge branch 'add_test_for_matrix' into 'main'

Add test for matrix See merge request !3

Merge branch 'add_test_for_matrix' into 'main'
a418699e · Mattia Mancini · 0883e58a · 9a1cd4db · a418699e · a418699e
Commit a418699e authored May 14, 2024 by Mattia Mancini
--- a/.gitignore
+++ b/.gitignore
 .vscode
 build
+tmp
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 stages:          # List of stages for jobs, and their order of execution
+  - prepare
  - build
  - test
+  - benchmark
+  - summarize

+docker-base:
+  stage: prepare
+  before_script:
+    - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY
+  
+  # Use the official docker image.
+  image: docker:stable
+  tags:
+    - dind
+  # Default branch leaves tag empty (= latest tag)
+  # All other branches are tagged with the escaped branch name (commit ref slug)
+  script:
+    - cd docker
+    - docker build --pull -t "$CI_REGISTRY_IMAGE:latest" .
+    - docker push "$CI_REGISTRY_IMAGE:latest"
+  only:
+    changes:
+      - docker/Dockerfile
+
+.build_docker:
+  image: "$CI_REGISTRY_IMAGE:latest"
+  before_script:
+    - cmake -B build . -DCMAKE_BUILD_TYPE=Release
+    - make -C build -j
+    - build/microbenchmarks --benchmark_out=results-generic.json --benchmark_out_format=json

 build-job:       # This job runs in the build stage, which runs first.
  stage: build
-  before_script:
-    - module load spack
-    - module load cmake
-  tags:
-    - das6-gpu
+  image: "$CI_REGISTRY_IMAGE:latest"
+  script:
+    - cmake -B build . -DCMAKE_BUILD_TYPE=Release
+    - make -C build -j
+    - build/microbenchmarks --benchmark_out=results-generic.json --benchmark_out_format=json
+
+unittests: 
+  stage: test
+  image: "$CI_REGISTRY_IMAGE:latest"
  script:
    - cmake -B build .
-    - make -C build
+    - make -C build -j
+    - build/unittests

-collect-performance:   # This job runs in the test stage.
+performance:   # This job runs in the test stage.
+  parallel:
+    matrix:
+      - COMPILER_VERSION:
+        - 9.4.0
+        - 12.2.0
+        ARCHITECTURE:
+        - zen2
+        - haswell
  tags:
    - das6-gpu
-  stage: test    # It only starts when the job in the build stage completes successfully.
-  dependencies:
-    - build-job
-  before_script:
-    - module load spack
-    - module load cmake 
+  stage: benchmark    # It only starts when the job in the build stage completes successfully.  
  script:
-    - cmake -B build .
-    - make -C build
-    - build/microbenchmarks --benchmark_out=results.json --benchmark_out_format=json
+    - sbatch --wait -C ${ARCHITECTURE} -o output -e error ci/das6/compile_and_run.sh ${ARCHITECTURE} ${COMPILER_VERSION}
+    - cat output >&1
+    - cat error >&2
  artifacts:
    paths:
-        - ./results.json
+      - ./results*.json
    untracked: false
    when: on_success
    access: all
    expire_in: 1 days

+performance-generic:
+  stage: benchmark
+  image: "$CI_REGISTRY_IMAGE:latest"
+  extends:
+  - .build_docker
+  script:
+    - build/microbenchmarks --benchmark_out=results-generic.json --benchmark_out_format=json
+  artifacts:
+    paths:
+      - ./results*.json
+    untracked: false
+    when: on_success
+    access: all
+    expire_in: 1 days
+
+collect-performance:
+  stage: summarize
+  image: "$CI_REGISTRY_IMAGE:latest"
+  dependencies:
+  - performance
+  artifacts:
+    paths:
+      - ./results*.json
+      - ./result*.png
+  script:
+  - python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary
\ No newline at end of file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,12 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 set(BENCHMARK_ENABLE_GTEST_TESTS
    OFF
    CACHE INTERNAL "Download GTest sources")
+
+set(BENCHMARK_ENABLE_LIBPFM ON CACHE INTERNAL "Enable perf")
+
 include(FetchContent)
+include(CTest)
+
 FetchContent_Declare(
  googlebenchmark
  GIT_REPOSITORY https://github.com/google/benchmark.git
@@ -18,9 +23,50 @@ FetchContent_Declare(
 # Make the fetched content available.
 FetchContent_MakeAvailable(googlebenchmark)

+Include(FetchContent)
+
+FetchContent_Declare(
+  Catch2
+  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+  GIT_TAG        v3.4.0 # or a later release
+)
+
+FetchContent_MakeAvailable(Catch2)
+include(Catch)
+
+set(BENCHMARK_ENABLE_GTEST_TESTS
+    OFF
+    CACHE INTERNAL "Download GTest sources")
+
+# Make aocommon available
+FetchContent_Declare(
+  aocommon
+  GIT_REPOSITORY https://gitlab.com/aroffringa/aocommon.git
+  GIT_TAG master)
+
+FetchContent_MakeAvailable(aocommon)
+
+
+
 # Add the benchmark executable
-add_executable(microbenchmarks benchmarks/matrix_multiplication.cpp)
+file(GLOB BENCHMARK_SOURCES "benchmarks/*.cpp")
+add_executable(microbenchmarks ${BENCHMARK_SOURCES})
+
+file(GLOB TEST_SOURCES "test/*.cpp")
+add_executable(unittests ${TEST_SOURCES})

 # Link against Google Benchmark
 target_link_libraries(microbenchmarks benchmark::benchmark)
+target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include)
+target_include_directories(microbenchmarks PRIVATE code)
 target_compile_options(microbenchmarks PUBLIC "-O3;-march=native;-ggdb;")
+
+
+list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+
+
+catch_discover_tests(unittests WORKING_DIRECTORY)
+target_link_libraries(unittests PRIVATE Catch2::Catch2WithMain)
+target_include_directories(unittests PRIVATE code)
+target_include_directories(unittests PRIVATE ${aocommon_SOURCE_DIR}/include)
+target_compile_options(unittests PUBLIC "-O3;-march=native;-ggdb;")
--- a/benchmarks/matrix_multiplication.cpp
+++ b/benchmarks/matrix_multiplication.cpp
 #include <benchmark/benchmark.h>
-
-#include <complex>
-#include <new>  // For std::align_val_t
-#include <random>
-
-// Function to perform matrix multiplication for 2x2 complex matrices
-void matrixMultiply(const std::complex<float>* A, const std::complex<float>* B,
-                    std::complex<float>* C) {
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 2; ++j) {
-      std::complex<float> sum = 0.0f;
-      for (int k = 0; k < 2; ++k) {
-        sum += A[i * 2 + k] * B[k * 2 + j];
-      }
-      C[i * 2 + j] = sum;
-    }
-  }
-}
-
-inline void multiply_mat(const float* a, const float* b, float* c, float sign) {
-  c[0] += sign * (a[0] * b[0] + a[1] * b[2]);
-  c[1] += sign * (a[0] * b[1] + a[1] * b[3]);
-  c[2] += sign * (a[2] * b[0] + a[3] * b[2]);
-  c[3] += sign * (a[2] * b[1] + a[3] * b[3]);
-}
-
-void matrixMultiplyNaive(const std::complex<float>* a,
-                         const std::complex<float>* b, std::complex<float>* c) {
-  const float a_real[] = {a[0].real(), a[1].real(), a[2].real(), a[3].real()};
-  const float b_real[] = {b[0].real(), b[1].real(), b[2].real(), b[3].real()};
-  const float a_imag[] = {a[0].imag(), a[1].imag(), a[2].imag(), a[3].imag()};
-  const float b_imag[] = {b[0].imag(), b[1].imag(), b[2].imag(), b[3].imag()};
-
-  float c_real[4] = {0, 0, 0, 0};
-  float c_imag[4] = {0, 0, 0, 0};
-
-  multiply_mat(a_real, b_real, c_real, 1.0f);
-  multiply_mat(a_imag, b_imag, c_real, -1.0f);
-  multiply_mat(a_real, b_imag, c_imag, 1.0f);
-  multiply_mat(a_imag, b_real, c_imag, 1.0f);
-
-  c[0] = {c_real[0], c_imag[0]};
-  c[1] = {c_real[1], c_imag[1]};
-  c[2] = {c_real[2], c_imag[2]};
-  c[3] = {c_real[3], c_imag[3]};
-}
-// Initialize matrices with random complex values
-std::random_device rd;
-std::mt19937 gen(rd());
-std::uniform_real_distribution<float> dis(-1.0, 1.0);
-
-inline void Initialize(std::complex<float>* a) {
-  for (int i = 0; i < 4; ++i) {
-    a[i] = std::complex<float>(dis(gen), dis(gen));
-  }
-}
+#include <matrix_multiplication.h>

 class InitializeInput : public benchmark::Fixture {
 public:
@@ -96,4 +41,19 @@ BENCHMARK_F(InitializeInput, MatrixMultiplicationNaive)
  }
 }

+// Using aocommon avx implementation
+BENCHMARK_F(InitializeInput, MatrixMultiplicationAOAvx)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    matrixMultiplyAoCommon(A, B, C);
+  }
+}
+
+// Using direct avx2 implementation
+BENCHMARK_F(InitializeInput, MatrixMultiplicationAvx2)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    matrixMultiplyAVX2(A, B, C);
+  }
+}
 BENCHMARK_MAIN();
--- a/ci/das6/compile_and_run.sh
+++ b/ci/das6/compile_and_run.sh
+#!/bin/bash
+
+set -e
+# compile the code and run it on das6
+# Specify the compiler version and architecture
+
+ARCHITECTURE=$1
+COMPILER_VERSION=$2
+
+
+echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE}
+BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
+module load spack/${COMPILER_VERSION}
+module load cmake
+module load boost
+module load casacore
+cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release
+make -C ${BUILD_DIR} -j
+${BUILD_DIR}/microbenchmarks --benchmark_out=results-${COMPILER_VERSION}-${ARCHITECTURE}.json --benchmark_out_format=json
\ No newline at end of file
--- a/ci/summarize-results.py
+++ b/ci/summarize-results.py
+import json
+import os
+from argparse import ArgumentParser
+import seaborn
+import pandas
+import matplotlib.pyplot as plt
+
+seaborn.set_theme(palette="flare", style="whitegrid")
+
+def parse_args():
+    parser = ArgumentParser(description="Combine benchmark metrics from Google Benchmarks Framework")
+    parser.add_argument("files", nargs="+", help="Metrics")
+    parser.add_argument("output", help="Combined metrics json")
+    parser.add_argument("--filter", help="Filter tests by name")
+    return parser.parse_args()
+
+def read_and_combine_json(files, filter):
+    results = []
+    for f_name in files:
+        basename = os.path.basename(f_name)
+        with open(f_name, 'r') as f_stream:
+            result_obj = json.load(f_stream)
+            results_normalized = result_obj["benchmarks"]
+            for result_normalized in results_normalized:
+                if filter  and filter not in result_normalized["name"]:
+                    continue
+                result_normalized["context"] = result_obj["context"]
+                result_normalized["compiler_version"], result_normalized["architecture"] = basename.replace("results-", "").replace(".json", "").split("-")
+                results.append(result_normalized)
+    return results
+
+def store_combined(outfile, obj):
+    with open(outfile + ".json", "w") as f_stream:
+        json.dump(obj, f_stream, indent=4)
+
+def create_summary_plot(metrics, outplot_name):
+    time_unit = metrics.time_unit[0]
+    
+    grid = seaborn.FacetGrid(metrics, col="architecture")
+    grid.map(seaborn.barplot, "compiler_version", "cpu_time", "name")
+    grid.set_ylabels(f"CPU time({time_unit})")
+    grid.add_legend()
+    grid.savefig(outplot_name + ".png")
+
+def main():
+    args = parse_args()
+    metrics_results = read_and_combine_json(args.files, args.filter)
+    metrics_dataframe= pandas.DataFrame(metrics_results)
+    create_summary_plot(metrics_dataframe, args.output)
+    store_combined(args.output, metrics_results)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/code/matrix_multiplication.h
+++ b/code/matrix_multiplication.h
+#include <aocommon/avx/MatrixComplexFloat2x2.h>
+
+#include <complex>
+#include <iomanip>
+#include <iostream>
+#include <new>  // For std::align_val_t
+#include <random>
+
+// Initialize matrices with random complex values
+std::random_device rd;
+std::mt19937 gen(rd());
+std::uniform_real_distribution<float> dis(-1.0, 1.0);
+
+inline void Initialize(std::complex<float>* a) {
+  for (int i = 0; i < 4; ++i) {
+    a[i] = std::complex<float>(dis(gen), dis(gen));
+  }
+}
+
+// Function to perform matrix multiplication for 2x2 complex matrices
+void matrixMultiply(const std::complex<float>* A, const std::complex<float>* B,
+                    std::complex<float>* C) {
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      std::complex<float> sum = 0.0f;
+      for (int k = 0; k < 2; ++k) {
+        sum += A[i * 2 + k] * B[k * 2 + j];
+      }
+      C[i * 2 + j] = sum;
+    }
+  }
+}
+
+inline void multiply_mat(const float* a, const float* b, float* c, float sign) {
+  c[0] += sign * (a[0] * b[0] + a[1] * b[2]);
+  c[1] += sign * (a[0] * b[1] + a[1] * b[3]);
+  c[2] += sign * (a[2] * b[0] + a[3] * b[2]);
+  c[3] += sign * (a[2] * b[1] + a[3] * b[3]);
+}
+
+void matrixMultiplyNaive(const std::complex<float>* a,
+                         const std::complex<float>* b, std::complex<float>* c) {
+  const float a_real[] = {a[0].real(), a[1].real(), a[2].real(), a[3].real()};
+  const float b_real[] = {b[0].real(), b[1].real(), b[2].real(), b[3].real()};
+  const float a_imag[] = {a[0].imag(), a[1].imag(), a[2].imag(), a[3].imag()};
+  const float b_imag[] = {b[0].imag(), b[1].imag(), b[2].imag(), b[3].imag()};
+
+  float c_real[4] = {0, 0, 0, 0};
+  float c_imag[4] = {0, 0, 0, 0};
+
+  multiply_mat(a_real, b_real, c_real, 1.0f);
+  multiply_mat(a_imag, b_imag, c_real, -1.0f);
+  multiply_mat(a_real, b_imag, c_imag, 1.0f);
+  multiply_mat(a_imag, b_real, c_imag, 1.0f);
+
+  c[0] = {c_real[0], c_imag[0]};
+  c[1] = {c_real[1], c_imag[1]};
+  c[2] = {c_real[2], c_imag[2]};
+  c[3] = {c_real[3], c_imag[3]};
+}
+
+void matrixMultiplyAoCommon(const std::complex<float>* a,
+                            const std::complex<float>* b,
+                            std::complex<float>* c) {
+  const aocommon::avx::MatrixComplexFloat2x2 A(a[0], a[1], a[2], a[3]);
+  const aocommon::avx::MatrixComplexFloat2x2 B(b[0], b[1], b[2], b[3]);
+
+  const aocommon::avx::MatrixComplexFloat2x2 C = A * B;
+  c[0] = C[0];
+  c[1] = C[1];
+  c[2] = C[2];
+  c[3] = C[3];
+}
+
+void matrixMultiplyAVX2(const std::complex<float>* a,
+                        const std::complex<float>* b, std::complex<float>* c) {
+  float* a_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(a));
+  float* b_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(b));
+  float* c_ptr = reinterpret_cast<float*>(c);
+  __m256 a_m = _mm256_load_ps(a_ptr);
+  __m256 b_m = _mm256_load_ps(b_ptr);
+
+  __m256i a_1_ind = _mm256_set_epi32(4, 4, 4, 4, 0, 0, 0, 0);
+  __m256i b_1_ind = _mm256_set_epi32(3, 2, 1, 0, 3, 2, 1, 0);
+
+  __m256i a_2_ind = _mm256_set_epi32(6, 6, 6, 6, 2, 2, 2, 2);
+  __m256i b_2_ind = _mm256_set_epi32(7, 6, 5, 4, 7, 6, 5, 4);
+
+  __m256i a_3_ind = _mm256_set_epi32(5, 5, 5, 5, 1, 1, 1, 1);
+  __m256i b_3_ind = _mm256_set_epi32(2, 3, 0, 1, 2, 3, 0, 1);
+
+  __m256i a_4_ind = _mm256_set_epi32(7, 7, 7, 7, 3, 3, 3, 3);
+  __m256i b_4_ind = _mm256_set_epi32(6, 7, 4, 5, 6, 7, 4, 5);
+
+  __m256 inv = _mm256_set_ps(1., -1., 1., -1., 1., -1., 1., -1.);
+  __m256 a_1 = _mm256_permutevar8x32_ps(a_m, a_1_ind);
+  __m256 b_1 = _mm256_permutevar8x32_ps(b_m, b_1_ind);
+
+  __m256 a_2 = _mm256_permutevar8x32_ps(a_m, a_2_ind);
+  __m256 b_2 = _mm256_permutevar8x32_ps(b_m, b_2_ind);
+
+  __m256 a_3 = _mm256_permutevar8x32_ps(a_m, a_3_ind);
+  __m256 b_3 = _mm256_permutevar8x32_ps(b_m, b_3_ind);
+
+  __m256 a_4 = _mm256_permutevar8x32_ps(a_m, a_4_ind);
+  __m256 b_4 = _mm256_permutevar8x32_ps(b_m, b_4_ind);
+
+  __m256 c_p1 = _mm256_fmaddsub_ps(a_1, b_1, _mm256_mul_ps(a_3, b_3));
+  __m256 c_p2 = _mm256_fmaddsub_ps(a_2, b_2, _mm256_mul_ps(a_4, b_4));
+  __m256 c_m = _mm256_add_ps(c_p1, c_p2);
+  _mm256_store_ps(c_ptr, c_m);
+}
\ No newline at end of file
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM ubuntu:latest
+
+RUN apt-get update -qq &&\
+    export DEBIAN_FRONTEND=noninteractive && apt-get install -y -qq \
+    casacore-data casacore-dev casacore-tools \
+    cmake \
+    g++ \
+    g++-12 \
+    git \
+    libblas-dev liblapack-dev \
+    libboost-date-time-dev \
+    libboost-test-dev \
+    libboost-dev \
+    libcfitsio-dev \
+    libfftw3-dev \
+    libgsl-dev \
+    libhdf5-dev \
+    libopenmpi-dev \
+    libpython3-dev \
+    pkg-config \
+    python3-dev python3-numpy \
+    python3-sphinx \
+    python3-pip \
+    python3-seaborn \ 
+    python3-pandas
\ No newline at end of file
--- a/test/helpers.h
+++ b/test/helpers.h
+#ifndef HELPERS
+
+#define HELPERS
+
+#include <array>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+#include <string>
+#include <vector>
+
+#define COMPARE_ARRAYS(lhs, rhs, precision)                                    \
+  compareArrays(Catch::getResultCapture().getCurrentTestName(), __LINE__, lhs, \
+                rhs, precision)
+
+template <typename T>
+void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv,
+                   float precision) {
+  REQUIRE_THAT(lv, Catch::Matchers::WithinAbs(rv, precision));
+}
+
+template <>
+void compareSingle(const std::vector<std::complex<float>>& lv,
+                   const std::vector<std::complex<float>>& rv,
+                   float precision) {
+  for (size_t idx = 0; idx < lv.size(); idx++) {
+    const auto le = lv[idx];
+    const auto re = rv[idx];
+
+    REQUIRE_THAT(le.real(), Catch::Matchers::WithinAbs(re.real(), precision));
+    REQUIRE_THAT(le.imag(), Catch::Matchers::WithinAbs(re.imag(), precision));
+  }
+}
+
+template <typename T>
+std::string valueToString(const T& value) {
+  return std::to_string(value);
+}
+
+std::string valueToString(const std::complex<float>& value) {
+  return std::to_string(value.real()) + ", " + std::to_string(value.imag()) +
+         "j";
+}
+
+template <typename T, size_t N>
+void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
+                   std::array<T, N> rhs, float precision) {
+  std::vector<T> lv(lhs.begin(), lhs.end());
+  std::vector<T> rv(rhs.begin(), rhs.end());
+  INFO("Test case [" << test << "] failed at line "
+                     << line);  // Reported only if REQUIRE fails
+
+  std::stringstream ss;
+  ss << "Expected : \n";
+  for (size_t idx = 0; idx < N; idx++) {
+    ss << valueToString(lhs[idx]) << "\t";
+  }
+
+  ss << "\nObtained : \n";
+  for (size_t idx = 0; idx < N; idx++) {
+    ss << valueToString(rhs[idx]) << "\t";
+  }
+  ss << "\n";
+  INFO("Reason: \n" << ss.str());
+  compareSingle(lv, rv, precision);
+}
+
+#endif
\ No newline at end of file
--- a/test/test_matrix_multiplication.cpp
+++ b/test/test_matrix_multiplication.cpp
+#include <matrix_multiplication.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "helpers.h"
+
+TEST_CASE("test complex matrix multiplication", "[float]") {
+  // This setup will be done 4 times in total, once for each section
+  std::array<std::complex<float>, 4> A;
+  std::array<std::complex<float>, 4> B;
+  std::array<std::complex<float>, 4> C;
+
+  std::array<std::complex<float>, 4> C_expected;
+
+  Initialize(A.data());
+  Initialize(B.data());
+
+  matrixMultiply(A.data(), B.data(), C_expected.data());
+
+  SECTION("test correctness of naive implementation") {
+    matrixMultiplyNaive(A.data(), B.data(), C.data());
+
+    COMPARE_ARRAYS(C_expected, C, 1.e-6);
+  }
+
+  SECTION("test correctness of aocommon implementation") {
+    matrixMultiplyAoCommon(A.data(), B.data(), C.data());
+
+    COMPARE_ARRAYS(C_expected, C, 1.e-6);
+  }
+
+  SECTION("test correctness of avx2 implementation") {
+    matrixMultiplyAVX2(A.data(), B.data(), C.data());
+
+    COMPARE_ARRAYS(C_expected, C, 1.e-6);
+  }
+}
\ No newline at end of file
--- a/test/tests.cpp
+++ b/test/tests.cpp
+#define CATCH_CONFIG_MAIN  // This tells Catch to provide a main() - only do
+                           // this in one cpp file
+#include <catch2/catch_all.hpp>
\ No newline at end of file