Skip to content
Snippets Groups Projects
Commit a418699e authored by Mattia Mancini's avatar Mattia Mancini
Browse files

Merge branch 'add_test_for_matrix' into 'main'

Add test for matrix

See merge request !3
parents 0883e58a 9a1cd4db
No related branches found
No related tags found
1 merge request!3Add test for matrix
Pipeline #81796 passed
.vscode
build
tmp
stages: # List of stages for jobs, and their order of execution
- prepare
- build
- test
- benchmark
- summarize
docker-base:
stage: prepare
before_script:
- docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY
# Use the official docker image.
image: docker:stable
tags:
- dind
# Default branch leaves tag empty (= latest tag)
# All other branches are tagged with the escaped branch name (commit ref slug)
script:
- cd docker
- docker build --pull -t "$CI_REGISTRY_IMAGE:latest" .
- docker push "$CI_REGISTRY_IMAGE:latest"
only:
changes:
- docker/Dockerfile
.build_docker:
image: "$CI_REGISTRY_IMAGE:latest"
before_script:
- cmake -B build . -DCMAKE_BUILD_TYPE=Release
- make -C build -j
- build/microbenchmarks --benchmark_out=results-generic.json --benchmark_out_format=json
build-job: # This job runs in the build stage, which runs first.
stage: build
before_script:
- module load spack
- module load cmake
tags:
- das6-gpu
image: "$CI_REGISTRY_IMAGE:latest"
script:
- cmake -B build . -DCMAKE_BUILD_TYPE=Release
- make -C build -j
- build/microbenchmarks --benchmark_out=results-generic.json --benchmark_out_format=json
unittests:
stage: test
image: "$CI_REGISTRY_IMAGE:latest"
script:
- cmake -B build .
- make -C build
- make -C build -j
- build/unittests
collect-performance: # This job runs in the test stage.
performance: # This job runs in the test stage.
parallel:
matrix:
- COMPILER_VERSION:
- 9.4.0
- 12.2.0
ARCHITECTURE:
- zen2
- haswell
tags:
- das6-gpu
stage: test # It only starts when the job in the build stage completes successfully.
dependencies:
- build-job
before_script:
- module load spack
- module load cmake
stage: benchmark # It only starts when the job in the build stage completes successfully.
script:
- cmake -B build .
- make -C build
- build/microbenchmarks --benchmark_out=results.json --benchmark_out_format=json
- sbatch --wait -C ${ARCHITECTURE} -o output -e error ci/das6/compile_and_run.sh ${ARCHITECTURE} ${COMPILER_VERSION}
- cat output >&1
- cat error >&2
artifacts:
paths:
- ./results.json
- ./results*.json
untracked: false
when: on_success
access: all
expire_in: 1 days
performance-generic:
stage: benchmark
image: "$CI_REGISTRY_IMAGE:latest"
extends:
- .build_docker
script:
- build/microbenchmarks --benchmark_out=results-generic.json --benchmark_out_format=json
artifacts:
paths:
- ./results*.json
untracked: false
when: on_success
access: all
expire_in: 1 days
collect-performance:
stage: summarize
image: "$CI_REGISTRY_IMAGE:latest"
dependencies:
- performance
artifacts:
paths:
- ./results*.json
- ./result*.png
script:
- python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary
\ No newline at end of file
......@@ -8,7 +8,12 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
set(BENCHMARK_ENABLE_GTEST_TESTS
OFF
CACHE INTERNAL "Download GTest sources")
set(BENCHMARK_ENABLE_LIBPFM ON CACHE INTERNAL "Enable perf")
include(FetchContent)
include(CTest)
FetchContent_Declare(
googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
......@@ -18,9 +23,50 @@ FetchContent_Declare(
# Make the fetched content available.
FetchContent_MakeAvailable(googlebenchmark)
Include(FetchContent)
FetchContent_Declare(
Catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
GIT_TAG v3.4.0 # or a later release
)
FetchContent_MakeAvailable(Catch2)
include(Catch)
set(BENCHMARK_ENABLE_GTEST_TESTS
OFF
CACHE INTERNAL "Download GTest sources")
# Make aocommon available
FetchContent_Declare(
aocommon
GIT_REPOSITORY https://gitlab.com/aroffringa/aocommon.git
GIT_TAG master)
FetchContent_MakeAvailable(aocommon)
# Add the benchmark executable
add_executable(microbenchmarks benchmarks/matrix_multiplication.cpp)
file(GLOB BENCHMARK_SOURCES "benchmarks/*.cpp")
add_executable(microbenchmarks ${BENCHMARK_SOURCES})
file(GLOB TEST_SOURCES "test/*.cpp")
add_executable(unittests ${TEST_SOURCES})
# Link against Google Benchmark
target_link_libraries(microbenchmarks benchmark::benchmark)
target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include)
target_include_directories(microbenchmarks PRIVATE code)
target_compile_options(microbenchmarks PUBLIC "-O3;-march=native;-ggdb;")
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
catch_discover_tests(unittests WORKING_DIRECTORY)
target_link_libraries(unittests PRIVATE Catch2::Catch2WithMain)
target_include_directories(unittests PRIVATE code)
target_include_directories(unittests PRIVATE ${aocommon_SOURCE_DIR}/include)
target_compile_options(unittests PUBLIC "-O3;-march=native;-ggdb;")
#include <benchmark/benchmark.h>
#include <complex>
#include <new> // For std::align_val_t
#include <random>
// Function to perform matrix multiplication for 2x2 complex matrices
void matrixMultiply(const std::complex<float>* A, const std::complex<float>* B,
std::complex<float>* C) {
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 2; ++j) {
std::complex<float> sum = 0.0f;
for (int k = 0; k < 2; ++k) {
sum += A[i * 2 + k] * B[k * 2 + j];
}
C[i * 2 + j] = sum;
}
}
}
inline void multiply_mat(const float* a, const float* b, float* c, float sign) {
c[0] += sign * (a[0] * b[0] + a[1] * b[2]);
c[1] += sign * (a[0] * b[1] + a[1] * b[3]);
c[2] += sign * (a[2] * b[0] + a[3] * b[2]);
c[3] += sign * (a[2] * b[1] + a[3] * b[3]);
}
void matrixMultiplyNaive(const std::complex<float>* a,
const std::complex<float>* b, std::complex<float>* c) {
const float a_real[] = {a[0].real(), a[1].real(), a[2].real(), a[3].real()};
const float b_real[] = {b[0].real(), b[1].real(), b[2].real(), b[3].real()};
const float a_imag[] = {a[0].imag(), a[1].imag(), a[2].imag(), a[3].imag()};
const float b_imag[] = {b[0].imag(), b[1].imag(), b[2].imag(), b[3].imag()};
float c_real[4] = {0, 0, 0, 0};
float c_imag[4] = {0, 0, 0, 0};
multiply_mat(a_real, b_real, c_real, 1.0f);
multiply_mat(a_imag, b_imag, c_real, -1.0f);
multiply_mat(a_real, b_imag, c_imag, 1.0f);
multiply_mat(a_imag, b_real, c_imag, 1.0f);
c[0] = {c_real[0], c_imag[0]};
c[1] = {c_real[1], c_imag[1]};
c[2] = {c_real[2], c_imag[2]};
c[3] = {c_real[3], c_imag[3]};
}
// Initialize matrices with random complex values
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(-1.0, 1.0);
inline void Initialize(std::complex<float>* a) {
for (int i = 0; i < 4; ++i) {
a[i] = std::complex<float>(dis(gen), dis(gen));
}
}
#include <matrix_multiplication.h>
class InitializeInput : public benchmark::Fixture {
public:
......@@ -96,4 +41,19 @@ BENCHMARK_F(InitializeInput, MatrixMultiplicationNaive)
}
}
// Using aocommon avx implementation
BENCHMARK_F(InitializeInput, MatrixMultiplicationAOAvx)
(benchmark::State& state) {
for (auto _ : state) {
matrixMultiplyAoCommon(A, B, C);
}
}
// Using direct avx2 implementation
BENCHMARK_F(InitializeInput, MatrixMultiplicationAvx2)
(benchmark::State& state) {
for (auto _ : state) {
matrixMultiplyAVX2(A, B, C);
}
}
BENCHMARK_MAIN();
#!/bin/bash
set -e
# compile the code and run it on das6
# Specify the compiler version and architecture
ARCHITECTURE=$1
COMPILER_VERSION=$2
echo RUNNING ON ${COMPILER_VERSION} AND ${ARCHITECTURE}
BUILD_DIR=build-${COMPILER_VERSION}-${ARCHITECTURE}
module load spack/${COMPILER_VERSION}
module load cmake
module load boost
module load casacore
cmake -B ${BUILD_DIR} . -DCMAKE_BUILD_TYPE=Release
make -C ${BUILD_DIR} -j
${BUILD_DIR}/microbenchmarks --benchmark_out=results-${COMPILER_VERSION}-${ARCHITECTURE}.json --benchmark_out_format=json
\ No newline at end of file
import json
import os
from argparse import ArgumentParser
import seaborn
import pandas
import matplotlib.pyplot as plt
seaborn.set_theme(palette="flare", style="whitegrid")
def parse_args():
parser = ArgumentParser(description="Combine benchmark metrics from Google Benchmarks Framework")
parser.add_argument("files", nargs="+", help="Metrics")
parser.add_argument("output", help="Combined metrics json")
parser.add_argument("--filter", help="Filter tests by name")
return parser.parse_args()
def read_and_combine_json(files, filter):
results = []
for f_name in files:
basename = os.path.basename(f_name)
with open(f_name, 'r') as f_stream:
result_obj = json.load(f_stream)
results_normalized = result_obj["benchmarks"]
for result_normalized in results_normalized:
if filter and filter not in result_normalized["name"]:
continue
result_normalized["context"] = result_obj["context"]
result_normalized["compiler_version"], result_normalized["architecture"] = basename.replace("results-", "").replace(".json", "").split("-")
results.append(result_normalized)
return results
def store_combined(outfile, obj):
with open(outfile + ".json", "w") as f_stream:
json.dump(obj, f_stream, indent=4)
def create_summary_plot(metrics, outplot_name):
time_unit = metrics.time_unit[0]
grid = seaborn.FacetGrid(metrics, col="architecture")
grid.map(seaborn.barplot, "compiler_version", "cpu_time", "name")
grid.set_ylabels(f"CPU time({time_unit})")
grid.add_legend()
grid.savefig(outplot_name + ".png")
def main():
args = parse_args()
metrics_results = read_and_combine_json(args.files, args.filter)
metrics_dataframe= pandas.DataFrame(metrics_results)
create_summary_plot(metrics_dataframe, args.output)
store_combined(args.output, metrics_results)
if __name__ == '__main__':
main()
\ No newline at end of file
#include <aocommon/avx/MatrixComplexFloat2x2.h>
#include <complex>
#include <iomanip>
#include <iostream>
#include <new> // For std::align_val_t
#include <random>
// Initialize matrices with random complex values
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(-1.0, 1.0);
inline void Initialize(std::complex<float>* a) {
for (int i = 0; i < 4; ++i) {
a[i] = std::complex<float>(dis(gen), dis(gen));
}
}
// Function to perform matrix multiplication for 2x2 complex matrices
void matrixMultiply(const std::complex<float>* A, const std::complex<float>* B,
std::complex<float>* C) {
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 2; ++j) {
std::complex<float> sum = 0.0f;
for (int k = 0; k < 2; ++k) {
sum += A[i * 2 + k] * B[k * 2 + j];
}
C[i * 2 + j] = sum;
}
}
}
inline void multiply_mat(const float* a, const float* b, float* c, float sign) {
c[0] += sign * (a[0] * b[0] + a[1] * b[2]);
c[1] += sign * (a[0] * b[1] + a[1] * b[3]);
c[2] += sign * (a[2] * b[0] + a[3] * b[2]);
c[3] += sign * (a[2] * b[1] + a[3] * b[3]);
}
void matrixMultiplyNaive(const std::complex<float>* a,
const std::complex<float>* b, std::complex<float>* c) {
const float a_real[] = {a[0].real(), a[1].real(), a[2].real(), a[3].real()};
const float b_real[] = {b[0].real(), b[1].real(), b[2].real(), b[3].real()};
const float a_imag[] = {a[0].imag(), a[1].imag(), a[2].imag(), a[3].imag()};
const float b_imag[] = {b[0].imag(), b[1].imag(), b[2].imag(), b[3].imag()};
float c_real[4] = {0, 0, 0, 0};
float c_imag[4] = {0, 0, 0, 0};
multiply_mat(a_real, b_real, c_real, 1.0f);
multiply_mat(a_imag, b_imag, c_real, -1.0f);
multiply_mat(a_real, b_imag, c_imag, 1.0f);
multiply_mat(a_imag, b_real, c_imag, 1.0f);
c[0] = {c_real[0], c_imag[0]};
c[1] = {c_real[1], c_imag[1]};
c[2] = {c_real[2], c_imag[2]};
c[3] = {c_real[3], c_imag[3]};
}
void matrixMultiplyAoCommon(const std::complex<float>* a,
const std::complex<float>* b,
std::complex<float>* c) {
const aocommon::avx::MatrixComplexFloat2x2 A(a[0], a[1], a[2], a[3]);
const aocommon::avx::MatrixComplexFloat2x2 B(b[0], b[1], b[2], b[3]);
const aocommon::avx::MatrixComplexFloat2x2 C = A * B;
c[0] = C[0];
c[1] = C[1];
c[2] = C[2];
c[3] = C[3];
}
void matrixMultiplyAVX2(const std::complex<float>* a,
const std::complex<float>* b, std::complex<float>* c) {
float* a_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(a));
float* b_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(b));
float* c_ptr = reinterpret_cast<float*>(c);
__m256 a_m = _mm256_load_ps(a_ptr);
__m256 b_m = _mm256_load_ps(b_ptr);
__m256i a_1_ind = _mm256_set_epi32(4, 4, 4, 4, 0, 0, 0, 0);
__m256i b_1_ind = _mm256_set_epi32(3, 2, 1, 0, 3, 2, 1, 0);
__m256i a_2_ind = _mm256_set_epi32(6, 6, 6, 6, 2, 2, 2, 2);
__m256i b_2_ind = _mm256_set_epi32(7, 6, 5, 4, 7, 6, 5, 4);
__m256i a_3_ind = _mm256_set_epi32(5, 5, 5, 5, 1, 1, 1, 1);
__m256i b_3_ind = _mm256_set_epi32(2, 3, 0, 1, 2, 3, 0, 1);
__m256i a_4_ind = _mm256_set_epi32(7, 7, 7, 7, 3, 3, 3, 3);
__m256i b_4_ind = _mm256_set_epi32(6, 7, 4, 5, 6, 7, 4, 5);
__m256 inv = _mm256_set_ps(1., -1., 1., -1., 1., -1., 1., -1.);
__m256 a_1 = _mm256_permutevar8x32_ps(a_m, a_1_ind);
__m256 b_1 = _mm256_permutevar8x32_ps(b_m, b_1_ind);
__m256 a_2 = _mm256_permutevar8x32_ps(a_m, a_2_ind);
__m256 b_2 = _mm256_permutevar8x32_ps(b_m, b_2_ind);
__m256 a_3 = _mm256_permutevar8x32_ps(a_m, a_3_ind);
__m256 b_3 = _mm256_permutevar8x32_ps(b_m, b_3_ind);
__m256 a_4 = _mm256_permutevar8x32_ps(a_m, a_4_ind);
__m256 b_4 = _mm256_permutevar8x32_ps(b_m, b_4_ind);
__m256 c_p1 = _mm256_fmaddsub_ps(a_1, b_1, _mm256_mul_ps(a_3, b_3));
__m256 c_p2 = _mm256_fmaddsub_ps(a_2, b_2, _mm256_mul_ps(a_4, b_4));
__m256 c_m = _mm256_add_ps(c_p1, c_p2);
_mm256_store_ps(c_ptr, c_m);
}
\ No newline at end of file
FROM ubuntu:latest
RUN apt-get update -qq &&\
export DEBIAN_FRONTEND=noninteractive && apt-get install -y -qq \
casacore-data casacore-dev casacore-tools \
cmake \
g++ \
g++-12 \
git \
libblas-dev liblapack-dev \
libboost-date-time-dev \
libboost-test-dev \
libboost-dev \
libcfitsio-dev \
libfftw3-dev \
libgsl-dev \
libhdf5-dev \
libopenmpi-dev \
libpython3-dev \
pkg-config \
python3-dev python3-numpy \
python3-sphinx \
python3-pip \
python3-seaborn \
python3-pandas
\ No newline at end of file
#ifndef HELPERS
#define HELPERS
#include <array>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <string>
#include <vector>
#define COMPARE_ARRAYS(lhs, rhs, precision) \
compareArrays(Catch::getResultCapture().getCurrentTestName(), __LINE__, lhs, \
rhs, precision)
template <typename T>
void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv,
float precision) {
REQUIRE_THAT(lv, Catch::Matchers::WithinAbs(rv, precision));
}
template <>
void compareSingle(const std::vector<std::complex<float>>& lv,
const std::vector<std::complex<float>>& rv,
float precision) {
for (size_t idx = 0; idx < lv.size(); idx++) {
const auto le = lv[idx];
const auto re = rv[idx];
REQUIRE_THAT(le.real(), Catch::Matchers::WithinAbs(re.real(), precision));
REQUIRE_THAT(le.imag(), Catch::Matchers::WithinAbs(re.imag(), precision));
}
}
template <typename T>
std::string valueToString(const T& value) {
return std::to_string(value);
}
std::string valueToString(const std::complex<float>& value) {
return std::to_string(value.real()) + ", " + std::to_string(value.imag()) +
"j";
}
template <typename T, size_t N>
void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
std::array<T, N> rhs, float precision) {
std::vector<T> lv(lhs.begin(), lhs.end());
std::vector<T> rv(rhs.begin(), rhs.end());
INFO("Test case [" << test << "] failed at line "
<< line); // Reported only if REQUIRE fails
std::stringstream ss;
ss << "Expected : \n";
for (size_t idx = 0; idx < N; idx++) {
ss << valueToString(lhs[idx]) << "\t";
}
ss << "\nObtained : \n";
for (size_t idx = 0; idx < N; idx++) {
ss << valueToString(rhs[idx]) << "\t";
}
ss << "\n";
INFO("Reason: \n" << ss.str());
compareSingle(lv, rv, precision);
}
#endif
\ No newline at end of file
#include <matrix_multiplication.h>
#include <catch2/catch_test_macros.hpp>
#include "helpers.h"
TEST_CASE("test complex matrix multiplication", "[float]") {
// This setup will be done 4 times in total, once for each section
std::array<std::complex<float>, 4> A;
std::array<std::complex<float>, 4> B;
std::array<std::complex<float>, 4> C;
std::array<std::complex<float>, 4> C_expected;
Initialize(A.data());
Initialize(B.data());
matrixMultiply(A.data(), B.data(), C_expected.data());
SECTION("test correctness of naive implementation") {
matrixMultiplyNaive(A.data(), B.data(), C.data());
COMPARE_ARRAYS(C_expected, C, 1.e-6);
}
SECTION("test correctness of aocommon implementation") {
matrixMultiplyAoCommon(A.data(), B.data(), C.data());
COMPARE_ARRAYS(C_expected, C, 1.e-6);
}
SECTION("test correctness of avx2 implementation") {
matrixMultiplyAVX2(A.data(), B.data(), C.data());
COMPARE_ARRAYS(C_expected, C, 1.e-6);
}
}
\ No newline at end of file
#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do
// this in one cpp file
#include <catch2/catch_all.hpp>
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment