Skip to content
Snippets Groups Projects
Commit 35b0cf31 authored by Mattia Mancini's avatar Mattia Mancini
Browse files

Merge branch 'test_hermitian_implementation' into 'main'

Test hermitian implementation

See merge request !13
parents 631b5b33 c486faba
No related branches found
No related tags found
1 merge request!13Test hermitian implementation
Pipeline #106634 passed with warnings
Showing
with 679 additions and 49 deletions
...@@ -84,6 +84,7 @@ performance-jetson: # This job runs in the test stage. ...@@ -84,6 +84,7 @@ performance-jetson: # This job runs in the test stage.
- sbatch --wait -p jetsonq -w orin02 -o output.txt -e error.txt ci/das6/compile_and_run_jetson.sh jetson_arm64 11.4.0 - sbatch --wait -p jetsonq -w orin02 -o output.txt -e error.txt ci/das6/compile_and_run_jetson.sh jetson_arm64 11.4.0
- cat output.txt >&1 - cat output.txt >&1
- cat error.txt >&2 - cat error.txt >&2
allow_failure: true
artifacts: artifacts:
paths: paths:
...@@ -127,4 +128,7 @@ collect-performance: ...@@ -127,4 +128,7 @@ collect-performance:
- ./*.tar - ./*.tar
script: script:
- ls -la - ls -la
- python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary - python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary-matrix-multiplication
- python3 ci/summarize-results.py --filter HermitianSquare results*.json result-summary-hermitian-square
- python3 ci/summarize-results.py --filter KroneckerSquare results*.json result-summary-kronecker-square
...@@ -50,6 +50,7 @@ FetchContent_Declare( ...@@ -50,6 +50,7 @@ FetchContent_Declare(
FetchContent_Populate(aocommon) FetchContent_Populate(aocommon)
set(COMPILER_FLAGS "-O3;-march=native;-ggdb;") set(COMPILER_FLAGS "-O3;-march=native;-ggdb;")
# List all kernel code # List all kernel code
file(GLOB KERNEL_SOURCES "code/*.cpp") file(GLOB KERNEL_SOURCES "code/*.cpp")
# Add the benchmark executable # Add the benchmark executable
...@@ -59,12 +60,17 @@ add_executable(microbenchmarks ${BENCHMARK_SOURCES} ${KERNEL_SOURCES}) ...@@ -59,12 +60,17 @@ add_executable(microbenchmarks ${BENCHMARK_SOURCES} ${KERNEL_SOURCES})
file(GLOB TEST_SOURCES "test/*.cpp") file(GLOB TEST_SOURCES "test/*.cpp")
add_executable(unittests ${TEST_SOURCES} ${KERNEL_SOURCES}) add_executable(unittests ${TEST_SOURCES} ${KERNEL_SOURCES})
find_package(OpenMP)
# Link against Google Benchmark # Link against Google Benchmark
target_link_libraries(microbenchmarks benchmark::benchmark) target_link_libraries(microbenchmarks PRIVATE benchmark::benchmark)
target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include) target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include)
target_include_directories(microbenchmarks PRIVATE code) target_include_directories(microbenchmarks PRIVATE code)
target_compile_options(microbenchmarks PUBLIC ${COMPILER_FLAGS}) target_compile_options(microbenchmarks PUBLIC ${COMPILER_FLAGS})
if(OpenMP_CXX_FOUND)
target_link_libraries(microbenchmarks PRIVATE OpenMP::OpenMP_CXX)
target_link_libraries(unittests PRIVATE OpenMP::OpenMP_CXX)
endif()
include(precompile) include(precompile)
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
......
#include <benchmark/benchmark.h>
#include <hermitian_square.h>
#include <iostream>
namespace {
class InitializeInput : public benchmark::Fixture {
public:
void SetUp(::benchmark::State& state) {
A = std::make_unique<aocommon::Matrix4x4>();
InitializeInput(*A);
}
void TearDown(::benchmark::State& state) { A.reset(); }
std::unique_ptr<aocommon::Matrix4x4> A;
};
} // namespace
// Reference standard
BENCHMARK_F(InitializeInput, HermitianSquare)
(benchmark::State& state) {
for (auto _ : state) {
HermitianSquare(*A);
}
}
BENCHMARK_F(InitializeInput, HermitianSquareRefactored)
(benchmark::State& state) {
for (auto _ : state) {
HermitianSquareRefactored(*A);
}
}
BENCHMARK_F(InitializeInput, HermitianSquareNaiveSIMD)
(benchmark::State& state) {
for (auto _ : state) {
HermitianSquareNaiveSIMD(*A);
}
}
BENCHMARK_F(InitializeInput, HermitianSquareNaiveSIMDv2)
(benchmark::State& state) {
for (auto _ : state) {
HermitianSquareNaiveSIMDv2(*A);
}
}
#include <benchmark/benchmark.h>
#include <kronecker_square.h>
#include <iostream>
namespace {
class InitializeInput : public benchmark::Fixture {
public:
void SetUp(::benchmark::State& state) {
A = std::make_unique<aocommon::MC2x2>();
B = std::make_unique<aocommon::MC2x2>();
InitializeInput(*A);
InitializeInput(*B);
}
void TearDown(::benchmark::State& state) {
A.reset();
B.reset();
}
std::unique_ptr<aocommon::MC2x2> A, B;
};
} // namespace
// Reference implementation
BENCHMARK_F(InitializeInput, KroneckerSquareReference)
(benchmark::State& state) {
for (auto _ : state) {
aocommon::HMC4x4 r = aocommon::HMC4x4::Zero();
for (size_t i = 0; i < 100000000; i++) {
r += KroneckerSquareReference(*A, *B);
}
}
}
// Fused implementation
BENCHMARK_F(InitializeInput, KroneckerSquareFused)
(benchmark::State& state) {
for (auto _ : state) {
aocommon::HMC4x4 r = aocommon::HMC4x4::Zero();
for (size_t i = 0; i < 100000000; i++) {
r += KroneckerSquareFused(*A, *B);
}
}
}
#include <benchmark/benchmark.h>
BENCHMARK_MAIN();
...@@ -109,5 +109,3 @@ BENCHMARK_F(InitializeInput, MatrixMultiplicationSSE) ...@@ -109,5 +109,3 @@ BENCHMARK_F(InitializeInput, MatrixMultiplicationSSE)
} }
} }
#endif #endif
BENCHMARK_MAIN();
...@@ -5,7 +5,7 @@ import seaborn ...@@ -5,7 +5,7 @@ import seaborn
import pandas import pandas
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
seaborn.set_theme(palette="flare", style="whitegrid") seaborn.set_theme(palette="bright", style="whitegrid")
def parse_args(): def parse_args():
parser = ArgumentParser(description="Combine benchmark metrics from Google Benchmarks Framework") parser = ArgumentParser(description="Combine benchmark metrics from Google Benchmarks Framework")
......
...@@ -8,9 +8,10 @@ macro (add_precompile_target) ...@@ -8,9 +8,10 @@ macro (add_precompile_target)
list(TRANSFORM apt_INCLUDE_DIRS PREPEND -I) list(TRANSFORM apt_INCLUDE_DIRS PREPEND -I)
add_custom_command(OUTPUT ${apt_TARGET_NAME}.s add_custom_command(OUTPUT ${apt_TARGET_NAME}.s
COMMAND ${CMAKE_CXX_COMPILER} -std=c++${CMAKE_CXX_STANDARD} ${CMAKE_CXX_COMPLERS_ARG} ${apt_COMPILER_FLAGS} ${apt_SOURCES} ${apt_INCLUDE_DIRS} -S -o ${apt_TARGET_NAME}.s COMMAND ${CMAKE_CXX_COMPILER} -std=c++${CMAKE_CXX_STANDARD} ${CMAKE_CXX_COMPLERS_ARG} ${apt_COMPILER_FLAGS} ${apt_SOURCES} ${apt_INCLUDE_DIRS} -S -fverbose-asm -Wa,aslh -o ${apt_TARGET_NAME}.s
DEPENDS ${apt_SOURCES} DEPENDS ${apt_SOURCES}
) )
add_custom_target(precompile_${apt_TARGET_NAME} ALL add_custom_target(precompile_${apt_TARGET_NAME} ALL
DEPENDS ${apt_TARGET_NAME}.s DEPENDS ${apt_TARGET_NAME}.s
COMMENT "Adding precompile target ${KERNEL_NAME} with source ${KERNEL_SOURCE}") COMMENT "Adding precompile target ${KERNEL_NAME} with source ${KERNEL_SOURCE}")
......
#include "hermitian_square.h"
aocommon::HMatrix4x4 HermitianSquareNaiveSIMD(const aocommon::Matrix4x4& mat) {
const double* mat_pointer = reinterpret_cast<const double*>(&mat[0]);
double tmp_real_col0[4];
double tmp_real_col1[4];
double tmp_real_col2[4];
double tmp_real_col3[4];
double tmp_imag_col0[4];
double tmp_imag_col1[4];
double tmp_imag_col2[4];
double tmp_imag_col3[4];
double c1c0_r[4], c2c1_r[4], c2c0_r[4], c3c2_r[4], c3c1_r[4], c3c0_r[4];
double c1c0_i[4], c2c1_i[4], c2c0_i[4], c3c2_i[4], c3c1_i[4], c3c0_i[4];
double tmp_real_uf[6] = {0, 0, 0, 0, 0, 0};
double tmp_imag_uf[6] = {0, 0, 0, 0, 0, 0};
double tmp_diag[4];
memset(tmp_diag, 0, 4 * sizeof(double));
#pragma omp simd
for (int k = 0; k < 4; k++) {
tmp_real_col0[k] = mat_pointer[2 * (4 * k + 0)];
tmp_real_col1[k] = mat_pointer[2 * (4 * k + 1)];
tmp_real_col2[k] = mat_pointer[2 * (4 * k + 2)];
tmp_real_col3[k] = mat_pointer[2 * (4 * k + 3)];
tmp_imag_col0[k] = mat_pointer[2 * (4 * k + 0) + 1];
tmp_imag_col1[k] = mat_pointer[2 * (4 * k + 1) + 1];
tmp_imag_col2[k] = mat_pointer[2 * (4 * k + 2) + 1];
tmp_imag_col3[k] = mat_pointer[2 * (4 * k + 3) + 1];
}
#pragma omp simd
for (int k = 0; k < 4; k++) {
tmp_diag[0] += ((tmp_real_col0[k] * tmp_real_col0[k]) +
(tmp_imag_col0[k] * tmp_imag_col0[k]));
tmp_diag[1] += ((tmp_real_col1[k] * tmp_real_col1[k]) +
(tmp_imag_col1[k] * tmp_imag_col1[k]));
tmp_diag[2] += ((tmp_real_col2[k] * tmp_real_col2[k]) +
(tmp_imag_col2[k] * tmp_imag_col2[k]));
tmp_diag[3] += ((tmp_real_col3[k] * tmp_real_col3[k]) +
(tmp_imag_col3[k] * tmp_imag_col3[k]));
}
#pragma omp simd
for (int k = 0; k < 4; k++) {
c1c0_r[k] = tmp_real_col1[k] * tmp_real_col0[k] +
tmp_imag_col1[k] * tmp_imag_col0[k];
c2c0_r[k] = tmp_real_col2[k] * tmp_real_col0[k] +
tmp_imag_col2[k] * tmp_imag_col0[k];
c2c1_r[k] = tmp_real_col2[k] * tmp_real_col1[k] +
tmp_imag_col2[k] * tmp_imag_col1[k];
c3c0_r[k] = tmp_real_col3[k] * tmp_real_col0[k] +
tmp_imag_col3[k] * tmp_imag_col0[k];
c3c1_r[k] = tmp_real_col3[k] * tmp_real_col1[k] +
tmp_imag_col3[k] * tmp_imag_col1[k];
c3c2_r[k] = tmp_real_col3[k] * tmp_real_col2[k] +
tmp_imag_col3[k] * tmp_imag_col2[k];
c1c0_i[k] = tmp_real_col1[k] * tmp_imag_col0[k] -
tmp_imag_col1[k] * tmp_real_col0[k];
c2c0_i[k] = tmp_real_col2[k] * tmp_imag_col0[k] -
tmp_imag_col2[k] * tmp_real_col0[k];
c2c1_i[k] = tmp_real_col2[k] * tmp_imag_col1[k] -
tmp_imag_col2[k] * tmp_real_col1[k];
c3c0_i[k] = tmp_real_col3[k] * tmp_imag_col0[k] -
tmp_imag_col3[k] * tmp_real_col0[k];
c3c1_i[k] = tmp_real_col3[k] * tmp_imag_col1[k] -
tmp_imag_col3[k] * tmp_real_col1[k];
c3c2_i[k] = tmp_real_col3[k] * tmp_imag_col2[k] -
tmp_imag_col3[k] * tmp_real_col2[k];
}
for (int k = 0; k < 4; k++) {
tmp_real_uf[0] += c1c0_r[k];
tmp_real_uf[1] += c2c0_r[k];
tmp_real_uf[2] += c2c1_r[k];
tmp_real_uf[3] += c3c0_r[k];
tmp_real_uf[4] += c3c1_r[k];
tmp_real_uf[5] += c3c2_r[k];
tmp_imag_uf[0] += c1c0_i[k];
tmp_imag_uf[1] += c2c0_i[k];
tmp_imag_uf[2] += c2c1_i[k];
tmp_imag_uf[3] += c3c0_i[k];
tmp_imag_uf[4] += c3c1_i[k];
tmp_imag_uf[5] += c3c2_i[k];
}
return {
tmp_diag[0],
0.0,
0.0,
0.0,
{tmp_real_uf[0], tmp_imag_uf[0]},
tmp_diag[1],
0.0,
0.0,
{tmp_real_uf[1], tmp_imag_uf[1]},
{tmp_real_uf[2], tmp_imag_uf[2]},
tmp_diag[2],
0.0,
{tmp_real_uf[3], tmp_imag_uf[3]},
{tmp_real_uf[4], tmp_imag_uf[4]},
{tmp_real_uf[5], tmp_imag_uf[5]},
tmp_diag[3],
};
}
#include "hermitian_square.h"
#if __AVX__
#include <immintrin.h>
inline double reduce_single(const double col[4]) {
__m256d vec = _mm256_load_pd(col);
__m128d low = _mm256_extractf128_pd(vec, 0);
__m128d high = _mm256_extractf128_pd(vec, 1);
__m128d sum = _mm_add_pd(low, high);
sum = _mm_hadd_pd(sum, sum);
return _mm_cvtsd_f64(sum);
}
#else
inline double reduce_single(const double col[4]) {
double sum = col[0];
#pragma unroll
for (size_t i = 0; i < 4; i++) {
sum += col[i];
}
return sum;
}
#endif
aocommon::HMatrix4x4 HermitianSquareNaiveSIMDv2(
const aocommon::Matrix4x4& mat) {
const double* mat_pointer = reinterpret_cast<const double*>(&mat[0]);
double tmp_real_col0[4];
double tmp_real_col1[4];
double tmp_real_col2[4];
double tmp_real_col3[4];
double tmp_imag_col0[4];
double tmp_imag_col1[4];
double tmp_imag_col2[4];
double tmp_imag_col3[4];
double c1c0_r[4], c2c1_r[4], c2c0_r[4], c3c2_r[4], c3c1_r[4], c3c0_r[4];
double c1c0_i[4], c2c1_i[4], c2c0_i[4], c3c2_i[4], c3c1_i[4], c3c0_i[4];
double tmp_real_uf[6] = {0., 0., 0., 0., 0., 0.};
double tmp_imag_uf[6] = {0., 0., 0., 0., 0., 0.};
double tmp_diag[4] = {0., 0., 0., 0.};
memset(tmp_diag, 0, 4 * sizeof(double));
#pragma omp simd
for (int k = 0; k < 4; k++) {
tmp_real_col0[k] = mat_pointer[2 * (4 * k + 0)];
tmp_real_col1[k] = mat_pointer[2 * (4 * k + 1)];
tmp_real_col2[k] = mat_pointer[2 * (4 * k + 2)];
tmp_real_col3[k] = mat_pointer[2 * (4 * k + 3)];
tmp_imag_col0[k] = mat_pointer[2 * (4 * k + 0) + 1];
tmp_imag_col1[k] = mat_pointer[2 * (4 * k + 1) + 1];
tmp_imag_col2[k] = mat_pointer[2 * (4 * k + 2) + 1];
tmp_imag_col3[k] = mat_pointer[2 * (4 * k + 3) + 1];
}
#pragma omp simd
for (int k = 0; k < 4; k++) {
tmp_diag[0] += ((tmp_real_col0[k] * tmp_real_col0[k]) +
(tmp_imag_col0[k] * tmp_imag_col0[k]));
tmp_diag[1] += ((tmp_real_col1[k] * tmp_real_col1[k]) +
(tmp_imag_col1[k] * tmp_imag_col1[k]));
tmp_diag[2] += ((tmp_real_col2[k] * tmp_real_col2[k]) +
(tmp_imag_col2[k] * tmp_imag_col2[k]));
tmp_diag[3] += ((tmp_real_col3[k] * tmp_real_col3[k]) +
(tmp_imag_col3[k] * tmp_imag_col3[k]));
}
#pragma omp simd
for (int k = 0; k < 4; k++) {
c1c0_r[k] = tmp_real_col1[k] * tmp_real_col0[k] +
tmp_imag_col1[k] * tmp_imag_col0[k];
c2c0_r[k] = tmp_real_col2[k] * tmp_real_col0[k] +
tmp_imag_col2[k] * tmp_imag_col0[k];
c2c1_r[k] = tmp_real_col2[k] * tmp_real_col1[k] +
tmp_imag_col2[k] * tmp_imag_col1[k];
c3c0_r[k] = tmp_real_col3[k] * tmp_real_col0[k] +
tmp_imag_col3[k] * tmp_imag_col0[k];
c3c1_r[k] = tmp_real_col3[k] * tmp_real_col1[k] +
tmp_imag_col3[k] * tmp_imag_col1[k];
c3c2_r[k] = tmp_real_col3[k] * tmp_real_col2[k] +
tmp_imag_col3[k] * tmp_imag_col2[k];
c1c0_i[k] = tmp_real_col1[k] * tmp_imag_col0[k] -
tmp_imag_col1[k] * tmp_real_col0[k];
c2c0_i[k] = tmp_real_col2[k] * tmp_imag_col0[k] -
tmp_imag_col2[k] * tmp_real_col0[k];
c2c1_i[k] = tmp_real_col2[k] * tmp_imag_col1[k] -
tmp_imag_col2[k] * tmp_real_col1[k];
c3c0_i[k] = tmp_real_col3[k] * tmp_imag_col0[k] -
tmp_imag_col3[k] * tmp_real_col0[k];
c3c1_i[k] = tmp_real_col3[k] * tmp_imag_col1[k] -
tmp_imag_col3[k] * tmp_real_col1[k];
c3c2_i[k] = tmp_real_col3[k] * tmp_imag_col2[k] -
tmp_imag_col3[k] * tmp_real_col2[k];
}
tmp_real_uf[0] = reduce_single(c1c0_r);
tmp_real_uf[1] = reduce_single(c2c0_r);
tmp_real_uf[2] = reduce_single(c2c1_r);
tmp_real_uf[3] = reduce_single(c3c0_r);
tmp_real_uf[4] = reduce_single(c3c1_r);
tmp_real_uf[5] = reduce_single(c3c2_r);
tmp_imag_uf[0] = reduce_single(c1c0_i);
tmp_imag_uf[1] = reduce_single(c2c0_i);
tmp_imag_uf[2] = reduce_single(c2c1_i);
tmp_imag_uf[3] = reduce_single(c3c0_i);
tmp_imag_uf[4] = reduce_single(c3c1_i);
tmp_imag_uf[5] = reduce_single(c3c2_i);
return {
tmp_diag[0],
0.0,
0.0,
0.0,
{tmp_real_uf[0], tmp_imag_uf[0]},
tmp_diag[1],
0.0,
0.0,
{tmp_real_uf[1], tmp_imag_uf[1]},
{tmp_real_uf[2], tmp_imag_uf[2]},
tmp_diag[2],
0.0,
{tmp_real_uf[3], tmp_imag_uf[3]},
{tmp_real_uf[4], tmp_imag_uf[4]},
{tmp_real_uf[5], tmp_imag_uf[5]},
tmp_diag[3],
};
}
\ No newline at end of file
#ifndef HERMITIAN_SQUARE_H
#define HERMITIAN_SQUARE_H
#include <aocommon/hmatrix4x4.h>
#include <aocommon/matrix4x4.h>
#include <iostream>
#include <random>
inline void Initialize(aocommon::Matrix4x4& mat) {
// Initialize matrices with random complex values
std::seed_seq seed({42});
std::mt19937 gen(seed);
std::uniform_real_distribution<double> dis(-1.0, 1.0);
for (int i = 0; i < 16; i++) {
mat[i] = std::complex<double>(dis(gen), dis(gen));
}
}
aocommon::HMatrix4x4 HermitianSquare(const aocommon::Matrix4x4& mat);
aocommon::HMatrix4x4 HermitianSquareRefactored(const aocommon::Matrix4x4& mat);
aocommon::HMatrix4x4 HermitianSquareNaiveSIMD(const aocommon::Matrix4x4& mat);
aocommon::HMatrix4x4 HermitianSquareNaiveSIMDv2(const aocommon::Matrix4x4& mat);
#endif
\ No newline at end of file
#include "hermitian_square.h"
aocommon::HMatrix4x4 HermitianSquare(const aocommon::Matrix4x4& mat) {
return mat.HermitianSquare();
}
template <typename T>
using cmplx = std::complex<T>;
aocommon::HMatrix4x4 HermitianSquareRefactored(const aocommon::Matrix4x4& mat) {
auto N = [](std::complex<double> z) -> double { return std::norm(z); };
/*
Mat is a complex matrix indexed as
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
norm = r*r + i*i #ops = 3
conj = r +ji -> r - ji #ops = 0
complex multiplication (CM) = (a + ib ) * (c + i d )
-> a*c - b*d + i (a d + bc) -> 4 mult 1 sum 1 sub -> #ops 6
complex conjugate multiplication (CCM) = (a - ib ) * (c + i d )
-> a*c + b*d + i (-bc + ad) -> 4 mult 1 sum 1 sub -> #ops 6
On the diagonal there are 4 norms + 3 addition -> #ops 15
there are 4 values in the diagonal so 60 operations
Off diagonal 4 complex multiplication -> #ops 24
24 * 6 off diagonal element #ops 144
144 + 60 -> 204 operations per matrix
*/
const cmplx<double> R00 = std::norm(mat[0]) + std::norm(mat[4]) +
std::norm(mat[8]) + std::norm(mat[12]);
const cmplx<double> R01 = 0.0;
const cmplx<double> R02 = 0.0;
const cmplx<double> R03 = 0.0;
const cmplx<double> R10 =
std::conj(mat[1]) * mat[0] + std::conj(mat[5]) * mat[4] +
std::conj(mat[9]) * mat[8] + std::conj(mat[13]) * mat[12];
const cmplx<double> R11 = std::norm(mat[1]) + std::norm(mat[5]) +
std::norm(mat[9]) + std::norm(mat[13]);
const cmplx<double> R12 = 0.0;
const cmplx<double> R13 = 0.0;
const cmplx<double> R20 =
std::conj(mat[2]) * mat[0] + std::conj(mat[6]) * mat[4] +
std::conj(mat[10]) * mat[8] + std::conj(mat[14]) * mat[12];
const cmplx<double> R21 =
std::conj(mat[2]) * mat[1] + std::conj(mat[6]) * mat[5] +
std::conj(mat[10]) * mat[9] + std::conj(mat[14]) * mat[13];
const cmplx<double> R22 = std::norm(mat[2]) + std::norm(mat[6]) +
std::norm(mat[10]) + std::norm(mat[14]);
const cmplx<double> R23 = 0.0;
const cmplx<double> R30 =
std::conj(mat[3]) * mat[0] + std::conj(mat[7]) * mat[4] +
std::conj(mat[11]) * mat[8] + std::conj(mat[15]) * mat[12];
const cmplx<double> R31 =
std::conj(mat[3]) * mat[1] + std::conj(mat[7]) * mat[5] +
std::conj(mat[11]) * mat[9] + std::conj(mat[15]) * mat[13];
const cmplx<double> R32 =
std::conj(mat[3]) * mat[2] + std::conj(mat[7]) * mat[6] +
std::conj(mat[11]) * mat[10] + std::conj(mat[15]) * mat[14];
const cmplx<double> R33 = std::norm(mat[3]) + std::norm(mat[7]) +
std::norm(mat[11]) + std::norm(mat[15]);
return {R00, R01, R02, R03, R10, R11, R12, R13,
R20, R21, R22, R23, R30, R31, R32, R33};
}
#ifndef KRONECKER_SQUARE_H
#define KRONECKER_SQUARE_H
#include <aocommon/hmatrix4x4.h>
#include <aocommon/matrix4x4.h>
#include <iostream>
#include <random>
inline void Initialize(aocommon::MC2x2& a) {
// Initialize matrices with random complex values
std::seed_seq seed({42});
std::mt19937 gen(seed);
std::uniform_real_distribution<double> dis(-1.0, 1.0);
for (size_t i = 0; i < 4; i++) {
a.Set(i, std::complex<double>(dis(gen), dis(gen)));
}
}
aocommon::HMC4x4 KroneckerSquareReference(aocommon::MC2x2& left,
aocommon::MC2x2& right);
aocommon::HMC4x4 KroneckerSquareFused(aocommon::MC2x2& left,
aocommon::MC2x2& right);
#endif // KRONECKER_SQUARE_H
\ No newline at end of file
#include "kronecker_square.h"
// This code is taken from
// https://gitlab.com/aroffringa/wsclean/-/merge_requests/772
aocommon::HMC4x4 KroneckerSquareFused(aocommon::MC2x2& a, aocommon::MC2x2& b) {
using T = const std::complex<double>;
using RT = const double;
using std::conj;
using std::norm;
// Calculate a^H a. Because the result is Hermitian, some shortcuts
// can be made.
RT p00 = norm(a.Get(0)) + norm(a.Get(2));
T p01 = conj(a.Get(0)) * a.Get(1) + conj(a.Get(2)) * a.Get(3);
RT p11 = norm(a.Get(1)) + norm(a.Get(3));
// Calculate b^H b.
RT q00 = norm(b.Get(0)) + norm(b.Get(2));
T q10 = conj(b.Get(1)) * b.Get(0) + conj(b.Get(3)) * b.Get(2); // = conj(q01)
RT q11 = norm(b.Get(1)) + norm(b.Get(3));
// Calculate the Kronecker product of p^T and q
RT m00 = p00 * q00;
T m10 = p00 * q10;
RT m11 = p00 * q11;
T m20 = p01 * q00;
T m21 = p01 * conj(q10);
RT m22 = p11 * q00;
T m30 = p01 * q10;
T m31 = p01 * q11;
T m32 = p11 * q10;
RT m33 = p11 * q11;
return aocommon::HMC4x4::FromData(
{m00, m10.real(), m10.imag(), m11, m20.real(), m20.imag(), m21.real(),
m21.imag(), m22, m30.real(), m30.imag(), m31.real(), m31.imag(),
m32.real(), m32.imag(), m33});
}
#include "kronecker_square.h"
aocommon::HMC4x4 KroneckerSquareReference(aocommon::MC2x2& left,
aocommon::MC2x2& right) {
return aocommon::HMC4x4::KroneckerProduct(left.HermitianSquare().Transpose(),
right.HermitianSquare());
}
\ No newline at end of file
...@@ -10,8 +10,8 @@ ...@@ -10,8 +10,8 @@
inline void Initialize(std::complex<float>* a) { inline void Initialize(std::complex<float>* a) {
// Initialize matrices with random complex values // Initialize matrices with random complex values
std::random_device rd; std::seed_seq seed({42});
std::mt19937 gen(rd()); std::mt19937 gen(seed);
std::uniform_real_distribution<float> dis(-1.0, 1.0); std::uniform_real_distribution<float> dis(-1.0, 1.0);
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
......
#include "helpers.h"
#include <aocommon/matrix4x4.h>
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <iostream>
#define COMPARE_ARRAYS(lhs, rhs, precision) \
compareArrays(Catch::getResultCapture().getCurrentTestName(), __LINE__, lhs, \
rhs, precision)
template <typename T>
void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv,
float precision) {
REQUIRE_THAT(lv, Catch::Matchers::WithinAbs(rv, precision));
}
template <>
void compareSingle(const std::vector<std::complex<float>>& lv,
const std::vector<std::complex<float>>& rv,
float precision) {
for (size_t idx = 0; idx < lv.size(); idx++) {
const auto le = lv[idx];
const auto re = rv[idx];
REQUIRE_THAT(le.real(), Catch::Matchers::WithinAbs(re.real(), precision));
REQUIRE_THAT(le.imag(), Catch::Matchers::WithinAbs(re.imag(), precision));
}
}
template <typename T>
std::string valueToString(const T& value) {
return std::to_string(value);
}
std::string valueToString(const std::complex<float>& value) {
return std::to_string(value.real()) + " " + std::to_string(value.imag()) +
"j";
}
template <typename T, size_t N>
void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
std::array<T, N> rhs, float precision) {
std::vector<T> lv(lhs.begin(), lhs.end());
std::vector<T> rv(rhs.begin(), rhs.end());
INFO("Test case [" << test << "] failed at line "
<< line); // Reported only if REQUIRE fails
std::stringstream ss;
ss << "Expected : \n";
for (size_t idx = 0; idx < N; idx++) {
ss << valueToString(lhs[idx]) << "\t";
}
ss << "\nObtained : \n";
for (size_t idx = 0; idx < N; idx++) {
ss << valueToString(rhs[idx]) << "\t";
}
ss << "\n";
INFO("Reason: \n" << ss.str());
compareSingle(lv, rv, precision);
}
template void compareArrays(const std::string& test, unsigned line,
std::array<std::complex<float>, 4ul> lhs,
std::array<std::complex<float>, 4ul> rhs,
float precision);
void AssertEqual(const aocommon::Matrix4x4& a, const aocommon::Matrix4x4& b,
float precision) {
for (size_t i = 0; i < 16; i++) {
REQUIRE_THAT(a[i].real(),
Catch::Matchers::WithinAbs(b[i].real(), precision));
REQUIRE_THAT(a[i].imag(),
Catch::Matchers::WithinAbs(b[i].imag(), precision));
}
}
void AssertEqual(const aocommon::HMatrix4x4& a, const aocommon::HMatrix4x4& b,
float precision) {
for (size_t i = 0; i < 16; i++) {
REQUIRE_THAT(a[i].real(),
Catch::Matchers::WithinAbs(b[i].real(), precision));
REQUIRE_THAT(a[i].imag(),
Catch::Matchers::WithinAbs(b[i].imag(), precision));
}
}
void PrintMatrix(const aocommon::Matrix4x4& mat) {
for (size_t j = 0; j < 4; j++) {
for (size_t i = 0; i < 3; i++) std::cout << mat[i + j * 4] << "\t";
std::cout << mat[3 + j * 4] << std::endl;
}
}
void PrintMatrix(const aocommon::HMatrix4x4& mat) {
PrintMatrix(mat.ToMatrix());
}
\ No newline at end of file
#ifndef HELPERS #ifndef HELPERS_H
#define HELPERS #define HELPERS_H
#include <aocommon/hmatrix4x4.h>
#include <aocommon/matrix4x4.h>
#include <array> #include <array>
#include <catch2/matchers/catch_matchers_floating_point.hpp> #include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <complex>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -13,54 +17,27 @@ ...@@ -13,54 +17,27 @@
template <typename T> template <typename T>
void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv, void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv,
float precision) { float precision);
REQUIRE_THAT(lv, Catch::Matchers::WithinAbs(rv, precision));
}
template <> template <>
void compareSingle(const std::vector<std::complex<float>>& lv, void compareSingle(const std::vector<std::complex<float>>& lv,
const std::vector<std::complex<float>>& rv, const std::vector<std::complex<float>>& rv, float precision);
float precision) {
for (size_t idx = 0; idx < lv.size(); idx++) {
const auto le = lv[idx];
const auto re = rv[idx];
REQUIRE_THAT(le.real(), Catch::Matchers::WithinAbs(re.real(), precision));
REQUIRE_THAT(le.imag(), Catch::Matchers::WithinAbs(re.imag(), precision));
}
}
template <typename T> template <typename T>
std::string valueToString(const T& value) { std::string valueToString(const T& value);
return std::to_string(value);
}
std::string valueToString(const std::complex<float>& value) { std::string valueToString(const std::complex<float>& value);
return std::to_string(value.real()) + " " + std::to_string(value.imag()) +
"j";
}
template <typename T, size_t N> template <typename T, size_t N>
void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs, void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
std::array<T, N> rhs, float precision) { std::array<T, N> rhs, float precision);
std::vector<T> lv(lhs.begin(), lhs.end());
std::vector<T> rv(rhs.begin(), rhs.end()); void AssertEqual(const aocommon::Matrix4x4& a, const aocommon::Matrix4x4& b,
INFO("Test case [" << test << "] failed at line " float precision);
<< line); // Reported only if REQUIRE fails
std::stringstream ss; void AssertEqual(const aocommon::HMatrix4x4& a, const aocommon::HMatrix4x4& b,
ss << "Expected : \n"; float precision);
for (size_t idx = 0; idx < N; idx++) {
ss << valueToString(lhs[idx]) << "\t";
}
ss << "\nObtained : \n"; void PrintMatrix(const aocommon::Matrix4x4& mat);
for (size_t idx = 0; idx < N; idx++) {
ss << valueToString(rhs[idx]) << "\t";
}
ss << "\n";
INFO("Reason: \n" << ss.str());
compareSingle(lv, rv, precision);
}
#endif #endif // HELPERS_H
\ No newline at end of file \ No newline at end of file
#include <hermitian_square.h>
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include "helpers.h"
TEST_CASE("test hermitian square", "[double]") {
// This setup will be done 4 times in total, once for each section
aocommon::Matrix4x4 A;
aocommon::HMatrix4x4 HA;
aocommon::HMatrix4x4 HA_expected;
Initialize(A);
HA_expected = HermitianSquare(A);
SECTION("test correctness of refactored implementation") {
HA = HermitianSquareRefactored(A);
CHECK(HA_expected == HA);
}
SECTION("test correctness of nasty implementation") {
HA = HermitianSquareNaiveSIMD(A);
AssertEqual(HA_expected, HA, 1.e-6);
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment