diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 42f5bb2aa7841bbe9286d72489eef01785df5b83..d11a266f967e9699e0bccf0472fef125ad6ae421 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -84,6 +84,7 @@ performance-jetson:   # This job runs in the test stage.
     - sbatch --wait -p jetsonq -w orin02 -o output.txt -e error.txt ci/das6/compile_and_run_jetson.sh jetson_arm64 11.4.0
     - cat output.txt >&1
     - cat error.txt >&2
+  allow_failure: true
     
   artifacts:
     paths:
@@ -127,4 +128,7 @@ collect-performance:
       - ./*.tar
   script:
   - ls -la 
-  - python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary
+  - python3 ci/summarize-results.py --filter MatrixMultiplication results*.json result-summary-matrix-multiplication
+  - python3 ci/summarize-results.py --filter HermitianSquare results*.json result-summary-hermitian-square
+  - python3 ci/summarize-results.py --filter KroneckerSquare results*.json result-summary-kronecker-square
+  
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 20257d1d36c341b5b34364e06a2363b4a8176068..73d5509823c9b275c2517ed7fae9649b55ec0962 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ FetchContent_Declare(
 FetchContent_Populate(aocommon)
 
 set(COMPILER_FLAGS "-O3;-march=native;-ggdb;")
+
 # List all kernel code
 file(GLOB KERNEL_SOURCES "code/*.cpp")
 # Add the benchmark executable
@@ -59,12 +60,17 @@ add_executable(microbenchmarks ${BENCHMARK_SOURCES} ${KERNEL_SOURCES})
 file(GLOB TEST_SOURCES "test/*.cpp")
 add_executable(unittests ${TEST_SOURCES} ${KERNEL_SOURCES})
 
+find_package(OpenMP)
 
 # Link against Google Benchmark
-target_link_libraries(microbenchmarks benchmark::benchmark)
+target_link_libraries(microbenchmarks PRIVATE benchmark::benchmark)
 target_include_directories(microbenchmarks PRIVATE ${aocommon_SOURCE_DIR}/include)
 target_include_directories(microbenchmarks PRIVATE code)
 target_compile_options(microbenchmarks PUBLIC ${COMPILER_FLAGS})
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(microbenchmarks PRIVATE OpenMP::OpenMP_CXX)
+    target_link_libraries(unittests PRIVATE OpenMP::OpenMP_CXX)
+endif()
 
 include(precompile)
 list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
diff --git a/benchmarks/hermitian_square.cpp b/benchmarks/hermitian_square.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffffa74e32634649693ff56c11b74e0642093b1a
--- /dev/null
+++ b/benchmarks/hermitian_square.cpp
@@ -0,0 +1,47 @@
+#include <benchmark/benchmark.h>
+#include <hermitian_square.h>
+
+#include <iostream>
+
+namespace {
+
+class InitializeInput : public benchmark::Fixture {
+ public:
+  void SetUp(::benchmark::State& state) {
+    A = std::make_unique<aocommon::Matrix4x4>();
+    InitializeInput(*A);
+  }
+  void TearDown(::benchmark::State& state) { A.reset(); }
+
+  std::unique_ptr<aocommon::Matrix4x4> A;
+};
+}  // namespace
+
+// Reference standard
+BENCHMARK_F(InitializeInput, HermitianSquare)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    HermitianSquare(*A);
+  }
+}
+
+BENCHMARK_F(InitializeInput, HermitianSquareRefactored)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    HermitianSquareRefactored(*A);
+  }
+}
+
+BENCHMARK_F(InitializeInput, HermitianSquareNaiveSIMD)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    HermitianSquareNaiveSIMD(*A);
+  }
+}
+
+BENCHMARK_F(InitializeInput, HermitianSquareNaiveSIMDv2)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    HermitianSquareNaiveSIMDv2(*A);
+  }
+}
diff --git a/benchmarks/kronecker_square.cpp b/benchmarks/kronecker_square.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f7e95a96cbd0e00dea82ff373577daa5af3d255a
--- /dev/null
+++ b/benchmarks/kronecker_square.cpp
@@ -0,0 +1,45 @@
+#include <benchmark/benchmark.h>
+#include <kronecker_square.h>
+
+#include <iostream>
+
+namespace {
+
+class InitializeInput : public benchmark::Fixture {
+ public:
+  void SetUp(::benchmark::State& state) {
+    A = std::make_unique<aocommon::MC2x2>();
+    B = std::make_unique<aocommon::MC2x2>();
+    InitializeInput(*A);
+    InitializeInput(*B);
+  }
+  void TearDown(::benchmark::State& state) {
+    A.reset();
+    B.reset();
+  }
+
+  std::unique_ptr<aocommon::MC2x2> A, B;
+};
+}  // namespace
+
+// Reference implementation
+BENCHMARK_F(InitializeInput, KroneckerSquareReference)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    aocommon::HMC4x4 r = aocommon::HMC4x4::Zero();
+    for (size_t i = 0; i < 100000000; i++) {
+      r += KroneckerSquareReference(*A, *B);
+    }
+  }
+}
+
+// Fused implementation
+BENCHMARK_F(InitializeInput, KroneckerSquareFused)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    aocommon::HMC4x4 r = aocommon::HMC4x4::Zero();
+    for (size_t i = 0; i < 100000000; i++) {
+      r += KroneckerSquareFused(*A, *B);
+        }
+  }
+}
diff --git a/benchmarks/main.cpp b/benchmarks/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fefa047228720dd970aefbfca70030a737f9a4
--- /dev/null
+++ b/benchmarks/main.cpp
@@ -0,0 +1,3 @@
+#include <benchmark/benchmark.h>
+
+BENCHMARK_MAIN();
diff --git a/benchmarks/matrix_multiplication.cpp b/benchmarks/matrix_multiplication.cpp
index fb0f0f7b3276f4959e7e7665a9dc6d70c5bdd0fb..75a058de04e513b0b3869881e575e5c882b3602e 100644
--- a/benchmarks/matrix_multiplication.cpp
+++ b/benchmarks/matrix_multiplication.cpp
@@ -109,5 +109,3 @@ BENCHMARK_F(InitializeInput, MatrixMultiplicationSSE)
   }
 }
 #endif
-
-BENCHMARK_MAIN();
diff --git a/ci/summarize-results.py b/ci/summarize-results.py
index ab99375ac8df9bc330049f81abc0c6ef1959c6f3..549852a04a7001041c47c7760987e2ee07bb75e2 100644
--- a/ci/summarize-results.py
+++ b/ci/summarize-results.py
@@ -5,7 +5,7 @@ import seaborn
 import pandas
 import matplotlib.pyplot as plt
 
-seaborn.set_theme(palette="flare", style="whitegrid")
+seaborn.set_theme(palette="bright", style="whitegrid")
 
 def parse_args():
     parser = ArgumentParser(description="Combine benchmark metrics from Google Benchmarks Framework")
diff --git a/cmake/precompile.cmake b/cmake/precompile.cmake
index 097e36d9b1f0d89869ff9839df305f0bd48f0788..cfc0ce61fb190fed6149d71a475876bf4f439680 100644
--- a/cmake/precompile.cmake
+++ b/cmake/precompile.cmake
@@ -8,9 +8,10 @@ macro (add_precompile_target)
     list(TRANSFORM apt_INCLUDE_DIRS PREPEND -I)
 
     add_custom_command(OUTPUT ${apt_TARGET_NAME}.s
-                    COMMAND ${CMAKE_CXX_COMPILER} -std=c++${CMAKE_CXX_STANDARD} ${CMAKE_CXX_COMPLERS_ARG} ${apt_COMPILER_FLAGS} ${apt_SOURCES} ${apt_INCLUDE_DIRS} -S -o ${apt_TARGET_NAME}.s
+                    COMMAND ${CMAKE_CXX_COMPILER} -std=c++${CMAKE_CXX_STANDARD} ${CMAKE_CXX_COMPLERS_ARG} ${apt_COMPILER_FLAGS} ${apt_SOURCES} ${apt_INCLUDE_DIRS} -S -fverbose-asm -Wa,aslh -o ${apt_TARGET_NAME}.s
                     DEPENDS ${apt_SOURCES}
                     )
+    
     add_custom_target(precompile_${apt_TARGET_NAME} ALL
                   DEPENDS ${apt_TARGET_NAME}.s
                   COMMENT "Adding precompile target ${KERNEL_NAME} with source ${KERNEL_SOURCE}")
diff --git a/code/hermitian_naive_simd.cpp b/code/hermitian_naive_simd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48d52d1880fcdc37d02e50b607a1423494e9429d
--- /dev/null
+++ b/code/hermitian_naive_simd.cpp
@@ -0,0 +1,114 @@
+#include "hermitian_square.h"
+
+aocommon::HMatrix4x4 HermitianSquareNaiveSIMD(const aocommon::Matrix4x4& mat) {
+  const double* mat_pointer = reinterpret_cast<const double*>(&mat[0]);
+  double tmp_real_col0[4];
+  double tmp_real_col1[4];
+  double tmp_real_col2[4];
+  double tmp_real_col3[4];
+
+  double tmp_imag_col0[4];
+  double tmp_imag_col1[4];
+  double tmp_imag_col2[4];
+  double tmp_imag_col3[4];
+
+  double c1c0_r[4], c2c1_r[4], c2c0_r[4], c3c2_r[4], c3c1_r[4], c3c0_r[4];
+  double c1c0_i[4], c2c1_i[4], c2c0_i[4], c3c2_i[4], c3c1_i[4], c3c0_i[4];
+  double tmp_real_uf[6] = {0, 0, 0, 0, 0, 0};
+  double tmp_imag_uf[6] = {0, 0, 0, 0, 0, 0};
+  double tmp_diag[4];
+
+  memset(tmp_diag, 0, 4 * sizeof(double));
+
+#pragma omp simd
+  for (int k = 0; k < 4; k++) {
+    tmp_real_col0[k] = mat_pointer[2 * (4 * k + 0)];
+    tmp_real_col1[k] = mat_pointer[2 * (4 * k + 1)];
+    tmp_real_col2[k] = mat_pointer[2 * (4 * k + 2)];
+    tmp_real_col3[k] = mat_pointer[2 * (4 * k + 3)];
+
+    tmp_imag_col0[k] = mat_pointer[2 * (4 * k + 0) + 1];
+    tmp_imag_col1[k] = mat_pointer[2 * (4 * k + 1) + 1];
+    tmp_imag_col2[k] = mat_pointer[2 * (4 * k + 2) + 1];
+    tmp_imag_col3[k] = mat_pointer[2 * (4 * k + 3) + 1];
+  }
+
+#pragma omp simd
+  for (int k = 0; k < 4; k++) {
+    tmp_diag[0] += ((tmp_real_col0[k] * tmp_real_col0[k]) +
+                    (tmp_imag_col0[k] * tmp_imag_col0[k]));
+
+    tmp_diag[1] += ((tmp_real_col1[k] * tmp_real_col1[k]) +
+                    (tmp_imag_col1[k] * tmp_imag_col1[k]));
+
+    tmp_diag[2] += ((tmp_real_col2[k] * tmp_real_col2[k]) +
+                    (tmp_imag_col2[k] * tmp_imag_col2[k]));
+
+    tmp_diag[3] += ((tmp_real_col3[k] * tmp_real_col3[k]) +
+                    (tmp_imag_col3[k] * tmp_imag_col3[k]));
+  }
+
+#pragma omp simd
+  for (int k = 0; k < 4; k++) {
+    c1c0_r[k] = tmp_real_col1[k] * tmp_real_col0[k] +
+                tmp_imag_col1[k] * tmp_imag_col0[k];
+    c2c0_r[k] = tmp_real_col2[k] * tmp_real_col0[k] +
+                tmp_imag_col2[k] * tmp_imag_col0[k];
+    c2c1_r[k] = tmp_real_col2[k] * tmp_real_col1[k] +
+                tmp_imag_col2[k] * tmp_imag_col1[k];
+    c3c0_r[k] = tmp_real_col3[k] * tmp_real_col0[k] +
+                tmp_imag_col3[k] * tmp_imag_col0[k];
+    c3c1_r[k] = tmp_real_col3[k] * tmp_real_col1[k] +
+                tmp_imag_col3[k] * tmp_imag_col1[k];
+    c3c2_r[k] = tmp_real_col3[k] * tmp_real_col2[k] +
+                tmp_imag_col3[k] * tmp_imag_col2[k];
+
+    c1c0_i[k] = tmp_real_col1[k] * tmp_imag_col0[k] -
+                tmp_imag_col1[k] * tmp_real_col0[k];
+    c2c0_i[k] = tmp_real_col2[k] * tmp_imag_col0[k] -
+                tmp_imag_col2[k] * tmp_real_col0[k];
+    c2c1_i[k] = tmp_real_col2[k] * tmp_imag_col1[k] -
+                tmp_imag_col2[k] * tmp_real_col1[k];
+    c3c0_i[k] = tmp_real_col3[k] * tmp_imag_col0[k] -
+                tmp_imag_col3[k] * tmp_real_col0[k];
+    c3c1_i[k] = tmp_real_col3[k] * tmp_imag_col1[k] -
+                tmp_imag_col3[k] * tmp_real_col1[k];
+    c3c2_i[k] = tmp_real_col3[k] * tmp_imag_col2[k] -
+                tmp_imag_col3[k] * tmp_real_col2[k];
+  }
+
+  for (int k = 0; k < 4; k++) {
+    tmp_real_uf[0] += c1c0_r[k];
+    tmp_real_uf[1] += c2c0_r[k];
+    tmp_real_uf[2] += c2c1_r[k];
+    tmp_real_uf[3] += c3c0_r[k];
+    tmp_real_uf[4] += c3c1_r[k];
+    tmp_real_uf[5] += c3c2_r[k];
+
+    tmp_imag_uf[0] += c1c0_i[k];
+    tmp_imag_uf[1] += c2c0_i[k];
+    tmp_imag_uf[2] += c2c1_i[k];
+    tmp_imag_uf[3] += c3c0_i[k];
+    tmp_imag_uf[4] += c3c1_i[k];
+    tmp_imag_uf[5] += c3c2_i[k];
+  }
+
+  return {
+      tmp_diag[0],
+      0.0,
+      0.0,
+      0.0,
+      {tmp_real_uf[0], tmp_imag_uf[0]},
+      tmp_diag[1],
+      0.0,
+      0.0,
+      {tmp_real_uf[1], tmp_imag_uf[1]},
+      {tmp_real_uf[2], tmp_imag_uf[2]},
+      tmp_diag[2],
+      0.0,
+      {tmp_real_uf[3], tmp_imag_uf[3]},
+      {tmp_real_uf[4], tmp_imag_uf[4]},
+      {tmp_real_uf[5], tmp_imag_uf[5]},
+      tmp_diag[3],
+  };
+}
diff --git a/code/hermitian_naive_simd_v2.cpp b/code/hermitian_naive_simd_v2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46031dce05ab1c664cc2644e896a251175bcda90
--- /dev/null
+++ b/code/hermitian_naive_simd_v2.cpp
@@ -0,0 +1,134 @@
+
+#include "hermitian_square.h"
+
+#if __AVX__
+#include <immintrin.h>
+inline double reduce_single(const double col[4]) {
+  __m256d vec = _mm256_load_pd(col);
+  __m128d low = _mm256_extractf128_pd(vec, 0);
+  __m128d high = _mm256_extractf128_pd(vec, 1);
+  __m128d sum = _mm_add_pd(low, high);
+  sum = _mm_hadd_pd(sum, sum);
+  return _mm_cvtsd_f64(sum);
+}
+#else
+inline double reduce_single(const double col[4]) {
+  double sum = col[0];
+#pragma unroll
+  for (size_t i = 0; i < 4; i++) {
+    sum += col[i];
+  }
+  return sum;
+}
+#endif
+
+aocommon::HMatrix4x4 HermitianSquareNaiveSIMDv2(
+    const aocommon::Matrix4x4& mat) {
+  const double* mat_pointer = reinterpret_cast<const double*>(&mat[0]);
+  double tmp_real_col0[4];
+  double tmp_real_col1[4];
+  double tmp_real_col2[4];
+  double tmp_real_col3[4];
+
+  double tmp_imag_col0[4];
+  double tmp_imag_col1[4];
+  double tmp_imag_col2[4];
+  double tmp_imag_col3[4];
+
+  double c1c0_r[4], c2c1_r[4], c2c0_r[4], c3c2_r[4], c3c1_r[4], c3c0_r[4];
+  double c1c0_i[4], c2c1_i[4], c2c0_i[4], c3c2_i[4], c3c1_i[4], c3c0_i[4];
+  double tmp_real_uf[6] = {0., 0., 0., 0., 0., 0.};
+  double tmp_imag_uf[6] = {0., 0., 0., 0., 0., 0.};
+  double tmp_diag[4] = {0., 0., 0., 0.};
+
+  memset(tmp_diag, 0, 4 * sizeof(double));
+
+#pragma omp simd
+  for (int k = 0; k < 4; k++) {
+    tmp_real_col0[k] = mat_pointer[2 * (4 * k + 0)];
+    tmp_real_col1[k] = mat_pointer[2 * (4 * k + 1)];
+    tmp_real_col2[k] = mat_pointer[2 * (4 * k + 2)];
+    tmp_real_col3[k] = mat_pointer[2 * (4 * k + 3)];
+
+    tmp_imag_col0[k] = mat_pointer[2 * (4 * k + 0) + 1];
+    tmp_imag_col1[k] = mat_pointer[2 * (4 * k + 1) + 1];
+    tmp_imag_col2[k] = mat_pointer[2 * (4 * k + 2) + 1];
+    tmp_imag_col3[k] = mat_pointer[2 * (4 * k + 3) + 1];
+  }
+
+#pragma omp simd
+  for (int k = 0; k < 4; k++) {
+    tmp_diag[0] += ((tmp_real_col0[k] * tmp_real_col0[k]) +
+                    (tmp_imag_col0[k] * tmp_imag_col0[k]));
+
+    tmp_diag[1] += ((tmp_real_col1[k] * tmp_real_col1[k]) +
+                    (tmp_imag_col1[k] * tmp_imag_col1[k]));
+
+    tmp_diag[2] += ((tmp_real_col2[k] * tmp_real_col2[k]) +
+                    (tmp_imag_col2[k] * tmp_imag_col2[k]));
+
+    tmp_diag[3] += ((tmp_real_col3[k] * tmp_real_col3[k]) +
+                    (tmp_imag_col3[k] * tmp_imag_col3[k]));
+  }
+#pragma omp simd
+  for (int k = 0; k < 4; k++) {
+    c1c0_r[k] = tmp_real_col1[k] * tmp_real_col0[k] +
+                tmp_imag_col1[k] * tmp_imag_col0[k];
+    c2c0_r[k] = tmp_real_col2[k] * tmp_real_col0[k] +
+                tmp_imag_col2[k] * tmp_imag_col0[k];
+    c2c1_r[k] = tmp_real_col2[k] * tmp_real_col1[k] +
+                tmp_imag_col2[k] * tmp_imag_col1[k];
+    c3c0_r[k] = tmp_real_col3[k] * tmp_real_col0[k] +
+                tmp_imag_col3[k] * tmp_imag_col0[k];
+    c3c1_r[k] = tmp_real_col3[k] * tmp_real_col1[k] +
+                tmp_imag_col3[k] * tmp_imag_col1[k];
+    c3c2_r[k] = tmp_real_col3[k] * tmp_real_col2[k] +
+                tmp_imag_col3[k] * tmp_imag_col2[k];
+
+    c1c0_i[k] = tmp_real_col1[k] * tmp_imag_col0[k] -
+                tmp_imag_col1[k] * tmp_real_col0[k];
+    c2c0_i[k] = tmp_real_col2[k] * tmp_imag_col0[k] -
+                tmp_imag_col2[k] * tmp_real_col0[k];
+    c2c1_i[k] = tmp_real_col2[k] * tmp_imag_col1[k] -
+                tmp_imag_col2[k] * tmp_real_col1[k];
+    c3c0_i[k] = tmp_real_col3[k] * tmp_imag_col0[k] -
+                tmp_imag_col3[k] * tmp_real_col0[k];
+    c3c1_i[k] = tmp_real_col3[k] * tmp_imag_col1[k] -
+                tmp_imag_col3[k] * tmp_real_col1[k];
+    c3c2_i[k] = tmp_real_col3[k] * tmp_imag_col2[k] -
+                tmp_imag_col3[k] * tmp_real_col2[k];
+  }
+
+  tmp_real_uf[0] = reduce_single(c1c0_r);
+  tmp_real_uf[1] = reduce_single(c2c0_r);
+  tmp_real_uf[2] = reduce_single(c2c1_r);
+  tmp_real_uf[3] = reduce_single(c3c0_r);
+  tmp_real_uf[4] = reduce_single(c3c1_r);
+  tmp_real_uf[5] = reduce_single(c3c2_r);
+
+  tmp_imag_uf[0] = reduce_single(c1c0_i);
+  tmp_imag_uf[1] = reduce_single(c2c0_i);
+  tmp_imag_uf[2] = reduce_single(c2c1_i);
+  tmp_imag_uf[3] = reduce_single(c3c0_i);
+  tmp_imag_uf[4] = reduce_single(c3c1_i);
+  tmp_imag_uf[5] = reduce_single(c3c2_i);
+
+  return {
+      tmp_diag[0],
+      0.0,
+      0.0,
+      0.0,
+      {tmp_real_uf[0], tmp_imag_uf[0]},
+      tmp_diag[1],
+      0.0,
+      0.0,
+      {tmp_real_uf[1], tmp_imag_uf[1]},
+      {tmp_real_uf[2], tmp_imag_uf[2]},
+      tmp_diag[2],
+      0.0,
+      {tmp_real_uf[3], tmp_imag_uf[3]},
+      {tmp_real_uf[4], tmp_imag_uf[4]},
+      {tmp_real_uf[5], tmp_imag_uf[5]},
+      tmp_diag[3],
+  };
+}
\ No newline at end of file
diff --git a/code/hermitian_square.h b/code/hermitian_square.h
new file mode 100644
index 0000000000000000000000000000000000000000..48e2cee830b6aa97d2940b7ee1e6b4bd44cd5d9b
--- /dev/null
+++ b/code/hermitian_square.h
@@ -0,0 +1,26 @@
+#ifndef HERMITIAN_SQUARE_H
+#define HERMITIAN_SQUARE_H
+
+#include <aocommon/hmatrix4x4.h>
+#include <aocommon/matrix4x4.h>
+
+#include <iostream>
+#include <random>
+
+inline void Initialize(aocommon::Matrix4x4& mat) {
+  // Initialize matrices with random complex values
+  std::seed_seq seed({42});
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<double> dis(-1.0, 1.0);
+
+  for (int i = 0; i < 16; i++) {
+    mat[i] = std::complex<double>(dis(gen), dis(gen));
+  }
+}
+
+aocommon::HMatrix4x4 HermitianSquare(const aocommon::Matrix4x4& mat);
+aocommon::HMatrix4x4 HermitianSquareRefactored(const aocommon::Matrix4x4& mat);
+aocommon::HMatrix4x4 HermitianSquareNaiveSIMD(const aocommon::Matrix4x4& mat);
+aocommon::HMatrix4x4 HermitianSquareNaiveSIMDv2(const aocommon::Matrix4x4& mat);
+
+#endif
\ No newline at end of file
diff --git a/code/hermitian_square_reference.cpp b/code/hermitian_square_reference.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50f51402be21141efb7ffa82862509ad7eb79c2f
--- /dev/null
+++ b/code/hermitian_square_reference.cpp
@@ -0,0 +1,75 @@
+#include "hermitian_square.h"
+
+aocommon::HMatrix4x4 HermitianSquare(const aocommon::Matrix4x4& mat) {
+  return mat.HermitianSquare();
+}
+
+template <typename T>
+using cmplx = std::complex<T>;
+
+aocommon::HMatrix4x4 HermitianSquareRefactored(const aocommon::Matrix4x4& mat) {
+  auto N = [](std::complex<double> z) -> double { return std::norm(z); };
+
+  /*
+    Mat is a complex matrix indexed as
+     0  1  2  3
+     4  5  6  7
+     8  9 10 11
+    12 13 14 15
+
+    norm = r*r + i*i #ops = 3
+    conj = r +ji -> r - ji #ops = 0
+    complex multiplication (CM) = (a + ib ) * (c + i d )
+    -> a*c - b*d + i (a d +  bc) -> 4 mult 1 sum 1 sub -> #ops 6
+    complex conjugate multiplication (CCM) = (a - ib ) * (c + i d )
+    -> a*c + b*d + i (-bc + ad) -> 4 mult 1 sum 1 sub -> #ops 6
+
+
+    On the diagonal there are 4 norms + 3 addition -> #ops 15
+    there are 4 values in the diagonal so 60 operations
+
+    Off diagonal 4 complex multiplication -> #ops 24
+    24 * 6 off diagonal element #ops 144
+
+    144 + 60 -> 204 operations per matrix
+  */
+
+  const cmplx<double> R00 = std::norm(mat[0]) + std::norm(mat[4]) +
+                            std::norm(mat[8]) + std::norm(mat[12]);
+  const cmplx<double> R01 = 0.0;
+  const cmplx<double> R02 = 0.0;
+  const cmplx<double> R03 = 0.0;
+
+  const cmplx<double> R10 =
+      std::conj(mat[1]) * mat[0] + std::conj(mat[5]) * mat[4] +
+      std::conj(mat[9]) * mat[8] + std::conj(mat[13]) * mat[12];
+  const cmplx<double> R11 = std::norm(mat[1]) + std::norm(mat[5]) +
+                            std::norm(mat[9]) + std::norm(mat[13]);
+  const cmplx<double> R12 = 0.0;
+  const cmplx<double> R13 = 0.0;
+
+  const cmplx<double> R20 =
+      std::conj(mat[2]) * mat[0] + std::conj(mat[6]) * mat[4] +
+      std::conj(mat[10]) * mat[8] + std::conj(mat[14]) * mat[12];
+  const cmplx<double> R21 =
+      std::conj(mat[2]) * mat[1] + std::conj(mat[6]) * mat[5] +
+      std::conj(mat[10]) * mat[9] + std::conj(mat[14]) * mat[13];
+  const cmplx<double> R22 = std::norm(mat[2]) + std::norm(mat[6]) +
+                            std::norm(mat[10]) + std::norm(mat[14]);
+  const cmplx<double> R23 = 0.0;
+
+  const cmplx<double> R30 =
+      std::conj(mat[3]) * mat[0] + std::conj(mat[7]) * mat[4] +
+      std::conj(mat[11]) * mat[8] + std::conj(mat[15]) * mat[12];
+  const cmplx<double> R31 =
+      std::conj(mat[3]) * mat[1] + std::conj(mat[7]) * mat[5] +
+      std::conj(mat[11]) * mat[9] + std::conj(mat[15]) * mat[13];
+  const cmplx<double> R32 =
+      std::conj(mat[3]) * mat[2] + std::conj(mat[7]) * mat[6] +
+      std::conj(mat[11]) * mat[10] + std::conj(mat[15]) * mat[14];
+  const cmplx<double> R33 = std::norm(mat[3]) + std::norm(mat[7]) +
+                            std::norm(mat[11]) + std::norm(mat[15]);
+
+  return {R00, R01, R02, R03, R10, R11, R12, R13,
+          R20, R21, R22, R23, R30, R31, R32, R33};
+}
diff --git a/code/kronecker_square.h b/code/kronecker_square.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bae7737b16046d880e867f94a56e55a12c476f0
--- /dev/null
+++ b/code/kronecker_square.h
@@ -0,0 +1,27 @@
+#ifndef KRONECKER_SQUARE_H
+#define KRONECKER_SQUARE_H
+
+#include <aocommon/hmatrix4x4.h>
+#include <aocommon/matrix4x4.h>
+
+#include <iostream>
+#include <random>
+
+inline void Initialize(aocommon::MC2x2& a) {
+  // Initialize matrices with random complex values
+  std::seed_seq seed({42});
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<double> dis(-1.0, 1.0);
+
+  for (size_t i = 0; i < 4; i++) {
+    a.Set(i, std::complex<double>(dis(gen), dis(gen)));
+  }
+}
+
+aocommon::HMC4x4 KroneckerSquareReference(aocommon::MC2x2& left,
+                                          aocommon::MC2x2& right);
+
+aocommon::HMC4x4 KroneckerSquareFused(aocommon::MC2x2& left,
+                                      aocommon::MC2x2& right);
+
+#endif  // KRONECKER_SQUARE_H
\ No newline at end of file
diff --git a/code/kronecker_square_fused.cpp b/code/kronecker_square_fused.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7acfa1faec490208a709965d331bdf6758776e13
--- /dev/null
+++ b/code/kronecker_square_fused.cpp
@@ -0,0 +1,40 @@
+
+#include "kronecker_square.h"
+
+// This code is taken from
+// https://gitlab.com/aroffringa/wsclean/-/merge_requests/772
+
+aocommon::HMC4x4 KroneckerSquareFused(aocommon::MC2x2& a, aocommon::MC2x2& b) {
+  using T = const std::complex<double>;
+  using RT = const double;
+  using std::conj;
+  using std::norm;
+
+  // Calculate a^H a. Because the result is Hermitian, some shortcuts
+  // can be made.
+  RT p00 = norm(a.Get(0)) + norm(a.Get(2));
+  T p01 = conj(a.Get(0)) * a.Get(1) + conj(a.Get(2)) * a.Get(3);
+  RT p11 = norm(a.Get(1)) + norm(a.Get(3));
+
+  // Calculate b^H b.
+  RT q00 = norm(b.Get(0)) + norm(b.Get(2));
+  T q10 = conj(b.Get(1)) * b.Get(0) + conj(b.Get(3)) * b.Get(2);  // = conj(q01)
+  RT q11 = norm(b.Get(1)) + norm(b.Get(3));
+
+  // Calculate the Kronecker product of p^T and q
+  RT m00 = p00 * q00;
+  T m10 = p00 * q10;
+  RT m11 = p00 * q11;
+  T m20 = p01 * q00;
+  T m21 = p01 * conj(q10);
+  RT m22 = p11 * q00;
+  T m30 = p01 * q10;
+  T m31 = p01 * q11;
+  T m32 = p11 * q10;
+  RT m33 = p11 * q11;
+
+  return aocommon::HMC4x4::FromData(
+      {m00, m10.real(), m10.imag(), m11, m20.real(), m20.imag(), m21.real(),
+       m21.imag(), m22, m30.real(), m30.imag(), m31.real(), m31.imag(),
+       m32.real(), m32.imag(), m33});
+}
diff --git a/code/kroneker_square_reference.cpp b/code/kroneker_square_reference.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1080224379585ffe2d87c5ffce22da4cb5aec1f
--- /dev/null
+++ b/code/kroneker_square_reference.cpp
@@ -0,0 +1,7 @@
+#include "kronecker_square.h"
+
+aocommon::HMC4x4 KroneckerSquareReference(aocommon::MC2x2& left,
+                                          aocommon::MC2x2& right) {
+  return aocommon::HMC4x4::KroneckerProduct(left.HermitianSquare().Transpose(),
+                                            right.HermitianSquare());
+}
\ No newline at end of file
diff --git a/code/kroneker_square_reference_simd.cpp b/code/kroneker_square_reference_simd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/matrix_multiplication.h b/code/matrix_multiplication.h
index 3867383cf531af65bef6fbab2696235834cd480c..4f51072d638a28ceb74a100901f8da210a94dbba 100644
--- a/code/matrix_multiplication.h
+++ b/code/matrix_multiplication.h
@@ -10,8 +10,8 @@
 
 inline void Initialize(std::complex<float>* a) {
   // Initialize matrices with random complex values
-  std::random_device rd;
-  std::mt19937 gen(rd());
+  std::seed_seq seed({42});
+  std::mt19937 gen(seed);
   std::uniform_real_distribution<float> dis(-1.0, 1.0);
 
   for (int i = 0; i < 4; i++) {
diff --git a/test/helpers.cpp b/test/helpers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60cabefaf0126cae7bd1aa937f81e0c40e0cc61d
--- /dev/null
+++ b/test/helpers.cpp
@@ -0,0 +1,98 @@
+#include "helpers.h"
+
+#include <aocommon/matrix4x4.h>
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+#include <iostream>
+
+#define COMPARE_ARRAYS(lhs, rhs, precision)                                    \
+  compareArrays(Catch::getResultCapture().getCurrentTestName(), __LINE__, lhs, \
+                rhs, precision)
+
+template <typename T>
+void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv,
+                   float precision) {
+  REQUIRE_THAT(lv, Catch::Matchers::WithinAbs(rv, precision));
+}
+
+template <>
+void compareSingle(const std::vector<std::complex<float>>& lv,
+                   const std::vector<std::complex<float>>& rv,
+                   float precision) {
+  for (size_t idx = 0; idx < lv.size(); idx++) {
+    const auto le = lv[idx];
+    const auto re = rv[idx];
+
+    REQUIRE_THAT(le.real(), Catch::Matchers::WithinAbs(re.real(), precision));
+    REQUIRE_THAT(le.imag(), Catch::Matchers::WithinAbs(re.imag(), precision));
+  }
+}
+
+template <typename T>
+std::string valueToString(const T& value) {
+  return std::to_string(value);
+}
+
+std::string valueToString(const std::complex<float>& value) {
+  return std::to_string(value.real()) + " " + std::to_string(value.imag()) +
+         "j";
+}
+
+template <typename T, size_t N>
+void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
+                   std::array<T, N> rhs, float precision) {
+  std::vector<T> lv(lhs.begin(), lhs.end());
+  std::vector<T> rv(rhs.begin(), rhs.end());
+  INFO("Test case [" << test << "] failed at line "
+                     << line);  // Reported only if REQUIRE fails
+
+  std::stringstream ss;
+  ss << "Expected : \n";
+  for (size_t idx = 0; idx < N; idx++) {
+    ss << valueToString(lhs[idx]) << "\t";
+  }
+
+  ss << "\nObtained : \n";
+  for (size_t idx = 0; idx < N; idx++) {
+    ss << valueToString(rhs[idx]) << "\t";
+  }
+  ss << "\n";
+  INFO("Reason: \n" << ss.str());
+  compareSingle(lv, rv, precision);
+}
+
+template void compareArrays(const std::string& test, unsigned line,
+                            std::array<std::complex<float>, 4ul> lhs,
+                            std::array<std::complex<float>, 4ul> rhs,
+                            float precision);
+
+void AssertEqual(const aocommon::Matrix4x4& a, const aocommon::Matrix4x4& b,
+                 float precision) {
+  for (size_t i = 0; i < 16; i++) {
+    REQUIRE_THAT(a[i].real(),
+                 Catch::Matchers::WithinAbs(b[i].real(), precision));
+    REQUIRE_THAT(a[i].imag(),
+                 Catch::Matchers::WithinAbs(b[i].imag(), precision));
+  }
+}
+
+void AssertEqual(const aocommon::HMatrix4x4& a, const aocommon::HMatrix4x4& b,
+                 float precision) {
+  for (size_t i = 0; i < 16; i++) {
+    REQUIRE_THAT(a[i].real(),
+                 Catch::Matchers::WithinAbs(b[i].real(), precision));
+    REQUIRE_THAT(a[i].imag(),
+                 Catch::Matchers::WithinAbs(b[i].imag(), precision));
+  }
+}
+void PrintMatrix(const aocommon::Matrix4x4& mat) {
+  for (size_t j = 0; j < 4; j++) {
+    for (size_t i = 0; i < 3; i++) std::cout << mat[i + j * 4] << "\t";
+    std::cout << mat[3 + j * 4] << std::endl;
+  }
+}
+
+void PrintMatrix(const aocommon::HMatrix4x4& mat) {
+  PrintMatrix(mat.ToMatrix());
+}
\ No newline at end of file
diff --git a/test/helpers.h b/test/helpers.h
index 1939b9ba0f4c8c872ee3031a9657cf565f60fda8..83f306f74f8abfdb9ecbc9fc3fc544e3ca63aa2a 100644
--- a/test/helpers.h
+++ b/test/helpers.h
@@ -1,9 +1,13 @@
-#ifndef HELPERS
+#ifndef HELPERS_H
 
-#define HELPERS
+#define HELPERS_H
+
+#include <aocommon/hmatrix4x4.h>
+#include <aocommon/matrix4x4.h>
 
 #include <array>
 #include <catch2/matchers/catch_matchers_floating_point.hpp>
+#include <complex>
 #include <string>
 #include <vector>
 
@@ -13,54 +17,27 @@
 
 template <typename T>
 void compareSingle(const std::vector<T>& lv, const std::vector<T>& rv,
-                   float precision) {
-  REQUIRE_THAT(lv, Catch::Matchers::WithinAbs(rv, precision));
-}
+                   float precision);
 
 template <>
 void compareSingle(const std::vector<std::complex<float>>& lv,
-                   const std::vector<std::complex<float>>& rv,
-                   float precision) {
-  for (size_t idx = 0; idx < lv.size(); idx++) {
-    const auto le = lv[idx];
-    const auto re = rv[idx];
-
-    REQUIRE_THAT(le.real(), Catch::Matchers::WithinAbs(re.real(), precision));
-    REQUIRE_THAT(le.imag(), Catch::Matchers::WithinAbs(re.imag(), precision));
-  }
-}
+                   const std::vector<std::complex<float>>& rv, float precision);
 
 template <typename T>
-std::string valueToString(const T& value) {
-  return std::to_string(value);
-}
+std::string valueToString(const T& value);
 
-std::string valueToString(const std::complex<float>& value) {
-  return std::to_string(value.real()) + " " + std::to_string(value.imag()) +
-         "j";
-}
+std::string valueToString(const std::complex<float>& value);
 
 template <typename T, size_t N>
 void compareArrays(const std::string& test, unsigned line, std::array<T, N> lhs,
-                   std::array<T, N> rhs, float precision) {
-  std::vector<T> lv(lhs.begin(), lhs.end());
-  std::vector<T> rv(rhs.begin(), rhs.end());
-  INFO("Test case [" << test << "] failed at line "
-                     << line);  // Reported only if REQUIRE fails
+                   std::array<T, N> rhs, float precision);
+
+void AssertEqual(const aocommon::Matrix4x4& a, const aocommon::Matrix4x4& b,
+                 float precision);
 
-  std::stringstream ss;
-  ss << "Expected : \n";
-  for (size_t idx = 0; idx < N; idx++) {
-    ss << valueToString(lhs[idx]) << "\t";
-  }
+void AssertEqual(const aocommon::HMatrix4x4& a, const aocommon::HMatrix4x4& b,
+                 float precision);
 
-  ss << "\nObtained : \n";
-  for (size_t idx = 0; idx < N; idx++) {
-    ss << valueToString(rhs[idx]) << "\t";
-  }
-  ss << "\n";
-  INFO("Reason: \n" << ss.str());
-  compareSingle(lv, rv, precision);
-}
+void PrintMatrix(const aocommon::Matrix4x4& mat);
 
-#endif
\ No newline at end of file
+#endif  // HELPERS_H
\ No newline at end of file
diff --git a/test/test_hermitian_square.cpp b/test/test_hermitian_square.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..459b616c6a3252afa5a1550f34e58e75fbe3f864
--- /dev/null
+++ b/test/test_hermitian_square.cpp
@@ -0,0 +1,28 @@
+#include <hermitian_square.h>
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+#include "helpers.h"
+
+TEST_CASE("test hermitian square", "[double]") {
+  // This setup will be done 4 times in total, once for each section
+  aocommon::Matrix4x4 A;
+  aocommon::HMatrix4x4 HA;
+  aocommon::HMatrix4x4 HA_expected;
+
+  Initialize(A);
+
+  HA_expected = HermitianSquare(A);
+
+  SECTION("test correctness of refactored implementation") {
+    HA = HermitianSquareRefactored(A);
+    CHECK(HA_expected == HA);
+  }
+
+  SECTION("test correctness of nasty implementation") {
+    HA = HermitianSquareNaiveSIMD(A);
+
+    AssertEqual(HA_expected, HA, 1.e-6);
+  }
+}
\ No newline at end of file
diff --git a/test/test_kronecker_square.cpp b/test/test_kronecker_square.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ef68380de56a9e5c4575bc34a0d36e9cedec23c
--- /dev/null
+++ b/test/test_kronecker_square.cpp
@@ -0,0 +1,26 @@
+#include <kronecker_square.h>
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+#include "helpers.h"
+
+TEST_CASE("test kronecker square matrix multiplication", "[double]") {
+  // This setup will be done 4 times in total, once for each section
+  aocommon::MC2x2 A;
+  aocommon::MC2x2 B;
+  aocommon::Matrix4x4 C;
+
+  aocommon::Matrix4x4 C_expected;
+
+  Initialize(A);
+  Initialize(B);
+
+  C_expected = KroneckerSquareReference(A, B).ToMatrix();
+
+  SECTION("test correctness of fused implementation") {
+    C = KroneckerSquareFused(A, B).ToMatrix();
+
+    AssertEqual(C_expected, C, 1.e-6);
+  }
+}
\ No newline at end of file