Skip to content
Snippets Groups Projects
Commit f40fbb56 authored by Mattia Mancini's avatar Mattia Mancini
Browse files

Remove OpenMP implementation

parent 7e5ea4c4
No related branches found
No related tags found
1 merge request!11Add batch approach for matrix multiplication
Pipeline #84016 passed
...@@ -49,7 +49,7 @@ FetchContent_Declare( ...@@ -49,7 +49,7 @@ FetchContent_Declare(
FetchContent_Populate(aocommon) FetchContent_Populate(aocommon)
set(COMPILER_FLAGS "-O3;-march=native;-ggdb;-fopenmp") set(COMPILER_FLAGS "-O3;-march=native;-ggdb")
# List all kernel code # List all kernel code
file(GLOB KERNEL_SOURCES "code/*.cpp") file(GLOB KERNEL_SOURCES "code/*.cpp")
# Add the benchmark executable # Add the benchmark executable
......
...@@ -146,14 +146,6 @@ BENCHMARK_DEFINE_F(InitializeInputBatch, BatchMatrixMultiplicationAOCommon) ...@@ -146,14 +146,6 @@ BENCHMARK_DEFINE_F(InitializeInputBatch, BatchMatrixMultiplicationAOCommon)
} }
} }
BENCHMARK_DEFINE_F(InitializeInputBatch,
BatchMatrixMultiplicationRealComplexOpenMP)
(benchmark::State& state) {
for (auto _ : state) {
matrixMultiplyRealComplexSIMD(A, B, C, state.range(0));
}
}
BENCHMARK_DEFINE_F(InitializeInputBatch, BatchMatrixMultiplicationRealComplex) BENCHMARK_DEFINE_F(InitializeInputBatch, BatchMatrixMultiplicationRealComplex)
(benchmark::State& state) { (benchmark::State& state) {
for (auto _ : state) { for (auto _ : state) {
...@@ -170,8 +162,4 @@ BENCHMARK_REGISTER_F(InitializeInputBatch, BatchMatrixMultiplicationAOCommon) ...@@ -170,8 +162,4 @@ BENCHMARK_REGISTER_F(InitializeInputBatch, BatchMatrixMultiplicationAOCommon)
BENCHMARK_REGISTER_F(InitializeInputBatch, BatchMatrixMultiplicationRealComplex) BENCHMARK_REGISTER_F(InitializeInputBatch, BatchMatrixMultiplicationRealComplex)
->Range(8, 512); ->Range(8, 512);
BENCHMARK_REGISTER_F(InitializeInputBatch,
BatchMatrixMultiplicationRealComplexOpenMP)
->Range(8, 512);
BENCHMARK_MAIN(); BENCHMARK_MAIN();
#include "matrix_multiplication.h" #include "matrix_multiplication.h"
void matrixMultiplyRealComplexSIMD(const std::complex<float>* a,
const std::complex<float>* b,
std::complex<float>* c, size_t batch_size) {
float* a_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(a));
float* b_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(b));
float* c_ptr = reinterpret_cast<float*>(c);
#pragma omp simd
for (size_t s = 0; s < batch_size; s++) {
const float a_00_real = a_ptr[s * 8 + 0];
const float a_00_imag = a_ptr[s * 8 + 1];
const float a_01_real = a_ptr[s * 8 + 2];
const float a_01_imag = a_ptr[s * 8 + 3];
const float a_10_real = a_ptr[s * 8 + 4];
const float a_10_imag = a_ptr[s * 8 + 5];
const float a_11_real = a_ptr[s * 8 + 6];
const float a_11_imag = a_ptr[s * 8 + 7];
const float b_00_real = b_ptr[s * 8 + 0];
const float b_00_imag = b_ptr[s * 8 + 1];
const float b_01_real = b_ptr[s * 8 + 2];
const float b_01_imag = b_ptr[s * 8 + 3];
const float b_10_real = b_ptr[s * 8 + 4];
const float b_10_imag = b_ptr[s * 8 + 5];
const float b_11_real = b_ptr[s * 8 + 6];
const float b_11_imag = b_ptr[s * 8 + 7];
c_ptr[s * 8 + 0] = a_00_real * b_00_real + a_01_real * b_10_real;
c_ptr[s * 8 + 0] -= a_00_imag * b_00_imag + a_01_imag * b_10_imag;
c_ptr[s * 8 + 1] = a_00_real * b_00_imag + a_01_real * b_10_imag;
c_ptr[s * 8 + 1] += a_00_imag * b_00_real + a_01_imag * b_10_real;
c_ptr[s * 8 + 2] = a_00_real * b_01_real + a_01_real * b_11_real;
c_ptr[s * 8 + 2] -= a_00_imag * b_01_imag + a_01_imag * b_11_imag;
c_ptr[s * 8 + 3] = a_00_real * b_01_imag + a_01_real * b_11_imag;
c_ptr[s * 8 + 3] += a_00_imag * b_01_real + a_01_imag * b_11_real;
c_ptr[s * 8 + 4] = a_10_real * b_00_real + a_11_real * b_10_real;
c_ptr[s * 8 + 4] -= a_10_imag * b_00_imag + a_11_imag * b_10_imag;
c_ptr[s * 8 + 5] = a_10_real * b_00_imag + a_11_real * b_10_imag;
c_ptr[s * 8 + 5] += a_10_imag * b_00_real + a_11_imag * b_10_real;
c_ptr[s * 8 + 6] = a_10_real * b_01_real + a_11_real * b_11_real;
c_ptr[s * 8 + 6] -= a_10_imag * b_01_imag + a_11_imag * b_11_imag;
c_ptr[s * 8 + 7] = a_10_real * b_01_imag + a_11_real * b_11_imag;
c_ptr[s * 8 + 7] += a_10_imag * b_01_real + a_11_imag * b_11_real;
}
}
void matrixMultiplyRealComplex(const std::complex<float>* a, void matrixMultiplyRealComplex(const std::complex<float>* a,
const std::complex<float>* b, const std::complex<float>* b,
std::complex<float>* c, size_t batch_size) { std::complex<float>* c, size_t batch_size) {
......
...@@ -35,11 +35,6 @@ TEST_CASE("test complex matrix multiplication", "[float]") { ...@@ -35,11 +35,6 @@ TEST_CASE("test complex matrix multiplication", "[float]") {
COMPARE_ARRAYS(C_expected, C, 1.e-6); COMPARE_ARRAYS(C_expected, C, 1.e-6);
} }
SECTION("test correctness of batch implementation") {
matrixMultiplyRealComplexSIMD(A.data(), B.data(), C.data(), 1);
COMPARE_ARRAYS(C_expected, C, 1.e-6);
}
#if defined(__AVX__) #if defined(__AVX__)
SECTION("test correctness of avx implementation") { SECTION("test correctness of avx implementation") {
matrixMultiplyAVX(A.data(), B.data(), C.data()); matrixMultiplyAVX(A.data(), B.data(), C.data());
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment