diff --git a/CMakeLists.txt b/CMakeLists.txt index 13a27024756e2d05f1007408ea66fd61685165f9..ea8b5b3d37a86aa774ed4e5bf83375d86f3ba13d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ FetchContent_Declare( FetchContent_Populate(aocommon) -set(COMPILER_FLAGS "-O3;-march=native;-ggdb;-fopenmp") +set(COMPILER_FLAGS "-O3;-march=native;-ggdb") # List all kernel code file(GLOB KERNEL_SOURCES "code/*.cpp") # Add the benchmark executable diff --git a/benchmarks/matrix_multiplication.cpp b/benchmarks/matrix_multiplication.cpp index e3cbe4e746650255d5c9ad3bc8f08b5025df7e1f..1615afba26ac0b021bfba17c0aa6181ef099af62 100644 --- a/benchmarks/matrix_multiplication.cpp +++ b/benchmarks/matrix_multiplication.cpp @@ -146,14 +146,6 @@ BENCHMARK_DEFINE_F(InitializeInputBatch, BatchMatrixMultiplicationAOCommon) } } -BENCHMARK_DEFINE_F(InitializeInputBatch, - BatchMatrixMultiplicationRealComplexOpenMP) -(benchmark::State& state) { - for (auto _ : state) { - matrixMultiplyRealComplexSIMD(A, B, C, state.range(0)); - } -} - BENCHMARK_DEFINE_F(InitializeInputBatch, BatchMatrixMultiplicationRealComplex) (benchmark::State& state) { for (auto _ : state) { @@ -170,8 +162,4 @@ BENCHMARK_REGISTER_F(InitializeInputBatch, BatchMatrixMultiplicationAOCommon) BENCHMARK_REGISTER_F(InitializeInputBatch, BatchMatrixMultiplicationRealComplex) ->Range(8, 512); -BENCHMARK_REGISTER_F(InitializeInputBatch, - BatchMatrixMultiplicationRealComplexOpenMP) - ->Range(8, 512); - BENCHMARK_MAIN(); diff --git a/code/matrix_multiplication_batch_real_complex.cpp b/code/matrix_multiplication_batch_real_complex.cpp index c99607640708761aa06e3a2c0440b20781dc6e80..925c5b6f5570c1e8e9e7b1fdb0d1f2bbd4bfb26c 100644 --- a/code/matrix_multiplication_batch_real_complex.cpp +++ b/code/matrix_multiplication_batch_real_complex.cpp @@ -1,50 +1,5 @@ #include "matrix_multiplication.h" -void matrixMultiplyRealComplexSIMD(const std::complex<float>* a, - const std::complex<float>* b, - std::complex<float>* c, size_t batch_size) { - float* a_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(a)); - float* b_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(b)); - float* c_ptr = reinterpret_cast<float*>(c); - -#pragma omp simd - for (size_t s = 0; s < batch_size; s++) { - const float a_00_real = a_ptr[s * 8 + 0]; - const float a_00_imag = a_ptr[s * 8 + 1]; - const float a_01_real = a_ptr[s * 8 + 2]; - const float a_01_imag = a_ptr[s * 8 + 3]; - const float a_10_real = a_ptr[s * 8 + 4]; - const float a_10_imag = a_ptr[s * 8 + 5]; - const float a_11_real = a_ptr[s * 8 + 6]; - const float a_11_imag = a_ptr[s * 8 + 7]; - const float b_00_real = b_ptr[s * 8 + 0]; - const float b_00_imag = b_ptr[s * 8 + 1]; - const float b_01_real = b_ptr[s * 8 + 2]; - const float b_01_imag = b_ptr[s * 8 + 3]; - const float b_10_real = b_ptr[s * 8 + 4]; - const float b_10_imag = b_ptr[s * 8 + 5]; - const float b_11_real = b_ptr[s * 8 + 6]; - const float b_11_imag = b_ptr[s * 8 + 7]; - - c_ptr[s * 8 + 0] = a_00_real * b_00_real + a_01_real * b_10_real; - c_ptr[s * 8 + 0] -= a_00_imag * b_00_imag + a_01_imag * b_10_imag; - c_ptr[s * 8 + 1] = a_00_real * b_00_imag + a_01_real * b_10_imag; - c_ptr[s * 8 + 1] += a_00_imag * b_00_real + a_01_imag * b_10_real; - c_ptr[s * 8 + 2] = a_00_real * b_01_real + a_01_real * b_11_real; - c_ptr[s * 8 + 2] -= a_00_imag * b_01_imag + a_01_imag * b_11_imag; - c_ptr[s * 8 + 3] = a_00_real * b_01_imag + a_01_real * b_11_imag; - c_ptr[s * 8 + 3] += a_00_imag * b_01_real + a_01_imag * b_11_real; - c_ptr[s * 8 + 4] = a_10_real * b_00_real + a_11_real * b_10_real; - c_ptr[s * 8 + 4] -= a_10_imag * b_00_imag + a_11_imag * b_10_imag; - c_ptr[s * 8 + 5] = a_10_real * b_00_imag + a_11_real * b_10_imag; - c_ptr[s * 8 + 5] += a_10_imag * b_00_real + a_11_imag * b_10_real; - c_ptr[s * 8 + 6] = a_10_real * b_01_real + a_11_real * b_11_real; - c_ptr[s * 8 + 6] -= a_10_imag * b_01_imag + a_11_imag * b_11_imag; - c_ptr[s * 8 + 7] = a_10_real * b_01_imag + a_11_real * b_11_imag; - c_ptr[s * 8 + 7] += a_10_imag * b_01_real + a_11_imag * b_11_real; - } -} - void matrixMultiplyRealComplex(const std::complex<float>* a, const std::complex<float>* b, std::complex<float>* c, size_t batch_size) { diff --git a/test/test_matrix_multiplication.cpp b/test/test_matrix_multiplication.cpp index c9709db6a70bf11effc7de12edcea729e08a2c8e..28ac6de690b11aa222a35cac4696a0f829691e3a 100644 --- a/test/test_matrix_multiplication.cpp +++ b/test/test_matrix_multiplication.cpp @@ -35,11 +35,6 @@ TEST_CASE("test complex matrix multiplication", "[float]") { COMPARE_ARRAYS(C_expected, C, 1.e-6); } - SECTION("test correctness of batch implementation") { - matrixMultiplyRealComplexSIMD(A.data(), B.data(), C.data(), 1); - - COMPARE_ARRAYS(C_expected, C, 1.e-6); - } #if defined(__AVX__) SECTION("test correctness of avx implementation") { matrixMultiplyAVX(A.data(), B.data(), C.data());