Skip to content
Snippets Groups Projects
Commit 8c061688 authored by Bram Veenboer's avatar Bram Veenboer Committed by Bram Veenboer
Browse files

Add matrixMultiplyAVX2b

Same as matrixMultiplyAVX2
parent a418699e
No related branches found
No related tags found
1 merge request!4Add matrixMultiplyAVX2b
......@@ -56,4 +56,12 @@ BENCHMARK_F(InitializeInput, MatrixMultiplicationAvx2)
matrixMultiplyAVX2(A, B, C);
}
}
// Using direct avx2b implementation
BENCHMARK_F(InitializeInput, MatrixMultiplicationAvx2b)
(benchmark::State& state) {
for (auto _ : state) {
matrixMultiplyAVX2b(A, B, C);
}
}
BENCHMARK_MAIN();
......@@ -110,3 +110,46 @@ void matrixMultiplyAVX2(const std::complex<float>* a,
__m256 c_m = _mm256_add_ps(c_p1, c_p2);
_mm256_store_ps(c_ptr, c_m);
}
void matrixMultiplyAVX2b(const std::complex<float>* a,
const std::complex<float>* b, std::complex<float>* c) {
float* a_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(a));
float* b_ptr = reinterpret_cast<float*>(const_cast<std::complex<float>*>(b));
float* c_ptr = reinterpret_cast<float*>(c);
__m256 a_m = _mm256_load_ps(a_ptr);
__m256 b_m = _mm256_load_ps(b_ptr);
__m256i a_1_ind = _mm256_set_epi32(4, 4, 4, 4, 0, 0, 0, 0);
__m256i b_1_ind = _mm256_set_epi32(3, 2, 1, 0, 3, 2, 1, 0);
__m256i a_2_ind = _mm256_set_epi32(6, 6, 6, 6, 2, 2, 2, 2);
__m256i b_2_ind = _mm256_set_epi32(7, 6, 5, 4, 7, 6, 5, 4);
__m256i a_3_ind = _mm256_set_epi32(5, 5, 5, 5, 1, 1, 1, 1);
__m256i b_3_ind = _mm256_set_epi32(2, 3, 0, 1, 2, 3, 0, 1);
__m256i a_4_ind = _mm256_set_epi32(7, 7, 7, 7, 3, 3, 3, 3);
__m256i b_4_ind = _mm256_set_epi32(6, 7, 4, 5, 6, 7, 4, 5);
__m256 inv = _mm256_set_ps(1., -1., 1., -1., 1., -1., 1., -1.);
__m256 a_1 = _mm256_permutevar8x32_ps(a_m, a_1_ind);
__m256 b_1 = _mm256_permutevar8x32_ps(b_m, b_1_ind);
__m256 a_2 = _mm256_permutevar8x32_ps(a_m, a_2_ind);
__m256 b_2 = _mm256_permutevar8x32_ps(b_m, b_2_ind);
__m256 a_3 = _mm256_permutevar8x32_ps(a_m, a_3_ind);
__m256 a_3_inv = _mm256_mul_ps(a_3, inv);
__m256 b_3 = _mm256_permutevar8x32_ps(b_m, b_3_ind);
__m256 a_4 = _mm256_permutevar8x32_ps(a_m, a_4_ind);
__m256 a_4_inv = _mm256_mul_ps(a_4, inv);
__m256 b_4 = _mm256_permutevar8x32_ps(b_m, b_4_ind);
__m256 c_m = _mm256_mul_ps(a_1, b_1);
c_m = _mm256_fmadd_ps(a_2, b_2, c_m);
c_m = _mm256_fmadd_ps(a_3_inv, b_3, c_m);
c_m = _mm256_fmadd_ps(a_4_inv, b_4, c_m);
_mm256_store_ps(c_ptr, c_m);
}
\ No newline at end of file
......@@ -34,4 +34,10 @@ TEST_CASE("test complex matrix multiplication", "[float]") {
COMPARE_ARRAYS(C_expected, C, 1.e-6);
}
SECTION("test correctness of avx2b implementation") {
matrixMultiplyAVX2b(A.data(), B.data(), C.data());
COMPARE_ARRAYS(C_expected, C, 1.e-6);
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment