Skip to content
Snippets Groups Projects
Commit 78b92033 authored by Bram Veenboer's avatar Bram Veenboer Committed by Bram Veenboer
Browse files

Simplify using _mm256_permute_ps

parent b25351c9
No related branches found
No related tags found
1 merge request!4Add matrixMultiplyAVX2b
...@@ -120,28 +120,19 @@ void matrixMultiplyAVX2b(const std::complex<float>* a, ...@@ -120,28 +120,19 @@ void matrixMultiplyAVX2b(const std::complex<float>* a,
__m256 a_m = _mm256_load_ps(a_ptr); __m256 a_m = _mm256_load_ps(a_ptr);
__m256 b_m = _mm256_load_ps(b_ptr); __m256 b_m = _mm256_load_ps(b_ptr);
__m256i a_1_ind = _mm256_set_epi32(4, 4, 4, 4, 0, 0, 0, 0); __m256 a_1 = _mm256_permute_ps(a_m, _MM_SHUFFLE(0, 0, 0, 0));
__m256i b_1_ind = _mm256_set_epi32(3, 2, 1, 0, 3, 2, 1, 0); __m256 a_2 = _mm256_permute_ps(a_m, _MM_SHUFFLE(2, 2, 2, 2));
__m256 a_3 = _mm256_permute_ps(a_m, _MM_SHUFFLE(1, 1, 1, 1));
__m256 a_4 = _mm256_permute_ps(a_m, _MM_SHUFFLE(3, 3, 3, 3));
__m256i a_2_ind = _mm256_set_epi32(6, 6, 6, 6, 2, 2, 2, 2); __m256i b_1_ind = _mm256_set_epi32(3, 2, 1, 0, 3, 2, 1, 0);
__m256i b_2_ind = _mm256_set_epi32(7, 6, 5, 4, 7, 6, 5, 4); __m256i b_2_ind = _mm256_set_epi32(7, 6, 5, 4, 7, 6, 5, 4);
__m256i a_3_ind = _mm256_set_epi32(5, 5, 5, 5, 1, 1, 1, 1);
__m256i b_3_ind = _mm256_set_epi32(2, 3, 0, 1, 2, 3, 0, 1); __m256i b_3_ind = _mm256_set_epi32(2, 3, 0, 1, 2, 3, 0, 1);
__m256i a_4_ind = _mm256_set_epi32(7, 7, 7, 7, 3, 3, 3, 3);
__m256i b_4_ind = _mm256_set_epi32(6, 7, 4, 5, 6, 7, 4, 5); __m256i b_4_ind = _mm256_set_epi32(6, 7, 4, 5, 6, 7, 4, 5);
__m256 a_1 = _mm256_permutevar8x32_ps(a_m, a_1_ind);
__m256 b_1 = _mm256_permutevar8x32_ps(b_m, b_1_ind); __m256 b_1 = _mm256_permutevar8x32_ps(b_m, b_1_ind);
__m256 a_2 = _mm256_permutevar8x32_ps(a_m, a_2_ind);
__m256 b_2 = _mm256_permutevar8x32_ps(b_m, b_2_ind); __m256 b_2 = _mm256_permutevar8x32_ps(b_m, b_2_ind);
__m256 a_3 = _mm256_permutevar8x32_ps(a_m, a_3_ind);
__m256 b_3 = _mm256_permutevar8x32_ps(b_m, b_3_ind); __m256 b_3 = _mm256_permutevar8x32_ps(b_m, b_3_ind);
__m256 a_4 = _mm256_permutevar8x32_ps(a_m, a_4_ind);
__m256 b_4 = _mm256_permutevar8x32_ps(b_m, b_4_ind); __m256 b_4 = _mm256_permutevar8x32_ps(b_m, b_4_ind);
__m256 c_m = _mm256_mul_ps(a_1, b_1); __m256 c_m = _mm256_mul_ps(a_1, b_1);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment