diff --git a/benchmark/predict.cpp b/benchmark/predict.cpp index 2b175ef17d23dd2b015f75d08c1f15b9588869df..978bb579fca81bfc4896bd789a6d3f8cfa59b1d1 100644 --- a/benchmark/predict.cpp +++ b/benchmark/predict.cpp @@ -62,6 +62,7 @@ BENCHMARK_REGISTER_F(PredictBenchmark, PointSource) nstations, nsources, {false, true}, + {false, true}, }) ->ArgNames({"stat", "chan", "fullstokes", "freqsmear"}) ->Unit(benchmark::kMillisecond); diff --git a/src/PredictPlanExec.cpp b/src/PredictPlanExec.cpp index 85d15dfc2f48339230b865444ce628c92b6510ca..daab337a167fd1a725ddee88475b0a011f509cdf 100644 --- a/src/PredictPlanExec.cpp +++ b/src/PredictPlanExec.cpp @@ -208,71 +208,83 @@ void PredictPlanExec::ProcessPolarizationComponentMultiSimd(E &buffer_expr, const uint_fast32_t simd_channels = (nchannels / 4) * 4; const uint_fast32_t total_vector_size = simd_channels * 4; + // std::vector<double> channel_accum(simd_channels, 0.0); + for (uint_fast32_t bl = 0; bl < baselines.size(); ++bl) { const size_t p = baselines[bl].first; const size_t q = baselines[bl].second; + double real_accum = 0.0, imag_accum = 0.0; + const auto &shift_data = GetShiftDataComplexSeperated(); // Cache reference + alignas(32) double r[4], i[4]; uint_fast32_t ch = 0; - for (; ch < simd_channels; ch += 4) { - // Load 4 values from shift_data for p and q - // __m256d x_p = _mm256_set_pd( - // shift_data(p, ch + 3).real(), shift_data(p, ch + 2).real(), - // shift_data(p, ch + 1).real(), shift_data(p, ch + 0).real()); - - // __m256d y_p = _mm256_set_pd( - // shift_data(p, ch + 3).imag(), shift_data(p, ch + 2).imag(), - // shift_data(p, ch + 1).imag(), shift_data(p, ch + 0).imag()); - - // __m256d x_q = _mm256_set_pd( - // shift_data(q, ch + 3).real(), shift_data(q, ch + 2).real(), - // shift_data(q, ch + 1).real(), shift_data(q, ch + 0).real()); - - // __m256d y_q = _mm256_set_pd( - // shift_data(q, ch + 3).imag(), shift_data(q, ch + 2).imag(), - // shift_data(q, ch + 1).imag(), shift_data(q, ch + 0).imag()); - - // __m256d x_c = - // _mm256_set_pd(spectrum(ch + 3).real(), spectrum(ch + 2).real(), - // spectrum(ch + 1).real(), spectrum(ch + 0).real()); - - // __m256d y_c = - // _mm256_set_pd(spectrum(ch + 3).imag(), spectrum(ch + 2).imag(), - // spectrum(ch + 1).imag(), spectrum(ch + 0).imag()); - - __m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch + 0)); - __m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch + 0)); - __m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch + 0)); - __m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch + 0)); - __m256d x_c = _mm256_loadu_pd(&spectrum(0, ch + 0)); - __m256d y_c = _mm256_loadu_pd(&spectrum(1, ch + 0)); - - // Complex multiply (q_conj * p) * stokes - __m256d real_part = - _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q)); - __m256d imag_part = - _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p)); - - __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c), - _mm256_mul_pd(imag_part, y_c)); - __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c), - _mm256_mul_pd(imag_part, x_c)); - if (correct_frequency_smearing) { + if (correct_frequency_smearing) { + for (; ch < simd_channels; ch += 4) { + // Load 4 values from shift_data for p and q + __m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch)); + __m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch)); + __m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch)); + __m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch)); + __m256d x_c = _mm256_loadu_pd(&spectrum(0, ch)); + __m256d y_c = _mm256_loadu_pd(&spectrum(1, ch)); __m256d smear_terms_temp = _mm256_loadu_pd(&smear_terms[ch]); + + // Complex multiply (q_conj * p) * stokes + __m256d real_part = + _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q)); + __m256d imag_part = + _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p)); + + __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c), + _mm256_mul_pd(imag_part, y_c)); + __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c), + _mm256_mul_pd(imag_part, x_c)); + temp_real = _mm256_mul_pd(temp_real, smear_terms_temp); temp_imag = _mm256_mul_pd(temp_imag, smear_terms_temp); - } - alignas(32) double r[4], i[4]; - _mm256_store_pd(r, temp_real); - _mm256_store_pd(i, temp_imag); + _mm256_store_pd(r, temp_real); + _mm256_store_pd(i, temp_imag); - for (uint_fast8_t k = 0; k < 4; ++k) { - buffer_expr(bl, ch + k) += dcomplex{r[k], i[k]}; + buffer_expr(bl, ch + 0) += dcomplex{r[0], i[0]}; + buffer_expr(bl, ch + 1) += dcomplex{r[1], i[1]}; + buffer_expr(bl, ch + 2) += dcomplex{r[2], i[2]}; + buffer_expr(bl, ch + 3) += dcomplex{r[3], i[3]}; + } + } else { + for (; ch < simd_channels; ch += 4) { + // Load 4 values from shift_data for p and q + __m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch)); + __m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch)); + __m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch)); + __m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch)); + __m256d x_c = _mm256_loadu_pd(&spectrum(0, ch)); + __m256d y_c = _mm256_loadu_pd(&spectrum(1, ch)); + + // Complex multiply (q_conj * p) * stokes + __m256d real_part = + _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q)); + __m256d imag_part = + _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p)); + + __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c), + _mm256_mul_pd(imag_part, y_c)); + __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c), + _mm256_mul_pd(imag_part, x_c)); + + _mm256_store_pd(r, temp_real); + _mm256_store_pd(i, temp_imag); + + buffer_expr(bl, ch + 0) += dcomplex{r[0], i[0]}; + buffer_expr(bl, ch + 1) += dcomplex{r[1], i[1]}; + buffer_expr(bl, ch + 2) += dcomplex{r[2], i[2]}; + buffer_expr(bl, ch + 3) += dcomplex{r[3], i[3]}; } } + for (; ch < nchannels; ++ch) { const double x_p = (shift_data(0, p, ch)); const double y_p = (shift_data(1, p, ch));