Skip to content
Snippets Groups Projects
Commit e1ccfe57 authored by Wiebe van Breukelen's avatar Wiebe van Breukelen
Browse files

Hoist conditional branching in hotspot

parent f5df38ce
Branches
No related tags found
2 merge requests!33Draft: GPU smearterms,!30Add SIMD-based optimizations and refactored PredictPlanExecCPU
Pipeline #119164 failed
......@@ -62,6 +62,7 @@ BENCHMARK_REGISTER_F(PredictBenchmark, PointSource)
nstations,
nsources,
{false, true},
{false, true},
})
->ArgNames({"stat", "chan", "fullstokes", "freqsmear"})
->Unit(benchmark::kMillisecond);
......
......@@ -208,45 +208,61 @@ void PredictPlanExec::ProcessPolarizationComponentMultiSimd(E &buffer_expr,
const uint_fast32_t simd_channels = (nchannels / 4) * 4;
const uint_fast32_t total_vector_size = simd_channels * 4;
// std::vector<double> channel_accum(simd_channels, 0.0);
for (uint_fast32_t bl = 0; bl < baselines.size(); ++bl) {
const size_t p = baselines[bl].first;
const size_t q = baselines[bl].second;
double real_accum = 0.0, imag_accum = 0.0;
const auto &shift_data = GetShiftDataComplexSeperated(); // Cache reference
alignas(32) double r[4], i[4];
uint_fast32_t ch = 0;
if (correct_frequency_smearing) {
for (; ch < simd_channels; ch += 4) {
// Load 4 values from shift_data for p and q
// __m256d x_p = _mm256_set_pd(
// shift_data(p, ch + 3).real(), shift_data(p, ch + 2).real(),
// shift_data(p, ch + 1).real(), shift_data(p, ch + 0).real());
// __m256d y_p = _mm256_set_pd(
// shift_data(p, ch + 3).imag(), shift_data(p, ch + 2).imag(),
// shift_data(p, ch + 1).imag(), shift_data(p, ch + 0).imag());
__m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch));
__m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch));
__m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch));
__m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch));
__m256d x_c = _mm256_loadu_pd(&spectrum(0, ch));
__m256d y_c = _mm256_loadu_pd(&spectrum(1, ch));
__m256d smear_terms_temp = _mm256_loadu_pd(&smear_terms[ch]);
// __m256d x_q = _mm256_set_pd(
// shift_data(q, ch + 3).real(), shift_data(q, ch + 2).real(),
// shift_data(q, ch + 1).real(), shift_data(q, ch + 0).real());
// Complex multiply (q_conj * p) * stokes
__m256d real_part =
_mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q));
__m256d imag_part =
_mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p));
// __m256d y_q = _mm256_set_pd(
// shift_data(q, ch + 3).imag(), shift_data(q, ch + 2).imag(),
// shift_data(q, ch + 1).imag(), shift_data(q, ch + 0).imag());
__m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c),
_mm256_mul_pd(imag_part, y_c));
__m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c),
_mm256_mul_pd(imag_part, x_c));
// __m256d x_c =
// _mm256_set_pd(spectrum(ch + 3).real(), spectrum(ch + 2).real(),
// spectrum(ch + 1).real(), spectrum(ch + 0).real());
temp_real = _mm256_mul_pd(temp_real, smear_terms_temp);
temp_imag = _mm256_mul_pd(temp_imag, smear_terms_temp);
// __m256d y_c =
// _mm256_set_pd(spectrum(ch + 3).imag(), spectrum(ch + 2).imag(),
// spectrum(ch + 1).imag(), spectrum(ch + 0).imag());
_mm256_store_pd(r, temp_real);
_mm256_store_pd(i, temp_imag);
__m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch + 0));
__m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch + 0));
__m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch + 0));
__m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch + 0));
__m256d x_c = _mm256_loadu_pd(&spectrum(0, ch + 0));
__m256d y_c = _mm256_loadu_pd(&spectrum(1, ch + 0));
buffer_expr(bl, ch + 0) += dcomplex{r[0], i[0]};
buffer_expr(bl, ch + 1) += dcomplex{r[1], i[1]};
buffer_expr(bl, ch + 2) += dcomplex{r[2], i[2]};
buffer_expr(bl, ch + 3) += dcomplex{r[3], i[3]};
}
} else {
for (; ch < simd_channels; ch += 4) {
// Load 4 values from shift_data for p and q
__m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch));
__m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch));
__m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch));
__m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch));
__m256d x_c = _mm256_loadu_pd(&spectrum(0, ch));
__m256d y_c = _mm256_loadu_pd(&spectrum(1, ch));
// Complex multiply (q_conj * p) * stokes
__m256d real_part =
......@@ -259,20 +275,16 @@ void PredictPlanExec::ProcessPolarizationComponentMultiSimd(E &buffer_expr,
__m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c),
_mm256_mul_pd(imag_part, x_c));
if (correct_frequency_smearing) {
__m256d smear_terms_temp = _mm256_loadu_pd(&smear_terms[ch]);
temp_real = _mm256_mul_pd(temp_real, smear_terms_temp);
temp_imag = _mm256_mul_pd(temp_imag, smear_terms_temp);
}
alignas(32) double r[4], i[4];
_mm256_store_pd(r, temp_real);
_mm256_store_pd(i, temp_imag);
for (uint_fast8_t k = 0; k < 4; ++k) {
buffer_expr(bl, ch + k) += dcomplex{r[k], i[k]};
buffer_expr(bl, ch + 0) += dcomplex{r[0], i[0]};
buffer_expr(bl, ch + 1) += dcomplex{r[1], i[1]};
buffer_expr(bl, ch + 2) += dcomplex{r[2], i[2]};
buffer_expr(bl, ch + 3) += dcomplex{r[3], i[3]};
}
}
for (; ch < nchannels; ++ch) {
const double x_p = (shift_data(0, p, ch));
const double y_p = (shift_data(1, p, ch));
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment