diff --git a/benchmark/predict.cpp b/benchmark/predict.cpp
index 2b175ef17d23dd2b015f75d08c1f15b9588869df..978bb579fca81bfc4896bd789a6d3f8cfa59b1d1 100644
--- a/benchmark/predict.cpp
+++ b/benchmark/predict.cpp
@@ -62,6 +62,7 @@ BENCHMARK_REGISTER_F(PredictBenchmark, PointSource)
         nstations,
         nsources,
         {false, true},
+        {false, true},
     })
     ->ArgNames({"stat", "chan", "fullstokes", "freqsmear"})
     ->Unit(benchmark::kMillisecond);
diff --git a/src/PredictPlanExec.cpp b/src/PredictPlanExec.cpp
index 85d15dfc2f48339230b865444ce628c92b6510ca..daab337a167fd1a725ddee88475b0a011f509cdf 100644
--- a/src/PredictPlanExec.cpp
+++ b/src/PredictPlanExec.cpp
@@ -208,71 +208,83 @@ void PredictPlanExec::ProcessPolarizationComponentMultiSimd(E &buffer_expr,
   const uint_fast32_t simd_channels = (nchannels / 4) * 4;
   const uint_fast32_t total_vector_size = simd_channels * 4;
 
+  // std::vector<double> channel_accum(simd_channels, 0.0);
+
   for (uint_fast32_t bl = 0; bl < baselines.size(); ++bl) {
     const size_t p = baselines[bl].first;
     const size_t q = baselines[bl].second;
 
+    double real_accum = 0.0, imag_accum = 0.0;
+
     const auto &shift_data = GetShiftDataComplexSeperated(); // Cache reference
 
+    alignas(32) double r[4], i[4];
     uint_fast32_t ch = 0;
-    for (; ch < simd_channels; ch += 4) {
-      // Load 4 values from shift_data for p and q
-      // __m256d x_p = _mm256_set_pd(
-      //     shift_data(p, ch + 3).real(), shift_data(p, ch + 2).real(),
-      //     shift_data(p, ch + 1).real(), shift_data(p, ch + 0).real());
-
-      // __m256d y_p = _mm256_set_pd(
-      //     shift_data(p, ch + 3).imag(), shift_data(p, ch + 2).imag(),
-      //     shift_data(p, ch + 1).imag(), shift_data(p, ch + 0).imag());
-
-      // __m256d x_q = _mm256_set_pd(
-      //     shift_data(q, ch + 3).real(), shift_data(q, ch + 2).real(),
-      //     shift_data(q, ch + 1).real(), shift_data(q, ch + 0).real());
-
-      // __m256d y_q = _mm256_set_pd(
-      //     shift_data(q, ch + 3).imag(), shift_data(q, ch + 2).imag(),
-      //     shift_data(q, ch + 1).imag(), shift_data(q, ch + 0).imag());
-
-      // __m256d x_c =
-      //     _mm256_set_pd(spectrum(ch + 3).real(), spectrum(ch + 2).real(),
-      //                   spectrum(ch + 1).real(), spectrum(ch + 0).real());
-
-      // __m256d y_c =
-      //     _mm256_set_pd(spectrum(ch + 3).imag(), spectrum(ch + 2).imag(),
-      //                   spectrum(ch + 1).imag(), spectrum(ch + 0).imag());
-
-      __m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch + 0));
-      __m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch + 0));
-      __m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch + 0));
-      __m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch + 0));
-      __m256d x_c = _mm256_loadu_pd(&spectrum(0, ch + 0));
-      __m256d y_c = _mm256_loadu_pd(&spectrum(1, ch + 0));
-
-      // Complex multiply (q_conj * p) * stokes
-      __m256d real_part =
-          _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q));
-      __m256d imag_part =
-          _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p));
-
-      __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c),
-                                        _mm256_mul_pd(imag_part, y_c));
-      __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c),
-                                        _mm256_mul_pd(imag_part, x_c));
 
-      if (correct_frequency_smearing) {
+    if (correct_frequency_smearing) {
+      for (; ch < simd_channels; ch += 4) {
+        // Load 4 values from shift_data for p and q
+        __m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch));
+        __m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch));
+        __m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch));
+        __m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch));
+        __m256d x_c = _mm256_loadu_pd(&spectrum(0, ch));
+        __m256d y_c = _mm256_loadu_pd(&spectrum(1, ch));
         __m256d smear_terms_temp = _mm256_loadu_pd(&smear_terms[ch]);
+
+        // Complex multiply (q_conj * p) * stokes
+        __m256d real_part =
+            _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q));
+        __m256d imag_part =
+            _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p));
+
+        __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c),
+                                          _mm256_mul_pd(imag_part, y_c));
+        __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c),
+                                          _mm256_mul_pd(imag_part, x_c));
+
         temp_real = _mm256_mul_pd(temp_real, smear_terms_temp);
         temp_imag = _mm256_mul_pd(temp_imag, smear_terms_temp);
-      }
 
-      alignas(32) double r[4], i[4];
-      _mm256_store_pd(r, temp_real);
-      _mm256_store_pd(i, temp_imag);
+        _mm256_store_pd(r, temp_real);
+        _mm256_store_pd(i, temp_imag);
 
-      for (uint_fast8_t k = 0; k < 4; ++k) {
-        buffer_expr(bl, ch + k) += dcomplex{r[k], i[k]};
+        buffer_expr(bl, ch + 0) += dcomplex{r[0], i[0]};
+        buffer_expr(bl, ch + 1) += dcomplex{r[1], i[1]};
+        buffer_expr(bl, ch + 2) += dcomplex{r[2], i[2]};
+        buffer_expr(bl, ch + 3) += dcomplex{r[3], i[3]};
+      }
+    } else {
+      for (; ch < simd_channels; ch += 4) {
+        // Load 4 values from shift_data for p and q
+        __m256d x_p = _mm256_loadu_pd(&shift_data(0, p, ch));
+        __m256d y_p = _mm256_loadu_pd(&shift_data(1, p, ch));
+        __m256d x_q = _mm256_loadu_pd(&shift_data(0, q, ch));
+        __m256d y_q = _mm256_loadu_pd(&shift_data(1, q, ch));
+        __m256d x_c = _mm256_loadu_pd(&spectrum(0, ch));
+        __m256d y_c = _mm256_loadu_pd(&spectrum(1, ch));
+
+        // Complex multiply (q_conj * p) * stokes
+        __m256d real_part =
+            _mm256_add_pd(_mm256_mul_pd(x_p, x_q), _mm256_mul_pd(y_p, y_q));
+        __m256d imag_part =
+            _mm256_sub_pd(_mm256_mul_pd(x_p, y_q), _mm256_mul_pd(x_q, y_p));
+
+        __m256d temp_real = _mm256_sub_pd(_mm256_mul_pd(real_part, x_c),
+                                          _mm256_mul_pd(imag_part, y_c));
+        __m256d temp_imag = _mm256_add_pd(_mm256_mul_pd(real_part, y_c),
+                                          _mm256_mul_pd(imag_part, x_c));
+
+        _mm256_store_pd(r, temp_real);
+        _mm256_store_pd(i, temp_imag);
+
+        buffer_expr(bl, ch + 0) += dcomplex{r[0], i[0]};
+        buffer_expr(bl, ch + 1) += dcomplex{r[1], i[1]};
+        buffer_expr(bl, ch + 2) += dcomplex{r[2], i[2]};
+        buffer_expr(bl, ch + 3) += dcomplex{r[3], i[3]};
       }
     }
+
     for (; ch < nchannels; ++ch) {
       const double x_p = (shift_data(0, p, ch));
       const double y_p = (shift_data(1, p, ch));