From a68a6730eb33fbfdbcf723dda17d399a95c25cc1 Mon Sep 17 00:00:00 2001
From: Mark de Wever <mark.dewever@stcorp.nl>
Date: Fri, 4 Nov 2022 12:48:28 +0000
Subject: [PATCH] RAP-100 Reduce memory usage scalar solver

---
 ddecal/gain_solvers/DiagonalSolver.cc  | 79 ++++++++++++++---------
 ddecal/gain_solvers/DiagonalSolver.h   | 11 ++++
 ddecal/gain_solvers/FullJonesSolver.cc | 80 ++++++++++++++---------
 ddecal/gain_solvers/FullJonesSolver.h  | 10 +++
 ddecal/gain_solvers/ScalarSolver.cc    | 88 +++++++++++++++-----------
 ddecal/gain_solvers/ScalarSolver.h     | 10 +++
 6 files changed, 180 insertions(+), 98 deletions(-)

diff --git a/ddecal/gain_solvers/DiagonalSolver.cc b/ddecal/gain_solvers/DiagonalSolver.cc
index b55ca5e95..9144b1fbc 100644
--- a/ddecal/gain_solvers/DiagonalSolver.cc
+++ b/ddecal/gain_solvers/DiagonalSolver.cc
@@ -23,33 +23,11 @@ DiagonalSolver::SolveResult DiagonalSolver::Solve(
   assert(solutions.size() == NChannelBlocks());
 
   PrepareConstraints();
-
-  std::vector<std::vector<DComplex>> next_solutions(NChannelBlocks());
-
   SolveResult result;
 
-  // Model matrix 2 x ant x [2N x D] and visibility vector 2 x ant x [2N],
-  // for each channelblock. The following loop allocates all structures
-  std::vector<std::vector<Matrix>> g_times_cs(NChannelBlocks());
-  std::vector<std::vector<std::vector<Complex>>> vs(NChannelBlocks());
-  for (size_t ch_block = 0; ch_block != NChannelBlocks(); ++ch_block) {
-    const SolveData::ChannelBlockData& channelBlock =
-        data.ChannelBlock(ch_block);
-    next_solutions[ch_block].resize(NDirections() * NAntennas() * 2);
-    g_times_cs[ch_block].reserve(NAntennas() * 2);
-    vs[ch_block].reserve(NAntennas() * 2);
-
-    for (size_t ant = 0; ant != NAntennas(); ++ant) {
-      // Model matrix [(2N) x D] and visibility vector [2N]
-      const size_t n_visibilities = channelBlock.NAntennaVisibilities(ant);
-      const size_t m = 2 * n_visibilities;
-      const size_t n = NDirections();
-      for (size_t p = 0; p != 2; ++p) {
-        g_times_cs[ch_block].emplace_back(m, n);
-        vs[ch_block].emplace_back(std::max(m, n));
-      }
-    }
-  }
+  std::vector<std::vector<DComplex>> next_solutions(NChannelBlocks());
+  for (std::vector<DComplex>& next_solution : next_solutions)
+    next_solution.resize(NDirections() * NAntennas() * 2);
 
   ///
   /// Start iterating
@@ -64,17 +42,29 @@ DiagonalSolver::SolveResult DiagonalSolver::Solve(
 
   double avg_squared_diff = 1.0E4;
 
+  // Using more threads wastes CPU and memory resources.
+  const size_t n_threads = std::min(NChannelBlocks(), GetNThreads());
+  // For each thread:
+  // - Model matrix 2 x ant x [2N x D]
+  // - Visibility vector 2 x ant x [2N]
+  std::vector<std::vector<Matrix>> thread_g_times_cs(n_threads);
+  std::vector<std::vector<std::vector<Complex>>> thread_vs(n_threads);
+
+  aocommon::ParallelFor<size_t> loop(n_threads);
   do {
     MakeSolutionsFinite2Pol(solutions);
 
-    ParallelFor<size_t> loop(GetNThreads());
     loop.Run(0, NChannelBlocks(),
              [&](size_t ch_block, [[maybe_unused]] size_t thread) {
-               const SolveData::ChannelBlockData& channelBlock =
+               const SolveData::ChannelBlockData& channel_block =
                    data.ChannelBlock(ch_block);
-               PerformIteration(channelBlock, g_times_cs[ch_block],
-                                vs[ch_block], solutions[ch_block],
-                                next_solutions[ch_block]);
+
+               std::vector<Matrix>& g_times_cs = thread_g_times_cs[thread];
+               std::vector<std::vector<Complex>>& vs = thread_vs[thread];
+               InitializeModelMatrix(channel_block, g_times_cs, vs);
+
+               PerformIteration(channel_block, g_times_cs, vs,
+                                solutions[ch_block], next_solutions[ch_block]);
              });
 
     Step(solutions, next_solutions);
@@ -204,5 +194,34 @@ void DiagonalSolver::PerformIteration(
   }
 }
 
+void DiagonalSolver::InitializeModelMatrix(
+    const SolveData::ChannelBlockData& channel_block_data,
+    std::vector<Matrix>& g_times_cs,
+    std::vector<std::vector<Complex>>& vs) const {
+  assert(g_times_cs.empty() == vs.empty());
+  if (g_times_cs.empty()) {
+    // Executed the first iteration only.
+    g_times_cs.reserve(NAntennas() * 2);
+    vs.reserve(NAntennas() * 2);
+  } else {
+    // Note clear() does not modify the capacity.
+    // See https://en.cppreference.com/w/cpp/container/vector/clear
+    g_times_cs.clear();
+    vs.clear();
+  }
+
+  // Update the size of the model matrix and initialize them to zero.
+  for (size_t ant = 0; ant != NAntennas(); ++ant) {
+    // Model matrix [(2N) x D] and visibility vector [2N]
+    const size_t n_visibilities = channel_block_data.NAntennaVisibilities(ant);
+    const size_t m = n_visibilities * 2;
+    const size_t n = NDirections();
+    for (size_t p = 0; p != 2; ++p) {
+      g_times_cs.emplace_back(m, n);
+      vs.emplace_back(std::max(m, n));
+    }
+  }
+}
+
 }  // namespace ddecal
 }  // namespace dp3
diff --git a/ddecal/gain_solvers/DiagonalSolver.h b/ddecal/gain_solvers/DiagonalSolver.h
index dc4a78780..2b00188af 100644
--- a/ddecal/gain_solvers/DiagonalSolver.h
+++ b/ddecal/gain_solvers/DiagonalSolver.h
@@ -26,6 +26,17 @@ class DiagonalSolver final : public SolverBase {
                         std::vector<std::vector<Complex>>& vs,
                         const std::vector<DComplex>& solutions,
                         std::vector<DComplex>& next_solutions);
+
+  /**
+   * Initialize the model matrix for a channel block.
+   *
+   * The number of elements in the model matrix depends on the number of
+   * antenna visibilities in the corresponding channel block.
+   */
+  void InitializeModelMatrix(
+      const SolveData::ChannelBlockData& channel_block_data,
+      std::vector<Matrix>& g_times_cs,
+      std::vector<std::vector<Complex>>& vs) const;
 };
 
 }  // namespace ddecal
diff --git a/ddecal/gain_solvers/FullJonesSolver.cc b/ddecal/gain_solvers/FullJonesSolver.cc
index 660754f0b..b4f316b41 100644
--- a/ddecal/gain_solvers/FullJonesSolver.cc
+++ b/ddecal/gain_solvers/FullJonesSolver.cc
@@ -51,33 +51,11 @@ FullJonesSolver::SolveResult FullJonesSolver::Solve(
   assert(solutions.size() == NChannelBlocks());
 
   PrepareConstraints();
-
-  std::vector<std::vector<DComplex>> next_solutions(NChannelBlocks());
-
   SolveResult result;
 
-  // Dimensions for each channelblock:
-  // - Model matrix: n_antennas x [2N x 2D]
-  // - Visibility matrix: n_antennas x [2N x 2]
-  // The following loop allocates all structures:
-  std::vector<std::vector<Matrix>> g_times_cs(NChannelBlocks());
-  std::vector<std::vector<Matrix>> vs(NChannelBlocks());
-  for (size_t ch_block = 0; ch_block != NChannelBlocks(); ++ch_block) {
-    const SolveData::ChannelBlockData& channel_block_data =
-        data.ChannelBlock(ch_block);
+  std::vector<std::vector<DComplex>> next_solutions(NChannelBlocks());
+  for (size_t ch_block = 0; ch_block != NChannelBlocks(); ++ch_block)
     next_solutions[ch_block].resize(NDirections() * NAntennas() * 4);
-    g_times_cs[ch_block].reserve(NAntennas());
-    vs[ch_block].reserve(NAntennas());
-
-    for (size_t ant = 0; ant != NAntennas(); ++ant) {
-      // Model matrix [2N x 2D] and visibility matrix [2N x 2]
-      const size_t m = channel_block_data.NAntennaVisibilities(ant) * 2;
-      const size_t n = NDirections() * 2;
-      const size_t n_rhs = 2;
-      g_times_cs[ch_block].emplace_back(m, n);
-      vs[ch_block].emplace_back(std::max(m, n), n_rhs);
-    }
-  }
 
   ///
   /// Start iterating
@@ -90,16 +68,29 @@ FullJonesSolver::SolveResult FullJonesSolver::Solve(
   std::vector<double> step_magnitudes;
   step_magnitudes.reserve(GetMaxIterations());
 
+  // Using more threads wastes CPU and memory resources.
+  const size_t n_threads = std::min(NChannelBlocks(), GetNThreads());
+  // For each thread:
+  // - Model matrix: n_antennas x [2N x 2D]
+  // - Visibility matrix: n_antennas x [2N x 2]
+  std::vector<std::vector<Matrix>> thread_g_times_cs(n_threads);
+  std::vector<std::vector<Matrix>> thread_vs(n_threads);
+
+  aocommon::ParallelFor<size_t> loop(GetNThreads());
   do {
     MakeSolutionsFinite4Pol(solutions);
 
-    aocommon::ParallelFor<size_t> loop(GetNThreads());
-    loop.Run(0, NChannelBlocks(),
-             [&](size_t ch_block, [[maybe_unused]] size_t thread) {
-               PerformIteration(data.ChannelBlock(ch_block),
-                                g_times_cs[ch_block], vs[ch_block],
-                                solutions[ch_block], next_solutions[ch_block]);
-             });
+    loop.Run(0, NChannelBlocks(), [&](size_t ch_block, size_t thread) {
+      const SolveData::ChannelBlockData& channel_block =
+          data.ChannelBlock(ch_block);
+
+      std::vector<Matrix>& g_times_cs = thread_g_times_cs[thread];
+      std::vector<Matrix>& vs = thread_vs[thread];
+      InitializeModelMatrix(channel_block, g_times_cs, vs);
+
+      PerformIteration(channel_block, g_times_cs, vs, solutions[ch_block],
+                       next_solutions[ch_block]);
+    });
 
     Step(solutions, next_solutions);
 
@@ -257,5 +248,32 @@ void FullJonesSolver::PerformIteration(
   }
 }
 
+void FullJonesSolver::InitializeModelMatrix(
+    const SolveData::ChannelBlockData& channel_block_data,
+    std::vector<Matrix>& g_times_cs, std::vector<Matrix>& vs) const {
+  assert(g_times_cs.empty() == vs.empty());
+  if (g_times_cs.empty()) {
+    // Executed the first iteration only.
+    g_times_cs.reserve(NAntennas());
+    vs.reserve(NAntennas());
+  } else {
+    // Note clear() does not modify the capacity.
+    // See https://en.cppreference.com/w/cpp/container/vector/clear
+    g_times_cs.clear();
+    vs.clear();
+  }
+
+  // Update the size of the model matrix and initialize them to zero.
+  for (size_t ant = 0; ant != NAntennas(); ++ant) {
+    // Model matrix [2N x 2D] and visibility matrix [2N x 2]
+    const size_t n_visibilities = channel_block_data.NAntennaVisibilities(ant);
+    const size_t m = n_visibilities * 2;
+    const size_t n = NDirections() * 2;
+    const size_t n_rhs = 2;
+    g_times_cs.emplace_back(m, n);
+    vs.emplace_back(std::max(m, n), n_rhs);
+  }
+}
+
 }  // namespace ddecal
 }  // namespace dp3
diff --git a/ddecal/gain_solvers/FullJonesSolver.h b/ddecal/gain_solvers/FullJonesSolver.h
index aecf1038b..70b1e9d2a 100644
--- a/ddecal/gain_solvers/FullJonesSolver.h
+++ b/ddecal/gain_solvers/FullJonesSolver.h
@@ -24,6 +24,16 @@ class FullJonesSolver final : public SolverBase {
                         std::vector<Matrix>& vs,
                         const std::vector<DComplex>& solutions,
                         std::vector<DComplex>& next_solutions);
+
+  /**
+   * Initialize the model matrix for a channel block.
+   *
+   * The number of elements in the model matrix depends on the number of
+   * antenna visibilities in the corresponding channel block.
+   */
+  void InitializeModelMatrix(
+      const SolveData::ChannelBlockData& channel_block_data,
+      std::vector<Matrix>& g_times_cs, std::vector<Matrix>& vs) const;
 };
 
 }  // namespace ddecal
diff --git a/ddecal/gain_solvers/ScalarSolver.cc b/ddecal/gain_solvers/ScalarSolver.cc
index 7c57f7ffa..1a8c8ba9f 100644
--- a/ddecal/gain_solvers/ScalarSolver.cc
+++ b/ddecal/gain_solvers/ScalarSolver.cc
@@ -20,33 +20,11 @@ ScalarSolver::SolveResult ScalarSolver::Solve(
   assert(solutions.size() == NChannelBlocks());
 
   PrepareConstraints();
-
-  std::vector<std::vector<DComplex>> next_solutions(NChannelBlocks());
-
   SolveResult result;
 
-  // Model matrix ant x [N x D] and visibility vector ant x [N x 1],
-  // for each channelblock
-  // The following loop allocates all structures
-  std::vector<std::vector<Matrix>> g_times_cs(NChannelBlocks());
-  std::vector<std::vector<Matrix>> vs(NChannelBlocks());
-  for (size_t chBlock = 0; chBlock != NChannelBlocks(); ++chBlock) {
-    const SolveData::ChannelBlockData& channelBlock =
-        data.ChannelBlock(chBlock);
-    next_solutions[chBlock].resize(NDirections() * NAntennas());
-    g_times_cs[chBlock].reserve(NAntennas());
-    vs[chBlock].reserve(NAntennas());
-
-    for (size_t ant = 0; ant != NAntennas(); ++ant) {
-      // Model matrix [N x D] and visibility vector [N x 1]
-      const size_t n_visibilities = channelBlock.NAntennaVisibilities(ant);
-      const size_t m = n_visibilities * 4;
-      const size_t n = NDirections();
-      const size_t nrhs = 1;
-      g_times_cs[chBlock].emplace_back(m, n);
-      vs[chBlock].emplace_back(std::max(m, n), nrhs);
-    }
-  }
+  std::vector<std::vector<DComplex>> next_solutions(NChannelBlocks());
+  for (std::vector<DComplex>& next_solution : next_solutions)
+    next_solution.resize(NDirections() * NAntennas());
 
   ///
   /// Start iterating
@@ -61,17 +39,29 @@ ScalarSolver::SolveResult ScalarSolver::Solve(
 
   double avg_squared_diff = 1.0E4;
 
+  // Using more threads wastes CPU and memory resources.
+  const size_t n_threads = std::min(NChannelBlocks(), GetNThreads());
+  // For each thread:
+  // - Model matrix ant x [N x D] and
+  // - Visibility vector ant x [N x 1]
+  std::vector<std::vector<Matrix>> thread_g_times_cs(n_threads);
+  std::vector<std::vector<Matrix>> thread_vs(n_threads);
+
+  aocommon::ParallelFor<size_t> loop(n_threads);
   do {
     MakeSolutionsFinite1Pol(solutions);
 
-    aocommon::ParallelFor<size_t> loop(GetNThreads());
-    loop.Run(0, NChannelBlocks(),
-             [&](size_t chBlock, [[maybe_unused]] size_t thread) {
-               const SolveData::ChannelBlockData& channelBlock =
-                   data.ChannelBlock(chBlock);
-               PerformIteration(channelBlock, g_times_cs[chBlock], vs[chBlock],
-                                solutions[chBlock], next_solutions[chBlock]);
-             });
+    loop.Run(0, NChannelBlocks(), [&](size_t ch_block, size_t thread) {
+      const SolveData::ChannelBlockData& channel_block =
+          data.ChannelBlock(ch_block);
+
+      std::vector<Matrix>& g_times_cs = thread_g_times_cs[thread];
+      std::vector<Matrix>& vs = thread_vs[thread];
+      InitializeModelMatrix(channel_block, g_times_cs, vs);
+
+      PerformIteration(channel_block, g_times_cs, vs, solutions[ch_block],
+                       next_solutions[ch_block]);
+    });
 
     Step(solutions, next_solutions);
 
@@ -112,10 +102,6 @@ void ScalarSolver::PerformIteration(const SolveData::ChannelBlockData& cb_data,
                                     std::vector<Matrix>& vs,
                                     const std::vector<DComplex>& solutions,
                                     std::vector<DComplex>& next_solutions) {
-  for (size_t ant = 0; ant != NAntennas(); ++ant) {
-    g_times_cs[ant].SetZero();
-    vs[ant].SetZero();
-  }
   const size_t n_visibilities = cb_data.NVisibilities();
   const size_t p1_to_p2[4] = {0, 2, 1, 3};
 
@@ -196,5 +182,33 @@ void ScalarSolver::PerformIteration(const SolveData::ChannelBlockData& cb_data,
   }
 }
 
+void ScalarSolver::InitializeModelMatrix(
+    const SolveData::ChannelBlockData& channel_block_data,
+    std::vector<Matrix>& g_times_cs, std::vector<Matrix>& vs) const {
+  assert(g_times_cs.empty() == vs.empty());
+  if (g_times_cs.empty()) {
+    // Executed the first iteration only.
+    g_times_cs.reserve(NAntennas());
+    vs.reserve(NAntennas());
+  } else {
+    // Note clear() does not modify the capacity.
+    // See https://en.cppreference.com/w/cpp/container/vector/clear
+    g_times_cs.clear();
+    vs.clear();
+  }
+
+  // Update the size of the model matrix and initialize them to zero.
+  for (size_t ant = 0; ant != NAntennas(); ++ant) {
+    // Model matrix [N x D] and visibility vector [N x 1]
+    const size_t n_visibilities = channel_block_data.NAntennaVisibilities(ant);
+    const size_t m = n_visibilities * 4;
+    const size_t n = NDirections();
+    const size_t nrhs = 1;
+
+    g_times_cs.emplace_back(m, n);
+    vs.emplace_back(std::max(m, n), nrhs);
+  }
+}
+
 }  // namespace ddecal
 }  // namespace dp3
diff --git a/ddecal/gain_solvers/ScalarSolver.h b/ddecal/gain_solvers/ScalarSolver.h
index 62fba9e10..9c48015ae 100644
--- a/ddecal/gain_solvers/ScalarSolver.h
+++ b/ddecal/gain_solvers/ScalarSolver.h
@@ -24,6 +24,16 @@ class ScalarSolver final : public SolverBase {
                         std::vector<Matrix>& vs,
                         const std::vector<DComplex>& solutions,
                         std::vector<DComplex>& next_solutions);
+
+  /**
+   * Initialize the model matrix for a channel block.
+   *
+   * The number of elements in the model matrix depends on the number of
+   * antenna visibilities in the corresponding channel block.
+   */
+  void InitializeModelMatrix(
+      const SolveData::ChannelBlockData& channel_block_data,
+      std::vector<Matrix>& g_times_cs, std::vector<Matrix>& vs) const;
 };
 
 }  // namespace ddecal
-- 
GitLab