From fbbf87bbbc34f387be9f7dceccf8da55479d3883 Mon Sep 17 00:00:00 2001
From: Auke Klazema <klazema@astron.nl>
Date: Mon, 21 Jan 2019 09:32:12 +0000
Subject: [PATCH] Task SW-581: Cherry Picked revision 40995 from trunk to
 release to have the changes in the cobalt_functions.sh in production

---
 .../GPUProc/src/scripts/cobalt_functions.sh   | 64 +++++++++++++------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
index f570a5b3d81..bc4981a6738 100755
--- a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
+++ b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
@@ -54,29 +54,57 @@ function read_cluster_model {
       HEADNODE=head.cep4.control.lofar
       SLURM_PARTITION=cpu
       SLURM_RESERVATION=cobalt
-      # Get the nodes in the cobalt reservation. The reservation must be active.
-      RESVNODES=$(ssh $HEADNODE scontrol show res -o $SLURM_RESERVATION  | perl -n -e 'm/Nodes=(.*?) .*State=ACTIVE/ ? print STDOUT $1 : die "No active cobalt reservation found"')
-      if [ $? -eq 0 ]; then
-        echo "Active reservation '$SLURM_RESERVATION' found, get online nodes in the reservation"
-        SINFO_FLAGS="--responding --states=idle,mixed,alloc,reserved -n $RESVNODES"
-      else 
-        echo "WARNING: No reservation '$SLURM_RESERVATION' found, defaulting to all online nodes in partition '$SLURM_PARTITION'"
-        SINFO_FLAGS="--responding --states=idle,mixed,alloc --partition=$SLURM_PARTITION"
+      RESVCACHE=$LOFARROOT/var/run/slurmresv.cache
+      COMPCACHE=$LOFARROOT/var/run/compnodes.cache
+
+      # Get the reserved CEP4 nodes for output writing. Try three methods in order of precedence: 
+      # 1. Get nodes from the cobalt slurm reservation (must have state active)
+      # 2. Read a cache file with the node list
+      # 3. Default to a particular set of nodes
+      echo "Reading the slurm '$SLURM_RESERVATION' reservation.."
+      RESVNODES=$(ssh $HEADNODE scontrol show res -o $SLURM_RESERVATION  | \
+                  perl -n -e 'm/Nodes=(.*?) .*State=ACTIVE/ ? print STDOUT $1 : die "WARNING: No active reservation found\n"')
+      if [ -n "$RESVNODES" ]; then
+        # save in cache
+        cat <<-CAT > $RESVCACHE 
+		echo "Cache created at $(date)"
+		RESVNODES="$RESVNODES"
+	CAT
+      elif [ -s $RESVCACHE ]; then
+        echo "Reading the cache file '$RESVCACHE'"
+        source $RESVCACHE
+      else
+        echo "WARNING: No reserved nodes and no cache file found, using defaults"
+        RESVNODES="cpu[40-44]"
       fi
+      echo "Reserved nodes: $RESVNODES"
+
+      # Checking online status: try three methods in order of precedence: 
+      # 1. Check slurm for the node status (sinfo)
+      # 2. Read a cache file with the node list
+      # 3. Default to a particular set of nodes
+      echo "Checking online status"
+      SINFO_FLAGS="--responding --states=idle,mixed,alloc,reserved -n $RESVNODES"
       COMPUTENODES="$(ssh $HEADNODE sinfo --format=%n.cep4.infiniband.lofar,%T --noheader --sort=N $SINFO_FLAGS | fgrep -v ,draining | cut -f1 -d,)"
-      # OLD COMPUTENODES="`ssh $HEADNODE sinfo --responding --states=idle,mixed,alloc --format=%n.cep4.infiniband.lofar,%T --noheader --partition=$SLURM_PARTITION --sort=N | fgrep -v ,draining | cut -f1 -d,`"
-      if [ -z "$COMPUTENODES" ]; then
-        echo "ERROR: Could not obtain list of available CEP4 nodes. Defaulting to all."
-        COMPUTENODES="`seq -f "cpu%02.0f.cep4.infiniband.lofar" 1 47`"
+      if [ -n "$COMPUTENODES" ]; then
+        # save in cache
+        cat <<-CAT > $COMPCACHE 
+		echo "Cache created at $(date)"
+		COMPUTENODES="$COMPUTENODES"
+	CAT
+      elif [ -s $COMPCACHE ]; then
+        echo "Reading the cache file '$COMPCACHE'"
+        source $COMPCACHE
+      else
+        echo "WARNING: No active nodes and no cache file found, using defaults"
+        COMPUTENODES="`seq -f "cpu%02.0f.cep4.infiniband.lofar" 40 44`"
       fi
+      echo -e "Nodes used for output writing:\n${COMPUTENODES}"
 
-      GLOBALFS_DIR=/data
-
-      #SLURM=true
-      SLURM=false # Don't use SLURM for now, let's get it working without it first
       GLOBALFS=true
-      DOCKER=false # disabled as outputproc is too slow on docker 1.9.1 (#9522)
-
+      GLOBALFS_DIR=/data
+      SLURM=false    # Don't use SLURM for now, let's get it working without it first
+      DOCKER=false   # disabled as outputproc is too slow on docker 1.9.1 (#9522)
       OUTPUTPROC_ROOT="`echo '/opt/outputproc-${LOFAR_TAG}' | docker-template`"
       ;;
     DRAGNET)
-- 
GitLab