From f4ba80d2d43fe48a3179bb382652a26f099fb777 Mon Sep 17 00:00:00 2001
From: Jan David Mol <mol@astron.nl>
Date: Sat, 27 Aug 2016 14:32:33 +0000
Subject: [PATCH] Task #9522: Ask SLURM which nodes are available on CEP4

---
 .../GPUProc/src/scripts/cobalt_functions.sh       | 11 +++++++----
 RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh |  4 ++++
 RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh       | 15 ---------------
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
index 301a14087ab..0d73077f3da 100755
--- a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
+++ b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
@@ -52,9 +52,11 @@ function read_cluster_model {
   case "${CLUSTER_NAME}" in
     CEP4)
       HEADNODE=head01.cep4.control.lofar
-      #COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 50`"
-      COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 26` `seq -f "cpu%02.0f.cep4" 30 35` `seq -f "cpu%02.0f.cep4" 37 39`"
-      NRCOMPUTENODES=50
+      COMPUTENODES="`ssh $HEADNODE sinfo --responding --states=idle,mixed,alloc --format=%n --noheader --partition=cobalt --sort=N | awk '{ print $1 ".cep4"; }'`"
+      if [ -z "$COMPUTENODES" ]; then
+        echo "ERROR: Could not obtain list of available CEP4 nodes. Defaulting to all."
+        COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 50`"
+      fi
 
       GLOBALFS_DIR=/data
 
@@ -66,11 +68,12 @@ function read_cluster_model {
     *)
       HEADNODE=lhn001.cep2.lofar
       COMPUTENODES="`seq -f "locus%02.0f" 1 94`"
-      NRCOMPUTENODES=94
 
       SLURM=false
       GLOBALFS=false
       DOCKER=false
       ;;
   esac
+
+  NRCOMPUTENODES=`echo $COMPUTENODES | wc -w`
 }
diff --git a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh
index 2333b3230c1..dd0c8b1fc6c 100755
--- a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh
+++ b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh
@@ -252,6 +252,10 @@ read_cluster_model
 
 # Determine list of outputProc hosts for various purposes
 if $SLURM; then
+  # TODO: Start SLURM job, wait for resources, record node list
+  echo "ERROR: SLURM resource allocation is not supported"
+  exit 1
+
   # Expand node list into something usable
   NODE_LIST="`ssh $HEADNODE scontrol show hostnames $SLURM_JOB_NODELIST`"
 else
diff --git a/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh b/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh
index 99e90f9aae2..5103ddc82f6 100755
--- a/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh
+++ b/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh
@@ -63,21 +63,6 @@ mkfifo -m 0660 "$COMMANDPIPE" || true
 # Construct command line
 COMMAND="env LOFARENV=$LOFARENV runObservation.sh -P $PIDFILE -o Cobalt.commandStream=file:$COMMANDPIPE $PARSET"
 
-# Process cluster requirements
-read_cluster_model
-
-if $SLURM; then
-  # We need to issue "salloc" on the target cluster, and once the resources
-  # are available, the job should first SSH back here to start the observation.
-
-  # Note that we need to marshall some SLURM environment variables as well, hence the use of bash.
-  for s in SLURM_JOB_ID SLURM_JOB_NODELIST SLURM_JOB_NUM_NODES; do
-    SLURM_VARS+=" $s=\$$s"
-  done
-
-  COMMAND="ssh -tt $HEADNODE salloc -N $NRCOMPUTENODES -J $OBSID bash -c 'ssh `hostname -f` -tt $SLURM_VARS $COMMAND'"
-fi
-
 # Start observation in the background
 echo "Starting $COMMAND"
 $COMMAND > $LOGFILE 2>&1 </dev/null &
-- 
GitLab