From f4ba80d2d43fe48a3179bb382652a26f099fb777 Mon Sep 17 00:00:00 2001 From: Jan David Mol <mol@astron.nl> Date: Sat, 27 Aug 2016 14:32:33 +0000 Subject: [PATCH] Task #9522: Ask SLURM which nodes are available on CEP4 --- .../GPUProc/src/scripts/cobalt_functions.sh | 11 +++++++---- RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh | 4 ++++ RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh | 15 --------------- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh index 301a14087ab..0d73077f3da 100755 --- a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh +++ b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh @@ -52,9 +52,11 @@ function read_cluster_model { case "${CLUSTER_NAME}" in CEP4) HEADNODE=head01.cep4.control.lofar - #COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 50`" - COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 26` `seq -f "cpu%02.0f.cep4" 30 35` `seq -f "cpu%02.0f.cep4" 37 39`" - NRCOMPUTENODES=50 + COMPUTENODES="`ssh $HEADNODE sinfo --responding --states=idle,mixed,alloc --format=%n --noheader --partition=cobalt --sort=N | awk '{ print $1 ".cep4"; }'`" + if [ -z "$COMPUTENODES" ]; then + echo "ERROR: Could not obtain list of available CEP4 nodes. Defaulting to all." + COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 50`" + fi GLOBALFS_DIR=/data @@ -66,11 +68,12 @@ function read_cluster_model { *) HEADNODE=lhn001.cep2.lofar COMPUTENODES="`seq -f "locus%02.0f" 1 94`" - NRCOMPUTENODES=94 SLURM=false GLOBALFS=false DOCKER=false ;; esac + + NRCOMPUTENODES=`echo $COMPUTENODES | wc -w` } diff --git a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh index 2333b3230c1..dd0c8b1fc6c 100755 --- a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh +++ b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh @@ -252,6 +252,10 @@ read_cluster_model # Determine list of outputProc hosts for various purposes if $SLURM; then + # TODO: Start SLURM job, wait for resources, record node list + echo "ERROR: SLURM resource allocation is not supported" + exit 1 + # Expand node list into something usable NODE_LIST="`ssh $HEADNODE scontrol show hostnames $SLURM_JOB_NODELIST`" else diff --git a/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh b/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh index 99e90f9aae2..5103ddc82f6 100755 --- a/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh +++ b/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh @@ -63,21 +63,6 @@ mkfifo -m 0660 "$COMMANDPIPE" || true # Construct command line COMMAND="env LOFARENV=$LOFARENV runObservation.sh -P $PIDFILE -o Cobalt.commandStream=file:$COMMANDPIPE $PARSET" -# Process cluster requirements -read_cluster_model - -if $SLURM; then - # We need to issue "salloc" on the target cluster, and once the resources - # are available, the job should first SSH back here to start the observation. - - # Note that we need to marshall some SLURM environment variables as well, hence the use of bash. - for s in SLURM_JOB_ID SLURM_JOB_NODELIST SLURM_JOB_NUM_NODES; do - SLURM_VARS+=" $s=\$$s" - done - - COMMAND="ssh -tt $HEADNODE salloc -N $NRCOMPUTENODES -J $OBSID bash -c 'ssh `hostname -f` -tt $SLURM_VARS $COMMAND'" -fi - # Start observation in the background echo "Starting $COMMAND" $COMMAND > $LOGFILE 2>&1 </dev/null & -- GitLab