From fbbf87bbbc34f387be9f7dceccf8da55479d3883 Mon Sep 17 00:00:00 2001 From: Auke Klazema <klazema@astron.nl> Date: Mon, 21 Jan 2019 09:32:12 +0000 Subject: [PATCH] Task SW-581: Cherry Picked revision 40995 from trunk to release to have the changes in the cobalt_functions.sh in production --- .../GPUProc/src/scripts/cobalt_functions.sh | 64 +++++++++++++------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh index f570a5b3d81..bc4981a6738 100755 --- a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh +++ b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh @@ -54,29 +54,57 @@ function read_cluster_model { HEADNODE=head.cep4.control.lofar SLURM_PARTITION=cpu SLURM_RESERVATION=cobalt - # Get the nodes in the cobalt reservation. The reservation must be active. - RESVNODES=$(ssh $HEADNODE scontrol show res -o $SLURM_RESERVATION | perl -n -e 'm/Nodes=(.*?) .*State=ACTIVE/ ? print STDOUT $1 : die "No active cobalt reservation found"') - if [ $? -eq 0 ]; then - echo "Active reservation '$SLURM_RESERVATION' found, get online nodes in the reservation" - SINFO_FLAGS="--responding --states=idle,mixed,alloc,reserved -n $RESVNODES" - else - echo "WARNING: No reservation '$SLURM_RESERVATION' found, defaulting to all online nodes in partition '$SLURM_PARTITION'" - SINFO_FLAGS="--responding --states=idle,mixed,alloc --partition=$SLURM_PARTITION" + RESVCACHE=$LOFARROOT/var/run/slurmresv.cache + COMPCACHE=$LOFARROOT/var/run/compnodes.cache + + # Get the reserved CEP4 nodes for output writing. Try three methods in order of precedence: + # 1. Get nodes from the cobalt slurm reservation (must have state active) + # 2. Read a cache file with the node list + # 3. Default to a particular set of nodes + echo "Reading the slurm '$SLURM_RESERVATION' reservation.." + RESVNODES=$(ssh $HEADNODE scontrol show res -o $SLURM_RESERVATION | \ + perl -n -e 'm/Nodes=(.*?) .*State=ACTIVE/ ? print STDOUT $1 : die "WARNING: No active reservation found\n"') + if [ -n "$RESVNODES" ]; then + # save in cache + cat <<-CAT > $RESVCACHE + echo "Cache created at $(date)" + RESVNODES="$RESVNODES" + CAT + elif [ -s $RESVCACHE ]; then + echo "Reading the cache file '$RESVCACHE'" + source $RESVCACHE + else + echo "WARNING: No reserved nodes and no cache file found, using defaults" + RESVNODES="cpu[40-44]" fi + echo "Reserved nodes: $RESVNODES" + + # Checking online status: try three methods in order of precedence: + # 1. Check slurm for the node status (sinfo) + # 2. Read a cache file with the node list + # 3. Default to a particular set of nodes + echo "Checking online status" + SINFO_FLAGS="--responding --states=idle,mixed,alloc,reserved -n $RESVNODES" COMPUTENODES="$(ssh $HEADNODE sinfo --format=%n.cep4.infiniband.lofar,%T --noheader --sort=N $SINFO_FLAGS | fgrep -v ,draining | cut -f1 -d,)" - # OLD COMPUTENODES="`ssh $HEADNODE sinfo --responding --states=idle,mixed,alloc --format=%n.cep4.infiniband.lofar,%T --noheader --partition=$SLURM_PARTITION --sort=N | fgrep -v ,draining | cut -f1 -d,`" - if [ -z "$COMPUTENODES" ]; then - echo "ERROR: Could not obtain list of available CEP4 nodes. Defaulting to all." - COMPUTENODES="`seq -f "cpu%02.0f.cep4.infiniband.lofar" 1 47`" + if [ -n "$COMPUTENODES" ]; then + # save in cache + cat <<-CAT > $COMPCACHE + echo "Cache created at $(date)" + COMPUTENODES="$COMPUTENODES" + CAT + elif [ -s $COMPCACHE ]; then + echo "Reading the cache file '$COMPCACHE'" + source $COMPCACHE + else + echo "WARNING: No active nodes and no cache file found, using defaults" + COMPUTENODES="`seq -f "cpu%02.0f.cep4.infiniband.lofar" 40 44`" fi + echo -e "Nodes used for output writing:\n${COMPUTENODES}" - GLOBALFS_DIR=/data - - #SLURM=true - SLURM=false # Don't use SLURM for now, let's get it working without it first GLOBALFS=true - DOCKER=false # disabled as outputproc is too slow on docker 1.9.1 (#9522) - + GLOBALFS_DIR=/data + SLURM=false # Don't use SLURM for now, let's get it working without it first + DOCKER=false # disabled as outputproc is too slow on docker 1.9.1 (#9522) OUTPUTPROC_ROOT="`echo '/opt/outputproc-${LOFAR_TAG}' | docker-template`" ;; DRAGNET) -- GitLab