Skip to content
Snippets Groups Projects
Commit f4ba80d2 authored by Jan David Mol's avatar Jan David Mol
Browse files

Task #9522: Ask SLURM which nodes are available on CEP4

parent f816b9af
No related branches found
No related tags found
No related merge requests found
......@@ -52,9 +52,11 @@ function read_cluster_model {
case "${CLUSTER_NAME}" in
CEP4)
HEADNODE=head01.cep4.control.lofar
#COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 50`"
COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 26` `seq -f "cpu%02.0f.cep4" 30 35` `seq -f "cpu%02.0f.cep4" 37 39`"
NRCOMPUTENODES=50
COMPUTENODES="`ssh $HEADNODE sinfo --responding --states=idle,mixed,alloc --format=%n --noheader --partition=cobalt --sort=N | awk '{ print $1 ".cep4"; }'`"
if [ -z "$COMPUTENODES" ]; then
echo "ERROR: Could not obtain list of available CEP4 nodes. Defaulting to all."
COMPUTENODES="`seq -f "cpu%02.0f.cep4" 1 50`"
fi
GLOBALFS_DIR=/data
......@@ -66,11 +68,12 @@ function read_cluster_model {
*)
HEADNODE=lhn001.cep2.lofar
COMPUTENODES="`seq -f "locus%02.0f" 1 94`"
NRCOMPUTENODES=94
SLURM=false
GLOBALFS=false
DOCKER=false
;;
esac
NRCOMPUTENODES=`echo $COMPUTENODES | wc -w`
}
......@@ -252,6 +252,10 @@ read_cluster_model
# Determine list of outputProc hosts for various purposes
if $SLURM; then
# TODO: Start SLURM job, wait for resources, record node list
echo "ERROR: SLURM resource allocation is not supported"
exit 1
# Expand node list into something usable
NODE_LIST="`ssh $HEADNODE scontrol show hostnames $SLURM_JOB_NODELIST`"
else
......
......@@ -63,21 +63,6 @@ mkfifo -m 0660 "$COMMANDPIPE" || true
# Construct command line
COMMAND="env LOFARENV=$LOFARENV runObservation.sh -P $PIDFILE -o Cobalt.commandStream=file:$COMMANDPIPE $PARSET"
# Process cluster requirements
read_cluster_model
if $SLURM; then
# We need to issue "salloc" on the target cluster, and once the resources
# are available, the job should first SSH back here to start the observation.
# Note that we need to marshall some SLURM environment variables as well, hence the use of bash.
for s in SLURM_JOB_ID SLURM_JOB_NODELIST SLURM_JOB_NUM_NODES; do
SLURM_VARS+=" $s=\$$s"
done
COMMAND="ssh -tt $HEADNODE salloc -N $NRCOMPUTENODES -J $OBSID bash -c 'ssh `hostname -f` -tt $SLURM_VARS $COMMAND'"
fi
# Start observation in the background
echo "Starting $COMMAND"
$COMMAND > $LOGFILE 2>&1 </dev/null &
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment