diff --git a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh index c9f3c01b7a0244a53774dc196033561641726dd1..8dd6353f0ab533d895051268f2412a91352e30c4 100755 --- a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh +++ b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh @@ -63,7 +63,9 @@ function read_cluster_model { #SLURM=true SLURM=false # Don't use SLURM for now, let's get it working without it first GLOBALFS=true - DOCKER=true + DOCKER=false # disabled as outputproc is too slow on docker 1.9.1 (#9522) + + CEP4=true ;; *) HEADNODE=lhn001.cep2.lofar @@ -72,6 +74,8 @@ function read_cluster_model { SLURM=false GLOBALFS=false DOCKER=false + + CEP4=false ;; esac diff --git a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh index dd0c8b1fc6cfb09fc55351857431935ad39bbb44..92bb651a47230e13d23b79a01386c47822ccd12f 100755 --- a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh +++ b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh @@ -256,8 +256,19 @@ if $SLURM; then echo "ERROR: SLURM resource allocation is not supported" exit 1 + # Allocate resources + # TODO: Start outputProc here + ssh $HEADNODE srun -N $NRCOMPUTENODES -c 1 --job-name=$OBSID bash -c 'while sleep 1; do :; done' & + + # Wait for allocation + while [ "`ssh $HEADNODE sacct --starttime=now --noheader --parsable2 --format=state --name=$OBSID | tail -n 1`" != "RUNNING" ]; do sleep 1; done + + # Obtain node list + NODE_LIST="`ssh $HEADNODE sacct --starttime=now --noheader --parsable2 --format=nodelist --name=$OBSID | tail -n 1`" + # Expand node list into something usable - NODE_LIST="`ssh $HEADNODE scontrol show hostnames $SLURM_JOB_NODELIST`" + # TODO: move ".cep4" to cluster model + NODE_LIST="`ssh $HEADNODE scontrol show hostnames $NODE_LISTi | awk '{ print $1 ".cep4"; }'`" else # Derive host list from parset NODE_LIST=$(getOutputProcHosts $PARSET) @@ -308,6 +319,11 @@ PID_LIST_FILE="$LOFARROOT/var/run/outputProc-$OBSERVATIONID.pids" function clean_up { EXIT_STATE=$1 PID_LIST=$2 + + if $SLURM; then + echo "[children] Cancelling SLURM allocation" + ssh $HEADNODE scancel --jobname=$OBSID + fi echo "[children] Sending SIGTERM" # THe kill statements might be called with an empty argument. This will @@ -351,6 +367,11 @@ if $DOCKER; then OUTPUTPROC_CMDLINE="docker-run-slurm.sh --rm --cpu-shares=24576 --cap-add=sys_nice --cap-add=sys_admin -u `id -u $SSH_USER_NAME` --net=host -v $GLOBALFS_DIR:$GLOBALFS_DIR lofar-outputproc:$TAG bash -c \"sudo echo 950000 > /sys/fs/cgroup/cpu/cpu.rt_runtime_us; $OUTPUTPROC_CMDLINE\"" fi +if $CEP4; then + TAG="`echo '${LOFAR_TAG}' | docker-template`" + OUTPUTPROC_CMDLINE="source /opt/outputproc-$TAG/lofarinit.sh; $OUTPUTPROC_CMDLINE" +fi + echo "[outputProc] command line = $OUTPUTPROC_CMDLINE" if ! $DUMMY_RUN; then