From 08ca7f05274cdeefe2eb0204b4c5e7be86ebd1f2 Mon Sep 17 00:00:00 2001
From: Jan David Mol <mol@astron.nl>
Date: Thu, 1 Sep 2016 20:08:53 +0000
Subject: [PATCH] Task #9522: Run outputproc from /opt/outputproc-$BRANCH/ on
 CEP4 instead of through Docker, to fix data loss that occurred if outputProc
 runs in docker, and the docker daemon is busy

---
 .../GPUProc/src/scripts/cobalt_functions.sh   |  6 ++++-
 .../GPUProc/src/scripts/runObservation.sh     | 23 ++++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
index c9f3c01b7a0..8dd6353f0ab 100755
--- a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
+++ b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh
@@ -63,7 +63,9 @@ function read_cluster_model {
       #SLURM=true
       SLURM=false # Don't use SLURM for now, let's get it working without it first
       GLOBALFS=true
-      DOCKER=true
+      DOCKER=false # disabled as outputproc is too slow on docker 1.9.1 (#9522)
+
+      CEP4=true
       ;;
     *)
       HEADNODE=lhn001.cep2.lofar
@@ -72,6 +74,8 @@ function read_cluster_model {
       SLURM=false
       GLOBALFS=false
       DOCKER=false
+
+      CEP4=false
       ;;
   esac
 
diff --git a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh
index dd0c8b1fc6c..92bb651a472 100755
--- a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh
+++ b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh
@@ -256,8 +256,19 @@ if $SLURM; then
   echo "ERROR: SLURM resource allocation is not supported"
   exit 1
 
+  # Allocate resources
+  # TODO: Start outputProc here
+  ssh $HEADNODE srun -N $NRCOMPUTENODES -c 1 --job-name=$OBSID bash -c 'while sleep 1; do :; done' &
+
+  # Wait for allocation
+  while [ "`ssh $HEADNODE sacct --starttime=now --noheader --parsable2 --format=state --name=$OBSID | tail -n 1`" != "RUNNING" ]; do sleep 1; done
+
+  # Obtain node list
+  NODE_LIST="`ssh $HEADNODE sacct --starttime=now --noheader --parsable2 --format=nodelist --name=$OBSID | tail -n 1`"
+
   # Expand node list into something usable
-  NODE_LIST="`ssh $HEADNODE scontrol show hostnames $SLURM_JOB_NODELIST`"
+  # TODO: move ".cep4" to cluster model
+  NODE_LIST="`ssh $HEADNODE scontrol show hostnames $NODE_LISTi | awk '{ print $1 ".cep4"; }'`"
 else
   # Derive host list from parset
   NODE_LIST=$(getOutputProcHosts $PARSET)
@@ -308,6 +319,11 @@ PID_LIST_FILE="$LOFARROOT/var/run/outputProc-$OBSERVATIONID.pids"
 function clean_up { 
   EXIT_STATE=$1
   PID_LIST=$2
+
+  if $SLURM; then
+    echo "[children] Cancelling SLURM allocation"
+    ssh $HEADNODE scancel --jobname=$OBSID
+  fi
   
   echo "[children] Sending SIGTERM" 
   # THe kill statements might be called with an empty argument. This will 
@@ -351,6 +367,11 @@ if $DOCKER; then
   OUTPUTPROC_CMDLINE="docker-run-slurm.sh --rm --cpu-shares=24576 --cap-add=sys_nice --cap-add=sys_admin -u `id -u $SSH_USER_NAME` --net=host -v $GLOBALFS_DIR:$GLOBALFS_DIR lofar-outputproc:$TAG bash -c \"sudo echo 950000 > /sys/fs/cgroup/cpu/cpu.rt_runtime_us; $OUTPUTPROC_CMDLINE\""
 fi
 
+if $CEP4; then
+  TAG="`echo '${LOFAR_TAG}' | docker-template`"
+  OUTPUTPROC_CMDLINE="source /opt/outputproc-$TAG/lofarinit.sh; $OUTPUTPROC_CMDLINE"
+fi
+
 echo "[outputProc] command line = $OUTPUTPROC_CMDLINE"
 
 if ! $DUMMY_RUN; then
-- 
GitLab