diff --git a/.gitattributes b/.gitattributes index e3fa62677708f0dce0d86c373df1b7919158aad1..cca9e7a6878beab4a7ca8941fceb03db454ad844 100644 --- a/.gitattributes +++ b/.gitattributes @@ -4463,6 +4463,7 @@ RTCP/Cobalt/GPUProc/src/scripts/CobaltVersions.sh eol=lf RTCP/Cobalt/GPUProc/src/scripts/Cobalt_build.sh eol=lf RTCP/Cobalt/GPUProc/src/scripts/Cobalt_install.sh eol=lf RTCP/Cobalt/GPUProc/src/scripts/bw_monitor.sh eol=lf +RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh eol=lf RTCP/Cobalt/GPUProc/src/scripts/cobaltswitch -text RTCP/Cobalt/GPUProc/src/scripts/record_raw_data.sh -text RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh eol=lf diff --git a/RTCP/Cobalt/GPUProc/src/CMakeLists.txt b/RTCP/Cobalt/GPUProc/src/CMakeLists.txt index ce590e0681dc2ff9c200dc87d6c02c7ef6b7988a..0545e880834f5ef817aa29baee4066f408ab2a54 100644 --- a/RTCP/Cobalt/GPUProc/src/CMakeLists.txt +++ b/RTCP/Cobalt/GPUProc/src/CMakeLists.txt @@ -124,6 +124,7 @@ lofar_add_bin_program(gpu_load gpu_load.cc) # install scripts used to run an observation under bin lofar_add_bin_scripts( + scripts/cobalt_functions.sh scripts/CobaltControl.sh scripts/runObservation.sh scripts/startstopkeys.sh diff --git a/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh new file mode 100755 index 0000000000000000000000000000000000000000..b9477c38e67ec5378c6668516d876f0c3905db7f --- /dev/null +++ b/RTCP/Cobalt/GPUProc/src/scripts/cobalt_functions.sh @@ -0,0 +1,66 @@ + +function addlogprefix { + ME="`basename -- "$0" .sh`@`hostname`" + while read LINE + do + echo "$ME" "`date "+%F %T.%3N"`" "$LINE" + done +} + +# +# The following functions assume that $PARSET is set. +# + +function getkey { + KEY=$1 + DEFAULT=$2 + + # grab the last key matching "^$KEY=", ignoring spaces. + VALUE=`<$PARSET perl -ne '/^'$KEY'\s*=\s*"?(.*?)"?\s*$/ || next; print "$1\n";' | tail -n 1` + + if [ "$VALUE" == "" ] + then + echo "$DEFAULT" + else + echo "$VALUE" + fi +} + +function setkey { + KEY=$1 + VAL=$2 + + # In case already there, comment all out to avoid stale warnings. Then append. + KEYESC=`echo "$KEY" | sed -r -e "s/([\.[])/\\\\\\\\\1/g"` # escape '.' '[' chars in keys with enough '\' + sed -i --follow-symlinks -r -e "s/^([[:blank:]]*$KEYESC[[:blank:]]*=)/#\1/g" "$PARSET" + echo "$KEY = $VAL" >> "$PARSET" +} + +function parse_cluster_description { + PROCESSING_CLUSTER=$(getkey Observation.Cluster.ProcessingCluster.clusterName "") + + # Hack to derive required properties (cluster model) from cluster name. + case "${PROCESSING_CLUSTER}" in + CEP4|cep4) + CLUSTER_NAME=cep4 + + HEADNODE=head01.cep4.control.lofar + NRCOMPUTENODES=50 + + SLURM=true + GLOBALFS=true + DOCKER=true + ;; + *) + CLUSTER_NAME=cep2 + + HEADNODE=lhn001.cep2.lofar + NRCOMPUTENODES=94 + + SLURM=false + GLOBALFS=false + DOCKER=false + ;; + esac +} + diff --git a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh index ecc5fb80b211f9f00d9b8fd692cfe2d62b0641e4..60ee69bdb9d472ea7dff6f8158fed07a82c4f6f8 100755 --- a/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh +++ b/RTCP/Cobalt/GPUProc/src/scripts/runObservation.sh @@ -10,37 +10,14 @@ ######## Functions ######## +source cobalt_functions.sh + function error { echo -e "$@" >&2 sendback_state 1 exit 1 } -function getkey { - KEY=$1 - DEFAULT=$2 - - # grab the last key matching "^$KEY=", ignoring spaces. - VALUE=`<$PARSET perl -ne '/^'$KEY'\s*=\s*"?(.*?)"?\s*$/ || next; print "$1\n";' | tail -n 1` - - if [ "$VALUE" == "" ] - then - echo "$DEFAULT" - else - echo "$VALUE" - fi -} - -function setkey { - KEY=$1 - VAL=$2 - - # In case already there, comment all out to avoid stale warnings. Then append. - KEYESC=`echo "$KEY" | sed -r -e "s/([\.[])/\\\\\\\\\1/g"` # escape '.' '[' chars in keys with enough '\' - sed -i --follow-symlinks -r -e "s/^([[:blank:]]*$KEYESC[[:blank:]]*=)/#\1/g" "$PARSET" - echo "$KEY = $VAL" >> "$PARSET" -} - function usage { echo -e \ "\nUsage: $0 [-A] [-B] [-C] [-F] [-P pidfile] [-l nprocs] [-p] [-o KEY=VALUE] [-x KEY=VALUE] PARSET"\ @@ -273,6 +250,7 @@ then setkey Cobalt.OutputProc.StaticMetaDataDirectory "$LOFARROOT/etc" setkey Cobalt.FinalMetaDataGatherer.database.host localhost setkey Cobalt.PVSSGateway.host "" + setkey Observation.Cluster.ProcessingCluster.clusterName "" # Redirect UDP/TCP input streams to any interface on the local machine sed 's/udp:[^:]*:/udp:0:/g' -i $PARSET @@ -335,6 +313,18 @@ SSH_PRIVATE_KEY=$(getkey Cobalt.OutputProc.sshPrivateKey) OUTPUT_PROC_EXECUTABLE=$(getkey Cobalt.OutputProc.executable) OBSERVATIONID=$(getkey Observation.ObsID 0) +parse_cluster_description + +# Determine list of outputProc hosts for various purposes +if $SLURM; then + # Expand node list into something usable + NODE_LIST="`ssh $HEADNODE scontrol show hostnames $SLURM_JOB_NODELIST`" +else + # Derive host list from parset + NODE_LIST=$(getOutputProcHosts $PARSET) +fi +echo "Node list: $NODE_LIST" + # If parameters are found in the parset create a key_string for ssh command if [ "$SSH_PRIVATE_KEY" != "" ] then @@ -352,6 +342,55 @@ ssh -l $SSH_USER_NAME $KEY_STRING "localhost" "/bin/true" || error "Failed to cr # a file containing the PID of these processes PID_LIST_FILE="$LOFARROOT/var/run/outputProc-$OBSERVATIONID.pids" +# If we're using a global file system, we need to specify which nodes +# process which files. We also support a preallocation. +# +# We replace the clustername in the parset by going round-robin over +# the node list for this job. + +if $GLOBALFS; then + # Update locations in parset + mv -fT "$PARSET" "$PARSET.allocate-globalFS" + + <$PARSET.allocate-outputProc >$PARSET perl -e ' + @hosts = qw('"$NODE_LIST"'); + + while (<>) { + if (/^ + (Observation.DataProducts.Output_[A-Za-z]+.locations) # key + \s*=\s* + \[(.*?)\] # value + /x) { + + # output location key -> replace hostnames + $key, $locations = $1, $2; + + # locations are of the format "locus001:/dir, locus002:/dir, ..." + foreach $loc (split /,/, $locations) { + # split off directory + ($host, $dir) = split /:/, $loc, 2; + + # replace hostname iff it matches our cluster + if ($host =~ /^\s*'"$CLUSTER_NAME"'\s$*/) { + # determine new host (rotate @hosts) + $host = shift @hosts; + push @hosts, $host; + } + + # add new location to the list + push @newlocations, join(":", $host, $dir); + } + + # print key with new value + printf "%s=[%s]\n", $key, join(",", @newlocations); + } else { + # print any other parset key verbatim + print; + } + } + ' +fi + # Function clean_up will clean op all PID in the # PID_LIST_FILE and the seperately supplied additional list of PIDs @@ -391,21 +430,40 @@ trap 'clean_up 1' SIGTERM SIGINT SIGQUIT SIGHUP echo "outputProc processes are appended to the file: $PID_LIST_FILE" touch $PID_LIST_FILE -LIST_OF_HOSTS=$(getOutputProcHosts $PARSET) -RANK=0 -VARS="QUEUE_PREFIX=$QUEUE_PREFIX" # Variables to forward to outputProc -for HOST in $LIST_OF_HOSTS -do - COMMAND="ssh -tt -l $SSH_USER_NAME $KEY_STRING $SSH_USER_NAME@$HOST $VARS $OUTPUT_PROC_EXECUTABLE $OBSERVATIONID $RANK" +# Construct full command line for outputProc +OUTPUTPROC_VARS="QUEUE_PREFIX=$QUEUE_PREFIX" # Variables to forward to outputProc +OUTPUTPROC_CMDLINE="$OUTPUTPROC_VARS $OUTPUT_PROC_EXECUTABLE $OBSERVATIONID" + +# Wrap command line with Docker if required +if $DOCKER; then + # TODO: Derive these + DATADIR="/data" + TAG="9048" + + OUTPUTPROC_CMDLINE="docker run -it -e LUSER=`id -u $SSH_USER_NAME` --net=host -v $DATADIR:$DATADIR lofar-outputproc:$TAG bash -c \"$OUTPUTPROC_CMDLINE\"" +fi + +if $SLURM; then + # The nodes we need (and can use) are part of this job + COMMAND="srun -N $SLURM_JOB_NUM_NODES $OUTPUTPROC_CMDLINE" echo "Starting $COMMAND" - # keep a counter to allow determination of the rank (needed for binding to rtcp) - RANK=$(($RANK + 1)) - - command_retry "$COMMAND" & # Start retrying function in the background - PID=$! # get the pid - + + $COMMAND & + PID=$! + echo -n "$PID " >> $PID_LIST_FILE # Save the pid for cleanup -done +else + for HOST in $NODE_LIST + do + COMMAND="ssh -tt -l $SSH_USER_NAME $KEY_STRING $SSH_USER_NAME@$HOST $OUTPUTPROC_CMDLINE" + echo "Starting $COMMAND" + + command_retry "$COMMAND" & # Start retrying function in the background + PID=$! # get the pid + + echo -n "$PID " >> $PID_LIST_FILE # Save the pid for cleanup + done +fi # ************************************ # Start rtcp diff --git a/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh b/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh index 3b33efdaa7feb4fb1e0775d5c38fc4b01f30551c..c6a769c8acae6fb2876cc8627f95e67902e2d03e 100755 --- a/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh +++ b/RTCP/Cobalt/GPUProc/src/scripts/startBGL.sh @@ -28,13 +28,7 @@ LOGFILE=$LOFARROOT/var/log/rtcp-$OBSID.log # The FIFO used for communication with rtcp COMMANDPIPE=$LOFARROOT/var/run/rtcp-$OBSID.pipe -function addlogprefix { - ME="`basename "$0" .sh`@`hostname`" - while read LINE - do - echo "$ME" "`date "+%F %T.%3N"`" "$LINE" - done -} +source cobalt_functions.sh ( # Always print a header, to match errors to observations @@ -55,6 +49,7 @@ function error { [ -n "$PARSET" ] || error "No parset provided" [ -f "$PARSET" -a -r "$PARSET" ] || error "Cannot read parset: $PARSET" +# Export a copy of the parset to the TBB software TBB_PARSET=/globalhome/lofarsystem/log/L$OBSID.parset echo "Copying parset to $TBB_PARSET for postprocessing" cp "$PARSET" "$TBB_PARSET" || true @@ -64,10 +59,30 @@ ln -sfT $TBB_PARSET /globalhome/lofarsystem/log/latest || true [ -e "$COMMANDPIPE" ] && rm -f "$COMMANDPIPE" mkfifo -m 0660 "$COMMANDPIPE" || true +# Construct command line +CMDLINE="runObservation.sh $PARAMS" + +# Process cluster requirements +parse_cluster_description + +if $SLURM; then + # We need to issue "salloc" on the target cluster, and once the resources + # are available, the job should first SSH back here to start the observation. + + # Note that we need to marshall some SLURM environment variables as well, hence the use of bash. + for s in SLURM_JOB_ID SLURM_JOB_NODELIST SLURM_JOB_NUM_NODES; do + SLURM_VARS+=" $s=\$$s" + done + + COMMAND="ssh -tt $HEADNODE salloc -N $NRCOMPUTENODES bash -c 'ssh `hostname -f` -tt $SLURM_VARS $CMDLINE'" +else + COMMAND="$CMDLINE" +fi + # Start observation in the background PARAMS="-P $PIDFILE -o Cobalt.commandStream=file:$COMMANDPIPE $PARSET" -echo "Starting runObservation.sh $PARAMS" -runObservation.sh $PARAMS > $LOGFILE 2>&1 </dev/null & +echo "Starting $CMDLINE" +$CMDLINE > $LOGFILE 2>&1 </dev/null & PID=$! echo "PID: $PID"