-
Marcel Loose authoredMarcel Loose authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
swlevel 19.66 KiB
#!/bin/bash
#
# swlevel : bring software on node in certain runlevel
#
# Copyright (C) 2006
# ASTRON (Netherlands Foundation for Research in Astronomy)
# P.O.Box 2, 7990 AA Dwingeloo, The Netherlands, softwaresupport@astron.nl
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# Syntax: swlevel [ 0 | 1 | 2 | 3 | 4 | 5 | 6 ]
#
# $Id$
#
if [ "$LOFARROOT" == "" ]; then
# default value until all MAC controlled systems provide $LOFARROOT
LOFARROOT=/opt/lofar
fi
BINDIR=$LOFARROOT/bin
SBINDIR=$LOFARROOT/sbin
LOGDIR=$LOFARROOT/var/log
ETCDIR=$LOFARROOT/etc
LEVELTABLE=${ETCDIR}/swlevel.conf
# Make sure all files are user/group writeable (needed for Int.
# Stations)
umask 002
#
# SyntaxError msg
#
SyntaxError()
{
Msg=$1
[ -z "${Msg}" ] || echo "ERROR: ${Msg}"
echo ""
echo "Syntax: $(basename $0) [-s/S/v/V/u/U] [-i x] [-r/q processname] [-l][ 0 |1 |2 |3 |4 |5 |6 ]"
echo "-s: Show current level, exit (-S: only print number)"
echo "-p: Print last set level from logfile (-P: only print number)"
echo "-v: Show running LOFAR version, exit (-V: only print version)"
echo "-u: Show users owning running processes (-U: same)"
echo "-i: Load RSP firmware from image [x] (default image is 1)"
echo "-l: Set to level as provided (optional)"
echo "-q: Quit process with name processname"
echo "-r: Run process with name processname"
echo
echo "Levels:"
echo "0: Stop all lofar software"
echo "1: Run Lofar daemons and PVSS"
echo "2: Run Lowlevel hardware drivers"
echo "3: Run Calculation services"
echo "4: Run Hardware and software monitors"
echo "5: Run System Health Management"
echo "6: MAC is controlling the software"
echo ""
exit 0
}
# Method to run a command with a specified timeout in seconds
# First argument is timeout, next arguments are the command and its parameters
# Find which image to load on a given station; uses file
# ${ETCDIR}/RSPImage.conf
#
# returns $image with image number
findImage()
{
findstation=$1
RSPImageFile=${ETCDIR}/RSPImage.conf
if [ ! -e $RSPImageFile ] ; then
echo "Cannot find Image file ${ETCDIR}/RSPImage.conf"
exit;
fi
image=0
while read line
do
first=`echo $line | awk '{print $1}'`
if [ "$first" == "$findstation" ]; then
image=`echo $line | awk '{print $2}'`
break
fi
done < ${ETCDIR}/RSPImage.conf
if [ $image -eq 0 ]; then
echo "Could not find station $findstation in file ${ETCDIR}/RSPImage.conf"
exit;
fi
return
}
#
# selectImage(); load an image on the RSP boards
#
selectImage()
{
let nrRSPs=`grep RSPBOARDS ${ETCDIR}/RemoteStation.conf | cut -d'=' -f2 | sed 's/ //g'`
let offset=0x`grep RSPDriver.MAC_ADDR_0 ${ETCDIR}/RSPDriver.conf | cut -d'=' -f2 | cut -d':' -f5`
let board=0
# Assume no errors with board communication
boardError=0
errorBoards=""
# Make sure we have an image number in parameter $image
if [ -z $image ] && [ $imageForced -eq 0 ]; then
findImage `hostname -s`
fi
while [ $board -lt $nrRSPs ]
do
# get major version of active firmware on RSPboard $board
#boardHex=`echo $board | awk '{ printf "%02x", $1 }'`
boardHex=`echo $board | awk -v firstBoard=$offset '{ printf "%02x", firstBoard+$1 }'`
# Uncomment next lines only for testing purposes!
#if [ "$boardHex" == "03" ]; then
# boardHex="1F"
#fi
rsu=`run_timeout.sh 5 sudo ${SBINDIR}/rsuctl3 -m 10:fa:00:00:$boardHex:00 -qV 2>&1 | grep BP | cut -d':' -f2 | sed 's/ //g' | cut -d'.' -f1`
# If not a single number, something weird must have happened
if [ ${#rsu} -ne 1 ]; then
echo "RSPboard $boardHex: Error requesting active firmware version (communication error)"
boardError=1
errorBoards=${errorBoards}${boardHex}","
else
echo "Loading image $image on RSPboard $boardHex ..."
run_timeout.sh 5 sudo ${SBINDIR}/rsuctl3_reset -m 10:fa:00:00:$boardHex:00 -q -x -p $image 1>/dev/null 2>&1
if [ $? -ne 0 ]; then
boardError=1
errorBoards=${errorBoards}${boardHex}","
fi
fi
# Next board
let board+=1
done
if [ $boardError -eq 1 ]; then
echo "Board(s) $errorBoards have a communication problem; try reset the 48V"
fi
}
#
# Check if RSP images and TBB images are properly loaded, and start TBB
# recording if they are
#
check_images()
{
if [ -e $BINDIR/rspctl ]; then
# First make sure RSP images are properly loaded
# Introduce a timeout of 60 sec for images to initialize
echo "Waiting for RSP and TBB images to be initialized"
timeout=60
for (( s=0 ; s<timeout; s++ ))
do
rsu_ready=`( rspctl --version | grep "0.0" ) >& /dev/null; echo $?`
if [ $rsu_ready == 1 ]; then
echo "RSP Images are loaded"
break
fi
sleep 1
done
if [ $s == $timeout ]; then
echo "Could not load RSP images in time; Reset RSP boards"
exit 1
fi
fi
if [ -e $BINDIR/tbbctl ]; then
# Now make sure TBB images are properly loaded
# Introduce a timeout of 60 sec for images to initialize
echo "Waiting for TBB images to be initialized"
timeout=60
for (( s=0 ; s<timeout; s++ ))
do
tbb_respons=`tbbctl --version`
tbb_ready=`( echo $tbb_respons | grep "\ V\ " ) >& /dev/null; echo $?`
if [ $tbb_ready -eq 0 ]; then
sleep 10 # additional delay to let TBB boards end their init phase
echo "TBB Images are loaded"
break
fi
tbb_down=`( echo $tbb_respons | grep "TBBDriver is NOT responding" )>& /dev/null; echo $?`
if [ $tbb_down -eq 0 ]; then
echo "TBBDriver is not responding; cannot continue start of TBBs"
# Trigger message furtheron in the code
s=$timeout
break
fi
sleep 1
done
if [ $s == $timeout ]; then
echo "Could not load TBB images; Reset TBB boards"
else
# Start TBB recording mode
if [ -e $SBINDIR/startTBB.sh ]; then
$SBINDIR/startTBB.sh
fi
fi
fi
}
#
# Start the program when it exists
#
start_prog()
{
# make arguments readable
prog=$1
asroot=${2:1}
withmpi=${3:1}
# special check for logging-daemons
[ $prog == $logProgToSkip ] && return
# check existance
[ -x $BINDIR/$prog ] || [ -x $BINDIR/${prog}.sh ] || return
# if it is a shell script call the script
if [ -f $BINDIR/${prog}.sh ]; then
$BINDIR/${prog}.sh start
return
fi
# Check if program is already running
/sbin/pidof -x ${prog} 1>/dev/null 2>&1
if [ $? -ne 0 ]; then
curdate=`date +%Y%m%dT%H%M%S`
# PVSS needs special treatment
if [ "$prog" = "PVSS00pmon" ]; then
echo Starting $prog
start_pvss2 1>/dev/null 2>&1 &
sleep 3
elif [ "$prog" = "SASGateway" ]; then
# Foreign stations not under central control should not
# connect to the SAS database; this prevents SAS main-
# tenance etc.
if [ "$user" = "lofarsys" ]; then
echo Starting $prog
rm -f $LOGDIR/$prog.log*.? 1>/dev/null 2>&1
$BINDIR/$prog 1>>${LOGDIR}/${prog}.stdout.${curdate} 2>&1 &
else
echo "Local use, not starting $prog"
fi
else
if [ -n "$asroot" ]; then
echo Starting $prog
sudo rm -f $LOGDIR/$prog.log.? 1>/dev/null 2>&1
if [ "$prog" = "RSPDriver" ]; then
selectImage
if [ $boardError -eq 1 ]; then
exit
fi
fi
if [ "$prog" = "TBBDriver" ]; then
# Check if RSPDriver is already running; if not, do not start either!
/sbin/pidof RSPDriver 1>/dev/null 2>&1
if [ $? -ne 0 ]; then
echo "RSPDriver not running, so not starting TBBDriver either"
exit
fi
fi
sudo -b $BINDIR/$prog 1>>$LOGDIR/$prog.stdout.${curdate} 2>&1
if [ "$prog" = "TBBDriver" ]; then
check_images
fi
else
echo Starting $prog
rm -f $LOGDIR/$prog.log*.? 1>/dev/null 2>&1
$BINDIR/$prog 1>>$LOGDIR/$prog.stdout.${curdate} 2>&1 &
fi
fi
usleep 250000
ps -ef | grep -v grep | egrep '[0-9][0-9] [a-zA-Z0-9/_.]*/'${prog}
fi
}
#
# Stop the program when it is running
#
stop_prog()
{
# make arguments readable
prog=$1
asroot=${2:1}
withmpi=${3:1}
[ ! -z "$asroot" ] && asroot=sudo
# special check for logging-daemons
[ $prog == $logProgToSkip ] && return
# check existance
[ -x $BINDIR/$prog ] || [ -x $BINDIR/${prog}.sh ] || return
# if it is a shell script call the script
if [ -f $BINDIR/${prog}.sh ]; then
$BINDIR/${prog}.sh stop
return
fi
# get processlist
/sbin/pidof -x ${prog} 1>/dev/null 2>&1
if [ $? -ne 0 ]; then
return
fi
# PVSS needs special treatment
if [ "$prog" = "PVSS00pmon" ]; then
echo "Stopping PVSS database"
start_pvss2 -stopWait
return
fi
# first handle mpi programs
if [ ! -z "$withmpi" ]; then
$asroot cexec :0-11 killall -9 ${prog}
return
fi
# first try normal kill
for pid in `/sbin/pidof -x ${prog}`
do
echo "Softly killing ${prog}(${pid})"
$asroot kill $pid 1>/dev/null 2>&1
usleep 500000
done
# when normal kill did not work, kill is with -9
for pid in `/sbin/pidof -x ${prog}`
do
echo "Hard killing ${prog}(${pid})"
$asroot kill -9 $pid 1>/dev/null 2>&1
usleep 500000
done
# if user0 or lofarsys, try normal kill as root
for pid in `/sbin/pidof -x ${prog}`
do
if [ "$user" == "user0" -o "$user" == "lofarsys" ]; then
sudo kill $pid 1>/dev/null 2>&1
usleep 50000
fi
done
# if user0 or lofarsys, try hard kill as root
for pid in `/sbin/pidof -x ${prog}`
do
if [ "$user" == "user0" -o "$user" == "lofarsys" ]; then
sudo kill -9 $pid 1>/dev/null 2>&1
usleep 50000
fi
done
# if still alive, write a message
for pid in `/sbin/pidof -x ${prog}`
do
echo -n "Could not kill ${prog}(${pid}); "
if [ "$user" == "user0" -o "$user" == "lofarsys" ]; then
echo "tried it as root as well, giving up..."
else
echo "probably run by another user, contact your system administrator"
fi
done
}
#
# show status of program
#
status_prog()
{
echo
echo Status of all software level:
highest_level_running=0
prevlevel=1
list=( `cat $LEVELTABLE | cut -d"#" -f1 | awk '{ if (NF>0) print $0 }' ` )
for line in ${list[@]}
do
# expected process and swlevel it should run in
levelnr=`echo $line | cut -d":" -f1`
prog=`echo $line | cut -d":" -f6`
pid=("")
# special check for logging-daemons
[ $prog == $logProgToSkip ] && continue
# check existance
[ -x $BINDIR/$prog ] || [ -x $BINDIR/${prog}.sh ] || continue
if [ $prevlevel -ne $levelnr ]; then
echo "---"
prevlevel=$levelnr
fi
# if it is a shell script call the script
if [ -f $BINDIR/${prog}.sh ]; then
$BINDIR/${prog}.sh status $levelnr
continue
fi
# find out the processID of the possibly (running) process
obsid=()
pid_user=()
/sbin/pidof -x ${prog} 1>/dev/null 2>&1
if [ $? -eq 0 ]; then
pid=( `/sbin/pidof -x ${prog}` )
i=0
for apid in ${pid[@]}
do
obsid[i]=`ps -p $apid --no-heading -o command | awk -F{ '{print $2}' | awk -F} '{print $1}'`
if [ $show_users -eq 1 ]; then
pid_user[i]=`ps -p $apid -o user=`
fi
i=$i+1
done
# If a program is running in a level higher than the level
# that should be active, raise the active level to indicate
# this.
highest_level_running=$levelnr
else
pid="DOWN"
fi
if [ "$pid" != "DOWN" ] && [ ${#obsid[0]} != 0 ]; then
echo ${levelnr}:${prog}:${pid[*]}:${obsid[*]} | awk -F: '{ printf "%s : %-25s %s [ObsID: %s]\n", $1, $2, $3, $4 }'
elif [ "$pid" != "DOWN" ] && [ ${show_users} -eq 1 ]; then
echo ${levelnr}:${prog}:${pid[*]}:${pid_user[*]} | awk -F: '{ printf "%s : %-25s %s [%s]\n", $1, $2, $3, $4 }'
else
echo ${levelnr}:${prog}:${pid[*]} | awk -F: '{ printf "%s : %-25s %s\n", $1, $2, $3}'
fi
# Some Checks
# Controllers must have one instance, only. Some programs may have more instances.
if [ ${#pid[@]} -ge 2 ]; then
if [ "$prog" != "ObservationControl" \
-a "$prog" != "PythonControl" \
-a "$prog" != "OnlineControl" ]; then
toomany="$toomany ${prog}[$levelnr]"
fi
fi
# Check for missing controllers
if [ "$pid" = "DOWN" -o "$pid" = "0" ]; then
if [ $levelnr -le $level ]; then
if [ $levelnr -le 5 ]; then
missing="$missing ${prog}[$levelnr]"
else
# LCU level 6 has two permanent controllers running
if [ "$prog" == "StationControl" \
-o "$prog" == "ClockControl" ]; then
missing="$missing ${prog}[$levelnr]"
fi
# MCU level 6 must have MACScheduler running
if [ "$prog" == "MACScheduler" ]; then
missing="$missing ${prog}[$levelnr]"
fi
fi
fi
fi
done
echo "---"
if [ "$missing" ]; then
echo "Missing :"$missing
fi
if [ "$toomany" ]; then
echo "Too many:"$toomany
fi
}
#
# goto_level levelnr
#
goto_level()
{
#first power down to new level
newlevel=$1
# set rcumode to 0 (power save) when entering level 1
if [ ${newlevel} -le 1 ]; then
if [ -e /tmp/level.admin ]; then
curlevel=`cat /tmp/level.admin`
else
curlevel=-1
fi
if [ ${curlevel} -ge 2 ]; then
/sbin/pidof RSPDriver 1>/dev/null 2>&1
if [ $? -eq 0 ]; then
status=`( rspctl --version | grep "0.0" ) >& /dev/null; echo $?`
if [ $status == 1 ]; then
echo "set rcumode to 0 (power save)"
rspctl --rcumode=0 1>/dev/null 2>&1
# Wait for setting to take effect before killing RSPDriver
sleep 2
else
echo "Beware: NOT going to rcumode 0 as images are still being initialized"
fi
fi
if [ -e $SBINDIR/stopTBB.sh ]; then
echo "Stopping TBB recording mode"
$SBINDIR/stopTBB.sh
fi
fi
fi
for (( l=6 ; l>newlevel ; l-- ))
do
tac $LEVELTABLE | cut -d"#" -f1 | awk '{ if (NF>0) print $0 }' | \
grep "^${l}:" | grep ":d:" | while read line
do
(
asroot=`echo $line | cut -d":" -f4`
withmpi=`echo $line | cut -d":" -f5`
program=`echo $line | cut -d":" -f6`
stop_prog $program x$asroot x$withmpi
) <&- # cant have programs reading from stdin
# as that would mess up 'while read line'
done
done
# then power up to new level
for (( l=1 ; l<=newlevel ; l++ ))
do
# Start programs for level $l
cat $LEVELTABLE | cut -d"#" -f1 | awk '{ if (NF>0) print $0 }' | grep "^${l}:" | grep ":u:" | while read line
do
(
asroot=`echo $line | cut -d":" -f4`
withmpi=`echo $line | cut -d":" -f5`
program=`echo $line | cut -d":" -f6`
start_prog $program x$asroot x$withmpi
) <&- # cant have programs reading from stdin
# as that would mess up 'while read line'
done
done
}
show_level()
{
if [ -e /tmp/level.admin ]; then
level=`cat /tmp/level.admin`
status_prog >& /dev/null
if [ $highest_level_running -gt $level ]; then
level=$highest_level_running
status_prog >& /dev/null
fi
if [ "$missing" != "" ]; then
let level=0-$level
fi
if [ "$1" != "S" ]; then
echo -n "Currently set level is "
fi
echo $level
if [ "$1" = "S" -o "$1" = "s" ]; then
exit
fi
else
level=-1
if [ "$1" != "S" ]; then
echo "Currently set level unknown"
fi
fi
# argument -s/-S only returns level, no list
if [ -z "$1" ]; then
status_prog
fi
exit $level
}
print_level()
{
if [ -e /tmp/level.admin ]; then
level=`cat /tmp/level.admin`
if [ "$1" != "P" ]; then
echo -n "Last set level is "
fi
echo $level
if [ "$1" = "P" -o "$1" = "p" ]; then
exit
fi
else
level=-1
if [ "$1" != "S" ]; then
echo "Last set level unknown"
fi
fi
# argument -s/-S only returns level, no list
exit $level
}
show_lofar_version()
{
if [ -e $LOFARROOT/Version.txt ]; then
version=`cat $LOFARROOT/Version.txt`
if [ "$1" = "v" ]; then
echo -n "Current LOFAR version is "
fi
echo $version
else
version="-1"
if [ "$1" = "v" ]; then
echo "Current LOFAR version unknown"
fi
fi
if [ "$version" != "-1" ]; then
exit
else
exit $version
fi
}
handle_args()
{
# Handle arguments
if [ ${#} -gt 1 ]; then
if [[ $1 != \-* ]]; then
echo "Warning: all arguments except level $1 will be ignored"
fi
fi
while getopts "hUuSsPpVvi:l:q:r:" flag
do
case "$flag" in
[uU])
show_users=1
show_level
;;
[sS])
show_level $flag
;;
[pP])
print_level $flag
;;
[vV])
show_lofar_version $flag
;;
i)
imageForced=1
image=$OPTARG
# This is needed to be able to retrieve the requested swlevel
# when it is not provided with option -l
shift $((OPTIND-1)); OPTIND=1
;;
q)
procesname=$OPTARG
stop_prog $procesname
exit
;;
r)
procesname=$OPTARG
start_prog $procesname
exit
;;
l)
level=$OPTARG
;;
h)
SyntaxError
;;
*)
exit
;;
esac
done
if [ -z $level ]; then
if [ "$1" != "" ]; then
level=$1
else
level=-1
fi
fi
if [ "$user" != "lofarsys" -a $level -gt 3 ]; then
echo "Will only start up to level 3 as this appears to be local use"
level=3
fi
return
}
#
# MAIN
#
# Find out if we are running on a PVSS system
# Note: on PVSS systems LoggingClient must be ignored,
# On non-PVSS system LoggingProcessor.
logProgToSkip=LoggingProcessor
if [ -f ${BINDIR}/PVSS00pmon ]; then
logProgToSkip=LoggingClient
fi
# All users can ask for current level
show_users=0
if [ -z $1 ]; then
show_level
fi
user=`id | cut -d'(' -f2 | cut -d')' -f1`
group=`groups | awk '{print $1}'`
imageForced=0
handle_args $*
# All other options that act on the station status are for lofarsys only
# Don't allow root to run swlevel because all logfile get root access.
if [ "$LOFARROOT" == "/opt/lofar" -a "$user" != "lofarsys" -a "$group" != "local" ]; then
echo "swlevel must be run by user lofarsys or group local members!"
exit
fi
# first power down to this level
case $level in
0|1|2|3|4|5|6)
;;
*) SyntaxError
esac
echo Going to level $level
cwd=`pwd`
cd ${BINDIR}
goto_level $level
cd ${cwd}
status_prog
if [ $highest_level_running -gt $level ]; then
echo "Could not go to level $level. Level is $highest_level_running"
fi
# save for later
echo $level > /tmp/level.admin
date=`date +%Y-%m-%d\ %H:%M:%S`
echo [${date}]:$0 $* >> ${LOGDIR}/swlevel.log
exit $level