Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
swlevel 19.66 KiB
#!/bin/bash
#
# swlevel : bring software on node in certain runlevel
#
# Copyright (C) 2006
# ASTRON (Netherlands Foundation for Research in Astronomy)
# P.O.Box 2, 7990 AA Dwingeloo, The Netherlands, softwaresupport@astron.nl
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# Syntax: swlevel [ 0 | 1 | 2 | 3 | 4 | 5 | 6 ]
#
# $Id$
#

if [ "$LOFARROOT" == "" ]; then
  # default value until all MAC controlled systems provide $LOFARROOT
  LOFARROOT=/opt/lofar
fi

BINDIR=$LOFARROOT/bin
SBINDIR=$LOFARROOT/sbin
LOGDIR=$LOFARROOT/var/log
ETCDIR=$LOFARROOT/etc
LEVELTABLE=${ETCDIR}/swlevel.conf

# Make sure all files are user/group writeable (needed for Int. 
# Stations)

umask 002

#
# SyntaxError msg
#
SyntaxError()
{
	Msg=$1

	[ -z "${Msg}" ] || echo "ERROR: ${Msg}"
	echo ""
	echo "Syntax: $(basename $0) [-s/S/v/V/u/U] [-i x] [-r/q processname] [-l][ 0 |1 |2 |3 |4 |5 |6 ]"
	echo "-s: Show current level, exit (-S: only print number)"
	echo "-p: Print last set level from logfile (-P: only print number)"
	echo "-v: Show running LOFAR version, exit (-V: only print version)"
	echo "-u: Show users owning running processes (-U: same)"
	echo "-i: Load RSP firmware from image [x] (default image is 1)"
        echo "-l: Set to level as provided (optional)"
        echo "-q: Quit process with name processname"
        echo "-r: Run process with name processname"        
        echo
	echo "Levels:"
	echo "0:  Stop all lofar software"
	echo "1:  Run Lofar daemons and PVSS"
	echo "2:  Run Lowlevel hardware drivers"
	echo "3:  Run Calculation services"
	echo "4:  Run Hardware and software monitors"
	echo "5:  Run System Health Management"
	echo "6:  MAC is controlling the software"
	echo ""
	exit 0
}

# Method to run a command with a specified timeout in seconds
# First argument is timeout, next arguments are the command and its parameters


# Find which image to load on a given station; uses file 
# ${ETCDIR}/RSPImage.conf
#
# returns $image with image number

findImage()
{ 
       findstation=$1
       RSPImageFile=${ETCDIR}/RSPImage.conf
       if [ ! -e $RSPImageFile ] ; then 
          echo "Cannot find Image file ${ETCDIR}/RSPImage.conf"
          exit;
       fi
       image=0
       while read line
       do
	 first=`echo $line | awk '{print $1}'`
         if [ "$first" == "$findstation" ]; then
            image=`echo $line | awk '{print $2}'`
	    break
         fi 
       done < ${ETCDIR}/RSPImage.conf
       if [ $image -eq 0 ]; then 
	   echo "Could not find station $findstation in file ${ETCDIR}/RSPImage.conf"
           exit;
       fi
       return
}       

#
# selectImage(); load an image on the RSP boards
#
selectImage()
{
	let nrRSPs=`grep RSPBOARDS ${ETCDIR}/RemoteStation.conf | cut -d'=' -f2 | sed 's/ //g'`
        let offset=0x`grep RSPDriver.MAC_ADDR_0 ${ETCDIR}/RSPDriver.conf | cut -d'=' -f2 | cut -d':' -f5`
	let board=0
        # Assume no errors with board communication
	boardError=0
        errorBoards=""
	# Make sure we have an image number in parameter $image
        if [ -z $image ] && [ $imageForced -eq 0 ]; then  
            findImage `hostname -s`
        fi

	while [ $board -lt $nrRSPs ]
	do
		# get major version of active firmware on RSPboard $board
                #boardHex=`echo $board | awk '{ printf "%02x", $1 }'` 
		boardHex=`echo $board | awk -v firstBoard=$offset '{ printf "%02x", firstBoard+$1 }'`
		# Uncomment next lines only for testing purposes!
                #if [ "$boardHex" == "03" ]; then 
		#   boardHex="1F"
                #fi
		rsu=`run_timeout.sh 5 sudo ${SBINDIR}/rsuctl3 -m 10:fa:00:00:$boardHex:00 -qV 2>&1 | grep BP | cut -d':' -f2 | sed 's/ //g' | cut -d'.' -f1`

		# If not a single number, something weird  must have happened
		if [ ${#rsu} -ne 1 ]; then 
                  echo "RSPboard $boardHex: Error requesting active firmware version (communication error)"
		  boardError=1
		  errorBoards=${errorBoards}${boardHex}","
                else
  		  echo "Loading image $image on RSPboard $boardHex ..."
		  run_timeout.sh 5 sudo ${SBINDIR}/rsuctl3_reset -m 10:fa:00:00:$boardHex:00 -q -x -p $image 1>/dev/null 2>&1
                  if [ $? -ne 0 ]; then 
		      boardError=1
                      errorBoards=${errorBoards}${boardHex}","
                  fi
		fi
		# Next board
		let board+=1
	done
	if [ $boardError -eq 1 ]; then
	    echo "Board(s) $errorBoards have a communication problem; try reset the 48V"
	fi
}

#
# Check if RSP images and TBB images are properly loaded, and start TBB
# recording if they are
#
check_images()
{
if [ -e $BINDIR/rspctl ]; then 
   # First make sure RSP images are properly loaded
   # Introduce a timeout of 60 sec for images to initialize
   echo "Waiting for RSP and TBB images to be initialized"
   timeout=60
   for (( s=0 ; s<timeout; s++ ))
   do 
     rsu_ready=`( rspctl --version | grep "0.0" ) >& /dev/null; echo $?`
     if [ $rsu_ready == 1 ]; then
	    echo "RSP Images are loaded"
	    break
     fi
     sleep 1
   done
   if [ $s == $timeout ]; then 
     echo "Could not load RSP images in time; Reset RSP boards"
     exit 1
   fi 
fi

if [ -e $BINDIR/tbbctl ]; then
   # Now make sure TBB images are properly loaded
   # Introduce a timeout of 60 sec for images to initialize
   echo "Waiting for TBB images to be initialized"
   timeout=60
   for (( s=0 ; s<timeout; s++ ))
   do 
     tbb_respons=`tbbctl --version`
	tbb_ready=`( echo $tbb_respons | grep "\ V\ " ) >& /dev/null; echo $?`
     if [ $tbb_ready -eq 0 ]; then
         sleep 10 # additional delay to let TBB boards end their init phase
	 echo "TBB Images are loaded"
	 break
     fi
     tbb_down=`( echo $tbb_respons | grep "TBBDriver is NOT responding" )>& /dev/null; echo $?`
     if [ $tbb_down -eq 0 ]; then
         echo "TBBDriver is not responding; cannot continue start of TBBs"
	 # Trigger message furtheron in the code
	 s=$timeout
	 break
     fi
     sleep 1
   done
   if [ $s == $timeout ]; then 
     echo "Could not load TBB images; Reset TBB boards"
   else
     # Start TBB recording mode
     if [ -e $SBINDIR/startTBB.sh ]; then
        $SBINDIR/startTBB.sh
     fi
   fi 
fi
}

#
# Start the program when it exists
#
start_prog()
{
	# make arguments readable
	prog=$1
	asroot=${2:1}
	withmpi=${3:1}

	# special check for logging-daemons
	[ $prog == $logProgToSkip ] && return
	
	# check existance
	[ -x $BINDIR/$prog ] || [ -x $BINDIR/${prog}.sh ] || return

	# if it is a shell script call the script
	if [ -f $BINDIR/${prog}.sh ]; then
		$BINDIR/${prog}.sh start
		return
	fi

	# Check if program is already running
	/sbin/pidof -x ${prog} 1>/dev/null 2>&1
	if [ $? -ne 0 ]; then
		curdate=`date +%Y%m%dT%H%M%S`
		# PVSS needs special treatment
		if [ "$prog" = "PVSS00pmon" ]; then 
		    echo Starting $prog
		    start_pvss2 1>/dev/null 2>&1 &
		    sleep 3
		elif [ "$prog" = "SASGateway" ]; then  
		    # Foreign stations not under central control should not
		    # connect to the SAS database; this prevents SAS main-
		    # tenance etc.
		    if [ "$user" = "lofarsys" ]; then
			echo Starting $prog
			rm -f $LOGDIR/$prog.log*.? 1>/dev/null 2>&1
			$BINDIR/$prog 1>>${LOGDIR}/${prog}.stdout.${curdate} 2>&1 &
                    else
                        echo "Local use, not starting $prog"
                    fi		    
  		else
		    if [ -n "$asroot" ]; then
		       	echo Starting $prog
		       	sudo rm -f $LOGDIR/$prog.log.? 1>/dev/null 2>&1
		       	if [ "$prog" = "RSPDriver" ]; then
			   selectImage
			   if [ $boardError -eq 1 ]; then
	 			exit
			   fi
		       	fi
                        if [ "$prog" = "TBBDriver" ]; then 
			   # Check if RSPDriver is already running; if not, do not start either!
			   /sbin/pidof RSPDriver 1>/dev/null 2>&1
			   if [ $? -ne 0 ]; then
				echo "RSPDriver not running, so not starting TBBDriver either"
				exit
			   fi
                        fi
		       	sudo -b $BINDIR/$prog 1>>$LOGDIR/$prog.stdout.${curdate} 2>&1
                       if [ "$prog" = "TBBDriver" ]; then 
                           check_images
		       fi
		    else

			echo Starting $prog
			rm -f $LOGDIR/$prog.log*.? 1>/dev/null 2>&1
			$BINDIR/$prog 1>>$LOGDIR/$prog.stdout.${curdate} 2>&1 &
		    fi
		fi
		usleep 250000
		ps -ef | grep -v grep | egrep '[0-9][0-9] [a-zA-Z0-9/_.]*/'${prog}
	fi
}

#
# Stop the program when it is running
#
stop_prog()
{
	# make arguments readable
	prog=$1
	asroot=${2:1}
	withmpi=${3:1}
	[ ! -z "$asroot" ] && asroot=sudo	

	# special check for logging-daemons
	[ $prog == $logProgToSkip ] && return
	
	# check existance
	[ -x $BINDIR/$prog ] || [ -x $BINDIR/${prog}.sh ] || return
	
	# if it is a shell script call the script
	if [ -f $BINDIR/${prog}.sh ]; then
		$BINDIR/${prog}.sh stop
		return
	fi

	# get processlist
	/sbin/pidof -x ${prog} 1>/dev/null 2>&1
	if [ $? -ne 0 ]; then
		return
	fi

	# PVSS needs special treatment
	if [ "$prog" = "PVSS00pmon" ]; then
		echo "Stopping PVSS database"
		start_pvss2 -stopWait
		return
	fi

	# first handle mpi programs
	if [ ! -z "$withmpi" ]; then
		$asroot cexec :0-11 killall -9 ${prog}
		return
	fi

	# first try normal kill
	for pid in `/sbin/pidof -x ${prog}`
	do 
		echo "Softly killing ${prog}(${pid})"
		$asroot kill $pid 1>/dev/null 2>&1
		usleep 500000
	done

	# when normal kill did not work, kill is with -9
	for pid in `/sbin/pidof -x ${prog}`
	do 
		echo "Hard killing ${prog}(${pid})"
		$asroot kill -9 $pid 1>/dev/null 2>&1
		usleep 500000
	done
        # if user0 or lofarsys, try normal kill as root 	 
	
	for pid in `/sbin/pidof -x ${prog}` 	 
	do 	 
	    if [ "$user" == "user0" -o "$user" == "lofarsys" ]; then 	 
	      sudo kill $pid 1>/dev/null 2>&1 	 
	      usleep 50000 	 
	    fi 	 
	done 	 
	  	 
	# if user0 or lofarsys, try hard kill as root 	 
	for pid in `/sbin/pidof -x ${prog}` 	 
	do 	 
	    if [ "$user" == "user0" -o "$user" == "lofarsys" ]; then 	 
		sudo kill -9 $pid 1>/dev/null 2>&1 	 
		usleep 50000 	 
	    fi 	 
	done 	 
	  	 
	# if still alive, write a message 	 
	for pid in `/sbin/pidof -x ${prog}` 	 
	do 	 
	  echo -n "Could not kill ${prog}(${pid}); " 	 
	  if [ "$user" == "user0" -o "$user" == "lofarsys" ]; then 	 
	      echo "tried it as root as well, giving up..." 	 
	  else 	 
	      echo "probably run by another user, contact your system administrator" 	 
	  fi 	 
	done

}

#
# show status of program
#
status_prog()
{
	echo
	echo Status of all software level:
	highest_level_running=0
	prevlevel=1
	list=( `cat $LEVELTABLE | cut -d"#" -f1 | awk '{ if (NF>0) print $0 }' ` )
	for line in ${list[@]}
	do
	        # expected process and swlevel it should run in
		levelnr=`echo $line | cut -d":" -f1`
		prog=`echo $line | cut -d":" -f6`
                pid=("")

		# special check for logging-daemons
		[ $prog == $logProgToSkip ] && continue
		
		# check existance
		[ -x $BINDIR/$prog ] || [ -x $BINDIR/${prog}.sh ] || continue
	
		if [ $prevlevel -ne $levelnr ]; then
			echo "---"
			prevlevel=$levelnr
		fi

		# if it is a shell script call the script
		if [ -f $BINDIR/${prog}.sh ]; then
			$BINDIR/${prog}.sh status $levelnr
			continue
		fi

		# find out the processID of the possibly (running) process
		obsid=()
		pid_user=()
		/sbin/pidof -x ${prog} 1>/dev/null 2>&1
		if [ $? -eq 0 ]; then
			pid=( `/sbin/pidof -x ${prog}` )
			i=0
			for apid in ${pid[@]}
			do
			  obsid[i]=`ps -p $apid --no-heading -o command | awk -F{ '{print $2}' | awk -F} '{print $1}'`
                          if [ $show_users -eq 1 ]; then 	 
			     pid_user[i]=`ps -p $apid -o user=` 	 
			  fi
			  i=$i+1
			done
			# If a program is running in a level higher than the level
			# that should be active, raise the active level to indicate
			# this.
			highest_level_running=$levelnr
		else
			pid="DOWN"
		fi

		if [ "$pid" != "DOWN" ] && [ ${#obsid[0]} != 0 ]; then 
    		  echo ${levelnr}:${prog}:${pid[*]}:${obsid[*]} | awk -F: '{ printf "%s : %-25s %s [ObsID: %s]\n", $1, $2, $3, $4 }'
		elif [ "$pid" != "DOWN" ] && [ ${show_users} -eq 1 ]; then
		    echo ${levelnr}:${prog}:${pid[*]}:${pid_user[*]} | awk -F: '{ printf "%s : %-25s %s [%s]\n", $1, $2, $3, $4 }'
		else
                  echo ${levelnr}:${prog}:${pid[*]} | awk -F: '{ printf "%s : %-25s %s\n", $1, $2, $3}'
		fi
		# Some Checks
		# Controllers must have one instance, only. Some programs may have more instances.
		if [ ${#pid[@]} -ge 2 ]; then 
		    if [ "$prog" != "ObservationControl" \
			-a "$prog" != "PythonControl" \
			-a "$prog" != "OnlineControl" ]; then
                       toomany="$toomany ${prog}[$levelnr]"
		    fi
		fi
		
		# Check for missing controllers 
		if [ "$pid" = "DOWN" -o "$pid" = "0" ]; then 
  		  if [ $levelnr -le $level ]; then 
		    if [ $levelnr -le 5 ]; then
			missing="$missing ${prog}[$levelnr]"
		    else
			# LCU level 6 has two permanent controllers running
			if [ "$prog" == "StationControl" \
			    -o "$prog" == "ClockControl" ]; then 
			    missing="$missing ${prog}[$levelnr]"
			fi
			# MCU level 6 must have MACScheduler running
			if [ "$prog" == "MACScheduler" ]; then 
			    missing="$missing ${prog}[$levelnr]"
			fi
		    fi
		  fi
		fi
	done
	echo "---"
	if [ "$missing" ]; then 
		echo "Missing :"$missing
	fi
	if [ "$toomany" ]; then 
		echo "Too many:"$toomany
	fi
}

#
# goto_level levelnr
#
goto_level()
{
	#first power down to new level
	newlevel=$1

	# set rcumode to 0 (power save) when entering level 1
	if [ ${newlevel} -le 1 ]; then
          if [ -e /tmp/level.admin ]; then
            curlevel=`cat /tmp/level.admin`
          else
            curlevel=-1
          fi
          if [ ${curlevel} -ge 2 ]; then 
	    /sbin/pidof RSPDriver 1>/dev/null 2>&1
	    if [ $? -eq 0 ]; then
	      status=`( rspctl --version | grep "0.0" ) >& /dev/null; echo $?`
              if [ $status == 1 ]; then 
       	        echo "set rcumode to 0 (power save)"
	        rspctl --rcumode=0 1>/dev/null 2>&1
                # Wait for setting to take effect before killing RSPDriver
                sleep 2 
	      else
		echo "Beware: NOT going to rcumode 0 as images are still being initialized"
	      fi
            fi
	    if [ -e $SBINDIR/stopTBB.sh ]; then
	       echo "Stopping TBB recording mode"
	       $SBINDIR/stopTBB.sh
            fi
          fi
	fi

	for (( l=6 ; l>newlevel ; l-- ))
	do
		tac $LEVELTABLE | cut -d"#" -f1 | awk '{ if (NF>0) print $0 }' | \
		grep "^${l}:" | grep ":d:" |  while read line
		do
                        (
			asroot=`echo $line | cut -d":" -f4`
			withmpi=`echo $line | cut -d":" -f5`
			program=`echo $line | cut -d":" -f6`
			stop_prog $program x$asroot x$withmpi
                        ) <&- # cant have programs reading from stdin
                              # as that would mess up 'while read line'
		done
	done

	# then power up to new level
	for (( l=1 ; l<=newlevel ; l++ ))
	do

	  # Start programs for level $l
	  cat $LEVELTABLE | cut -d"#" -f1 | awk '{ if (NF>0) print $0 }' | grep "^${l}:" | grep ":u:" |  while read line
		do
                        (
			asroot=`echo $line | cut -d":" -f4`
			withmpi=`echo $line | cut -d":" -f5`
			program=`echo $line | cut -d":" -f6`
			start_prog $program x$asroot x$withmpi
                        ) <&- # cant have programs reading from stdin
                              # as that would mess up 'while read line'
	       done
	done
}


show_level()
{
        if [ -e /tmp/level.admin ]; then
          level=`cat /tmp/level.admin`
          status_prog >& /dev/null
          if [ $highest_level_running -gt $level ]; then 
	    level=$highest_level_running
            status_prog >& /dev/null
          fi            
          if [ "$missing" != "" ]; then
            let level=0-$level
          fi 

	  if [ "$1" != "S" ]; then
	     echo -n "Currently set level is "
	  fi
	  echo $level
          if [ "$1" = "S" -o "$1" = "s" ]; then
            exit
          fi
        else
          level=-1
          if [ "$1" != "S" ]; then
             echo "Currently set level unknown"
          fi
        fi
        # argument -s/-S only returns level, no list
        if [ -z "$1" ]; then
            status_prog
        fi
        exit $level
}

print_level()
{
        if [ -e /tmp/level.admin ]; then
          level=`cat /tmp/level.admin`
	  if [ "$1" != "P" ]; then
	     echo -n "Last set level is "
	  fi
	  echo $level
          if [ "$1" = "P" -o "$1" = "p" ]; then
            exit
          fi
        else
          level=-1
          if [ "$1" != "S" ]; then
             echo "Last set level unknown"
          fi
        fi
        # argument -s/-S only returns level, no list
        exit $level
}

show_lofar_version()
{
        if [ -e $LOFARROOT/Version.txt ]; then
          version=`cat $LOFARROOT/Version.txt`
	  if [ "$1" = "v" ]; then  
            echo -n "Current LOFAR version is "
	  fi
          echo $version
        else
          version="-1"
          if [ "$1" = "v" ]; then
            echo "Current LOFAR version unknown"
          fi
        fi
	if [ "$version" != "-1" ]; then 
  	  exit
        else
          exit $version
        fi
}

handle_args()
{

  # Handle arguments
  if [ ${#} -gt 1 ]; then
    if [[ $1 != \-* ]]; then
      echo "Warning: all arguments except level $1 will be ignored"
    fi
  fi
  while getopts  "hUuSsPpVvi:l:q:r:" flag
  do
    case "$flag" in
    [uU]) 	 
      show_users=1 	 
      show_level 	 
      ;;
    [sS])
      show_level $flag
      ;;
    [pP])
      print_level $flag
      ;;
    [vV])
      show_lofar_version $flag
      ;;
    i)
      imageForced=1
      image=$OPTARG
      # This is needed to be able to retrieve the requested swlevel
      # when it is not provided with option -l
      shift $((OPTIND-1)); OPTIND=1
      ;;
    q)
      procesname=$OPTARG
      stop_prog $procesname
      exit
      ;;
    r)
      procesname=$OPTARG
      start_prog $procesname
      exit
      ;;
    l)
      level=$OPTARG
      ;;
    h)
      SyntaxError
      ;;
    *) 
      exit 
      ;;
    esac
  done
  if [ -z $level ]; then
    if [ "$1" != "" ]; then
      level=$1
    else
      level=-1
    fi
  fi

  if [ "$user" != "lofarsys" -a $level -gt 3 ]; then
      echo "Will only start up to level 3 as this appears to be local use"
      level=3
  fi

  return
}


#
# MAIN
#


# Find out if we are running on a PVSS system
# Note: on PVSS systems LoggingClient must be ignored, 
# On non-PVSS system LoggingProcessor.

logProgToSkip=LoggingProcessor
if [ -f ${BINDIR}/PVSS00pmon ]; then
	logProgToSkip=LoggingClient
fi


# All users can ask for current level
show_users=0
if [ -z $1 ]; then 
  show_level 
fi

user=`id | cut -d'(' -f2 | cut -d')' -f1`
group=`groups | awk '{print $1}'`
imageForced=0

handle_args $*

# All other options that act on the station status are for lofarsys only
# Don't allow root to run swlevel because all logfile get root access.
if [ "$LOFARROOT" == "/opt/lofar" -a "$user" != "lofarsys" -a "$group" != "local" ]; then 
  echo "swlevel must be run by user lofarsys or group local members!"
  exit
fi

# first power down to this level
case $level in
	0|1|2|3|4|5|6)  
			;;
	*) SyntaxError
esac
echo Going to level $level
cwd=`pwd`
cd ${BINDIR}
goto_level $level
cd ${cwd}
status_prog
if [ $highest_level_running -gt $level ]; then 
  echo "Could not go to level $level. Level is $highest_level_running"
fi
# save for later
echo $level > /tmp/level.admin
date=`date +%Y-%m-%d\ %H:%M:%S`
echo [${date}]:$0 $* >> ${LOGDIR}/swlevel.log
exit $level