diff --git a/CDB/LOFAR_ConfigDb.json b/CDB/LOFAR_ConfigDb.json index c7fc852c35cb9fcff76b24ab0e10664e17489156..b28c57984889d07dd0544702d6712861bdb72903 100644 --- a/CDB/LOFAR_ConfigDb.json +++ b/CDB/LOFAR_ConfigDb.json @@ -21,6 +21,13 @@ } } }, + "boot": { + "LTS": { + "Boot": { + "LTS/Boot/1": {} + } + } + }, "RECV": { "LTS": { "RECV": { diff --git a/README.md b/README.md index 192b3edb7713088120b672065296575c255adfa6..3f9bb0b2ba94b26e76a850d713a2dd048e218b9e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,49 @@ # Tango Station Control Station Control software related to Tango devices. + +# Installation + +## Prerequisites + +After checking out this repo, be sure to also check out the submodules: + +``` +git submodule init +git submodule update +``` + +You will also need: + +* docker +* docker-compose +* make +* bash + +## Bootstrap + +The bootstrap procedure is needed only once. First we build all docker containers, and load the initial configuration. This may take a while: + +``` +cd docker-compose +make bootstrap +``` + +If you lack access to LOFAR station hardware, configure the devices to use their simulators instead: + +``` +for sim in ../CDB/*-sim-config.json; do + ../sbin/update_ConfigDb.sh ../CDB${sim}-config.json +done +``` + +Now we can start all containers, and make sure everything is up: + +``` +make start +make status +``` + +If not, you can inspect why with `docker logs <container>`. The containers will automatically be restarted on reboot or failure. Stop them explicitly to bring them down (`make stop <container>`). + +Most notably, you will have web interfaces available at http://localhost:8888 (Jupyter Notebook), and http://localhost:3000 (Grafana). diff --git a/devices/clients/opcua_client.py b/devices/clients/opcua_client.py index 7d915cbd00aee72da2a13b7bbb7365306457cf4d..11e4cac79dca4faf7acc04c97f4b4790490b4a3f 100644 --- a/devices/clients/opcua_client.py +++ b/devices/clients/opcua_client.py @@ -179,6 +179,10 @@ class OPCUAConnection(CommClient): return prot_attr.read_function, prot_attr.write_function + def call_method(self, method_path, *args): + raise NotImplementedError + + class ProtocolAttribute: """ This class provides a small wrapper for the OPC ua read/write functions in order to better organise the code diff --git a/devices/devices/boot.py b/devices/devices/boot.py new file mode 100644 index 0000000000000000000000000000000000000000..fb8a6947c6eb5775f96cb3660c5a123bba370ca9 --- /dev/null +++ b/devices/devices/boot.py @@ -0,0 +1,297 @@ +# -*- coding: utf-8 -*- +# +# This file is part of the RECV project +# +# +# +# Distributed under the terms of the APACHE license. +# See LICENSE.txt for more info. + +""" Boot Device Server for LOFAR2.0 + +Boots the rest of the station software. + +""" + +# TODO(Corne): Remove sys.path.append hack once packaging is in place! +import os, sys +currentdir = os.path.dirname(os.path.realpath(__file__)) +parentdir = os.path.dirname(currentdir) +sys.path.append(parentdir) + +# PyTango imports +from tango import DebugIt +from tango.server import run, command +from tango.server import device_property, attribute +from tango import AttrWriteType, DeviceProxy, DevState +# Additional import +import numpy + +from device_decorators import * + +from clients.attribute_wrapper import attribute_wrapper +from devices.hardware_device import hardware_device +from common.lofar_logging import device_logging_to_python, log_exceptions +from common.lofar_git import get_version + +import logging +logger = logging.getLogger() + +from threading import Thread + +__all__ = ["Boot", "main"] + + +class InitialisationException(Exception): + pass + +class DevicesInitialiser(Thread): + """ + Initialise devices on this station. + + Devices which are unreachable are assumed to be brought down explicitly, + and are ignored (unless ignore_unavailable_devices == False). + + Initialisation happens in a separate thread. It is started by calling + the start() method, and progress can be followed by inspecting the + members progress (0-100), status (string), and is_running() (bool). + """ + def __init__(self, device_names, ignore_unavailable_devices=True, proxy_timeout=10.0): + self.ignore_unavailable_devices = ignore_unavailable_devices + + # Since Python3.7+, the insertion order equals the iteration order, which is what we depend on + # to process the devices in the same order as in device_names. + self.devices = {name: DeviceProxy(name) for name in device_names} + + # set the timeout for all proxies + for device in self.devices.values(): + device.set_timeout_millis(int(proxy_timeout * 1000)) + + # setup initial state + self.progress = 0 + self.set_status("Initialisation not started yet") + + super().__init__() + + def run(self): + self.set_status("Starting initialisation") + + try: + self.initialise_devices() + + self.set_status("Initialisation completed") + except Exception as e: + logger.exception("Failed to initialise station") + + # add the exception to the status + self.set_status(f"{self.status} [{e.__class__.__name__}: {str(e)}]") + + # we keep the status stuck at the last thing it tried + + def is_running(self): + return self.is_alive() + + + def stop(self): + if not self.is_alive(): + return + + # Just wait for the current initialisation to finish. It's a finite process. + self.join() + + def set_status(self, status): + self.status = status + + logger.info(status) + + def initialise_devices(self): + """ + Initialise or re-initialise all devices on the station. + + :return:None + """ + + # reset initialisation parameters + self.progress = 0 + + # restart devices in order + for num_restarted_devices, device in enumerate(self.devices.keys(), 1): + if self.is_available(device) or not self.ignore_unavailable_devices: + self.start_device(device) + + self.progress = 100.0 * num_restarted_devices / len(self.devices) + + # make sure we always finish at 100% in case of success + self.progress = 100 + + def is_available(self, device_name: str): + """ Return whether the device 'device_name' is actually available on this server. """ + + proxy = self.devices[device_name] + try: + proxy.state() + except Exception as e: + return False + + return True + + def stop_device(self, device_name: str): + """ Stop device 'device_name'. """ + + proxy = self.devices[device_name] + + if proxy.state() == DevState.OFF: + # already off + return + + self.set_status(f"[stopping {device_name}] Stopping device.") + + proxy.Off() + if proxy.state() != DevState.OFF: + raise InitialisationException(f"Could not turn off device {device_name}. Please look at its logs.") + + self.set_status(f"[stopping {device_name}] Stopped device.") + + def start_device(self, device_name: str): + """ Run the startup sequence for device 'device_name'. """ + + proxy = self.devices[device_name] + + # go to a well-defined state, which may be needed if the user calls + # this function explicitly. + self.stop_device(device_name) + + # setup connections to hardware + self.set_status(f"[restarting {device_name}] Initialising device.") + proxy.Initialise() + if proxy.state() != DevState.STANDBY: + raise InitialisationException(f"Could not initialise device {device_name}. Please look at its logs.") + + # configure the device + self.set_status(f"[restarting {device_name}] Setting defaults.") + proxy.set_defaults() + + self.set_status(f"[restarting {device_name}] Initialising hardware.") + proxy.initialise_hardware() + + # mark as ready for service + self.set_status(f"[restarting {device_name}] Turning on device.") + proxy.On() + if proxy.state() != DevState.ON: + raise InitialisationException(f"Could not turn on device {device_name}. Please look at its logs.") + + self.set_status(f"[restarting {device_name}] Succesfully started.") + +@device_logging_to_python() +class Boot(hardware_device): + # ----------------- + # Device Properties + # ----------------- + + DeviceProxy_Time_Out = device_property( + dtype='DevDouble', + mandatory=False, + default_value=10.0, + ) + + # Which devices to initialise, and in which order + Device_Names = device_property( + dtype='DevVarStringArray', + mandatory=False, + default_value=["LTS/Docker/1", # Docker controls the device containers, so it goes before anything else + "LTS/RECV/1", # RCUs are input for SDP, so initialise them first + "LTS/UNB2/1", # Uniboards host SDP, so initialise them first + "LTS/SDP/1", # SDP controls the mask for SST/XST/BST, so initialise it first + "LTS/SST/1", + "LTS/XST/1", + ], + ) + + # By default, we assume any device is not available + # because its docker container was not started, which + # is an explicit and thus intentional action. + # We ignore such devices when initialising the station. + Ignore_Unavailable_Devices = device_property( + dtype='DevBoolean', + mandatory=False, + default_value=True, + ) + + # ---------- + # Attributes + # ---------- + initialising_station_R = attribute(dtype=numpy.bool_, access=AttrWriteType.READ, fget=lambda self: self.initialiser.is_running()) + initialisation_progress_R = attribute(dtype=numpy.int, access=AttrWriteType.READ, fget=lambda self: numpy.int(self.initialiser.progress)) + initialisation_status_R = attribute(dtype=str, access=AttrWriteType.READ, fget=lambda self: self.initialiser.status) + + @log_exceptions() + def delete_device(self): + """Hook to delete resources allocated in init_device. + + This method allows for any memory or other resources allocated in the + init_device method to be released. This method is called by the device + destructor and by the device Init command (a Tango built-in). + """ + self.debug_stream("Shutting down...") + + self.Off() + self.debug_stream("Shut down. Good bye.") + + # -------- + # overloaded functions + # -------- + @log_exceptions() + def configure_for_off(self): + """ user code here. is called when the state is set to OFF """ + try: + self.initialiser.stop() + except Exception as e: + self.warn_stream("Exception while stopping OPC ua connection in configure_for_off function: {}. Exception ignored".format(e)) + + @log_exceptions() + def configure_for_initialise(self): + # create an initialiser object so we can query it even before starting the (first) initialisation + self.initialiser = DevicesInitialiser(self.Device_Names, self.Ignore_Unavailable_Devices, self.DeviceProxy_Time_Out) + + @command() + @DebugIt() + @only_in_states([DevState.ON]) + @fault_on_error() + @log_exceptions() + def initialise_station(self): + """ + Initialise or re-initialise all devices on the station. + + This command will take a while to execute, so should be called asynchronously. + + :return:None + """ + + if self.initialiser.is_running(): + # already initialising + return + + # join any previous attempt, if any + try: + self.initialiser.join() + except RuntimeError: + pass + + # start new initialisation attempt + self.initialiser = DevicesInitialiser(self.Device_Names, self.Ignore_Unavailable_Devices, self.DeviceProxy_Time_Out) + self.initialiser.start() + +# ---------- +# Run server +# ---------- +def main(args=None, **kwargs): + """Main function of the RECV module.""" + + from common.lofar_logging import configure_logger + configure_logger() + + return run((Boot,), args=args, **kwargs) + + +if __name__ == '__main__': + main() diff --git a/devices/devices/hardware_device.py b/devices/devices/hardware_device.py index a25b863ebc8255fa05c02a5f420f23f309ebf0fb..84d8e4c2b9c6156c994715416bebf38f979903b6 100644 --- a/devices/devices/hardware_device.py +++ b/devices/devices/hardware_device.py @@ -11,7 +11,7 @@ """ -from abc import ABCMeta, abstractmethod +from abc import abstractmethod # PyTango imports from tango.server import Device, command, DeviceMeta, attribute @@ -23,11 +23,14 @@ from common.lofar_logging import log_exceptions from common.lofar_git import get_version from devices.abstract_device import AbstractDeviceMetas from devices.device_decorators import only_in_states, fault_on_error +import time +import math import logging __all__ = ["hardware_device"] +import logging logger = logging.getLogger() @@ -244,3 +247,36 @@ class hardware_device(Device, metaclass=AbstractDeviceMetas): # log which attribute we're addressing raise Exception(f"Cannot assign default to attribute {name}") from e + @only_in_states([DevState.STANDBY, DevState.INIT, DevState.ON]) + @fault_on_error() + @command() + def initialise_hardware(self): + """ Initialise the hardware after configuring it. """ + + # This is just the command version of _initialise_hardware(). + self._initialise_hardware() + + def _initialise_hardware(self): + """ Override this method to initialise any hardware after configuring it. """ + pass + + def wait_attribute(self, attr_name, value, timeout=10, pollperiod=0.2): + """ Wait until the given attribute obtains the given value. + + Raises an Exception if it has not after the timeout. + + timeout: time until an Exception is raised, in seconds. + pollperiod: how often to check the attribute, in seconds. + """ + + attr = getattr(self, attr_name) + + # Poll every half a second + for _ in range(math.ceil(timeout/pollperiod)): + if attr != value: + return + + time.sleep(pollperiod) + + raise Exception(f"{attr} != {value} after f{timeout} seconds still.") + diff --git a/devices/devices/recv.py b/devices/devices/recv.py index a078f601c7d1962f4a11367e7ca9745ec590d5f2..40efb14354471021e0a3d03feab865141422bbc7 100644 --- a/devices/devices/recv.py +++ b/devices/devices/recv.py @@ -61,6 +61,7 @@ class RECV(opcua_device): # Attributes # ---------- Ant_mask_RW = attribute_wrapper(comms_annotation=["2:PCC", "2:Ant_mask_RW"], datatype=numpy.bool_, dims=(3, 32), access=AttrWriteType.READ_WRITE) + Ant_status_R = attribute(dtype=str, max_dim_x=3, max_dim_y=32) CLK_Enable_PWR_R = attribute_wrapper(comms_annotation=["2:PCC", "2:CLK_Enable_PWR_R"], datatype=numpy.bool_) CLK_I2C_STATUS_R = attribute_wrapper(comms_annotation=["2:PCC", "2:CLK_I2C_STATUS_R"], datatype=numpy.int64) CLK_PLL_error_R = attribute_wrapper(comms_annotation=["2:PCC", "2:CLK_PLL_error_R"], datatype=numpy.bool_) @@ -89,6 +90,7 @@ class RECV(opcua_device): RCU_mask_RW = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_mask_RW"], datatype=numpy.bool_, dims=(32,), access=AttrWriteType.READ_WRITE) RCU_monitor_rate_RW = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_monitor_rate_RW"], datatype=numpy.int64, access=AttrWriteType.READ_WRITE) RCU_Pwr_dig_R = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_Pwr_dig_R"], datatype=numpy.bool_, dims=(32,)) + Ant_status_R = attribute(dtype=str, max_dim_x=32) RCU_temperature_R = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_temperature_R"], datatype=numpy.float64, dims=(32,)) RCU_translator_busy_R = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_translator_busy_R"], datatype=numpy.bool_) RCU_version_R = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_version_R"], datatype=numpy.str, dims=(32,)) @@ -96,19 +98,6 @@ class RECV(opcua_device): # -------- # overloaded functions # -------- - @log_exceptions() - def configure_for_initialise(self): - """ user code here. is called when the state is set to INIT """ - - # Init the dict that contains function to OPC-UA function mappings. - self.function_mapping = {} - self.function_mapping["RCU_on"] = {} - self.function_mapping["RCU_off"] = {} - self.function_mapping["CLK_on"] = {} - self.function_mapping["CLK_off"] = {} - - super().configure_for_initialise() - # -------- # Commands @@ -116,80 +105,161 @@ class RECV(opcua_device): @command() @DebugIt() @only_when_on() - @fault_on_error() def RCU_off(self): """ :return:None """ - self.function_mapping["RCU_off"]() + self.opcua_connection.call_method(["2:PCC","2:RCU_off"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def RCU_on(self): """ :return:None """ - self.function_mapping["RCU_on"]() + self.opcua_connection.call_method(["2:PCC","2:RCU_on"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def ADC_on(self): """ :return:None """ - self.function_mapping["ADC_on"]() + self.opcua_connection.call_method(["2:PCC","2:ADC_on"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def RCU_update(self): """ :return:None """ - self.function_mapping["RCU_update"]() + self.opcua_connection.call_method(["2:PCC","2:RCU_update"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def CLK_off(self): """ :return:None """ - self.function_mapping["CLK_off"]() + self.opcua_connection.call_method(["2:PCC","2:CLK_off"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def CLK_on(self): """ :return:None """ - self.function_mapping["CLK_on"]() + self.opcua_connection.call_method(["2:PCC","2:CLK_on"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def CLK_PLL_setup(self): """ :return:None """ - self.function_mapping["CLK_PLL_setup"]() + self.opcua_connection.call_method(["2:PCC","2:CLK_PLL_setup"]) + + def _initialise_hardware(self): + """ Initialise the RCU hardware. """ + + # method calls don't work yet, so don't use them to allow the boot + # device to initialise us without errors + logger.error("OPC-UA methods not supported yet, not initialising RCU hardware!") + return + + # Cycle clock + self.CLK_off() + self.wait_attribute("CLK_translator_busy_R", False, 10) + self.CLK_on() + self.wait_attribute("CLK_translator_busy_R", False, 10) + + if not self.CLK_PLL_locked_R: + if self.CLK_I2C_STATUS_R > 0: + raise Exception("CLK I2C is not working. Maybe power cycle subrack to restart CLK board and translator?") + else: + raise Exception("CLK signal is not locked. The subrack probably do not receive clock input or the CLK PCB is broken?") + + # Cycle RCUs + self.RCU_off() + self.wait_attribute("RCU_translator_busy_R", False, 5) + self.RCU_on() + self.wait_attribute("RCU_translator_busy_R", False, 5) + + def read_RCU_status_R(self): + """ Returns a set of strings denoting the status of each RCU. + + An empty string means no problems were detected. A non-empty + string means the RCU seems unusable. + + This function can be used as input to modify the RCU_mask_RW. """ + + rcu_mask = self.RCU_mask_RW + i2c_errors = self.RCU_I2C_STATUS_R + + nr_rcus = len(rcu_mask) + rcu_status = [""] * nr_rcus + + # construct status of each RCU + for rcu in range(nr_rcus): + status = [] + + if not i2c_status[rcu]: + status.append("[I2C error]") + + rcu_status[rcu] = " ".join(status) + + return rcu_status + + def read_Ant_status_R(self): + """ Returns a set of strings denoting the status of each antenna. + + An empty string means no problems were detected. A non-empty + string means the antenna seems unusable. + + This function can be used as input to modify the Ant_mask_RW. """ + + ant_mask = self.Ant_mask_RW + rcu_mask = self.RCU_mask_RW + adc_lock = self.RCU_ADC_lock_R + i2c_errors = self.RCU_I2C_STATUS_R + + nr_rcus = len(ant_mask) + nr_ants_per_rcu = len(ant_mask[0]) + + # Collect status, join them into a single string per antenna later + ant_status = [""] * nr_ants + + + for rcu in range(nr_rcus): + for ant in range(nr_ants_per_rcu): + status = [] + + if i2c_status[rcu] != 0: + status.append("[I2C error]") + + if not rcu_mask[rcu]: + status.append("[RCU masked out]") + + if not adc_lock[rcu][ant]: + status.append("[ADC lock error]") + + ant_status[rcu][ant] = " ".join(status) + return ant_status + # ---------- # Run server diff --git a/devices/toolkit/lts_cold_start.py b/devices/toolkit/lts_cold_start.py deleted file mode 100644 index 47d3243e2064dc39fba8127e33da842acba19416..0000000000000000000000000000000000000000 --- a/devices/toolkit/lts_cold_start.py +++ /dev/null @@ -1,228 +0,0 @@ -#! /usr/bin/env python3 -import logging -from time import sleep - -# TODO(Corne): Remove sys.path.append hack once packaging is in place! -import os, sys -currentdir = os.path.dirname(os.path.realpath(__file__)) -parentdir = os.path.dirname(currentdir) -sys.path.append(parentdir) - -from toolkit.startup import startup -from toolkit.lofar2_config import configure_logging - - -def start_device(device: str): - ''' - Start a Tango device with the help of the startup function. - The device will not be forced to got through - OFF/INIT/STANDBY/ON but it is assumed that the device is in OFF - state. If the device is not in OFF state, then an exception - will be raised. - ''' - dev = startup(device = device, force_restart = False) - state = device.state() - if state is not tango._tango.DevState.ON: - raise Exception("Device \"{}\" is unexpectedly in \"{}\" state but it is expected to be in \"{}\" state. Please check the reason for the unexpected device state. Aborting the start-up procedure.".format(device, state, tango._tango.DevState.ON)) - return device - - -def lts_cold_start(): - ''' - What is this? - This is the LTS (LOFAR Test - and I forgot what S stands for) cold start - procedure cast into source code. The procedure can be found there: - https://support.astron.nl/confluence/display/L2M/LTS+startup+procedure - - Paulus wrote already a script that - illegally ;) - makes direct use of the - OPC-UA servers to accomplish the same thing that we are doing here. - Paulus' script can be found there: - https://git.astron.nl/lofar2.0/pypcc/-/blob/master/scripts/Startup.py - Thanks, Paulus! You made it very easy for me to cobble together this - script. - - For obvious reasons is our script much better though. :) - First, it is bigger. And bigger is always better. - Then it is better documented but that does not count in the HW world. - But it also raises exceptions with error messages that make an attempt to - help the user reading them and shuts down the respective Tango device(s) if - something goes south. - And that is where we try to do it really right: there is no reason to be - excessively verbatim when things work like they are expected to work. But - tell the user when something goes wrong, give an indication of what could - have gone wrong and where to look for the problem. - - Again, Paulus' script contains already very good indications where problems - might lie and made my job very easy. - - No parameters, parameters are for wimps. :) - ''' - # Define the LOFAR2.0 specific log format - configure_logging() - - # Get a reference to the RECV device, do not - # force a restart of the already running Tango - # device. - recv = startup("LTS/RECV/1") - - # Getting CLK, RCU & RCU ADCs into proper shape for use by real people. - # - # The start-up needs to happen in this sequence due to HW dependencies - # that can introduce issues which are then becoming very complicated to - # handle in SW. Therefore to keep it as simple as possible, let's stick - # to the rule recommended by Paulus: - # 1 CLK - # 2 RCU - # 3 RCU ADCs - # - # - # First take the CLK board through the motions. - # 1.1 Switch off CLK - # 1.2 Wait for CLK_translator_busy_R == True, throw an exception in timeout - # 1.3 Switch on CLK - # 1.4 Wait for CLK_translator_busy_R == True, throw an exception in timeout - # 1.5 Check if CLK_PLL_locked_R == True - # 1.6 Done - # - # - # Steps 1.1 & 1.2 - recv.CLK_off() - # 2021-04-30, Thomas - # This should be refactored into a function. - timeout = 10.0 - while recv.CLK_translator_busy_R is True: - logging.debug("Waiting on \"CLK_translator_busy_R\" to become \"True\"...") - timeout = timeout - 1.0 - if timeout < 1.0: - # Switching the RECV clock off should never take longer than - # 10 seconds. Here we ran into a timeout. - # Clean up and raise an exception. - recv.off() - raise Exception("After calling \"CLK_off\" a timeout occured while waiting for \"CLK_translator_busy_R\" to become \"True\". Please investigate the reason why the RECV translator never set \"CLK_translator_busy_R\" to \"True\". Aborting start-up procedure.") - sleep(1.0) - - # Steps 1.3 & 1.4 - recv.CLK_on() - # Per Paulus this should never take longer than 2 seconds. - # 2021-04-30, Thomas - # This should be refactored into a function. - timeout = 2.0 - while recv.CLK_translator_busy_R is True: - logging.debug("After calling \"CLK_on()\" Waiting on \"CLK_translator_busy_R\" to become \"True\"...") - timeout = timeout - 1.0 - if timeout < 1.0: - # Switching theRECV clock on should never take longer than - # a couple of seconds. Here we ran into a timeout. - # Clean up and raise an exception. - recv.off() - raise Exception("After calling \"CLK_on\" a timeout occured while waiting for \"CLK_translator_busy_R\" to become \"True\". Please investigate the reason why the RECV translator never set \"CLK_translator_busy_R\" to \"True\". Aborting start-up procedure.") - sleep(1.0) - - # 1.5 Check if CLK_PLL_locked_R == True - # 2021-04-30, Thomas - # This should be refactored into a function. - clk_locked = recv.CLK_PLL_locked_R - if clk_locked is True: - logging.info("CLK signal is locked.") - else: - # CLK signal is not locked - clk_i2c_status = recv.CLK_I2C_STATUS_R - exception_text = "CLK I2C is not working. Please investigate! Maybe power cycle subrack to restart CLK board and translator. Aborting start-up procedure." - if i2c_status <= 0: - exception_text = "CLK signal is not locked. Please investigate! The subrack probably do not receive clock input or the CLK PCB is broken. Aborting start-up procedure." - recv.off() - raise Exception(exception_text) - # Step 1.6 - # Done. - - # 2 RCUs - # If we reach this point in the start-up procedure, then the CLK board setup - # is done. We can proceed with the RCUs. - # - # Now take the RCUs through the motions. - # 2.1 Set RCU mask to all available RCUs - # 2.2 Switch off all RCUs - # 2.3 Wait for RCU_translator_busy_R = True, throw an exception in timeout - # 2.4 Switch on RCUs - # 2.5 Wait for RCU_translator_busy_R = True, throw an exception in timeout - # 2.6 Done - # - # - # Step 2.1 - # We have only 8 RCUs in LTS. - recv.RCU_mask_RW = [True, ] * 8 - # Steps 2.2 & 2.3 - recv.RCU_off() - # 2021-04-30, Thomas - # This should be refactored into a function. - timeout = 10.0 - while recv.RCU_translator_busy_R is True: - logging.debug("Waiting on \"RCU_translator_busy_R\" to become \"True\"...") - timeout = timeout - 1.0 - if timeout < 1.0: - # Switching the RCUs off should never take longer than - # 10 seconds. Here we ran into a timeout. - # Clean up and raise an exception. - recv.off() - raise Exception("After calling \"RCU_off\" a timeout occured while waiting for \"RCU_translator_busy_R\" to become \"True\". Please investigate the reason why the RECV translator never set \"RCU_translator_busy_R\" to \"True\". Aborting start-up procedure.") - sleep(1.0) - - # Steps 2.4 & 2.5 - # We leave the RCU mask as it is because it got already set for the - # RCU_off() call. - recv.RCU_on() - # Per Paulus this should never take longer than 5 seconds. - # 2021-04-30, Thomas - # This should be refactored into a function. - timeout = 5.0 - while recv.RCU_translator_busy_R is True: - logging.debug("After calling \"RCU_on()\" Waiting on \"RCU_translator_busy_R\" to become \"True\"...") - timeout = timeout - 1.0 - if timeout < 1.0: - # Switching the RCUs on should never take longer than - # a couple of seconds. Here we ran into a timeout. - # Clean up and raise an exception. - recv.off() - raise Exception("After calling \"RCU_on\" a timeout occured while waiting for \"RCU_translator_busy_R\" to become \"True\". Please investigate the reason why the RECV translator never set \"RCU_translator_busy_R\" to \"True\". Aborting start-up procedure.") - sleep(1.0) - # Step 2.6 - # Done. - - # 3 ADCs - # If we get here, we only got to check if the ADCs are locked, too. - # 3.1 Check RCUs' I2C status - # 3.2 Check RCU_ADC_lock_R == [True, ] for RCUs that have a good I2C status - # 3.3 Done - # - # - # Steps 3.1 & 3.2 - rcu_mask = recv.RCU_mask_RW - adc_locked = numpy.array(recv.RCU_ADC_lock_R) - for rcu, i2c_status in enumerate(recv.RCU_I2C_STATUS_R): - if i2c_status == 0: - rcu_mask[rcu] = True - logging.info("RCU #{} is available.".format(rcu)) - for adc, adc_is_locked in enumerate(adc_locked[rcu]): - if adc_is_locked < 1: - logging.warning("RCU#{}, ADC#{} is unlocked. Please investigate! Will continue with normal operation.".format(rcu, adc)) - else: - # The RCU's I2C bus is not working. - rcu_mask[rcu] = False - logging.error("RCU #{}'s I2C is not working. Please investigate! Disabling RCU #{} to avoid damage.".format(rcu, rcu)) - recv.RCU_mask_RW = rcu_mask - # Step 3.3 - # Done - - # Start-up APSCTL, i.e. Uniboard2s. - aps = startup("APSCTL/SDP/1") - logging.warning("Cannot start-up APSCTL because it requires manual actions.") - - # Start up SDP, i.e. configure the firmware in the Unibards - sdp = startup("LTS/SDP/1") - logging.warning("Cannot start-up SDP because it requires manual actions.") - - logging.info("LTS has been successfully started and configured.") - - -if __name__ == '__main__': - lts_cold_start() diff --git a/devices/toolkit/startup.py b/devices/toolkit/startup.py deleted file mode 100644 index 66a8d2c496fc7e86d0d13086336e900fc1a1bfaf..0000000000000000000000000000000000000000 --- a/devices/toolkit/startup.py +++ /dev/null @@ -1,49 +0,0 @@ -#! /usr/bin/env python3 -import tango -import logging - -logger = logging.getLogger() - -def startup(device: str, force_restart: bool) -> tango.DeviceProxy: - ''' - Start a LOFAR Tango device: - recv = startup(device = 'LTS/RECV/1', force_restart = False) - ''' - proxy = tango.DeviceProxy(device) - state = proxy.state() - - # go to OFF, but only if force_restart is True - if force_restart is True: - logger.warning(f"Forcing device {device} restart.") - proxy.off() - state = proxy.state() - if state is not tango._tango.DevState.OFF: - logger.error(f"Device {device} cannot perform off although restart has been enforced, state = {state}. Please investigate.") - return proxy - - if state is not tango._tango.DevState.OFF: - logger.error(f"Device {device} is not in OFF state, cannot start it. state = {state}") - return proxy - - # Initialise device - logger.info(f"Device {device} is in OFF, performing initialisation.") - proxy.initialise() - state = proxy.state() - if state is not tango._tango.DevState.STANDBY: - logger.error(f"Device {device} cannot perform initialise, state = {state}. Please investigate.") - return proxy - - # Set default values - logger.info(f"Device {device} is in STANDBY, setting default values.") - proxy.set_defaults() - - # Turn on device - logger.info(f"Device {device} is in STANDBY, performing on.") - proxy.on() - state = proxy.state() - if state is not tango._tango.DevState.ON: - logger.error(f"Device {device} cannot perform on, state = {state}. Please investigate.") - else: - logger.info(f"Device {device} has successfully reached ON state.") - - return proxy diff --git a/docker-compose/Makefile b/docker-compose/Makefile index 8e660436fb7ab61d86f704a00c5a386fcc25c401..81e5e4a85223a042b1325343fb01cc941ef3311c 100644 --- a/docker-compose/Makefile +++ b/docker-compose/Makefile @@ -154,6 +154,12 @@ ifneq ($(NETWORK_MODE),host) endif $(DOCKER_COMPOSE_ARGS) docker-compose -f tango.yml -f networks.yml up -d +bootstrap: pull build # first start, initialise from scratch + $(MAKE) start elk-configure-host # configure host kernel for elk container + $(MAKE) start dsconfig # boot up containers to load configurations + sleep 5 # wait for dsconfig container to come up + ../sbin/update_ConfigDb.sh ../CDB/LOFAR_ConfigDb.json # load default configuration + start: up ## start a service (usage: make start <servicename>) if [ $(UNAME_S) = Linux ]; then touch ~/.Xauthority; chmod a+r ~/.Xauthority; fi $(DOCKER_COMPOSE_ARGS) docker-compose $(COMPOSE_FILE_ARGS) start $(SERVICE) diff --git a/docker-compose/archiver.yml b/docker-compose/archiver.yml index 8a357d371e89377a1bfa2ce89e341ba708526fef..84dded354d22c97eeccd51ea97d8ff41b909f01e 100644 --- a/docker-compose/archiver.yml +++ b/docker-compose/archiver.yml @@ -73,5 +73,5 @@ services: - ..:/opt/lofar/tango:rw - ${HOME}:/hosthome - ../docker/tango/tango-archiver:/tango-archiver - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-boot.yml b/docker-compose/device-boot.yml new file mode 100644 index 0000000000000000000000000000000000000000..58a9aa7df81eab368464f4ca69ddab54129b7ace --- /dev/null +++ b/docker-compose/device-boot.yml @@ -0,0 +1,41 @@ +# +# Docker compose file that launches a LOFAR2.0 station's +# ObservationControl device. It also runs the dynamically +# created Observation devices. +# +# Defines: +# - device-observation_control: LOFAR2.0 station ObvservationControl +# +# Requires: +# - lofar-device-base.yml +# +version: '2' + +services: + device-boot: + image: device-boot + # build explicitly, as docker-compose does not understand a local image + # being shared among services. + build: + context: lofar-device-base + args: + SOURCE_IMAGE: ${DOCKER_REGISTRY_HOST}/${DOCKER_REGISTRY_USER}-tango-itango:${TANGO_ITANGO_VERSION} + container_name: ${CONTAINER_NAME_PREFIX}device-boot + networks: + - control + ports: + - "5708:5708" # unique port for this DS + volumes: + - ..:/opt/lofar/tango:rw + environment: + - TANGO_HOST=${TANGO_HOST} + entrypoint: + - /usr/local/bin/wait-for-it.sh + - ${TANGO_HOST} + - --timeout=30 + - --strict + - -- + # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA + # can't know about our Docker port forwarding + - python3 -u /opt/lofar/tango/devices/devices/boot.py LTS -v -ORBendPoint giop:tcp:0:5708 -ORBendPointPublish giop:tcp:${HOSTNAME}:5708 + restart: unless-stopped diff --git a/docker-compose/device-docker.yml b/docker-compose/device-docker.yml index 5386ead921b386741e62febeab399f3007a79281..d9e1e1e35233177ab271db395773538ed8c74ffa 100644 --- a/docker-compose/device-docker.yml +++ b/docker-compose/device-docker.yml @@ -41,4 +41,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/docker_device.py LTS -v -ORBendPoint giop:tcp:0:5705 -ORBendPointPublish giop:tcp:${HOSTNAME}:5705 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-observation_control.yml b/docker-compose/device-observation_control.yml index 011fe0a94112df557670a218518b6492520f4480..827a558a10167d29f3e0bd3402f3f84debcd3c23 100644 --- a/docker-compose/device-observation_control.yml +++ b/docker-compose/device-observation_control.yml @@ -38,4 +38,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/observation_control.py LTS -v -ORBendPoint giop:tcp:0:5703 -ORBendPointPublish giop:tcp:${HOSTNAME}:5703 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-recv.yml b/docker-compose/device-recv.yml index fdf8a535b002d629b29ee48b49b9ee91d8e925d7..f3bc3eea12b51b44cacbeb790d0666ced24ae169 100644 --- a/docker-compose/device-recv.yml +++ b/docker-compose/device-recv.yml @@ -39,4 +39,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/recv.py LTS -v -ORBendPoint giop:tcp:0:5707 -ORBendPointPublish giop:tcp:${HOSTNAME}:5707 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-sdp.yml b/docker-compose/device-sdp.yml index cdd8d137d6f249ef91e500dd4b9bb32734b23c90..8fefa3f355eda485ea757f0859924e317b9245ee 100644 --- a/docker-compose/device-sdp.yml +++ b/docker-compose/device-sdp.yml @@ -39,4 +39,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/sdp/sdp.py LTS -v -ORBendPoint giop:tcp:0:5701 -ORBendPointPublish giop:tcp:${HOSTNAME}:5701 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-sst.yml b/docker-compose/device-sst.yml index a9547f53830a564eeba6c9123c753c0062d1da30..7d922a61badf6575d15c6f0a0489a6fac3683367 100644 --- a/docker-compose/device-sst.yml +++ b/docker-compose/device-sst.yml @@ -42,4 +42,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/sdp/sst.py LTS -v -ORBendPoint giop:tcp:0:5702 -ORBendPointPublish giop:tcp:${HOSTNAME}:5702 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-unb2.yml b/docker-compose/device-unb2.yml index 67c443121cf02bc9c1652978b1dd67a5ebf3a80b..b1d7b945c6c82c3eb6a48632a107051f9ad1abe8 100644 --- a/docker-compose/device-unb2.yml +++ b/docker-compose/device-unb2.yml @@ -39,4 +39,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/unb2.py LTS -v -ORBendPoint giop:tcp:0:5704 -ORBendPointPublish giop:tcp:${HOSTNAME}:5704 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-xst.yml b/docker-compose/device-xst.yml index 1f75009dc6042b83aff706e34a811c1023f532b0..c634e5d83fc7b28f2b8438ae59dffb7157a03f54 100644 --- a/docker-compose/device-xst.yml +++ b/docker-compose/device-xst.yml @@ -42,4 +42,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/sdp/xst.py LTS -v -ORBendPoint giop:tcp:0:5706 -ORBendPointPublish giop:tcp:${HOSTNAME}:5706 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py b/docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py index df75d5962a1327041995aa04c41d6d1e1c2ae914..f6174de984585a0d6a8ebd64be104a9f31e1f7cd 100644 --- a/docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py +++ b/docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py @@ -4,6 +4,8 @@ sdp = DeviceProxy("LTS/SDP/1") sst = DeviceProxy("LTS/SST/1") xst = DeviceProxy("LTS/XST/1") unb2 = DeviceProxy("LTS/UNB2/1") +boot = DeviceProxy("LTS/Boot/1") +docker = DeviceProxy("LTS/Docker/1") # Put them in a list in case one wants to iterate -devices = [recv, sdp, sst, xst, unb2] +devices = [recv, sdp, sst, xst, unb2, boot, docker] diff --git a/docker-compose/recv-sim.yml b/docker-compose/recv-sim.yml index 7b1f704fa8854f12d411c7088b7caf0a74f328f0..edd4bc7e4efb589b55e9f7306da431ba91660b56 100644 --- a/docker-compose/recv-sim.yml +++ b/docker-compose/recv-sim.yml @@ -17,4 +17,4 @@ services: - ${HOME}:/hosthome ports: - "4843:4843" - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/sdptr-sim.yml b/docker-compose/sdptr-sim.yml index 70c1edf63acdf84df8a7d294aa17ea9489c9b9a7..c61cf8cfa84a6c65647c7a8bae4d0569aa60dad6 100644 --- a/docker-compose/sdptr-sim.yml +++ b/docker-compose/sdptr-sim.yml @@ -15,4 +15,4 @@ services: - control ports: - "4840:4840" - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/tangotest.yml b/docker-compose/tangotest.yml index 3a44fc61b73b18a78d9e5bbd6a6fef6ac2d648fd..357c91df487b51379db221f7cb984bc05018f5e3 100644 --- a/docker-compose/tangotest.yml +++ b/docker-compose/tangotest.yml @@ -25,5 +25,5 @@ services: - -- - /usr/local/bin/TangoTest - test - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/unb2-sim.yml b/docker-compose/unb2-sim.yml index e031e20f54ad6addec1fdbabf972661d6f4c8f9a..95468917e21eb5a2d5c5789a01dbd049de6eb091 100644 --- a/docker-compose/unb2-sim.yml +++ b/docker-compose/unb2-sim.yml @@ -17,4 +17,4 @@ services: - ${HOME}:/hosthome ports: - "4844:4844" - restart: on-failure + restart: unless-stopped diff --git a/jupyter-notebooks/Home.ipynb b/jupyter-notebooks/Home.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1b6001f9e5a87a0d626e310cc8038ede2d5f589f --- /dev/null +++ b/jupyter-notebooks/Home.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e051e48d", + "metadata": {}, + "source": [ + "# Welcome to your LOFAR2.0 station!\n", + "\n", + "The following interfaces are available to you, on the same host as this notebook, but on different ports:\n", + "\n", + "|Interface |Subsystem |Port|Credentials |\n", + "|----------|----------|----|--------------|\n", + "|Scripting |Jupyter |8888| |\n", + "|Monitoring|Grafana |3000|admin/admin |\n", + "|Logs |Kibana |5601| |\n", + "|ReST |tango-rest|8080|tango-cs/tango|\n", + "\n", + "Below are codes to manage the station at high level. For more detailed status information, look in Grafana." + ] + }, + { + "cell_type": "markdown", + "id": "32ae8bcf", + "metadata": {}, + "source": [ + "## (Re)boot station\n", + "The code below is used to:\n", + "* Reboot all station software\n", + "* Reset the hardware configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38037a71", + "metadata": {}, + "outputs": [], + "source": [ + "# Restart boot device itself\n", + "boot.off()\n", + "assert boot.state() == DevState.OFF, boot.state()\n", + "\n", + "boot.initialise()\n", + "assert boot.state() == DevState.STANDBY, boot.state()\n", + "\n", + "boot.on()\n", + "assert boot.state() == DevState.ON, boot.state()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21aba361", + "metadata": {}, + "outputs": [], + "source": [ + "# Request to reinitialise the station.\n", + "#\n", + "# WARNING: This will reset settings across the station!\n", + "boot.initialise_station()\n", + "assert boot.state() != DevState.FAULT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c00b465a", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "while boot.initialising_station_R:\n", + " print(f\"Still initialising station. {boot.initialisation_progress_R}% complete. State: {boot.initialisation_status_R}\")\n", + " time.sleep(1)\n", + "\n", + "if boot.initialisation_progress_R == 100:\n", + " print(\"Done initialising station.\")\n", + "else:\n", + " print(f\"Failed to initialise station: {boot.initialisation_status_R}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b444b751", + "metadata": {}, + "source": [ + "## Inspect Docker status\n", + "Docker containers that are not running will not provide any functionality, and are ignored when the station is rebooted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b09f9da", + "metadata": {}, + "outputs": [], + "source": [ + "container_status = {attr_name: getattr(docker, attr_name)\n", + " for attr_name in docker.get_attribute_list()\n", + " if attr_name.endswith(\"_R\")\n", + " and attr_name != 'version_R'}\n", + "\n", + "not_running_containers = [container for container, running in container_status.items() if running is False]\n", + "\n", + "if not not_running_containers:\n", + " print(\"All docker containers are running\")\n", + "else:\n", + " print(f\"Docker containers that are NOT running: {not_running_containers}\")" + ] + }, + { + "cell_type": "markdown", + "id": "55f3981d", + "metadata": {}, + "source": [ + "## Inspect Device status\n", + "Check whether all software devices are indeed up and running." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "637e6e22", + "metadata": {}, + "outputs": [], + "source": [ + "for d in devices:\n", + " try:\n", + " print(f\"Device {d.dev_name()} is in state {d.state()}\")\n", + " except ConnectionFailed as e:\n", + " print(f\"Device {d.dev_name()} is in state DOWN: {e.args[0].desc}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23008885", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "StationControl", + "language": "python", + "name": "stationcontrol" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}