diff --git a/CDB/LOFAR_ConfigDb.json b/CDB/LOFAR_ConfigDb.json index c7fc852c35cb9fcff76b24ab0e10664e17489156..b28c57984889d07dd0544702d6712861bdb72903 100644 --- a/CDB/LOFAR_ConfigDb.json +++ b/CDB/LOFAR_ConfigDb.json @@ -21,6 +21,13 @@ } } }, + "boot": { + "LTS": { + "Boot": { + "LTS/Boot/1": {} + } + } + }, "RECV": { "LTS": { "RECV": { diff --git a/README.md b/README.md index 192b3edb7713088120b672065296575c255adfa6..3f9bb0b2ba94b26e76a850d713a2dd048e218b9e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,49 @@ # Tango Station Control Station Control software related to Tango devices. + +# Installation + +## Prerequisites + +After checking out this repo, be sure to also check out the submodules: + +``` +git submodule init +git submodule update +``` + +You will also need: + +* docker +* docker-compose +* make +* bash + +## Bootstrap + +The bootstrap procedure is needed only once. First we build all docker containers, and load the initial configuration. This may take a while: + +``` +cd docker-compose +make bootstrap +``` + +If you lack access to LOFAR station hardware, configure the devices to use their simulators instead: + +``` +for sim in ../CDB/*-sim-config.json; do + ../sbin/update_ConfigDb.sh ../CDB${sim}-config.json +done +``` + +Now we can start all containers, and make sure everything is up: + +``` +make start +make status +``` + +If not, you can inspect why with `docker logs <container>`. The containers will automatically be restarted on reboot or failure. Stop them explicitly to bring them down (`make stop <container>`). + +Most notably, you will have web interfaces available at http://localhost:8888 (Jupyter Notebook), and http://localhost:3000 (Grafana). diff --git a/devices/clients/opcua_client.py b/devices/clients/opcua_client.py index 7d915cbd00aee72da2a13b7bbb7365306457cf4d..11e4cac79dca4faf7acc04c97f4b4790490b4a3f 100644 --- a/devices/clients/opcua_client.py +++ b/devices/clients/opcua_client.py @@ -179,6 +179,10 @@ class OPCUAConnection(CommClient): return prot_attr.read_function, prot_attr.write_function + def call_method(self, method_path, *args): + raise NotImplementedError + + class ProtocolAttribute: """ This class provides a small wrapper for the OPC ua read/write functions in order to better organise the code diff --git a/devices/devices/boot.py b/devices/devices/boot.py new file mode 100644 index 0000000000000000000000000000000000000000..fb8a6947c6eb5775f96cb3660c5a123bba370ca9 --- /dev/null +++ b/devices/devices/boot.py @@ -0,0 +1,297 @@ +# -*- coding: utf-8 -*- +# +# This file is part of the RECV project +# +# +# +# Distributed under the terms of the APACHE license. +# See LICENSE.txt for more info. + +""" Boot Device Server for LOFAR2.0 + +Boots the rest of the station software. + +""" + +# TODO(Corne): Remove sys.path.append hack once packaging is in place! +import os, sys +currentdir = os.path.dirname(os.path.realpath(__file__)) +parentdir = os.path.dirname(currentdir) +sys.path.append(parentdir) + +# PyTango imports +from tango import DebugIt +from tango.server import run, command +from tango.server import device_property, attribute +from tango import AttrWriteType, DeviceProxy, DevState +# Additional import +import numpy + +from device_decorators import * + +from clients.attribute_wrapper import attribute_wrapper +from devices.hardware_device import hardware_device +from common.lofar_logging import device_logging_to_python, log_exceptions +from common.lofar_git import get_version + +import logging +logger = logging.getLogger() + +from threading import Thread + +__all__ = ["Boot", "main"] + + +class InitialisationException(Exception): + pass + +class DevicesInitialiser(Thread): + """ + Initialise devices on this station. + + Devices which are unreachable are assumed to be brought down explicitly, + and are ignored (unless ignore_unavailable_devices == False). + + Initialisation happens in a separate thread. It is started by calling + the start() method, and progress can be followed by inspecting the + members progress (0-100), status (string), and is_running() (bool). + """ + def __init__(self, device_names, ignore_unavailable_devices=True, proxy_timeout=10.0): + self.ignore_unavailable_devices = ignore_unavailable_devices + + # Since Python3.7+, the insertion order equals the iteration order, which is what we depend on + # to process the devices in the same order as in device_names. + self.devices = {name: DeviceProxy(name) for name in device_names} + + # set the timeout for all proxies + for device in self.devices.values(): + device.set_timeout_millis(int(proxy_timeout * 1000)) + + # setup initial state + self.progress = 0 + self.set_status("Initialisation not started yet") + + super().__init__() + + def run(self): + self.set_status("Starting initialisation") + + try: + self.initialise_devices() + + self.set_status("Initialisation completed") + except Exception as e: + logger.exception("Failed to initialise station") + + # add the exception to the status + self.set_status(f"{self.status} [{e.__class__.__name__}: {str(e)}]") + + # we keep the status stuck at the last thing it tried + + def is_running(self): + return self.is_alive() + + + def stop(self): + if not self.is_alive(): + return + + # Just wait for the current initialisation to finish. It's a finite process. + self.join() + + def set_status(self, status): + self.status = status + + logger.info(status) + + def initialise_devices(self): + """ + Initialise or re-initialise all devices on the station. + + :return:None + """ + + # reset initialisation parameters + self.progress = 0 + + # restart devices in order + for num_restarted_devices, device in enumerate(self.devices.keys(), 1): + if self.is_available(device) or not self.ignore_unavailable_devices: + self.start_device(device) + + self.progress = 100.0 * num_restarted_devices / len(self.devices) + + # make sure we always finish at 100% in case of success + self.progress = 100 + + def is_available(self, device_name: str): + """ Return whether the device 'device_name' is actually available on this server. """ + + proxy = self.devices[device_name] + try: + proxy.state() + except Exception as e: + return False + + return True + + def stop_device(self, device_name: str): + """ Stop device 'device_name'. """ + + proxy = self.devices[device_name] + + if proxy.state() == DevState.OFF: + # already off + return + + self.set_status(f"[stopping {device_name}] Stopping device.") + + proxy.Off() + if proxy.state() != DevState.OFF: + raise InitialisationException(f"Could not turn off device {device_name}. Please look at its logs.") + + self.set_status(f"[stopping {device_name}] Stopped device.") + + def start_device(self, device_name: str): + """ Run the startup sequence for device 'device_name'. """ + + proxy = self.devices[device_name] + + # go to a well-defined state, which may be needed if the user calls + # this function explicitly. + self.stop_device(device_name) + + # setup connections to hardware + self.set_status(f"[restarting {device_name}] Initialising device.") + proxy.Initialise() + if proxy.state() != DevState.STANDBY: + raise InitialisationException(f"Could not initialise device {device_name}. Please look at its logs.") + + # configure the device + self.set_status(f"[restarting {device_name}] Setting defaults.") + proxy.set_defaults() + + self.set_status(f"[restarting {device_name}] Initialising hardware.") + proxy.initialise_hardware() + + # mark as ready for service + self.set_status(f"[restarting {device_name}] Turning on device.") + proxy.On() + if proxy.state() != DevState.ON: + raise InitialisationException(f"Could not turn on device {device_name}. Please look at its logs.") + + self.set_status(f"[restarting {device_name}] Succesfully started.") + +@device_logging_to_python() +class Boot(hardware_device): + # ----------------- + # Device Properties + # ----------------- + + DeviceProxy_Time_Out = device_property( + dtype='DevDouble', + mandatory=False, + default_value=10.0, + ) + + # Which devices to initialise, and in which order + Device_Names = device_property( + dtype='DevVarStringArray', + mandatory=False, + default_value=["LTS/Docker/1", # Docker controls the device containers, so it goes before anything else + "LTS/RECV/1", # RCUs are input for SDP, so initialise them first + "LTS/UNB2/1", # Uniboards host SDP, so initialise them first + "LTS/SDP/1", # SDP controls the mask for SST/XST/BST, so initialise it first + "LTS/SST/1", + "LTS/XST/1", + ], + ) + + # By default, we assume any device is not available + # because its docker container was not started, which + # is an explicit and thus intentional action. + # We ignore such devices when initialising the station. + Ignore_Unavailable_Devices = device_property( + dtype='DevBoolean', + mandatory=False, + default_value=True, + ) + + # ---------- + # Attributes + # ---------- + initialising_station_R = attribute(dtype=numpy.bool_, access=AttrWriteType.READ, fget=lambda self: self.initialiser.is_running()) + initialisation_progress_R = attribute(dtype=numpy.int, access=AttrWriteType.READ, fget=lambda self: numpy.int(self.initialiser.progress)) + initialisation_status_R = attribute(dtype=str, access=AttrWriteType.READ, fget=lambda self: self.initialiser.status) + + @log_exceptions() + def delete_device(self): + """Hook to delete resources allocated in init_device. + + This method allows for any memory or other resources allocated in the + init_device method to be released. This method is called by the device + destructor and by the device Init command (a Tango built-in). + """ + self.debug_stream("Shutting down...") + + self.Off() + self.debug_stream("Shut down. Good bye.") + + # -------- + # overloaded functions + # -------- + @log_exceptions() + def configure_for_off(self): + """ user code here. is called when the state is set to OFF """ + try: + self.initialiser.stop() + except Exception as e: + self.warn_stream("Exception while stopping OPC ua connection in configure_for_off function: {}. Exception ignored".format(e)) + + @log_exceptions() + def configure_for_initialise(self): + # create an initialiser object so we can query it even before starting the (first) initialisation + self.initialiser = DevicesInitialiser(self.Device_Names, self.Ignore_Unavailable_Devices, self.DeviceProxy_Time_Out) + + @command() + @DebugIt() + @only_in_states([DevState.ON]) + @fault_on_error() + @log_exceptions() + def initialise_station(self): + """ + Initialise or re-initialise all devices on the station. + + This command will take a while to execute, so should be called asynchronously. + + :return:None + """ + + if self.initialiser.is_running(): + # already initialising + return + + # join any previous attempt, if any + try: + self.initialiser.join() + except RuntimeError: + pass + + # start new initialisation attempt + self.initialiser = DevicesInitialiser(self.Device_Names, self.Ignore_Unavailable_Devices, self.DeviceProxy_Time_Out) + self.initialiser.start() + +# ---------- +# Run server +# ---------- +def main(args=None, **kwargs): + """Main function of the RECV module.""" + + from common.lofar_logging import configure_logger + configure_logger() + + return run((Boot,), args=args, **kwargs) + + +if __name__ == '__main__': + main() diff --git a/devices/devices/hardware_device.py b/devices/devices/hardware_device.py index a25b863ebc8255fa05c02a5f420f23f309ebf0fb..84d8e4c2b9c6156c994715416bebf38f979903b6 100644 --- a/devices/devices/hardware_device.py +++ b/devices/devices/hardware_device.py @@ -11,7 +11,7 @@ """ -from abc import ABCMeta, abstractmethod +from abc import abstractmethod # PyTango imports from tango.server import Device, command, DeviceMeta, attribute @@ -23,11 +23,14 @@ from common.lofar_logging import log_exceptions from common.lofar_git import get_version from devices.abstract_device import AbstractDeviceMetas from devices.device_decorators import only_in_states, fault_on_error +import time +import math import logging __all__ = ["hardware_device"] +import logging logger = logging.getLogger() @@ -244,3 +247,36 @@ class hardware_device(Device, metaclass=AbstractDeviceMetas): # log which attribute we're addressing raise Exception(f"Cannot assign default to attribute {name}") from e + @only_in_states([DevState.STANDBY, DevState.INIT, DevState.ON]) + @fault_on_error() + @command() + def initialise_hardware(self): + """ Initialise the hardware after configuring it. """ + + # This is just the command version of _initialise_hardware(). + self._initialise_hardware() + + def _initialise_hardware(self): + """ Override this method to initialise any hardware after configuring it. """ + pass + + def wait_attribute(self, attr_name, value, timeout=10, pollperiod=0.2): + """ Wait until the given attribute obtains the given value. + + Raises an Exception if it has not after the timeout. + + timeout: time until an Exception is raised, in seconds. + pollperiod: how often to check the attribute, in seconds. + """ + + attr = getattr(self, attr_name) + + # Poll every half a second + for _ in range(math.ceil(timeout/pollperiod)): + if attr != value: + return + + time.sleep(pollperiod) + + raise Exception(f"{attr} != {value} after f{timeout} seconds still.") + diff --git a/devices/devices/recv.py b/devices/devices/recv.py index a078f601c7d1962f4a11367e7ca9745ec590d5f2..40efb14354471021e0a3d03feab865141422bbc7 100644 --- a/devices/devices/recv.py +++ b/devices/devices/recv.py @@ -61,6 +61,7 @@ class RECV(opcua_device): # Attributes # ---------- Ant_mask_RW = attribute_wrapper(comms_annotation=["2:PCC", "2:Ant_mask_RW"], datatype=numpy.bool_, dims=(3, 32), access=AttrWriteType.READ_WRITE) + Ant_status_R = attribute(dtype=str, max_dim_x=3, max_dim_y=32) CLK_Enable_PWR_R = attribute_wrapper(comms_annotation=["2:PCC", "2:CLK_Enable_PWR_R"], datatype=numpy.bool_) CLK_I2C_STATUS_R = attribute_wrapper(comms_annotation=["2:PCC", "2:CLK_I2C_STATUS_R"], datatype=numpy.int64) CLK_PLL_error_R = attribute_wrapper(comms_annotation=["2:PCC", "2:CLK_PLL_error_R"], datatype=numpy.bool_) @@ -89,6 +90,7 @@ class RECV(opcua_device): RCU_mask_RW = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_mask_RW"], datatype=numpy.bool_, dims=(32,), access=AttrWriteType.READ_WRITE) RCU_monitor_rate_RW = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_monitor_rate_RW"], datatype=numpy.int64, access=AttrWriteType.READ_WRITE) RCU_Pwr_dig_R = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_Pwr_dig_R"], datatype=numpy.bool_, dims=(32,)) + Ant_status_R = attribute(dtype=str, max_dim_x=32) RCU_temperature_R = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_temperature_R"], datatype=numpy.float64, dims=(32,)) RCU_translator_busy_R = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_translator_busy_R"], datatype=numpy.bool_) RCU_version_R = attribute_wrapper(comms_annotation=["2:PCC", "2:RCU_version_R"], datatype=numpy.str, dims=(32,)) @@ -96,19 +98,6 @@ class RECV(opcua_device): # -------- # overloaded functions # -------- - @log_exceptions() - def configure_for_initialise(self): - """ user code here. is called when the state is set to INIT """ - - # Init the dict that contains function to OPC-UA function mappings. - self.function_mapping = {} - self.function_mapping["RCU_on"] = {} - self.function_mapping["RCU_off"] = {} - self.function_mapping["CLK_on"] = {} - self.function_mapping["CLK_off"] = {} - - super().configure_for_initialise() - # -------- # Commands @@ -116,80 +105,161 @@ class RECV(opcua_device): @command() @DebugIt() @only_when_on() - @fault_on_error() def RCU_off(self): """ :return:None """ - self.function_mapping["RCU_off"]() + self.opcua_connection.call_method(["2:PCC","2:RCU_off"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def RCU_on(self): """ :return:None """ - self.function_mapping["RCU_on"]() + self.opcua_connection.call_method(["2:PCC","2:RCU_on"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def ADC_on(self): """ :return:None """ - self.function_mapping["ADC_on"]() + self.opcua_connection.call_method(["2:PCC","2:ADC_on"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def RCU_update(self): """ :return:None """ - self.function_mapping["RCU_update"]() + self.opcua_connection.call_method(["2:PCC","2:RCU_update"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def CLK_off(self): """ :return:None """ - self.function_mapping["CLK_off"]() + self.opcua_connection.call_method(["2:PCC","2:CLK_off"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def CLK_on(self): """ :return:None """ - self.function_mapping["CLK_on"]() + self.opcua_connection.call_method(["2:PCC","2:CLK_on"]) @command() @DebugIt() @only_when_on() - @fault_on_error() def CLK_PLL_setup(self): """ :return:None """ - self.function_mapping["CLK_PLL_setup"]() + self.opcua_connection.call_method(["2:PCC","2:CLK_PLL_setup"]) + + def _initialise_hardware(self): + """ Initialise the RCU hardware. """ + + # method calls don't work yet, so don't use them to allow the boot + # device to initialise us without errors + logger.error("OPC-UA methods not supported yet, not initialising RCU hardware!") + return + + # Cycle clock + self.CLK_off() + self.wait_attribute("CLK_translator_busy_R", False, 10) + self.CLK_on() + self.wait_attribute("CLK_translator_busy_R", False, 10) + + if not self.CLK_PLL_locked_R: + if self.CLK_I2C_STATUS_R > 0: + raise Exception("CLK I2C is not working. Maybe power cycle subrack to restart CLK board and translator?") + else: + raise Exception("CLK signal is not locked. The subrack probably do not receive clock input or the CLK PCB is broken?") + + # Cycle RCUs + self.RCU_off() + self.wait_attribute("RCU_translator_busy_R", False, 5) + self.RCU_on() + self.wait_attribute("RCU_translator_busy_R", False, 5) + + def read_RCU_status_R(self): + """ Returns a set of strings denoting the status of each RCU. + + An empty string means no problems were detected. A non-empty + string means the RCU seems unusable. + + This function can be used as input to modify the RCU_mask_RW. """ + + rcu_mask = self.RCU_mask_RW + i2c_errors = self.RCU_I2C_STATUS_R + + nr_rcus = len(rcu_mask) + rcu_status = [""] * nr_rcus + + # construct status of each RCU + for rcu in range(nr_rcus): + status = [] + + if not i2c_status[rcu]: + status.append("[I2C error]") + + rcu_status[rcu] = " ".join(status) + + return rcu_status + + def read_Ant_status_R(self): + """ Returns a set of strings denoting the status of each antenna. + + An empty string means no problems were detected. A non-empty + string means the antenna seems unusable. + + This function can be used as input to modify the Ant_mask_RW. """ + + ant_mask = self.Ant_mask_RW + rcu_mask = self.RCU_mask_RW + adc_lock = self.RCU_ADC_lock_R + i2c_errors = self.RCU_I2C_STATUS_R + + nr_rcus = len(ant_mask) + nr_ants_per_rcu = len(ant_mask[0]) + + # Collect status, join them into a single string per antenna later + ant_status = [""] * nr_ants + + + for rcu in range(nr_rcus): + for ant in range(nr_ants_per_rcu): + status = [] + + if i2c_status[rcu] != 0: + status.append("[I2C error]") + + if not rcu_mask[rcu]: + status.append("[RCU masked out]") + + if not adc_lock[rcu][ant]: + status.append("[ADC lock error]") + + ant_status[rcu][ant] = " ".join(status) + return ant_status + # ---------- # Run server diff --git a/devices/devices/unb2.py b/devices/devices/unb2.py index e2f781a24e5e59c52591f0826e36000a38687aa1..4b071950bb68c52a41758a5eedba32605c0214cf 100644 --- a/devices/devices/unb2.py +++ b/devices/devices/unb2.py @@ -38,6 +38,12 @@ class UNB2(opcua_device): # Device Properties # ----------------- + UNB2_mask_RW_default = device_property( + dtype='DevVarBooleanArray', + mandatory=False, + default_value=[True] * 2 + ) + # ---------- # Attributes # ---------- diff --git a/devices/toolkit/lts_cold_start.py b/devices/toolkit/lts_cold_start.py deleted file mode 100644 index 47d3243e2064dc39fba8127e33da842acba19416..0000000000000000000000000000000000000000 --- a/devices/toolkit/lts_cold_start.py +++ /dev/null @@ -1,228 +0,0 @@ -#! /usr/bin/env python3 -import logging -from time import sleep - -# TODO(Corne): Remove sys.path.append hack once packaging is in place! -import os, sys -currentdir = os.path.dirname(os.path.realpath(__file__)) -parentdir = os.path.dirname(currentdir) -sys.path.append(parentdir) - -from toolkit.startup import startup -from toolkit.lofar2_config import configure_logging - - -def start_device(device: str): - ''' - Start a Tango device with the help of the startup function. - The device will not be forced to got through - OFF/INIT/STANDBY/ON but it is assumed that the device is in OFF - state. If the device is not in OFF state, then an exception - will be raised. - ''' - dev = startup(device = device, force_restart = False) - state = device.state() - if state is not tango._tango.DevState.ON: - raise Exception("Device \"{}\" is unexpectedly in \"{}\" state but it is expected to be in \"{}\" state. Please check the reason for the unexpected device state. Aborting the start-up procedure.".format(device, state, tango._tango.DevState.ON)) - return device - - -def lts_cold_start(): - ''' - What is this? - This is the LTS (LOFAR Test - and I forgot what S stands for) cold start - procedure cast into source code. The procedure can be found there: - https://support.astron.nl/confluence/display/L2M/LTS+startup+procedure - - Paulus wrote already a script that - illegally ;) - makes direct use of the - OPC-UA servers to accomplish the same thing that we are doing here. - Paulus' script can be found there: - https://git.astron.nl/lofar2.0/pypcc/-/blob/master/scripts/Startup.py - Thanks, Paulus! You made it very easy for me to cobble together this - script. - - For obvious reasons is our script much better though. :) - First, it is bigger. And bigger is always better. - Then it is better documented but that does not count in the HW world. - But it also raises exceptions with error messages that make an attempt to - help the user reading them and shuts down the respective Tango device(s) if - something goes south. - And that is where we try to do it really right: there is no reason to be - excessively verbatim when things work like they are expected to work. But - tell the user when something goes wrong, give an indication of what could - have gone wrong and where to look for the problem. - - Again, Paulus' script contains already very good indications where problems - might lie and made my job very easy. - - No parameters, parameters are for wimps. :) - ''' - # Define the LOFAR2.0 specific log format - configure_logging() - - # Get a reference to the RECV device, do not - # force a restart of the already running Tango - # device. - recv = startup("LTS/RECV/1") - - # Getting CLK, RCU & RCU ADCs into proper shape for use by real people. - # - # The start-up needs to happen in this sequence due to HW dependencies - # that can introduce issues which are then becoming very complicated to - # handle in SW. Therefore to keep it as simple as possible, let's stick - # to the rule recommended by Paulus: - # 1 CLK - # 2 RCU - # 3 RCU ADCs - # - # - # First take the CLK board through the motions. - # 1.1 Switch off CLK - # 1.2 Wait for CLK_translator_busy_R == True, throw an exception in timeout - # 1.3 Switch on CLK - # 1.4 Wait for CLK_translator_busy_R == True, throw an exception in timeout - # 1.5 Check if CLK_PLL_locked_R == True - # 1.6 Done - # - # - # Steps 1.1 & 1.2 - recv.CLK_off() - # 2021-04-30, Thomas - # This should be refactored into a function. - timeout = 10.0 - while recv.CLK_translator_busy_R is True: - logging.debug("Waiting on \"CLK_translator_busy_R\" to become \"True\"...") - timeout = timeout - 1.0 - if timeout < 1.0: - # Switching the RECV clock off should never take longer than - # 10 seconds. Here we ran into a timeout. - # Clean up and raise an exception. - recv.off() - raise Exception("After calling \"CLK_off\" a timeout occured while waiting for \"CLK_translator_busy_R\" to become \"True\". Please investigate the reason why the RECV translator never set \"CLK_translator_busy_R\" to \"True\". Aborting start-up procedure.") - sleep(1.0) - - # Steps 1.3 & 1.4 - recv.CLK_on() - # Per Paulus this should never take longer than 2 seconds. - # 2021-04-30, Thomas - # This should be refactored into a function. - timeout = 2.0 - while recv.CLK_translator_busy_R is True: - logging.debug("After calling \"CLK_on()\" Waiting on \"CLK_translator_busy_R\" to become \"True\"...") - timeout = timeout - 1.0 - if timeout < 1.0: - # Switching theRECV clock on should never take longer than - # a couple of seconds. Here we ran into a timeout. - # Clean up and raise an exception. - recv.off() - raise Exception("After calling \"CLK_on\" a timeout occured while waiting for \"CLK_translator_busy_R\" to become \"True\". Please investigate the reason why the RECV translator never set \"CLK_translator_busy_R\" to \"True\". Aborting start-up procedure.") - sleep(1.0) - - # 1.5 Check if CLK_PLL_locked_R == True - # 2021-04-30, Thomas - # This should be refactored into a function. - clk_locked = recv.CLK_PLL_locked_R - if clk_locked is True: - logging.info("CLK signal is locked.") - else: - # CLK signal is not locked - clk_i2c_status = recv.CLK_I2C_STATUS_R - exception_text = "CLK I2C is not working. Please investigate! Maybe power cycle subrack to restart CLK board and translator. Aborting start-up procedure." - if i2c_status <= 0: - exception_text = "CLK signal is not locked. Please investigate! The subrack probably do not receive clock input or the CLK PCB is broken. Aborting start-up procedure." - recv.off() - raise Exception(exception_text) - # Step 1.6 - # Done. - - # 2 RCUs - # If we reach this point in the start-up procedure, then the CLK board setup - # is done. We can proceed with the RCUs. - # - # Now take the RCUs through the motions. - # 2.1 Set RCU mask to all available RCUs - # 2.2 Switch off all RCUs - # 2.3 Wait for RCU_translator_busy_R = True, throw an exception in timeout - # 2.4 Switch on RCUs - # 2.5 Wait for RCU_translator_busy_R = True, throw an exception in timeout - # 2.6 Done - # - # - # Step 2.1 - # We have only 8 RCUs in LTS. - recv.RCU_mask_RW = [True, ] * 8 - # Steps 2.2 & 2.3 - recv.RCU_off() - # 2021-04-30, Thomas - # This should be refactored into a function. - timeout = 10.0 - while recv.RCU_translator_busy_R is True: - logging.debug("Waiting on \"RCU_translator_busy_R\" to become \"True\"...") - timeout = timeout - 1.0 - if timeout < 1.0: - # Switching the RCUs off should never take longer than - # 10 seconds. Here we ran into a timeout. - # Clean up and raise an exception. - recv.off() - raise Exception("After calling \"RCU_off\" a timeout occured while waiting for \"RCU_translator_busy_R\" to become \"True\". Please investigate the reason why the RECV translator never set \"RCU_translator_busy_R\" to \"True\". Aborting start-up procedure.") - sleep(1.0) - - # Steps 2.4 & 2.5 - # We leave the RCU mask as it is because it got already set for the - # RCU_off() call. - recv.RCU_on() - # Per Paulus this should never take longer than 5 seconds. - # 2021-04-30, Thomas - # This should be refactored into a function. - timeout = 5.0 - while recv.RCU_translator_busy_R is True: - logging.debug("After calling \"RCU_on()\" Waiting on \"RCU_translator_busy_R\" to become \"True\"...") - timeout = timeout - 1.0 - if timeout < 1.0: - # Switching the RCUs on should never take longer than - # a couple of seconds. Here we ran into a timeout. - # Clean up and raise an exception. - recv.off() - raise Exception("After calling \"RCU_on\" a timeout occured while waiting for \"RCU_translator_busy_R\" to become \"True\". Please investigate the reason why the RECV translator never set \"RCU_translator_busy_R\" to \"True\". Aborting start-up procedure.") - sleep(1.0) - # Step 2.6 - # Done. - - # 3 ADCs - # If we get here, we only got to check if the ADCs are locked, too. - # 3.1 Check RCUs' I2C status - # 3.2 Check RCU_ADC_lock_R == [True, ] for RCUs that have a good I2C status - # 3.3 Done - # - # - # Steps 3.1 & 3.2 - rcu_mask = recv.RCU_mask_RW - adc_locked = numpy.array(recv.RCU_ADC_lock_R) - for rcu, i2c_status in enumerate(recv.RCU_I2C_STATUS_R): - if i2c_status == 0: - rcu_mask[rcu] = True - logging.info("RCU #{} is available.".format(rcu)) - for adc, adc_is_locked in enumerate(adc_locked[rcu]): - if adc_is_locked < 1: - logging.warning("RCU#{}, ADC#{} is unlocked. Please investigate! Will continue with normal operation.".format(rcu, adc)) - else: - # The RCU's I2C bus is not working. - rcu_mask[rcu] = False - logging.error("RCU #{}'s I2C is not working. Please investigate! Disabling RCU #{} to avoid damage.".format(rcu, rcu)) - recv.RCU_mask_RW = rcu_mask - # Step 3.3 - # Done - - # Start-up APSCTL, i.e. Uniboard2s. - aps = startup("APSCTL/SDP/1") - logging.warning("Cannot start-up APSCTL because it requires manual actions.") - - # Start up SDP, i.e. configure the firmware in the Unibards - sdp = startup("LTS/SDP/1") - logging.warning("Cannot start-up SDP because it requires manual actions.") - - logging.info("LTS has been successfully started and configured.") - - -if __name__ == '__main__': - lts_cold_start() diff --git a/devices/toolkit/startup.py b/devices/toolkit/startup.py deleted file mode 100644 index 66a8d2c496fc7e86d0d13086336e900fc1a1bfaf..0000000000000000000000000000000000000000 --- a/devices/toolkit/startup.py +++ /dev/null @@ -1,49 +0,0 @@ -#! /usr/bin/env python3 -import tango -import logging - -logger = logging.getLogger() - -def startup(device: str, force_restart: bool) -> tango.DeviceProxy: - ''' - Start a LOFAR Tango device: - recv = startup(device = 'LTS/RECV/1', force_restart = False) - ''' - proxy = tango.DeviceProxy(device) - state = proxy.state() - - # go to OFF, but only if force_restart is True - if force_restart is True: - logger.warning(f"Forcing device {device} restart.") - proxy.off() - state = proxy.state() - if state is not tango._tango.DevState.OFF: - logger.error(f"Device {device} cannot perform off although restart has been enforced, state = {state}. Please investigate.") - return proxy - - if state is not tango._tango.DevState.OFF: - logger.error(f"Device {device} is not in OFF state, cannot start it. state = {state}") - return proxy - - # Initialise device - logger.info(f"Device {device} is in OFF, performing initialisation.") - proxy.initialise() - state = proxy.state() - if state is not tango._tango.DevState.STANDBY: - logger.error(f"Device {device} cannot perform initialise, state = {state}. Please investigate.") - return proxy - - # Set default values - logger.info(f"Device {device} is in STANDBY, setting default values.") - proxy.set_defaults() - - # Turn on device - logger.info(f"Device {device} is in STANDBY, performing on.") - proxy.on() - state = proxy.state() - if state is not tango._tango.DevState.ON: - logger.error(f"Device {device} cannot perform on, state = {state}. Please investigate.") - else: - logger.info(f"Device {device} has successfully reached ON state.") - - return proxy diff --git a/docker-compose/Makefile b/docker-compose/Makefile index 8e660436fb7ab61d86f704a00c5a386fcc25c401..81e5e4a85223a042b1325343fb01cc941ef3311c 100644 --- a/docker-compose/Makefile +++ b/docker-compose/Makefile @@ -154,6 +154,12 @@ ifneq ($(NETWORK_MODE),host) endif $(DOCKER_COMPOSE_ARGS) docker-compose -f tango.yml -f networks.yml up -d +bootstrap: pull build # first start, initialise from scratch + $(MAKE) start elk-configure-host # configure host kernel for elk container + $(MAKE) start dsconfig # boot up containers to load configurations + sleep 5 # wait for dsconfig container to come up + ../sbin/update_ConfigDb.sh ../CDB/LOFAR_ConfigDb.json # load default configuration + start: up ## start a service (usage: make start <servicename>) if [ $(UNAME_S) = Linux ]; then touch ~/.Xauthority; chmod a+r ~/.Xauthority; fi $(DOCKER_COMPOSE_ARGS) docker-compose $(COMPOSE_FILE_ARGS) start $(SERVICE) diff --git a/docker-compose/archiver.yml b/docker-compose/archiver.yml index 8a357d371e89377a1bfa2ce89e341ba708526fef..84dded354d22c97eeccd51ea97d8ff41b909f01e 100644 --- a/docker-compose/archiver.yml +++ b/docker-compose/archiver.yml @@ -73,5 +73,5 @@ services: - ..:/opt/lofar/tango:rw - ${HOME}:/hosthome - ../docker/tango/tango-archiver:/tango-archiver - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-boot.yml b/docker-compose/device-boot.yml new file mode 100644 index 0000000000000000000000000000000000000000..58a9aa7df81eab368464f4ca69ddab54129b7ace --- /dev/null +++ b/docker-compose/device-boot.yml @@ -0,0 +1,41 @@ +# +# Docker compose file that launches a LOFAR2.0 station's +# ObservationControl device. It also runs the dynamically +# created Observation devices. +# +# Defines: +# - device-observation_control: LOFAR2.0 station ObvservationControl +# +# Requires: +# - lofar-device-base.yml +# +version: '2' + +services: + device-boot: + image: device-boot + # build explicitly, as docker-compose does not understand a local image + # being shared among services. + build: + context: lofar-device-base + args: + SOURCE_IMAGE: ${DOCKER_REGISTRY_HOST}/${DOCKER_REGISTRY_USER}-tango-itango:${TANGO_ITANGO_VERSION} + container_name: ${CONTAINER_NAME_PREFIX}device-boot + networks: + - control + ports: + - "5708:5708" # unique port for this DS + volumes: + - ..:/opt/lofar/tango:rw + environment: + - TANGO_HOST=${TANGO_HOST} + entrypoint: + - /usr/local/bin/wait-for-it.sh + - ${TANGO_HOST} + - --timeout=30 + - --strict + - -- + # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA + # can't know about our Docker port forwarding + - python3 -u /opt/lofar/tango/devices/devices/boot.py LTS -v -ORBendPoint giop:tcp:0:5708 -ORBendPointPublish giop:tcp:${HOSTNAME}:5708 + restart: unless-stopped diff --git a/docker-compose/device-docker.yml b/docker-compose/device-docker.yml index 5386ead921b386741e62febeab399f3007a79281..d9e1e1e35233177ab271db395773538ed8c74ffa 100644 --- a/docker-compose/device-docker.yml +++ b/docker-compose/device-docker.yml @@ -41,4 +41,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/docker_device.py LTS -v -ORBendPoint giop:tcp:0:5705 -ORBendPointPublish giop:tcp:${HOSTNAME}:5705 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-observation_control.yml b/docker-compose/device-observation_control.yml index 011fe0a94112df557670a218518b6492520f4480..827a558a10167d29f3e0bd3402f3f84debcd3c23 100644 --- a/docker-compose/device-observation_control.yml +++ b/docker-compose/device-observation_control.yml @@ -38,4 +38,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/observation_control.py LTS -v -ORBendPoint giop:tcp:0:5703 -ORBendPointPublish giop:tcp:${HOSTNAME}:5703 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-recv.yml b/docker-compose/device-recv.yml index fdf8a535b002d629b29ee48b49b9ee91d8e925d7..f3bc3eea12b51b44cacbeb790d0666ced24ae169 100644 --- a/docker-compose/device-recv.yml +++ b/docker-compose/device-recv.yml @@ -39,4 +39,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/recv.py LTS -v -ORBendPoint giop:tcp:0:5707 -ORBendPointPublish giop:tcp:${HOSTNAME}:5707 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-sdp.yml b/docker-compose/device-sdp.yml index cdd8d137d6f249ef91e500dd4b9bb32734b23c90..8fefa3f355eda485ea757f0859924e317b9245ee 100644 --- a/docker-compose/device-sdp.yml +++ b/docker-compose/device-sdp.yml @@ -39,4 +39,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/sdp/sdp.py LTS -v -ORBendPoint giop:tcp:0:5701 -ORBendPointPublish giop:tcp:${HOSTNAME}:5701 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-sst.yml b/docker-compose/device-sst.yml index a9547f53830a564eeba6c9123c753c0062d1da30..7d922a61badf6575d15c6f0a0489a6fac3683367 100644 --- a/docker-compose/device-sst.yml +++ b/docker-compose/device-sst.yml @@ -42,4 +42,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/sdp/sst.py LTS -v -ORBendPoint giop:tcp:0:5702 -ORBendPointPublish giop:tcp:${HOSTNAME}:5702 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-unb2.yml b/docker-compose/device-unb2.yml index 67c443121cf02bc9c1652978b1dd67a5ebf3a80b..b1d7b945c6c82c3eb6a48632a107051f9ad1abe8 100644 --- a/docker-compose/device-unb2.yml +++ b/docker-compose/device-unb2.yml @@ -39,4 +39,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/unb2.py LTS -v -ORBendPoint giop:tcp:0:5704 -ORBendPointPublish giop:tcp:${HOSTNAME}:5704 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/device-xst.yml b/docker-compose/device-xst.yml index 1f75009dc6042b83aff706e34a811c1023f532b0..c634e5d83fc7b28f2b8438ae59dffb7157a03f54 100644 --- a/docker-compose/device-xst.yml +++ b/docker-compose/device-xst.yml @@ -42,4 +42,4 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - python3 -u /opt/lofar/tango/devices/devices/sdp/xst.py LTS -v -ORBendPoint giop:tcp:0:5706 -ORBendPointPublish giop:tcp:${HOSTNAME}:5706 - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/jupyter/Dockerfile b/docker-compose/jupyter/Dockerfile index b69ddfa7e5b6d6eaeab11b25f99258d0f0743daa..8be3e9f3900b01e80893d38aedcb4f6397aa8fd0 100644 --- a/docker-compose/jupyter/Dockerfile +++ b/docker-compose/jupyter/Dockerfile @@ -10,6 +10,9 @@ ENV HOME=/home/user RUN sudo mkdir -p ${HOME} RUN sudo chown ${CONTAINER_EXECUTION_UID} -R ${HOME} +# ipython 7.28 is broken in combination with Jupyter, it causes connection errors with notebooks +RUN sudo pip3 install ipython==7.27.0 + RUN sudo pip3 install jupyter RUN sudo pip3 install ipykernel RUN sudo pip3 install jupyter_bokeh @@ -46,7 +49,7 @@ COPY jupyter-notebook /usr/local/bin/jupyter-notebook RUN sudo pip3 install PyMySQL[rsa] sqlalchemy # Packages to interface with testing hardware directly -RUN sudo pip3 install pyvisa pyvisa-py +RUN sudo pip3 install pyvisa pyvisa-py opcua # Add Tini. Tini operates as a process subreaper for jupyter. This prevents kernel crashes. ENV TINI_VERSION v0.6.0 diff --git a/docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py b/docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py index df75d5962a1327041995aa04c41d6d1e1c2ae914..f6174de984585a0d6a8ebd64be104a9f31e1f7cd 100644 --- a/docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py +++ b/docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py @@ -4,6 +4,8 @@ sdp = DeviceProxy("LTS/SDP/1") sst = DeviceProxy("LTS/SST/1") xst = DeviceProxy("LTS/XST/1") unb2 = DeviceProxy("LTS/UNB2/1") +boot = DeviceProxy("LTS/Boot/1") +docker = DeviceProxy("LTS/Docker/1") # Put them in a list in case one wants to iterate -devices = [recv, sdp, sst, xst, unb2] +devices = [recv, sdp, sst, xst, unb2, boot, docker] diff --git a/docker-compose/recv-sim.yml b/docker-compose/recv-sim.yml index 7b1f704fa8854f12d411c7088b7caf0a74f328f0..edd4bc7e4efb589b55e9f7306da431ba91660b56 100644 --- a/docker-compose/recv-sim.yml +++ b/docker-compose/recv-sim.yml @@ -17,4 +17,4 @@ services: - ${HOME}:/hosthome ports: - "4843:4843" - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/sdptr-sim.yml b/docker-compose/sdptr-sim.yml index 70c1edf63acdf84df8a7d294aa17ea9489c9b9a7..c61cf8cfa84a6c65647c7a8bae4d0569aa60dad6 100644 --- a/docker-compose/sdptr-sim.yml +++ b/docker-compose/sdptr-sim.yml @@ -15,4 +15,4 @@ services: - control ports: - "4840:4840" - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/tangotest.yml b/docker-compose/tangotest.yml index 3a44fc61b73b18a78d9e5bbd6a6fef6ac2d648fd..357c91df487b51379db221f7cb984bc05018f5e3 100644 --- a/docker-compose/tangotest.yml +++ b/docker-compose/tangotest.yml @@ -25,5 +25,5 @@ services: - -- - /usr/local/bin/TangoTest - test - restart: on-failure + restart: unless-stopped diff --git a/docker-compose/unb2-sim.yml b/docker-compose/unb2-sim.yml index e031e20f54ad6addec1fdbabf972661d6f4c8f9a..95468917e21eb5a2d5c5789a01dbd049de6eb091 100644 --- a/docker-compose/unb2-sim.yml +++ b/docker-compose/unb2-sim.yml @@ -17,4 +17,4 @@ services: - ${HOME}:/hosthome ports: - "4844:4844" - restart: on-failure + restart: unless-stopped diff --git a/docs/source/configure_station.rst b/docs/source/configure_station.rst new file mode 100644 index 0000000000000000000000000000000000000000..412795ff05d649ab57d255f566178a50614091bb --- /dev/null +++ b/docs/source/configure_station.rst @@ -0,0 +1,70 @@ +Enter your LOFAR2.0 Hardware Configuration +=========================================== + +The software will need to be told various aspects of your station configuration, for example, the hostnames of the station hardware to control. The following settings are installation specific, and are stored as *properties* in the :ref:`tangodb`. The format used here is ``device.property``: + +Mandatory settings +------------------- + +Without these settings, you will not obtain the associated functionality: + +:RECV.OPC_Server_Name: Hostname of RECVTR. + + :type: ``string`` + +:UNB2.OPC_Server_Name: Hostname of UNB2TR. + + :type: ``string`` + +:SDP.OPC_Server_Name: Hostname of SDPTR. + + :type: ``string`` + +:SST.OPC_Server_Name: Hostname of SDPTR. + + :type: ``string`` + +:SST.FPGA_sst_offload_hdr_eth_destination_mac_RW_default: MAC address of the network interface on the host running this software stack, on which the SSTs are to be received. This network interface must be capable of receiving Jumbo (MTU=9000) frames. + + :type: ``string[N_fpgas]`` + +:SST.FPGA_sst_offload_hdr_ip_destination_address_RW_default: IP address of the network interface on the host running this software stack, on which the SSTs are to be received. + + :type: ``string[N_fpgas]`` + +:XST.OPC_Server_Name: Hostname of SDPTR. + + :type: ``string`` + +:XST.FPGA_xst_offload_hdr_eth_destination_mac_RW_default: MAC address of the network interface on the host running this software stack, on which the XSTs are to be received. This network interface must be capable of receiving Jumbo (MTU=9000) frames. + + :type: ``string[N_fpgas]`` + +:XST.FPGA_xst_offload_hdr_ip_destination_address_RW_default: IP address of the network interface on the host running this software stack, on which the XSTs are to be received. + + :type: ``string[N_fpgas]`` + +Optional settings +------------------- + +These settings make life nicer, but are not strictly necessary to get your software up and running: + +:RECV.Ant_mask_RW_default: Which antennas are installed. + + :type: ``bool[N_RCUs][N_antennas_per_RCU]`` + +:SDP.RCU_mask_RW_default: Which RCUs are installed. + + :type: ``bool[N_RCUs]`` + +:UNB2.UNB2_mask_RW_default: Which Uniboard2s are installed in SDP. + + :type: ``bool[N_unb]`` + +:SDP.TR_fpga_mask_RW_default: Which FPGAs are installed in SDP. + + :type: ``bool[N_fpgas]`` + +:SDP.FPGA_sdp_info_station_id_RW_default: Numeric identifier for this station. + + :type: ``uint32[N_fpgas]`` diff --git a/docs/source/developer.rst b/docs/source/developer.rst new file mode 100644 index 0000000000000000000000000000000000000000..517dfa324298e9451bfa5f9b25eef9726476686e --- /dev/null +++ b/docs/source/developer.rst @@ -0,0 +1,61 @@ +Developer information +========================= + +This chapter describes key areas useful for developers. + +Docker compose +------------------------- + +The docker setup is managed using ``make`` in the ``docker-compose`` directory. Key commands are: + +- ``make status`` to check which containers are running, +- ``make build <container>`` to rebuild the image for the container, +- ``make build-nocache <container>`` to rebuild the image for the container from scratch, +- ``make restart <container>`` to restart a specific container, for example to effectuate a code change. +- ``make clean`` to remove all images and containers, and the ``tangodb`` volume. To do a deeper clean, we need to remove all volumes and rebuild all containers from scratch:: + + make clean + docker volume prune + docker build-nocache + +Since the *Python code is taken from the host when the container starts*, restarting is enough to use the code you have in your local git repo. Rebuilding is unnecessary. + +Docker networking +------------------------- + +The Docker containers started use a *virtual network* to communicate among each other. This means that: + +- Containers address each other by a host name equal to the container name (f.e. ``elk`` for the elk stack, and ``databaseds`` for the TANGO_HOST), +- ``localhost`` cannot be used within the containers to access ports of other containers. +- ``host.docker.internal`` resolves to the actual host running the containers, +- All ports used by external parties need to be exposed explicitly in the docker-compose files. The container must open the same port as is thus exposed, or the port will not be reachable. + +The networks are defined in ``docker-compose/networks.yml``: + +.. literalinclude:: ../../docker-compose/networks.yml + +The ``$NETWORK_MODE`` defaults to ``tangonet`` in the ``docker-compose/Makefile``. + +.. _corba: + +CORBA +```````````````````` + +Tango devices use CORBA, which require all servers to be able to reach each other directly. Each CORBA device opens a port and advertises its address to the CORBA broker. The broker then forwards this address to any interested clients. A device within a docker container cannot know under which name it can be reached, however, and any port opened needs to be exposed explicitly in the docker-compose file for the device. To solve all this, we *assign a unique port to each device*, and explictly tell CORBA to use that port, and what the hostname is under which others can reach it. Each device thus has these lines in their compose file:: + + ports: + - "5701:5701" # unique port for this DS + entrypoint: + # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA + # can't know about our Docker port forwarding + - python3 -u /opt/lofar/tango/devices/devices/sdp/sdp.py LTS -v -ORBendPoint giop:tcp:0:5701 -ORBendPointPublish giop:tcp:${HOSTNAME}:5701 + +Specifying the wrong ``$HOSTNAME`` or port can make your device unreachable, even if it is running. Note that ``$HOSTNAME`` is advertised as is, that is, it is resolved to an IP address by any client that wants to connect. This means the ``$HOSTNAME`` needs to be correct for both the other containers, and external clients. + +The ``docker-compose/Makefile`` tries to set a good default for ``$HOSTNAME``, but you can override it by exporting the environment variable yourself (and run ``make restart <container>`` to effectuate the change). + +For more information, see: + +- https://huihoo.org/ace_tao/ACE-5.2+TAO-1.2/TAO/docs/ORBEndpoint.html +- http://omniorb.sourceforge.net/omni42/omniNames.html +- https://sourceforge.net/p/omniorb/svn/HEAD/tree/trunk/omniORB/src/lib/omniORB/orbcore/tcp/tcpEndpoint.cc diff --git a/docs/source/devices/configure.rst b/docs/source/devices/configure.rst new file mode 100644 index 0000000000000000000000000000000000000000..aa96966d2ee9d383c60e6a1651d0064bb8b914d2 --- /dev/null +++ b/docs/source/devices/configure.rst @@ -0,0 +1,63 @@ +Device Configuration +========================= + +The devices receive their configuration from two sources: + +- The TangoDB database, for static *properties*, +- Externally, from the user, or a control system, that set *control attributes* (see :doc:`devices` for what to set, and :ref:`attributes` for how to set them). + +.. _tangodb: + +TangoDB +------------------------- + +The TangoDB database is a persistent store for the properties of each device. The properties encode static settings, such as the hardware addresses, and default values for control attributes. + +Each device queries the TangoDB for the value of its properties during the ``initialise()`` call. Default values for control attributes can then be applied by explicitly calling ``set_defaults()``. The ``boot`` device also calls ``set_defaults()`` when initialising the station. The rationale being that the defaults can be applied at boot, but shouldn't be applied automatically during operations, as not to disturb running hardware. + +Device interaction +```````````````````````````` + +The properties of a device can be queried from the device directly:: + + # get a list of all the properties + property_names = device.get_property_list("*") + + # fetch the values of the given properties. returns a {property: value} dict. + property_dict = device.get_property(property_names) + +Properties can also be changed:: + + changeset = { "property": "new value" } + + device.put_property(changeset) + +Note that new values for properties will only be picked up by the device during ``initialise()``, so you will have to turn the device off and on. + +Command-line interaction +`````````````````````````` + +The content of the TangoDB can be dumped from the command line using:: + + bin/dump_ConfigDb.sh > tangodb-dump.json + +and changes can be applied using:: + + bin/update_ConfigDb.sh changeset.json + +.. note:: The ``dsconfig`` docker container needs to be running for these commands to work. + +Jive +`````````````````````````` + +The TangoDB can also be interactively queried and modified using Jive. Jive is an X11 application provided by the ``jive`` image as part of the software stack of the station. It must however be started on-demand, with a correctly configured ``$DISPLAY``:: + + cd docker-compose + make start jive + +If Jive does not appear, check ``docker logs jive`` to see what went wrong. + +For information on how to use Jive, see https://tango-controls.readthedocs.io/en/latest/tools-and-extensions/built-in/jive/. + +.. note:: If you need an X11 server on Windows, see :ref:`x11_on_windows`. + diff --git a/docs/source/devices/devices.rst b/docs/source/devices/devices.rst new file mode 100644 index 0000000000000000000000000000000000000000..1c6090bef3066def70a032b191688d8d0444cb03 --- /dev/null +++ b/docs/source/devices/devices.rst @@ -0,0 +1,179 @@ +Devices +============ + +.. _boot: + +Boot +--------- + +The ``boot == DeviceProxy("LTS/Boot/1")`` device is responsible for (re)starting and initialising the other devices. Devices which are not reachable, for example because their docker container is explicitly stopped, are skipped during initialisation. This device provides the following commands: + +:initialise_station(): Stop and start the other devices in the correct order, set their default values, and command them to initialise their hardware. This procedure runs asynchronously, causing this command to return immediately. Initialisation is aborted if an error is encountered. + + :returns: ``None`` + +The initialisation process can subsequently be followed through monitoring the following attributes: + +:initialising_R: Whether the initialisation procedure is still ongoing. + + :type: ``bool`` + +:initialisation_progress_R: Percentage completeness of the initialisation procedure. Each succesfully configured device increments progress. + + :type: ``int`` + +:initialisation_status_R: A description of what the device is currently trying to do. If an error occurs, this will hint towards the cause. + + :type: ``str`` + +A useful pattern is thus to call ``initialise_station()``, wait for ``initialising_R == False``, and then check whether the initalisation was succesful, if ``initialisation_progress_R == 100``. If a device fails to initialise, most likely the :doc:`../interfaces/logs` will need to be consulted. + +.. _docker: + +Docker +--------- + +The ``docker == DeviceProxy("LTS/Docker/1")`` device controls the docker containers. It allows starting and stopping them, and querying whether they are running. Each container is represented by two attributes: + +:<container>_R: Returns whether the container is running. + + :type: ``bool`` + +:<container>_RW: Set to ``True`` to start the container, and to ``False`` to stop it. + + :type: ``bool`` + +.. warning:: Do *not* stop the ``tango`` container, as doing so cripples the Tango infrastructure, leaving the station inoperable. It is also not wise to stop the ``device_docker`` container, as doing so would render this device unreachable. + + +RECV +---------- + +The ``recv == DeviceProxy("LTS/RECV/1")`` device controls the RCUs, the LBA antennas, and HBA tiles. Central to its operation are the masks (see also :ref:`attribute-masks`): + +:RCU_mask_RW: Controls which RCUs will actually be configured when attributes referring to RCUs are written. + + :type: ``bool[N_RCUs]`` + +:Ant_mask_RW: Controls which antennas will actually be configured when attributes referring to antennas are written. + + :type: ``bool[N_RCUs][N_antennas_per_RCU]`` + +Typically, ``N_RCUs == 32``, and ``N_antennas_per_RCU == 3``. + +SDP +----------- + +The ``sdp == DeviceProxy("LTS/SDP/1")``` device controls the digital signal processing in SDP, performed by the firmware on the FPGAs on the Uniboards. Central to its operation is the mask (see also :ref:`attribute-masks`): + +:TR_fpga_mask_RW: Controls which FPGAs will actually be configured when attributes referring to FPGAs are written. + + :type: ``bool[N_fpgas]`` + +Typically, ``N_fpgas == 16``. + +SST and XST +----------- + +The ``sst == DeviceProxy("LTS/SST/1")`` and ``xst == DeviceProxy("LTS/XST/1")`` devices manages the SSTs (subband statistics) and XSTs (crosslet statistics), respectively. The statistics are emitted piece-wise through UDP packets by the FPGAs on the Uniboards in SDP. By default, each device configures the statistics to be streamed to itself (the device), from where the user can obtain them. + +The statistics are exposed in two ways, as: + +- *Attributes*, representing the most recently received values, +- *TCP stream*, to allow the capture and recording of the statistics over any period of time. + +SST Statistics attributes +````````````````````````` + +The SSTs represent the amplitude of the signal in each subband, for each antenna, as an integer value. They are exposed through the following attributes: + +:sst_R: Amplitude of each subband, from each antenna. + + :type: ``uint64[N_ant][N_subbands]`` + +:sst_timestamp_R: Timestamp of the data, per antenna. + + :type: ``uint64[N_ant]`` + +:integration_interval_R: Timespan over which the SSTs were integrated, per antenna. + + :type: ``float32[N_ant]`` + +:subbands_calibrated_R: Whether the subband data was calibrated using the subband weights. + + :type: ``bool[N_ant]`` + +Typically, ``N_ant == 192``, and ``N_subbands == 512``. + +XST Statistics attributes +````````````````````````` + +The XSTs represent the cross-correlations between each pair of antennas, as complex values. The phases and amplitudes of the XSTs represent the phase and amplitude difference between the antennas, respectively. They are exposed as a matrix ``xst[a][b]``, of which only the triangle ``a<=b`` is filled, as the cross-correlation between antenna pairs ``(b,a)`` is equal to the complex conjugate of the cross-correlation of ``(a,b)``. The other triangle contains incidental values, but will be mostly 0. + +Complex values which cannot be represented in Tango attributes. Instead, the XST matrix is exposed as both their carthesian and polar parts: + +:xst_power_R, xst_phase_R: Amplitude and phase of the crosslet statistics. + + :type: ``float32[N_ant][N_ant]`` + +:xst_real_R, xst_imag_R: Real and imaginary parts of the crosslet statistics. + + :type: ``float32[N_ant][N_ant]`` + +:xst_timestamp_R: Timestamp of each block. + + :type: ``int64[N_blocks]`` + +:integration_interval_R: Timespan over which the XSTs were integrated, for each block. + + :type: ``float32[N_blocks]`` + +Typically, ``N_ant == 192``, and ``N_blocks == 136``. + +The metadata refers to the *blocks*, which are emitted by the FPGAs to represent the XSTs between 12 x 12 consecutive antennas. The following code converts block numbers to the indices of the first antenna pair in a block:: + + from common.baselines import baseline_from_index + + def first_antenna_pair(block_nr: int) -> int: + coarse_a, coarse_b = baseline_from_index(block_nr) + return (coarse_a * 12, coarse_b * 12) + +Conversely, to calculate the block index for an antenna pair ``(a,b)``, use:: + + from common.baselines import baseline_index + + def block_nr(a: int, b: int) -> int: + return baseline_index(a // 12, b // 12) + +TCP stream +`````````` + +The TCP stream interface allows a user to subscribe to the statistics packet streams, combined into a single TCP stream. The statistics will be streamed until the user disconnects, or the device is turned off. Any number of subscribers is supported, as bandwidth allows. Simply connect to the following port: + ++----------+----------------+ +| Device | TCP end point | ++==========+================+ +| SST | localhost:5101 | ++----------+----------------+ +| XST | localhost:5102 | ++----------+----------------+ + +The easiest way to capture this stream is to use our ``statistics_writer``, which will capture the statistics and store them in HDF5 file(s). The writer: + +- computes packet boundaries, +- processes the data of each packet, and stores their values into the matrix relevant for the mode, +- stores a matrix per timestamp, +- stores packet header information per timestamp, as HDF5 attributes, +- writes to a new file at a configurable interval. + +To run the writer:: + + cd devices/statistics_writer + python3 statistics_writer.py --mode SST --host localhost + +The correct port will automatically be chosen, depending on the given mode. See also ``statistics_writer.py -h`` for more information. + +The writer can also parse a statistics stream stored in a file. This allows the stream to be captured and processed independently. Capturing the stream can for example be done using ``netcat``:: + + nc localhost 5101 > SST-packets.bin + diff --git a/docs/source/devices/using.rst b/docs/source/devices/using.rst new file mode 100644 index 0000000000000000000000000000000000000000..8c2a58ca814fdea541e8e5dbcbe5b9ae189b5e84 --- /dev/null +++ b/docs/source/devices/using.rst @@ -0,0 +1,143 @@ +Using Devices +============= + +The station exposes *devices*, each of which is a remote software object that manages part of the station. Each device has the following properties: + +- It has a *state*, +- Many devices manage and represent hardware in the station, +- It exposes *read-only attributes*, that expose values from within the device or from the hardware it represents, +- It exposes *read-write attributes*, that allow controlling the functionality of the device, or the hardware it represents, +- It exposes *properties*, which are fixed configuration parameters (such as port numbers and timeouts), +- It exposes *commands*, that request the execution of a procedure in the device or in the hardware it manages. + +The devices are accessed remotely using ``DeviceProxy`` objects. See :doc:`../interfaces/control` on how to do this. + +States +------------ + +The state of a device is then queried with ``device.state()``. Each device can be in one of the following states: + +- ``DevState.OFF``: The device is not operating, +- ``DevState.INIT``: The device is being initialised, +- ``DevState.STANDBY``: The device is initialised and ready to be configured further, +- ``DevState.ON``: The device is operational. +- ``DevState.FAULT``: The device is malfunctioning. Functionality cannot be counted on. +- The ``device.state()`` function can throw an error, if the device cannot be reached at all. For example, because it's docker container is not running. See the :ref:`docker` device on how to start it. + +Each device provides the following commands to change the state: + +:off(): Turn the device ``OFF`` from any state. + +:initialise(): Initialise the device from the ``OFF`` state, to bring it to the ``STANDBY`` state. + +:on(): Mark the device as operational, from the ``STANDBY`` state, bringing it to ``ON``. + +The following procedure is a good way to bring a device to ``ON`` from any state:: + + def force_start(device): + if device.state() == DevState.FAULT: + device.off() + if device.state() == DevState.OFF: + device.initialise() + if device.state() == DevState.STANDBY: + device.on() + + return device.state() + +.. hint:: If a command gives you a timeout, the command will still be running until it finishes. You just won't know when it does or its result. In order to increase the timeout, use ``device.set_timeout_millis(timeout * 1000)``. + +FAULT +`````````` + +If a device enters the ``FAULT`` state, it means an error occurred that is fundamental to the operation of the software device. For example, the connection +to the hardware was lost. + +Interaction with the device in the ``FAULT`` state is undefined, and attributes cannot be read or written. The device needs to be reinitialised, which +typically involves the following sequence of commands:: + + # turn the device off completely first. + device.off() + + # setup any connections and threads + device.initialise() + + # turn on the device + device.on() + +Of course, the device could go into ``FAULT`` again, even during the ``initialise()`` command, for example because the hardware it manages is unreachable. To debug the fault condition, check the :doc:`../interfaces/logs` of the device in question. + +Initialise hardware +```````````````````` + +Most devices provide the following commands, in order to configure the hardware with base settings: + +:set_defaults(): Upload default attribute settings from the TangoDB to the hardware. + +:initialise_hardware(): For devices that control hardware, this command runs the hardware initialisation procedure. + +Typically, ``set_defaults()`` and ``initialise_hardware()`` are called in that order in the ``STANDBY`` state. The :ref:`boot` device runs these commands as part of its station initialsation sequence. + +.. _attributes: + +Attributes +------------ + +The device can be operated in ``ON`` state, where it exposes *attributes* and *commands*. The attributes can be accessed as python properties, for example:: + + recv = DeviceProxy("LTS/RECV/1") + + # turn on all LED0s + recv.RCU_LED0_RW = [True] * 32 + + # retrieve the status of all LED0s + print(recv.RCU_LED0_R) + +The attributes with an: + +- ``_R`` suffix are monitoring points, reflecting the state of the hardware, and are thus read-only. +- ``_RW`` suffix are control points, reflecting the desired state of the hardware. They are read-write, where writing requests the hardware to set the specified value. Reading them returns the last requested value. + +Meta data +````````````` + +A description of the attribute can be retrieved using:: + + print(recv.get_attribute_config("RCU_LED0_R").description) + +.. _attribute-masks: + +Attribute masks +--------------------- + +Several devices employ *attribute masks* in order to toggle which elements in their hardware array are actually to be controlled. This construct is necessary as most control points consist of arrays of values that cover all hardware elements. These array control points are always fully sent: it is not possible to update only a single element without uploading the rest. Without a mask, it is impossible to control a subset of the hardware. + +The masks only affect *writing* to attributes. Reading attributes (monitoring points) always result in data for all elements in the array. + +For example, the ``RCU_mask_RW`` array is the RCU mask in the ``recv`` device. It behaves as follows, when we interact with the ``RCU_LED0_R(W)`` attributes:: + + recv = DeviceProxy("LTS/RECV/1") + + # set mask to control all RCUs + recv.RCU_mask_RW = [True] * 32 + + # request to turn off LED0 for all RCUs + recv.RCU_LED0_RW = [False] * 32 + + # <--- all LED0s are now off + # recv.RCU_LED0_R should show this, + # if you have the RCU hardware installed. + + # set mask to only control RCU 3 + mask = [False] * 32 + mask[3] = True + recv.RCU_mask_RW = mask + + # request to turn on LED0, for all RCUs + # due to the mask, only LED0 on RCU 3 + # will be set. + recv.RCU_LED0_RW = [True] * 32 + + # <--- only LED0 on RCU3 is now on + # recv.RCU_LED0_R should show this, + # if you have the RCU hardware installed. + diff --git a/docs/source/faq.rst b/docs/source/faq.rst new file mode 100644 index 0000000000000000000000000000000000000000..367492e002e5d0d4bf20442c6e5e596ef78b852f --- /dev/null +++ b/docs/source/faq.rst @@ -0,0 +1,145 @@ +FAQ +=================================== + +Connecting to devices +-------------------------------------------------------------------------------------------------------------- + +My device is unreachable, but the device logs say it's running fine? +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +The ``$HOSTNAME`` may have been incorrectly guessed by ``docker-compose/Makefile``, or you accidently set it to an incorrect value. See :ref:`corba`. + +I get "API_CorbaException: TRANSIENT CORBA system exception: TRANSIENT_NoUsableProfile" when trying to connect to a device? +```````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +The ``$HOSTNAME`` may have been incorrectly guessed by ``docker-compose/Makefile``, or you accidently set it to an incorrect value. See :ref:`corba`. + +Docker +-------------------------------------------------------------------------------------------------------------- + +How do I prevent my containers from starting when I boot my computer? +```````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +You have to explicitly stop a container to prevent it from restarting. Use:: + + cd docker-compose + make stop <container> + +or plain ``make stop`` to stop all of them. + +Windows +-------------------------------------------------------------------------------------------------------------- + +How do I develop from Windows? +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +Our setup is Linux-based, so the easiest way to develop is by using WSL2, which lets you run a Linux distro under Windows. You'll need to: + +- Install WSL2. See f.e. https://www.omgubuntu.co.uk/how-to-install-wsl2-on-windows-10 +- Install `Docker Desktop <https://hub.docker.com/editions/community/docker-ce-desktop-windows/>`_ +- Enable the WSL2 backend in Docker Desktop +- We also recommend to install `Windows Terminal <https://www.microsoft.com/en-us/p/windows-terminal/9n0dx20hk701>`_ + +.. _x11_on_windows: + +How do I run X11 applications on Windows? +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` +If you need an X11 server on Windows: + +- Install `VcXsrv <https://sourceforge.net/projects/vcxsrv/>`_ +- Disable access control during its startup, +- Use ``export DISPLAY=host.docker.internal:0`` in WSL. + +You should now be able to run X11 applications from WSL and Docker. Try running ``xterm`` or ``xeyes`` to test. + + +SSTs/XSTs +-------------------------------------------------------------------------------------------------------------- + +Some SSTs/XSTs packets do arrive, but not all, and/or the matrices remain zero? +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +So ``sst.nof_packets_received`` / ``xst.nof_packets_received`` is increasing, telling you packets are arriving. But they're apparently dropped or contain zeroes. First, check the following settings: + +- ``sdp.TR_fpga_mask_RW[x] == True``, to make sure we're actually configuring the FPGAs, +- ``sdp.FPGA_wg_enable_RW[x] == False``, or the Waveform Generator might be replacing our the antenna data with zeroes, +- ``sdp.FPGA_processing_enabled_R[x] == True``, to verify that the FPGAs are processing, or the values and timestamps will be zero, +- For XSTs, ``xst.FPGA_xst_processing_enabled_R[x] == True``, to verify that the FPGAs are computing XSTs, or the values will be zero. + +Furthermore, the ``sst`` and ``xst`` devices expose several packet counters to indicate where incoming packets were dropped before or during processing: + +- ``nof_invalid_packets_R`` increases if packets arrive with an invalid header, or of the wrong statistic for this device, +- ``nof_packets_dropped_R`` increases if packets could not be processed because the processing queue is full, so the CPU cannot keep up with the flow, +- ``nof_payload_errors_R`` increases if the packet was marked by the FPGA to have an invalid payload, which causes the device to discard the packet, + +I am not receiving any XSTs and/or SSTs packets from SDP! +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +Are you sure? If ``sst.nof_packets_received`` / ``xst.nof_packets_received`` is actually increasing, the packets are arriving, but are not parsable by the SST/XST device. If so, see the previous question. + +Many settings need to be correct for the statistics emitted by the SDP FPGAs to reach our devices correctly. Here is a brief overview: + +- ``sdp.TR_fpga_mask_RW[x] == True``, to make sure we're actually configuring the FPGAs, +- ``sdp.FPGA_communication_error_R[x] == False``, to verify the FPGAs can be reached by SDP, +- SSTs: + + - ``sst.FPGA_sst_offload_enable_RW[x] == True``, to verify that the FPGAs are actually emitting the SSTs, + - ``sst.FPGA_sst_offload_hdr_eth_destination_mac_R[x] == <MAC of your machine's mtu=9000 interface>``, or the FPGAs will not send it to your machine. Use f.e. ``ip addr`` on the host to find the MAC address of your interface, and verify that its MTU is 9000, + - ``sst.FPGA_sst_offload_hdr_ip_destination_address_R[x] == <IP of your machine's mtu=9000 interface>``, or the packets will be dropped by the network or the kernel of your machine, + - ``sst.FPGA_sst_offload_hdr_ip_destination_address_R[x] == 5001``, or the packets will not be sent to a port that the SST device listens on. + +- XSTs: + + - ``xst.FPGA_sst_offload_enable_RW[x] == True``, to verify that the FPGAs are actually emitting the SSTs, + - ``xst.FPGA_xst_offload_hdr_eth_destination_mac_R[x] == <MAC of your machine's mtu=9000 interface>``, or the FPGAs will not send it to your machine. Use f.e. ``ip addr`` on the host to find the MAC address of your interface, and verify that its MTU is 9000, + - ``xst.FPGA_xst_offload_hdr_ip_destination_address_R[x] == <IP of your machine's mtu=9000 interface>``, or the packets will be dropped by the network or the kernel of your machine, + - ``xst.FPGA_xst_offload_hdr_ip_destination_address_R[x] == 5002``, or the packets will not be sent to a port that the XST device listens on. + +If this fails, see the next question. + +I am still not receiving XSTs and/or SSTs, even though the settings appear correct! +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +Let's see where the packets get stuck. Let us assume your MTU=9000 network interface is called ``em2`` (see ``ip addr`` to check): + +- Check whether the data arrives on ``em2``. Run ``tcpdump -i em2 udp -nn -vvv -c 10`` to capture the first 10 packets. Verify: + + - The destination MAC must match that of ``em2``, + - The destination IP must match that of ``em2``, + - The destination port is correct (5001 for SST, 5002 for XST), + - The source IP falls within the netmask of ``em2`` (unless ``net.ipv4.conf.em2.rp_filter=0`` is configured), + - TTL >= 2, + +- If you see no data at all, the network will have swallowed it. Try to use a direct network connection, or a hub (which broadcasts all packets, unlike a switch), to see what is being emitted by the FPGAs. +- Check whether the data reaches user space on the host: + + - Turn off the ``sst`` or ``xst`` device. This will not stop the FPGAs from sending. + - Run ``nc -u -l -p 5001 -vv`` (or port 5002 for XSTs). You should see raw packets being printed. + - If not, the Linux kernel is swallowing the packets, even before it can be sent to our docker container. + +- Check whether the data reaches kernel space in the container: + + - Enter the docker device by running ``docker exec -it device-sst bash``. + - Run ``sudo bash`` to become root, + - Run ``apt-get install -y tcpdump`` to install tcpdump, + - Check whether packets arrive using ``tcpdump -i eth0 udp -c 10 -nn``, + - If not, Linux is not routing the packets to the docker container. + +- Check whether the data reaches user space in the container: + + - Turn off the ``sst`` or ``xst`` device. This will not stop the FPGAs from sending. + - Enter the docker device by running ``docker exec -it device-sst bash``. + - Run ``sudo bash`` to become root, + - Run ``apt-get install -y netcat`` to install netcat, + - Check whether packets arrive using ``nc -u -l -p 5001 -vv`` (or port 5002 for XSTs), + - If not, Linux is not routing the packets to the docker container correctly. + +- If still on error was found, you've likely hit a bug in our software. + +Other containers +-------------------------------------------------------------------------------------------------------------- + +The ELK container won't start, saying "max virtual memory areas vm.max_map_count [65530] is too low"? +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +The ELK stack needs the ``vm.max_map_count`` sysctl kernel parameter to be at least 262144 to run. See :ref:`elk-kernel-settings`. diff --git a/docs/source/index.rst b/docs/source/index.rst index 5e6c6564940391ea5171403a833a2f83ed015adc..524d21369c9e0ded662f12a365d479ce3dc39abc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -6,10 +6,24 @@ Welcome to LOFAR2.0 Station Control's documentation! ==================================================== +LOFAR2.0 Station Control is a software stack aimed to monitor, control, and manage a LOFAR2.0 station. In order to do so, it whips up a series of Docker containers, and combines the power of `Tango Controls <https://www.tango-controls.org/>`_, `PyTango <https://pytango.readthedocs.io/en/stable/>`_, `Docker <https://www.docker.com/>`_, `Grafana <https://grafana.com/>`_, `ELK <https://www.elastic.co/what-is/elk-stack>`_, `Jupyter Notebook <https://jupyter.org/>`_, and many others to provide a rich and powerful experience in using the station. + +Full monitoring and control access to the LOFAR2.0 station hardware is provided, by marshalling their rich `OPC-UA <https://opcfoundation.org/about/opc-technologies/opc-ua/>`_ interfaces. Higher-level logic makes it possible to easily configure and obtain the LOFAR station data products (beamlets, XSTs, SSTs, BSTs) from your local machine using Python, or through one of our provided web interfaces. + +Even without having access to any LOFAR2.0 hardware, you can install the full stack on your laptop, and experiment with the software interfaces. + .. toctree:: :maxdepth: 2 :caption: Contents: + installation + interfaces/overview + devices/using + devices/devices + devices/configure + configure_station + developer + faq Indices and tables diff --git a/docs/source/installation.rst b/docs/source/installation.rst new file mode 100644 index 0000000000000000000000000000000000000000..cb0122ae95cc01de7f55e333345a6ec4d41bc369 --- /dev/null +++ b/docs/source/installation.rst @@ -0,0 +1,89 @@ +Installation +================== + +You will need the following dependencies installed: + +- docker +- docker-compose +- git +- make + +You start with checking out the source code, f.e. the master branch, as well as the git submodules we use:: + + git clone https://git.astron.nl/lofar2.0/tango.git + cd tango + git submodule init + git submodule update + +Next, we bootstrap the system. This will build our docker images, start key ones, and load the base configuration. This may take a while:: + + cd docker-compose + make bootstrap + +If you lack access to LOFAR station hardware, load additional configurations to use the simulators instead:: + + for sim in ../CDB/*-sim-config.json; do + ../sbin/update_ConfigDb.sh ../CDB${sim}-config.json + done + +If you do have access to LOFAR station hardware, you will have to :doc:`configure_station`. + +Now we are ready to start the other containers:: + + make start + +and make sure they are all up and running:: + + make status + +You should see the following state: + +- Containers ``astor``, ``hdbpp-viewer``, ``jive``, ``log-viewer`` and ``pogo`` will have State ``Exit 1``. These are containers that are interactive X11 tools, and not needed for now, +- Other containers have either State ``Up`` or ``Exit 0``. + +If not, you can inspect why with ``docker logs <container>``. Note that the containers will automatically be restarted on failure, and also if you reboot. Stop them explicitly to bring them down (``make stop <container>``). + +Post-boot Initialisation +--------------------------- + +After bootstrapping, and after a reboot, the software and hardware of the station needs to be explicitly initialised. Note that the docker containers do restart automatically at system boot. + +The following commands start all the software devices to control the station hardware, and initialise the hardware with the configured default settings. Go to http://localhost:8888, start a new *Station Control* notebook, and initiate the software boot sequence:: + + # reset our boot device + boot.off() + assert boot.state() == DevState.OFF + boot.initialise() + assert boot.state() == DevState.STANDBY + boot.on() + assert boot.state() == DevState.ON + + # start and initialise the other devices + boot.initialise_station() + + # wait for the devices to be initialised + import time + + while boot.initialising_station_R: + print(f"Still initialising station. {boot.initialisation_progress_R}% complete. State: {boot.initialisation_status_R}") + time.sleep(1) + + # print conclusion + if boot.initialisation_progress_R == 100: + print("Done initialising station.") + else: + print(f"Failed to initialise station: {boot.initialisation_status_R}") + +See :ref:`boot` for more information on the ``boot`` device. + +.. _elk-kernel-settings: + +ELK +```` + +The ELK stack requires some kernel settings to be tuned, before it will start. Although ``make bootstrap`` configures the kernel, these settings will not stick after a reboot. You will need to run either:: + + make start elk-configure-host + make restart elk + +after reboot, or configure your system to set ``sysctl -w vm.max_map_count=262144`` (or higher) as root during boot. diff --git a/docs/source/interfaces/control.rst b/docs/source/interfaces/control.rst new file mode 100644 index 0000000000000000000000000000000000000000..3c514f11d7a3e5a4bbc1c7339bac3bed0820d70f --- /dev/null +++ b/docs/source/interfaces/control.rst @@ -0,0 +1,84 @@ +Monitoring & Control +======================== + +The main API to control the station is through the `Tango Controls <https://tango-controls.readthedocs.io/en/latest/>`_ API we expose on port 10000, which is most easily accessed using a `PyTango <https://pytango.readthedocs.io/en/stable/client_api/index.html>`_ client. The Jupyter Notebook installation we provide is such a client. + +.. _jupyter: + +Jupyter Notebooks +------------------------ + +The station offers Juypyter notebooks On http://localhost:8888, which allow one to interact with the station, for example to set control points, access monitoring points, or to graph their values. + +The notebooks provide some predefined variables, so you don't have to look them up: + +.. literalinclude:: ../../../docker-compose/jupyter/ipython-profiles/stationcontrol-jupyter/startup/01-devices.py + +Note: the Jupyter notebooks use enhancements from the ``itango`` suite, which provide tab completions, but also the ``Device`` alias for ``DeviceProxy`` as was used in the Python examples in the next section. + +For example, you can start a new *Station Control* notebook (File->New Notebook->StationControl), and access these devices: + +.. image:: jupyter_basic_example.png + +.. _pytango-section: + +PyTango +------------------------ + +To access a station from scratch using Python, we need to install some dependencies:: + + pip3 install tango + +Then, if we know what devices are available on the station, we can access them directly:: + + import tango + import os + + # Tango needs to know where our Tango API is running. + os.environ["TANGO_HOST"] = "localhost:10000" + + # Construct a remote reference to a specific device. + # One can also use "tango://localhost:10000/LTS/Boot/1" if TANGO_HOST is not set + boot_device = tango.DeviceProxy("LTS/Boot/1") + + # Print the device's state. + print(boot_device.state()) + +To obtain a list of all devices, we need to access the database:: + + import tango + + # Tango needs to know where our Tango API is running. + import os + os.environ["TANGO_HOST"] = "localhost:10000" + + # Connect to the database. + db = tango.Database() + + # Retrieve the available devices, excluding any Tango-internal ones. + # This returns for example: ['LTS/Boot/1', 'LTS/Docker/1', ...] + devices = list(db.get_device_exported("LTS/*")) + + # Connect to any of them. + any_device = tango.DeviceProxy(devices[0]) + + # Print the device's state. + print(any_device.state()) + +.. _rest-api: + +ReST API +------------------------ + +We also provide a ReST API to allow the station to be controlled without needing to use the Tango API. The root access point is http://localhost:8080/tango/rest/v10/hosts/databaseds;port=10000/ (credentials: tango-cs/tango). This API allows for: + +- getting and setting attribute values, +- calling commands, +- retrieving the device state, +- and more. + +For example, retrieving http://localhost:8080/tango/rest/v10/hosts/databaseds;port=10000/devices/LTS/SDP/1/state returns the following JSON document:: + + {"state":"ON","status":"The device is in ON state."} + +For a full description of this API, see https://tango-rest-api.readthedocs.io/en/latest/. diff --git a/docs/source/interfaces/elk_last_hour.png b/docs/source/interfaces/elk_last_hour.png new file mode 100644 index 0000000000000000000000000000000000000000..d6f2a73c9ba754a5a6d5aeece1382906040acb15 Binary files /dev/null and b/docs/source/interfaces/elk_last_hour.png differ diff --git a/docs/source/interfaces/elk_log_fields.png b/docs/source/interfaces/elk_log_fields.png new file mode 100644 index 0000000000000000000000000000000000000000..c5774931f23933be6033e396220b2459409b1def Binary files /dev/null and b/docs/source/interfaces/elk_log_fields.png differ diff --git a/docs/source/interfaces/grafana_dashboard_1.png b/docs/source/interfaces/grafana_dashboard_1.png new file mode 100644 index 0000000000000000000000000000000000000000..448a9bd993b264cf35e98229f12829256f775029 Binary files /dev/null and b/docs/source/interfaces/grafana_dashboard_1.png differ diff --git a/docs/source/interfaces/grafana_dashboard_2.png b/docs/source/interfaces/grafana_dashboard_2.png new file mode 100644 index 0000000000000000000000000000000000000000..d7c34991d97cd22a209d1f02502afa1f439acf4e Binary files /dev/null and b/docs/source/interfaces/grafana_dashboard_2.png differ diff --git a/docs/source/interfaces/jupyter_basic_example.png b/docs/source/interfaces/jupyter_basic_example.png new file mode 100644 index 0000000000000000000000000000000000000000..c7e35204cc72b63e8ea2d81c2bdad337d3ce72a1 Binary files /dev/null and b/docs/source/interfaces/jupyter_basic_example.png differ diff --git a/docs/source/interfaces/logs.rst b/docs/source/interfaces/logs.rst new file mode 100644 index 0000000000000000000000000000000000000000..2b5c605ec5e47cf8b98b09dba47f6e6954f468ba --- /dev/null +++ b/docs/source/interfaces/logs.rst @@ -0,0 +1,44 @@ +Logs +================== + +The devices, and the docker containers in general, produce logging output. The easiest way to access the logs of a specific container is to ask docker directly. For example, to access and follow the most recent logs of the ``device-sdp`` container, execute on the host:: + + docker logs -n 100 -f device-sdp + +This is mostly useful for interactive use. + +.. _elk: + +ELK +------------------ + +To monitor the logs remotely, or to browse older logs, use the *ELK stack* that is included on the station, and served on http://localhost:5601. ELK, or ElasticSearch + Logstash + Kibana, is a popular log collection and querying system. Currently, the following logs are collected in our ELK installation: + +- Logs of all devices, +- Logs of the Jupyter notebook server. + +If you browse to the ELK stack (actually, it is Kibana providing the GUI), your go-to is the *Discover* view at http://localhost:5601/app/discover. There, you can construct (and save, load) a dashboard that provides a custom view of the logs, based on the *index pattern* ``logstash-*``. There is a lot to take in, and there are excellent Kibana tutorials on the web. + +To get going, use for example `this dashboard <http://localhost:5601/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-60m,to:now))&_a=(columns:!(extra.tango_device,level,message),filters:!(),index:'1e8ca200-1be0-11ec-a85f-b97e4206c18b',interval:auto,query:(language:kuery,query:''),sort:!())>`_, which shows the logs of the last hour, with some useful columns added to the default timestamp and message columns. Expand the time range if no logs appear, to look further back. You should see something like: + +.. image:: elk_last_hour.png + +ELK allows you to filter, edit the columns, and a lot more. We enrich the log entries with several extra fields, for example the device that generated it, and stack traces if available. Click on the ``>`` before a log entry and the information expands, showing for example: + +.. image:: elk_log_fields.png + +Furthermore, statistics from the ELK stack, such as the number of ERROR log messages, are made available as a data source in :doc:`monitoring`. + +LogViewer +------------------ + +For each device, Tango collects the logs as well. These can be viewed with the LogViewer X11 application. Make sure ``$DISPLAY`` is set, and run:: + + cd docker-compose + make start logviewer + +If LogViewer does not appear, check ``docker logs logviewer`` to see what went wrong. + +For information on how to use the LogViewer, see https://tango-controls.readthedocs.io/en/latest/tools-and-extensions/built-in/logviewer/logviewer.html. + +.. note:: If you need an X11 server on Windows, see :ref:`x11_on_windows`. diff --git a/docs/source/interfaces/monitoring.rst b/docs/source/interfaces/monitoring.rst new file mode 100644 index 0000000000000000000000000000000000000000..7d8a85fdf5bd7c103119a89a8dbae127040a5240 --- /dev/null +++ b/docs/source/interfaces/monitoring.rst @@ -0,0 +1,51 @@ +Monitoring GUIs +======================== + +Each device exposes a list of monitoring points as attributes with the ``_R`` prefix. These can be accessed interactively from a controle console (such as Jupyter), but that will not scale. + +Grafana +------------------------ + +We offer `Grafana <https://grafana.com/>`_ dashboards on http://localhost:3000 that provide a quick overview of the station's status, including temperatures and settings. Several dashboards are included. An example: + +.. image:: grafana_dashboard_1.png +.. image:: grafana_dashboard_2.png + +NOTE: These dashboards are highly subject to change. The above examples provide an impression of a possible overview of the station state. + +You are encouraged to inspect each panel (graph) to see the underlying database query and settings. Use the small arrow in the panel's title to get a drop-down menu of options, and select *inspect*. See the Grafana documentation for further information. + +The Grafana dashboards are configured with the following data sources: + +- :ref:`prometheus-section`, the time-series database that caches the latest values of all monitoring points (see next section), +- *Archiver DB*, the database that provides a long-term cache of attributes, +- :ref:`tangodb`, providing access to device properties (fixed settings), +- :ref:`elk`, the log output of the devices. + +.. _prometheus-section: + +Prometheus +------------------------- + +`Prometheus <https://prometheus.io/docs/introduction/overview/>`_ is a low-level monitoring system that allows us to periodically retrieve the values of all the attributes of all our devices, and cache them to be used in Grafana: + +- Every several seconds, Prometheus scrapes our `TANGO-Grafana Exporter <https://git.astron.nl/lofar2.0/ska-tango-grafana-exporter>`_ (our fork of https://gitlab.com/ska-telescope/TANGO-grafana.git), collecting all values of all the device attributes (except the large ones, for performance reasons). +- Prometheus can be queried directly on http://localhost:9090, +- The TANGO-Grafana Exporter can be queried directly on http://localhost:8000, +- The query language is `PromQL <https://prometheus.io/docs/prometheus/latest/querying/basics/>`_, which is also used in Grafana to query Prometheus, + +Prometheus stores attributes in the following format:: + + device_attribute{device="lts/recv/1", + dim_x="32", dim_y="0", + instance="tango-prometheus-exporter:8000", + job="tango", + label="RCU_temperature_R", + name="RCU_temperature_R", + type="float", + x="00", y="0"} + +The above describes a single data point and its labels. The primary identifying labels are ``device`` and ``name``. Each point furthermore has a value (integer) and a timestamp. The following transformations take place: + +- For 1D and 2D attributes, each array element is its own monitoring point, with ``x`` and ``y`` labels describing the indices. The labels ``dim_x`` and ``dim_y`` describe the array dimensionality, +- Attributes with string values get a ``str_value`` label describing their value. diff --git a/docs/source/interfaces/overview.rst b/docs/source/interfaces/overview.rst new file mode 100644 index 0000000000000000000000000000000000000000..a00ab5710ad863b4f10d1bb0ee93ab3f547826d5 --- /dev/null +++ b/docs/source/interfaces/overview.rst @@ -0,0 +1,41 @@ +Interfaces +====================== + +The station provides the following interfaces accessible through your browser (assuming you run on `localhost`): + ++---------------------+---------+----------------------+-------------------+ +|Interface |Subsystem|URL |Default credentials| ++=====================+=========+======================+===================+ +| :ref:`jupyter` |Jupyter |http://localhost:8888 | | ++---------------------+---------+----------------------+-------------------+ +| :doc:`monitoring` |Grafana |http://localhost:3000 |admin/admin | ++---------------------+---------+----------------------+-------------------+ +| :doc:`logs` |Kibana |http://localhost:5601 | | ++---------------------+---------+----------------------+-------------------+ + +Futhermore, there are some low-level interfaces: + ++---------------------------+------------------+-----------------------+-------------------+ +|Interface |Subsystem |URL |Default credentials| ++===========================+==================+=======================+===================+ +| :ref:`pytango-section` |Tango |tango://localhost:10000| | ++---------------------------+------------------+-----------------------+-------------------+ +| :ref:`prometheus-section` |Prometheus |http://localhost:9090 | | ++---------------------------+------------------+-----------------------+-------------------+ +| TANGO-Grafana Exporter |Python HTTPServer |http://localhost:8000 | | ++---------------------------+------------------+-----------------------+-------------------+ +| :ref:`rest-api` |tango-rest |http://localhost:8080 |tango-cs/tango | ++---------------------------+------------------+-----------------------+-------------------+ +| :ref:`tangodb` |MariaDB |http://localhost:3306 |tango/tango | ++---------------------------+------------------+-----------------------+-------------------+ +|Archive Database |MariaDB |http://localhost:3307 |tango/tango | ++---------------------------+------------------+-----------------------+-------------------+ +|Log Database |ElasticSearch |http://localhost:9200 | | ++---------------------------+------------------+-----------------------+-------------------+ + +.. toctree:: + :hidden: + + control + monitoring + logs diff --git a/jupyter-notebooks/Home.ipynb b/jupyter-notebooks/Home.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1b6001f9e5a87a0d626e310cc8038ede2d5f589f --- /dev/null +++ b/jupyter-notebooks/Home.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e051e48d", + "metadata": {}, + "source": [ + "# Welcome to your LOFAR2.0 station!\n", + "\n", + "The following interfaces are available to you, on the same host as this notebook, but on different ports:\n", + "\n", + "|Interface |Subsystem |Port|Credentials |\n", + "|----------|----------|----|--------------|\n", + "|Scripting |Jupyter |8888| |\n", + "|Monitoring|Grafana |3000|admin/admin |\n", + "|Logs |Kibana |5601| |\n", + "|ReST |tango-rest|8080|tango-cs/tango|\n", + "\n", + "Below are codes to manage the station at high level. For more detailed status information, look in Grafana." + ] + }, + { + "cell_type": "markdown", + "id": "32ae8bcf", + "metadata": {}, + "source": [ + "## (Re)boot station\n", + "The code below is used to:\n", + "* Reboot all station software\n", + "* Reset the hardware configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38037a71", + "metadata": {}, + "outputs": [], + "source": [ + "# Restart boot device itself\n", + "boot.off()\n", + "assert boot.state() == DevState.OFF, boot.state()\n", + "\n", + "boot.initialise()\n", + "assert boot.state() == DevState.STANDBY, boot.state()\n", + "\n", + "boot.on()\n", + "assert boot.state() == DevState.ON, boot.state()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21aba361", + "metadata": {}, + "outputs": [], + "source": [ + "# Request to reinitialise the station.\n", + "#\n", + "# WARNING: This will reset settings across the station!\n", + "boot.initialise_station()\n", + "assert boot.state() != DevState.FAULT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c00b465a", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "while boot.initialising_station_R:\n", + " print(f\"Still initialising station. {boot.initialisation_progress_R}% complete. State: {boot.initialisation_status_R}\")\n", + " time.sleep(1)\n", + "\n", + "if boot.initialisation_progress_R == 100:\n", + " print(\"Done initialising station.\")\n", + "else:\n", + " print(f\"Failed to initialise station: {boot.initialisation_status_R}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b444b751", + "metadata": {}, + "source": [ + "## Inspect Docker status\n", + "Docker containers that are not running will not provide any functionality, and are ignored when the station is rebooted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b09f9da", + "metadata": {}, + "outputs": [], + "source": [ + "container_status = {attr_name: getattr(docker, attr_name)\n", + " for attr_name in docker.get_attribute_list()\n", + " if attr_name.endswith(\"_R\")\n", + " and attr_name != 'version_R'}\n", + "\n", + "not_running_containers = [container for container, running in container_status.items() if running is False]\n", + "\n", + "if not not_running_containers:\n", + " print(\"All docker containers are running\")\n", + "else:\n", + " print(f\"Docker containers that are NOT running: {not_running_containers}\")" + ] + }, + { + "cell_type": "markdown", + "id": "55f3981d", + "metadata": {}, + "source": [ + "## Inspect Device status\n", + "Check whether all software devices are indeed up and running." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "637e6e22", + "metadata": {}, + "outputs": [], + "source": [ + "for d in devices:\n", + " try:\n", + " print(f\"Device {d.dev_name()} is in state {d.state()}\")\n", + " except ConnectionFailed as e:\n", + " print(f\"Device {d.dev_name()} is in state DOWN: {e.args[0].desc}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23008885", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "StationControl", + "language": "python", + "name": "stationcontrol" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}