From 9485ad566550e87bab2542475c2a3cc27379f848 Mon Sep 17 00:00:00 2001 From: Jan David Mol <mol@astron.nl> Date: Wed, 3 Nov 2021 07:25:50 +0100 Subject: [PATCH] L2SS-470: Add resume boot function, shortened attribute names, expose which devices were and were not initialised --- docker-compose/grafana/dashboards/home.json | 6 +- docs/source/devices/boot.rst | 22 +++- sbin/run_integration_test.sh | 4 +- .../tangostationcontrol/devices/boot.py | 105 ++++++++++++++---- .../devices/test_device_boot.py | 77 +++++++++++++ 5 files changed, 183 insertions(+), 31 deletions(-) create mode 100644 tangostationcontrol/tangostationcontrol/integration_test/devices/test_device_boot.py diff --git a/docker-compose/grafana/dashboards/home.json b/docker-compose/grafana/dashboards/home.json index 4ef59179f..98250c378 100644 --- a/docker-compose/grafana/dashboards/home.json +++ b/docker-compose/grafana/dashboards/home.json @@ -96,7 +96,7 @@ "targets": [ { "exemplar": true, - "expr": "device_attribute{device=\"stat/boot/1\",name=\"initialisation_progress_R\"}", + "expr": "device_attribute{device=\"stat/boot/1\",name=\"progress_R\"}", "interval": "", "legendFormat": "", "refId": "A" @@ -425,7 +425,7 @@ "targets": [ { "exemplar": true, - "expr": "device_attribute{device=\"stat/boot/1\",name=\"initialisation_status_R\"}", + "expr": "device_attribute{device=\"stat/boot/1\",name=\"status_R\"}", "instant": true, "interval": "", "legendFormat": "", @@ -449,7 +449,7 @@ "Time": true, "Value": true, "device": true, - "device_attribute{device=\"stat/boot/1\", dim_x=\"1\", dim_y=\"0\", instance=\"tango-prometheus-exporter:8000\", job=\"tango\", label=\"initialisation_status_R\", name=\"initialisation_status_R\", str_value=\"Initialisation completed\", type=\"string\", x=\"0\", y=\"0\"}": true, + "device_attribute{device=\"stat/boot/1\", dim_x=\"1\", dim_y=\"0\", instance=\"tango-prometheus-exporter:8000\", job=\"tango\", label=\"status_R\", name=\"status_R\", str_value=\"Initialisation completed\", type=\"string\", x=\"0\", y=\"0\"}": true, "dim_x": true, "dim_y": true, "instance": true, diff --git a/docs/source/devices/boot.rst b/docs/source/devices/boot.rst index 45af638d9..84f2f5a0a 100644 --- a/docs/source/devices/boot.rst +++ b/docs/source/devices/boot.rst @@ -5,22 +5,34 @@ Boot The ``boot == DeviceProxy("STAT/Boot/1")`` device is responsible for (re)starting and initialising the other devices. Devices which are not reachable, for example because their docker container is explicitly stopped, are skipped during initialisation. This device provides the following commands: -:initialise_station(): Stop and start the other devices in the correct order, set their default values, and command them to initialise their hardware. This procedure runs asynchronously, causing this command to return immediately. Initialisation is aborted if an error is encountered. +:boot(): Stop and start the other devices in the correct order, set their default values, and command them to initialise their hardware. This procedure runs asynchronously, causing this command to return immediately. Initialisation is aborted if an error is encountered. +wwww + :returns: ``None`` + +:resume(): Resume an earlier boot attempt: start initialising devices from the first one that failed to initialise, instead of from scratch. :returns: ``None`` The initialisation process can subsequently be followed through monitoring the following attributes: -:initialising_R: Whether the initialisation procedure is still ongoing. +:booting_R: Whether the initialisation procedure is still ongoing. :type: ``bool`` -:initialisation_progress_R: Percentage completeness of the initialisation procedure. Each succesfully configured device increments progress. +:progress_R: Percentage completeness of the initialisation procedure. Each succesfully configured device increments progress. :type: ``int`` -:initialisation_status_R: A description of what the device is currently trying to do. If an error occurs, this will hint towards the cause. +:status_R: A description of what the device is currently trying to do. If an error occurs, this will hint towards the cause. :type: ``str`` -A useful pattern is thus to call ``initialise_station()``, wait for ``initialising_R == False``, and then check whether the initalisation was succesful, if ``initialisation_progress_R == 100``. If a device fails to initialise, most likely the :doc:`../interfaces/logs` will need to be consulted. +:initialised_devices_R: Which devices were initialised succesfully. + + :type: ``str[]`` + +:uninitialised_devices_R: Which devices have not yet been initialised, or failed to initialiase. + + :type: ``str[]`` + +A useful pattern is thus to call ``boot()``, wait for ``booting_R == False``, and then check whether the initalisation was succesful, if ``progress_R == 100``. If a device fails to initialise, most likely the :doc:`../interfaces/logs` will need to be consulted. diff --git a/sbin/run_integration_test.sh b/sbin/run_integration_test.sh index 4e988f7fc..ee317bf4d 100755 --- a/sbin/run_integration_test.sh +++ b/sbin/run_integration_test.sh @@ -15,7 +15,7 @@ cd "$LOFAR20_DIR/docker-compose" || exit 1 make build # Start and stop sequence -make stop device-sdp device-recv device-sst device-unb2 device-xst sdptr-sim recv-sim unb2-sim apsct-sim apspu-sim +make stop device-boot device-sdp device-recv device-sst device-unb2 device-xst sdptr-sim recv-sim unb2-sim apsct-sim apspu-sim make start databaseds dsconfig elk # Give dsconfig and databaseds time to start @@ -32,7 +32,7 @@ make start sdptr-sim recv-sim unb2-sim apsct-sim apspu-sim # Give the simulators time to start sleep 5 -make start device-sdp device-recv device-sst device-unb2 device-xst +make start device-boot device-sdp device-recv device-sst device-unb2 device-xst # Give devices time to restart # TODO(Corne Lukken): Use a nicer more reliable mechanism diff --git a/tangostationcontrol/tangostationcontrol/devices/boot.py b/tangostationcontrol/tangostationcontrol/devices/boot.py index 5b0acd2a5..217f45c68 100644 --- a/tangostationcontrol/tangostationcontrol/devices/boot.py +++ b/tangostationcontrol/tangostationcontrol/devices/boot.py @@ -38,7 +38,7 @@ __all__ = ["Boot", "main"] class InitialisationException(Exception): pass -class DevicesInitialiser(Thread): +class DevicesInitialiser(object): """ Initialise devices on this station. @@ -56,30 +56,42 @@ class DevicesInitialiser(Thread): self.proxy_timeout = proxy_timeout # setup initial state + self.thread = None self.progress = 0 + self.devices = [] + self.device_initialised = {name: False for name in device_names} self.set_status("Initialisation not started yet") - super().__init__() + def _get_device_proxies(self): + """ Obtain the Device Proxies to all the devices we are to initialise. """ + + # Since Python3.7+, the insertion order equals the iteration order, which is what we depend on + # to process the devices in the same order as in device_names. + self.set_status("Obtaining DeviceProxies") + devices = {} + for name in self.device_names: + self.set_status(f"Obtaining a DeviceProxy to {name}") + devices[name] = DeviceProxy(name) + + # set the timeout for all proxies + self.set_status("Configuring DeviceProxies") + for device in devices.values(): + device.set_timeout_millis(int(self.proxy_timeout * 1000)) + + return devices def run(self): self.set_status("Starting initialisation") try: - # Since Python3.7+, the insertion order equals the iteration order, which is what we depend on - # to process the devices in the same order as in device_names. - self.devices = {} - for name in self.device_names: - self.set_status(f"Obtaining a DeviceProxy to {name}") - self.devices[name] = DeviceProxy(name) - - # set the timeout for all proxies - self.set_status("Configuring DeviceProxies") - for device in self.devices.values(): - device.set_timeout_millis(int(self.proxy_timeout * 1000)) + # get the device proxies if we didn't already + self.devices = self.devices or self._get_device_proxies() + # initialise the devices self.set_status("Initialisation started") self.initialise_devices() + # if we get here without throwing an exception, we're done self.set_status("Initialisation completed") except Exception as e: logger.exception("Failed to initialise station") @@ -90,15 +102,28 @@ class DevicesInitialiser(Thread): # we keep the status stuck at the last thing it tried def is_running(self): - return self.is_alive() + return self.thread and self.thread.is_alive() + + def start(self): + if self.is_running(): + # still busy, don't start + return + + if self.thread: + # done, but thread still exist. reap it first + self.stop() + self.thread = Thread(target=self.run) + self.thread.start() def stop(self): - if not self.is_alive(): + if not self.is_running(): return # Just wait for the current initialisation to finish. It's a finite process. - self.join() + self.thread.join() + + self.thread = None def set_status(self, status): self.status = status @@ -109,6 +134,9 @@ class DevicesInitialiser(Thread): """ Initialise or re-initialise all devices on the station. + If a device fails to initialise, the process is stopped. Calling + this function again will resume initialisation from the failed device. + :return:None """ @@ -117,9 +145,16 @@ class DevicesInitialiser(Thread): # restart devices in order for num_restarted_devices, device in enumerate(self.devices.keys(), 1): + # allow resuming by skipping already initialised devices + if self.device_initialised[device]: + continue + if self.is_available(device) or not self.ignore_unavailable_devices: self.start_device(device) + # mark device as initialised + self.device_initialised[device] = True + self.progress = 100.0 * num_restarted_devices / len(self.devices) # make sure we always finish at 100% in case of success @@ -223,9 +258,11 @@ class Boot(lofar_device): # ---------- # Attributes # ---------- - initialising_station_R = attribute(dtype=numpy.bool_, access=AttrWriteType.READ, fget=lambda self: self.initialiser.is_running()) - initialisation_progress_R = attribute(dtype=numpy.int, access=AttrWriteType.READ, fget=lambda self: numpy.int(self.initialiser.progress)) - initialisation_status_R = attribute(dtype=str, access=AttrWriteType.READ, fget=lambda self: self.initialiser.status) + booting_R = attribute(dtype=numpy.bool_, access=AttrWriteType.READ, fget=lambda self: self.initialiser.is_running(), doc="Whether booting is in progress.") + progress_R = attribute(dtype=numpy.int, access=AttrWriteType.READ, fget=lambda self: numpy.int(self.initialiser.progress), doc="Percentage of devices that was initialised") + status_R = attribute(dtype=str, access=AttrWriteType.READ, fget=lambda self: self.initialiser.status, doc="Description of current boot activity") + initialisated_devices_R = attribute(dtype=(str,), max_dim_x=128, access=AttrWriteType.READ, fget=lambda self: [name for name,initialised in self.initialiser.device_initialised.items() if initialised], doc="Which devices were initialised succesfully") + uninitialisated_devices_R = attribute(dtype=(str,), max_dim_x=128, access=AttrWriteType.READ, fget=lambda self: [name for name,initialised in self.initialiser.device_initialised.items() if not initialised], doc="Which devices have not been initialised or failed to initialise") @log_exceptions() def delete_device(self): @@ -261,12 +298,15 @@ class Boot(lofar_device): @only_in_states([DevState.ON]) @fault_on_error() @log_exceptions() - def initialise_station(self): + def boot(self): """ Initialise or re-initialise all devices on the station. This command will take a while to execute, so should be called asynchronously. + If resume == True, a previously started attempt is resumed from the device + that failed to initialise earlier. + :return:None """ @@ -274,9 +314,10 @@ class Boot(lofar_device): # already initialising return + # join any previous attempt, if any try: - self.initialiser.join() + self.initialiser.stop() except RuntimeError: pass @@ -284,6 +325,28 @@ class Boot(lofar_device): self.initialiser = DevicesInitialiser(self.Device_Names, self.Ignore_Unavailable_Devices, self.DeviceProxy_Time_Out) self.initialiser.start() + @command() + @DebugIt() + @only_in_states([DevState.ON]) + @fault_on_error() + @log_exceptions() + def resume(self): + """ + Resume booting. A previously started boot() attempt is resumed from + the first device that failed to initialise. + + This command will take a while to execute, so should be called asynchronously. + + :return:None + """ + + if self.initialiser.is_running(): + # already initialising + return + + # just start it again + self.initialiser.start() + # ---------- # Run server # ---------- diff --git a/tangostationcontrol/tangostationcontrol/integration_test/devices/test_device_boot.py b/tangostationcontrol/tangostationcontrol/integration_test/devices/test_device_boot.py new file mode 100644 index 000000000..2341e1aa5 --- /dev/null +++ b/tangostationcontrol/tangostationcontrol/integration_test/devices/test_device_boot.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +# +# This file is part of the LOFAR 2.0 Station Software +# +# +# +# Distributed under the terms of the APACHE license. +# See LICENSE.txt for more info. + +import time + +from tango._tango import DevState + +from tangostationcontrol.integration_test.device_proxy import TestDeviceProxy +from tangostationcontrol.integration_test import base + + +class TestDeviceBoot(base.IntegrationTestCase): + + def setUp(self): + """Intentionally recreate the device object in each test""" + super(TestDeviceBoot, self).setUp() + + def tearDown(self): + """Turn device Off in teardown to prevent blocking tests""" + d = TestDeviceProxy("STAT/Boot/1") + + try: + d.Off() + except Exception as e: + """Failing to turn Off devices should not raise errors here""" + print(f"Failed to turn device off in teardown {e}") + + def test_device_proxy_unb2(self): + """Test if we can successfully create a DeviceProxy and fetch state""" + + d = TestDeviceProxy("STAT/Boot/1") + + self.assertEqual(DevState.OFF, d.state()) + + def test_device_unb2_initialize(self): + """Test if we can transition to standby""" + + d = TestDeviceProxy("STAT/Boot/1") + + d.initialise() + + self.assertEqual(DevState.STANDBY, d.state()) + + def test_device_unb2_on(self): + """Test if we can transition to on""" + + d = TestDeviceProxy("STAT/Boot/1") + + d.initialise() + + d.on() + + self.assertEqual(DevState.ON, d.state()) + + def test_device_unb2_initialise_station(self): + """Test if we can initialise the station""" + + d = TestDeviceProxy("STAT/Boot/1") + + d.initialise() + d.on() + + d.initialise_station() + + # wait for a few seconds for the station to initialise + timeout = 10 + while d.initialising_station_R and timeout: + time.sleep(1) + + # check whether initialisation succeeded + self.assertEqual(100, d.initialisation_progress_R, msg=f"Initialisation of station failed. Status: {d.initialisation_status_R}") -- GitLab