diff --git a/CDB/LOFAR_ConfigDb.json b/CDB/LOFAR_ConfigDb.json index 508f4d3e3f2c96eb04cda4b2fea757a16d061124..2ba1ebe751f6397710f82e8d246c74fe9eaae8dd 100644 --- a/CDB/LOFAR_ConfigDb.json +++ b/CDB/LOFAR_ConfigDb.json @@ -91,6 +91,9 @@ "APSPU, APSPU_TEMP_error_R", "UNB2, UNB2_TEMP_error_R", "RECV, RECV_TEMP_error_R" + ], + "Shutdown_Device_List":[ + "STAT/SDP/1", "STAT/UNB2/1", "STAT/RECV/1", "STAT/APSCT/1", "STAT/APSPU/1" ] } } diff --git a/CDB/stations/simulators_ConfigDb.json b/CDB/stations/simulators_ConfigDb.json index c2f70a5999d78628421abeea7012f5cc2079f2ac..7cd92391917029be134fdc9fd4846b5540153663 100644 --- a/CDB/stations/simulators_ConfigDb.json +++ b/CDB/stations/simulators_ConfigDb.json @@ -123,6 +123,9 @@ "properties": { "Alarm_Error_List": [ "RECV, HBAT_LED_on_RW" + ], + "Shutdown_Device_List":[ + "STAT/SDP/1", "STAT/UNB2/1", "STAT/RECV/1", "STAT/APSCT/1", "STAT/APSPU/1" ] } } diff --git a/docker-compose/tango-prometheus-exporter/code/tango-prometheus-client.py b/docker-compose/tango-prometheus-exporter/code/tango-prometheus-client.py index f00be30ab836b2735b59c0b9c177ee450698a102..256f78bf155b7a3a17bd70e426478dd56bfc8182 100644 --- a/docker-compose/tango-prometheus-exporter/code/tango-prometheus-client.py +++ b/docker-compose/tango-prometheus-exporter/code/tango-prometheus-client.py @@ -175,7 +175,7 @@ class CustomCollector(object): # obtain extended info about all attributes attr_infos = {attr_info.name: attr_info for attr_info in dev.attribute_list_query()} - if dev.state() not in [DevState.STANDBY, DevState.ON, DevState.ALARM]: + if dev.state() not in [DevState.STANDBY, DevState.ON, DevState.ALARM, DevState.DISABLE]: logger.error(f"Error processing device {device_name}: it is in state {dev.state()}") # at least log state & status diff --git a/tangostationcontrol/docs/source/devices/using.rst b/tangostationcontrol/docs/source/devices/using.rst index 96881178bfa260acd182397ff4fd80a0794e3763..053d73bda33703c187f6bd9de9d7d84194647254 100644 --- a/tangostationcontrol/docs/source/devices/using.rst +++ b/tangostationcontrol/docs/source/devices/using.rst @@ -22,7 +22,8 @@ The state of a device is then queried with ``device.state()``. Each device can b - ``DevState.STANDBY``: The device is initialised and ready to be configured further, - ``DevState.ON``: The device is operational, - ``DevState.ALARM``: The device is operational, but one or more attributes are in alarm, -- ``DevState.FAULT``: The device is malfunctioning. Functionality cannot be counted on. +- ``DevState.FAULT``: The device is malfunctioning. Functionality cannot be counted on, +- ``DevState.DISABLE``: The device is not operating because its hardware has been shut down. - The ``device.state()`` function can throw an error, if the device cannot be reached at all. For example, because it's docker container is not running. See the :ref:`docker` device on how to start it. @@ -49,6 +50,10 @@ The state of a device is then queried with ``device.state()``. Each device can b alarm -> fault [label = "device", color="green"]; fault -> init [label = "user", color="red"]; fault -> off [label = "user", color="red"]; + standby -> disable [label = "user", color="green"]; + on -> disable [label = "user", color="green"]; + alarm -> disable [label = "user", color="green"]; + disable -> off [label= "user", color="red"]; } @@ -58,6 +63,8 @@ Each device provides the following commands to change the state: :warm_boot(): Turn on the device, but do not change the hardware. Moves from ``OFF`` to ``ON``. +:disable_hardware(): Shut down the hardware related to the device. Moves from ``STANDBY``, ``ON`` or ``ALARM`` to ``DISABLE`` + :off(): Turn the device ``OFF`` from any state. The following procedure is a good way to bring a device to ``ON`` from any state:: diff --git a/tangostationcontrol/tangostationcontrol/common/states.py b/tangostationcontrol/tangostationcontrol/common/states.py index 27bc5c481a9b7c33e347d68acf8f17b45f2f2315..cc458005621a3116b7a05839e81727d3dc796e70 100644 --- a/tangostationcontrol/tangostationcontrol/common/states.py +++ b/tangostationcontrol/tangostationcontrol/common/states.py @@ -6,7 +6,7 @@ OPERATIONAL_STATES = [DevState.ON, DevState.ALARM] # States in which Initialise() has happened, and the hardware # can thus be configured or otherwise interacted with. -INITIALISED_STATES = OPERATIONAL_STATES + [DevState.STANDBY] +INITIALISED_STATES = OPERATIONAL_STATES + [DevState.STANDBY, DevState.DISABLE] # States in which most commands are allowed DEFAULT_COMMAND_STATES = INITIALISED_STATES diff --git a/tangostationcontrol/tangostationcontrol/devices/apsct.py b/tangostationcontrol/tangostationcontrol/devices/apsct.py index 18a7f6fd7e49aeffdd1091e7e6a65766835be74a..60563f1138c125c79274b1c2f5342b7df68c02e8 100644 --- a/tangostationcontrol/tangostationcontrol/devices/apsct.py +++ b/tangostationcontrol/tangostationcontrol/devices/apsct.py @@ -133,6 +133,13 @@ class APSCT(opcua_device): else: raise Exception("200MHz signal is not locked. The subrack probably do not receive clock input or the CLK PCB is broken?") + def _disable_hardware(self): + """ Disable the APSCT hardware. """ + + # Turn off the APSCT + self.APSCT_off() + self.wait_attribute("APSCTTR_translator_busy_R", False, self.APSCT_On_Off_timeout) + # -------- # Commands # -------- diff --git a/tangostationcontrol/tangostationcontrol/devices/apspu.py b/tangostationcontrol/tangostationcontrol/devices/apspu.py index 7f40116222956f6066d2e00324ba18bfdaa43352..08c43f5a4362b22bd15ab0065e243f741add98ae 100644 --- a/tangostationcontrol/tangostationcontrol/devices/apspu.py +++ b/tangostationcontrol/tangostationcontrol/devices/apspu.py @@ -98,6 +98,9 @@ class APSPU(opcua_device): # overloaded functions # -------- + def _disable_hardware(self): + """ Disable the APSPU hardware. """ + super()._disable_hardware() # -------- # Commands diff --git a/tangostationcontrol/tangostationcontrol/devices/lofar_device.py b/tangostationcontrol/tangostationcontrol/devices/lofar_device.py index 3f127261973bda750d246a35bd514a868592d6ec..665caca7a4d0439e0a2195d9939210229b71cdb0 100644 --- a/tangostationcontrol/tangostationcontrol/devices/lofar_device.py +++ b/tangostationcontrol/tangostationcontrol/devices/lofar_device.py @@ -45,18 +45,22 @@ class lofar_device(Device, metaclass=DeviceMeta): ON = Device is fully configured, functional, controls the hardware, and is possibly actively running, ALARM = Device is operating but one of its attributes is out of range, FAULT = Device detected an unrecoverable error, and is thus malfunctional, + DISABLE = Device has shut down all its dependant hardware OFF = Device is turned off, drops connection to the hardware, The following state transitions are implemented: - boot -> OFF: Triggered by tango. Device will be instantiated, - OFF -> INIT: Triggered by device. Device will initialise (connect to hardware, other devices), - INIT -> STANDBY: Triggered by device. Device is initialised, and is ready for additional configuration by the user, - STANDBY -> ON: Triggered by user. Device reports to be functional, - ON -> ALARM: Triggered by tango. Device has attribute(s) with value(s) exceeding their alarm treshold, - * -> FAULT: Triggered by device. Device has degraded to malfunctional, for example because the connection to the hardware is lost, - * -> FAULT: Triggered by user. Emulate a forced malfunction for integration testing purposes, - * -> OFF: Triggered by user. Device is turned off. Triggered by the Off() command, - FAULT -> INIT: Triggered by user. Device is reinitialised to recover from an error, + boot -> OFF: Triggered by tango. Device will be instantiated, + OFF -> INIT: Triggered by device. Device will initialise (connect to hardware, other devices), + INIT -> STANDBY: Triggered by device. Device is initialised, and is ready for additional configuration by the user, + STANDBY -> ON: Triggered by user. Device reports to be functional, + STANDBY -> DISABLE: Triggered by user. Device has shut down its hardware. Triggered by the disable_hardware() command, + ON -> DISABLE: Triggered by user. Device has shut down its hardware. Triggered by the disable_hardware() command, + ALARM -> DISABLE: Triggered by user. Device has shut down its hardware. Triggered by the disable_hardware() command, + ON -> ALARM: Triggered by tango. Device has attribute(s) with value(s) exceeding their alarm treshold, + * -> FAULT: Triggered by device. Device has degraded to malfunctional, for example because the connection to the hardware is lost, + * -> FAULT: Triggered by user. Emulate a forced malfunction for integration testing purposes, + * -> OFF: Triggered by user. Device is turned off. Triggered by the Off() command, + FAULT -> INIT: Triggered by user. Device is reinitialised to recover from an error, The user triggers their transitions by the commands reflecting the target state (Initialise(), On(), Fault()). """ @@ -349,6 +353,24 @@ class lofar_device(Device, metaclass=DeviceMeta): # This is just the command version of _initialise_hardware(). self._initialise_hardware() + @only_in_states(INITIALISED_STATES) + @fault_on_error() + @command() + @DebugIt() + def disable_hardware(self): + """ Disable the hardware related to the device. """ + + if self.get_state() == DevState.DISABLE: + # Already disabled. + logger.warning("Requested to go to DISABLE state, but am already in DISABLE state.") + return + + self._disable_hardware() + + # Set state to DISABLE + self.set_state(DevState.DISABLE) + self.set_status("Device is in the DISABLE state.") + @only_in_states(DEFAULT_COMMAND_STATES) @command(dtype_out = DevDouble) def max_archiving_load(self): @@ -398,6 +420,10 @@ class lofar_device(Device, metaclass=DeviceMeta): """ Override this method to initialise any hardware after configuring it. """ pass + def _disable_hardware(self): + """ Override this method to disable any hardware related to the device. """ + pass + def read_attribute(self, attr_name): """ Read the value of a certain attribute (directly from the hardware). """ diff --git a/tangostationcontrol/tangostationcontrol/devices/recv.py b/tangostationcontrol/tangostationcontrol/devices/recv.py index ff2e4166fca4ba038895de25d268792274b90702..037b8adb69fa2515f5a3e2dbaa4ed00c8a2ab43b 100644 --- a/tangostationcontrol/tangostationcontrol/devices/recv.py +++ b/tangostationcontrol/tangostationcontrol/devices/recv.py @@ -235,6 +235,28 @@ class RECV(opcua_device): # by a fixed amount, the average of all steps. Doing so should result # in positive delays regardless of the pointing direction. self.HBAT_bf_delay_offset = numpy.mean(self.HBAT_bf_delay_step_delays) + + def _initialise_hardware(self): + """ Initialise the RCU hardware. """ + + # Cycle RCUs + self.RCU_off() + self.wait_attribute("RECVTR_translator_busy_R", False, self.RCU_On_Off_timeout) + self.RCU_on() + self.wait_attribute("RECVTR_translator_busy_R", False, self.RCU_On_Off_timeout) + + def _disable_hardware(self): + """ Disable the RECV hardware. """ + + # Save actual mask values + RCU_mask = self.proxy.RCU_mask_RW + # Set the mask to all Trues + self.RCU_mask_RW = [True] * 32 + # Turn off the RCUs + self.RCU_off() + self.wait_attribute("RECVTR_translator_busy_R", False, self.RCU_On_Off_timeout) + # Restore the mask + self.RCU_mask_RW = RCU_mask # -------- # internal functions @@ -319,15 +341,6 @@ class RECV(opcua_device): """ self.opcua_connection.call_method(["RCU_DTH_on"]) - def _initialise_hardware(self): - """ Initialise the RCU hardware. """ - - # Cycle RCUs - self.RCU_off() - self.wait_attribute("RECVTR_translator_busy_R", False, self.RCU_On_Off_timeout) - self.RCU_on() - self.wait_attribute("RECVTR_translator_busy_R", False, self.RCU_On_Off_timeout) - # ---------- # Run server # ---------- diff --git a/tangostationcontrol/tangostationcontrol/devices/sdp/sdp.py b/tangostationcontrol/tangostationcontrol/devices/sdp/sdp.py index b0038a0f1161c6c926f88815ebc27d0eaba53ef3..2a7ca8991f168f6bf5ba673207da6dc9dbb09817 100644 --- a/tangostationcontrol/tangostationcontrol/devices/sdp/sdp.py +++ b/tangostationcontrol/tangostationcontrol/devices/sdp/sdp.py @@ -283,6 +283,17 @@ class SDP(opcua_device): # Wait for the firmware to be loaded (ignoring masked out elements) self.wait_attribute("FPGA_boot_image_R", lambda attr: ((attr == 1) | ~wait_for).all(), 60) + + def _disable_hardware(self): + """ Disable the SDP hardware. """ + # Save actual mask values + TR_fpga_mask = self.proxy.TR_fpga_mask_RW + # Set the mask to all Trues + self.TR_fpga_mask_RW = [True] * 16 + # Boot the boot image firmware + self.FPGA_boot_image_RW = [0] * self.N_pn + # Restore the mask + self.TR_fpga_mask_RW = TR_fpga_mask # -------- # Commands diff --git a/tangostationcontrol/tangostationcontrol/devices/temperature_manager.py b/tangostationcontrol/tangostationcontrol/devices/temperature_manager.py index 912753f82c655a65d785af891fbf03222db86c3b..777343c80492bd03de0f84c21d5f1a1c29e09b90 100644 --- a/tangostationcontrol/tangostationcontrol/devices/temperature_manager.py +++ b/tangostationcontrol/tangostationcontrol/devices/temperature_manager.py @@ -12,8 +12,8 @@ from tangostationcontrol.common.entrypoint import entry from tangostationcontrol.devices.lofar_device import lofar_device from tangostationcontrol.common.lofar_logging import device_logging_to_python, log_exceptions - -from tango import Util, DeviceProxy, AttributeInfoEx, AttrDataFormat, EventType +from tango.server import command +from tango import Util, DeviceProxy, AttributeInfoEx, AttrDataFormat, EventType, DevSource, DebugIt from tango.server import attribute, device_property import numpy as np @@ -60,6 +60,12 @@ class TemperatureManager(lofar_device): default_value=[] ) + Shutdown_Device_List = device_property( + dtype=[str], + mandatory=False, + default_value=["STAT/SDP/1", "STAT/UNB2/1", "STAT/RECV/1", "STAT/APSCT/1", "STAT/APSPU/1"] + ) + # ---------- # Attributes # ---------- @@ -87,6 +93,7 @@ class TemperatureManager(lofar_device): # get the proxy to the device proxy = DeviceProxy(f"{ds_inst}/{proxy_name}/{instance_number}") + proxy.set_source(DevSource.DEV) # make sure the attribute is polled, otherwise we wont receive events if not proxy.is_attribute_polled(f"{attribute_name}"): @@ -142,19 +149,24 @@ class TemperatureManager(lofar_device): logger.warning(f"Detected a temperature alarm for {event.device}: {event.attr_value.name} := {event.attr_value.value}") self.auto_shutdown_hardware() + # -------- + # Commands + # -------- + @command() + @DebugIt() def auto_shutdown_hardware(self): """ This function automatically shuts down all hardware devices whenever a temperature alarm is detected - In the future there should be a strategy for turning off devices """ - DeviceProxy("STAT/SDP/1").off() - DeviceProxy("STAT/UNB2/1").off() - DeviceProxy("STAT/RECV/1").off() - DeviceProxy("STAT/APSCT/1").off() - DeviceProxy("STAT/APSPU/1").off() - DeviceProxy("STAT/PSOC/1").off() - logger.warning(f"Temperature alarm triggered auto shutdown of all hardware devices") + for dev_name in self.Shutdown_Device_List: + try: + proxy = DeviceProxy(dev_name) + proxy.disable_hardware() + except Exception as e: + logger.warning(f"Automatic hardware shutdown of device {dev_name} has failed: {e.args[0]}") + # TODO(Stefano): Add "STAT/PSOC/1" to the shutdown list and develop its behaviour + logger.warning(f"Temperature alarm triggered auto shutdown of all hardware devices") # ---------- # Run server diff --git a/tangostationcontrol/tangostationcontrol/devices/unb2.py b/tangostationcontrol/tangostationcontrol/devices/unb2.py index 3168621529a30a3c9b9690f2a07883ae04a965db..8ec767588ae0e9fe333411b763d47c765da20649 100644 --- a/tangostationcontrol/tangostationcontrol/devices/unb2.py +++ b/tangostationcontrol/tangostationcontrol/devices/unb2.py @@ -198,6 +198,28 @@ class UNB2(opcua_device): # overloaded functions # -------- + def _initialise_hardware(self): + """ Initialise the UNB2 hardware. """ + + # Cycle UNB2s + self.UNB2_off() + self.wait_attribute("UNB2TR_translator_busy_R", False, self.UNB2_On_Off_timeout) + self.UNB2_on() + self.wait_attribute("UNB2TR_translator_busy_R", False, self.UNB2_On_Off_timeout) + + def _disable_hardware(self): + """ Disable the UNB2 hardware. """ + + # Save actual mask values + UNB2_mask = self.proxy.UNB2_mask_RW + # Set the mask to all Trues + self.UNB2_mask_RW = [True] * 2 + # Turn off the uniboards + self.UNB2_off() + self.wait_attribute("UNB2TR_translator_busy_R", False, self.UNB2_On_Off_timeout) + # Restore the mask + self.UNB2_mask_RW = UNB2_mask + # -------- # Commands # -------- @@ -222,15 +244,6 @@ class UNB2(opcua_device): """ self.opcua_connection.call_method(["UNB2_on"]) - def _initialise_hardware(self): - """ Initialise the UNB2 hardware. """ - - # Cycle UNB2s - self.UNB2_off() - self.wait_attribute("UNB2TR_translator_busy_R", False, self.UNB2_On_Off_timeout) - self.UNB2_on() - self.wait_attribute("UNB2TR_translator_busy_R", False, self.UNB2_On_Off_timeout) - # ---------- # Run server # ---------- diff --git a/tangostationcontrol/tangostationcontrol/integration_test/default/devices/test_device_temperature_manager.py b/tangostationcontrol/tangostationcontrol/integration_test/default/devices/test_device_temperature_manager.py index 4ba379bb19e6e25c0406cec3301d77137719c317..3e32310e130e219b75f4c9525ae87a5143f92eee 100644 --- a/tangostationcontrol/tangostationcontrol/integration_test/default/devices/test_device_temperature_manager.py +++ b/tangostationcontrol/tangostationcontrol/integration_test/default/devices/test_device_temperature_manager.py @@ -9,6 +9,7 @@ from .base import AbstractTestBases from tangostationcontrol.integration_test.device_proxy import TestDeviceProxy from tango._tango import DevState +from tango import DeviceProxy import time @@ -20,9 +21,9 @@ class TestDeviceTemperatureManager(AbstractTestBases.TestDeviceBase): def setUp(self): """Intentionally recreate the device object in each test""" self.recv_proxy = self.setup_recv_proxy() + self.sdp_proxy = self.setup_sdp_proxy() super().setUp("STAT/TemperatureManager/1") - def tearDown(self): self.recv_proxy.stop_poll_attribute("HBAT_LED_on_RW") @@ -36,11 +37,36 @@ class TestDeviceTemperatureManager(AbstractTestBases.TestDeviceBase): self.assertTrue(recv_proxy.is_attribute_polled(f"HBAT_LED_on_RW")) return recv_proxy + def setup_sdp_proxy(self): + # setup SDP, on which this device depends + sdp_proxy = TestDeviceProxy("STAT/SDP/1") + sdp_proxy.off() + sdp_proxy.warm_boot() + sdp_proxy.set_defaults() + return sdp_proxy + def test_alarm(self): + # Exclude other devices which raise a TimeoutError, since they wait for the attribute *_translator_busy_R to become False + # (set instead to True in this test environment) + self.proxy.put_property({"Shutdown_Device_List": ["STAT/SDP/1"]}) + devices = [DeviceProxy("STAT/SDP/1")] + self.proxy.off() self.proxy.initialise() self.proxy.on() + self.setup_recv_proxy() + self.setup_sdp_proxy() + + # make sure none of the devices are in the OFF or FAULT state. Any other state is fine + for dev in devices: + if dev.state() == DevState.OFF: + dev.warm_boot() + elif dev.state() == DevState.FAULT: + dev.off() + dev.warm_boot() + self.assertEqual(self.proxy.get_property('Shutdown_Device_List')['Shutdown_Device_List'][0], "STAT/SDP/1") + # Here we trigger our own change event by just using an RW attribute self.recv_proxy.HBAT_LED_on_RW = [[False] * 32] * 96 time.sleep(2) @@ -52,32 +78,7 @@ class TestDeviceTemperatureManager(AbstractTestBases.TestDeviceBase): # the TEMP_MANAGER_is_alarming_R should now be True, since it should have detected the temperature alarm. self.assertTrue(self.proxy.is_alarming_R) - - def test_shutdown(self): - self.proxy.off() - self.proxy.initialise() - self.proxy.on() - - devices = [TestDeviceProxy("STAT/SDP/1"), TestDeviceProxy("STAT/UNB2/1"), self.recv_proxy, - TestDeviceProxy("STAT/APSCT/1"), TestDeviceProxy("STAT/APSPU/1"), TestDeviceProxy("STAT/PSOC/1")] - - # make sure none of the devices are in the OFF state. Any other state is fine - for dev in devices: - if dev.state() == DevState.OFF: - dev.initialise() - - # toggle the attribute to make sure we get a change event to True - self.recv_proxy.HBAT_LED_on_RW = [[False] * 32] * 96 - self.recv_proxy.HBAT_LED_on_RW = [[True] * 32] * 96 - - # sleeping here to make sure we've dealt with the above events - time.sleep(2) - - # make sure all the devices are in the OFF state + + # make sure all the hardware devices are in the DISABLE state for dev in devices: - self.assertEqual(DevState.OFF, dev.state()) - - - - - + self.assertEqual(DevState.DISABLE, dev.state()) diff --git a/tangostationcontrol/tangostationcontrol/test/devices/test_lofar_device.py b/tangostationcontrol/tangostationcontrol/test/devices/test_lofar_device.py index d97cf7b9ebc6625cf9568fb2db24cab4edd51bc7..38d599f9f780cccd2d506a588ae8a2596f5f5ff8 100644 --- a/tangostationcontrol/tangostationcontrol/test/devices/test_lofar_device.py +++ b/tangostationcontrol/tangostationcontrol/test/devices/test_lofar_device.py @@ -9,6 +9,7 @@ from tango.test_context import DeviceTestContext from tango.server import attribute +from tango import DevState, DevFailed from tangostationcontrol.devices import lofar_device @@ -45,4 +46,23 @@ class TestLofarDevice(device_base.DeviceTestCase): proxy.initialise() self.assertEqual(42.0, proxy.read_attribute_A) self.assertListEqual([42.0, 43.0], proxy.read_attribute_B_array.tolist()) + + def test_disable_state(self): + with DeviceTestContext(lofar_device.lofar_device, process=True, timeout=10) as proxy: + proxy.initialise() + self.assertEqual(DevState.STANDBY, proxy.state()) + proxy.on() + self.assertEqual(DevState.ON, proxy.state()) + proxy.disable_hardware() + self.assertEqual(DevState.DISABLE, proxy.state()) + + def test_disable_state_transitions(self): + with DeviceTestContext(lofar_device.lofar_device, process=True, timeout=10) as proxy: + proxy.off() + with self.assertRaises(DevFailed): + proxy.disable_hardware() + proxy.warm_boot() + proxy.Fault() + with self.assertRaises(DevFailed): + proxy.disable_hardware()