Merge branch 'prevent-cleared-metrics-for-too-long' into 'master'

Avoid race condition between polling attributes and serving metrics See merge request !898

Merge branch 'prevent-cleared-metrics-for-too-long' into 'master'
ea6b51b1 · Jan David Mol · ff41bd86 · f51ebb57 · ea6b51b1 · ea6b51b1
Commit ea6b51b1 authored 1 year ago by Jan David Mol
--- a/README.md
+++ b/README.md
@@ -166,6 +166,9 @@ Next change the version in the following places:
 # Release Notes
+* 0.32.5 Fixed race condition that exposed cleared metrics.
+         Corrected computation of xst.hardware_powered_fraction_R
+         Fixed logging of device names
 * 0.32.4 Fixed polling period (from 2500s to 2.5s).
 * 0.32.3 Fixed disappeared metrics from LOFARDevice, OPCUADevice, StationManager.
 * 0.32.2 Change hardware_powered_R to hardware_powered_fraction_R to report partial power.

--- a/tangostationcontrol/VERSION
+++ b/tangostationcontrol/VERSION
-0.32.4
+0.32.5
--- a/tangostationcontrol/tangostationcontrol/devices/base_device_classes/lofar_device.py
+++ b/tangostationcontrol/tangostationcontrol/devices/base_device_classes/lofar_device.py
@@ -120,16 +120,19 @@ class AttributePoller:
    async def _poll(self):
        first_exception = None
+        # NB: The metrics are exposed asynchronously to Prometheus.
+        # We must make sure not to leave a metric cleared when we're
+        # reading it. If a metric is cleared, it will considered
+        # by Prometheus not to exist, leading to gaps in the metric
+        # even if it functions correctly.
        for attr_name, attr_data in self._poll_list.items():
            try:
+                # try to read the attribute
                value = await self._read_attribute(attr_name)
-                # check if value could be read
-                if value is None:
-                    continue
                # update metric, if any
-                if attr_data["metric"]:
+                if attr_data["metric"] and value is not None:
                    attr_data["metric"].set_value(value)
            except (
                asyncio.exceptions.TimeoutError,
@@ -142,19 +145,27 @@ class AttributePoller:
                logger.debug(
                    f"Failed to poll attribute {attr_name}: {exception_to_str(e)}"
                )
+                # clear metric, if any
+                if attr_data["metric"]:
+                    attr_data["metric"].clear()
            except Exception as e:
                logger.exception(f"Failed to poll attribute {attr_name}")
                first_exception = first_exception or e
+                # clear metric, if any
+                if attr_data["metric"]:
+                    attr_data["metric"].clear()
        if first_exception:
            raise first_exception
    async def poll(self):
-        # invalidate all metrics, in case reading fails
+        if not self.device.is_attribute_access_allowed(AttReqType.READ_REQ):
+            # invalidate all metrics
            self.clear_all()
-        if not self.device.is_attribute_access_allowed(AttReqType.READ_REQ):
            # TODO(JDM): Poll attributes based on their individual is_allowed states
            return
@@ -264,7 +275,7 @@ class LOFARDevice(Device):
    FIRST_DEFAULT_SETTINGS = []
    def __str__(self):
-        return self.__class__.__name__
+        return self.get_name()
    @classmethod
    def attr_list(cls):

--- a/tangostationcontrol/tangostationcontrol/devices/sdp/xst.py
+++ b/tangostationcontrol/tangostationcontrol/devices/sdp/xst.py
@@ -728,21 +728,12 @@ class XST(Statistics):
        processing_enabled = self.read_attribute("FPGA_xst_processing_enable_R")
        offload_enabled = self.read_attribute("FPGA_xst_offload_enable_R")
-        expected_processing_enabled = numpy.array(
+        mask = self.control.read_parent_attribute("TR_fpga_mask_R")
-            self.FPGA_xst_processing_enable_RW_default, dtype=bool
-        )
-        expected_offload_enabled = numpy.array(
-            self.FPGA_xst_offload_enable_RW_default, dtype=bool
-        )
-        mask = expected_processing_enabled | expected_offload_enabled
        try:
            # "powered" means processing and offload is as expected for the FPGAs required
            return numpy.count_nonzero(
-                (processing_enabled == expected_processing_enabled)
+                processing_enabled & offload_enabled & mask
-                & (offload_enabled == expected_offload_enabled)
-                & mask
            ) / numpy.count_nonzero(mask)
        except ZeroDivisionError:
            return 1.0