diff --git a/README.md b/README.md index af8b42b1d34895bfd07c5cb8bcb2e24096b0ab20..c38893b41971bf25d55d7c2e1035d86eb135d6bb 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,9 @@ Next change the version in the following places: # Release Notes +* 0.32.5 Fixed race condition that exposed cleared metrics. + Corrected computation of xst.hardware_powered_fraction_R + Fixed logging of device names * 0.32.4 Fixed polling period (from 2500s to 2.5s). * 0.32.3 Fixed disappeared metrics from LOFARDevice, OPCUADevice, StationManager. * 0.32.2 Change hardware_powered_R to hardware_powered_fraction_R to report partial power. diff --git a/tangostationcontrol/VERSION b/tangostationcontrol/VERSION index eb7713622c212eb3365e1b66b11d24d72fd84b09..366b834ce0a224e9b0cd20de7739fa7638ebebb9 100644 --- a/tangostationcontrol/VERSION +++ b/tangostationcontrol/VERSION @@ -1 +1 @@ -0.32.4 +0.32.5 diff --git a/tangostationcontrol/tangostationcontrol/devices/base_device_classes/lofar_device.py b/tangostationcontrol/tangostationcontrol/devices/base_device_classes/lofar_device.py index 5f315d2fafead19b22c0a57e50024ab3e54f604d..dd9c5373262bb17b2f5825b9771e7a8a59e4b629 100644 --- a/tangostationcontrol/tangostationcontrol/devices/base_device_classes/lofar_device.py +++ b/tangostationcontrol/tangostationcontrol/devices/base_device_classes/lofar_device.py @@ -120,16 +120,19 @@ class AttributePoller: async def _poll(self): first_exception = None + # NB: The metrics are exposed asynchronously to Prometheus. + # We must make sure not to leave a metric cleared when we're + # reading it. If a metric is cleared, it will considered + # by Prometheus not to exist, leading to gaps in the metric + # even if it functions correctly. + for attr_name, attr_data in self._poll_list.items(): try: + # try to read the attribute value = await self._read_attribute(attr_name) - # check if value could be read - if value is None: - continue - # update metric, if any - if attr_data["metric"]: + if attr_data["metric"] and value is not None: attr_data["metric"].set_value(value) except ( asyncio.exceptions.TimeoutError, @@ -142,19 +145,27 @@ class AttributePoller: logger.debug( f"Failed to poll attribute {attr_name}: {exception_to_str(e)}" ) + + # clear metric, if any + if attr_data["metric"]: + attr_data["metric"].clear() except Exception as e: logger.exception(f"Failed to poll attribute {attr_name}") first_exception = first_exception or e + # clear metric, if any + if attr_data["metric"]: + attr_data["metric"].clear() + if first_exception: raise first_exception async def poll(self): - # invalidate all metrics, in case reading fails - self.clear_all() - if not self.device.is_attribute_access_allowed(AttReqType.READ_REQ): + # invalidate all metrics + self.clear_all() + # TODO(JDM): Poll attributes based on their individual is_allowed states return @@ -264,7 +275,7 @@ class LOFARDevice(Device): FIRST_DEFAULT_SETTINGS = [] def __str__(self): - return self.__class__.__name__ + return self.get_name() @classmethod def attr_list(cls): diff --git a/tangostationcontrol/tangostationcontrol/devices/sdp/xst.py b/tangostationcontrol/tangostationcontrol/devices/sdp/xst.py index 49f6407fac17134de03ca180501b21b860cd5bbf..c304dcfe0d56afcaed574c8aa6a728bed0711861 100644 --- a/tangostationcontrol/tangostationcontrol/devices/sdp/xst.py +++ b/tangostationcontrol/tangostationcontrol/devices/sdp/xst.py @@ -728,21 +728,12 @@ class XST(Statistics): processing_enabled = self.read_attribute("FPGA_xst_processing_enable_R") offload_enabled = self.read_attribute("FPGA_xst_offload_enable_R") - expected_processing_enabled = numpy.array( - self.FPGA_xst_processing_enable_RW_default, dtype=bool - ) - expected_offload_enabled = numpy.array( - self.FPGA_xst_offload_enable_RW_default, dtype=bool - ) - - mask = expected_processing_enabled | expected_offload_enabled + mask = self.control.read_parent_attribute("TR_fpga_mask_R") try: # "powered" means processing and offload is as expected for the FPGAs required return numpy.count_nonzero( - (processing_enabled == expected_processing_enabled) - & (offload_enabled == expected_offload_enabled) - & mask + processing_enabled & offload_enabled & mask ) / numpy.count_nonzero(mask) except ZeroDivisionError: return 1.0