Skip to content
Snippets Groups Projects
Commit ea6b51b1 authored by Jan David Mol's avatar Jan David Mol
Browse files

Merge branch 'prevent-cleared-metrics-for-too-long' into 'master'

Avoid race condition between polling attributes and serving metrics

See merge request !898
parents ff41bd86 f51ebb57
No related branches found
No related tags found
Loading
...@@ -166,6 +166,9 @@ Next change the version in the following places: ...@@ -166,6 +166,9 @@ Next change the version in the following places:
# Release Notes # Release Notes
* 0.32.5 Fixed race condition that exposed cleared metrics.
Corrected computation of xst.hardware_powered_fraction_R
Fixed logging of device names
* 0.32.4 Fixed polling period (from 2500s to 2.5s). * 0.32.4 Fixed polling period (from 2500s to 2.5s).
* 0.32.3 Fixed disappeared metrics from LOFARDevice, OPCUADevice, StationManager. * 0.32.3 Fixed disappeared metrics from LOFARDevice, OPCUADevice, StationManager.
* 0.32.2 Change hardware_powered_R to hardware_powered_fraction_R to report partial power. * 0.32.2 Change hardware_powered_R to hardware_powered_fraction_R to report partial power.
......
0.32.4 0.32.5
...@@ -120,16 +120,19 @@ class AttributePoller: ...@@ -120,16 +120,19 @@ class AttributePoller:
async def _poll(self): async def _poll(self):
first_exception = None first_exception = None
# NB: The metrics are exposed asynchronously to Prometheus.
# We must make sure not to leave a metric cleared when we're
# reading it. If a metric is cleared, it will considered
# by Prometheus not to exist, leading to gaps in the metric
# even if it functions correctly.
for attr_name, attr_data in self._poll_list.items(): for attr_name, attr_data in self._poll_list.items():
try: try:
# try to read the attribute
value = await self._read_attribute(attr_name) value = await self._read_attribute(attr_name)
# check if value could be read
if value is None:
continue
# update metric, if any # update metric, if any
if attr_data["metric"]: if attr_data["metric"] and value is not None:
attr_data["metric"].set_value(value) attr_data["metric"].set_value(value)
except ( except (
asyncio.exceptions.TimeoutError, asyncio.exceptions.TimeoutError,
...@@ -142,19 +145,27 @@ class AttributePoller: ...@@ -142,19 +145,27 @@ class AttributePoller:
logger.debug( logger.debug(
f"Failed to poll attribute {attr_name}: {exception_to_str(e)}" f"Failed to poll attribute {attr_name}: {exception_to_str(e)}"
) )
# clear metric, if any
if attr_data["metric"]:
attr_data["metric"].clear()
except Exception as e: except Exception as e:
logger.exception(f"Failed to poll attribute {attr_name}") logger.exception(f"Failed to poll attribute {attr_name}")
first_exception = first_exception or e first_exception = first_exception or e
# clear metric, if any
if attr_data["metric"]:
attr_data["metric"].clear()
if first_exception: if first_exception:
raise first_exception raise first_exception
async def poll(self): async def poll(self):
# invalidate all metrics, in case reading fails if not self.device.is_attribute_access_allowed(AttReqType.READ_REQ):
# invalidate all metrics
self.clear_all() self.clear_all()
if not self.device.is_attribute_access_allowed(AttReqType.READ_REQ):
# TODO(JDM): Poll attributes based on their individual is_allowed states # TODO(JDM): Poll attributes based on their individual is_allowed states
return return
...@@ -264,7 +275,7 @@ class LOFARDevice(Device): ...@@ -264,7 +275,7 @@ class LOFARDevice(Device):
FIRST_DEFAULT_SETTINGS = [] FIRST_DEFAULT_SETTINGS = []
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.get_name()
@classmethod @classmethod
def attr_list(cls): def attr_list(cls):
......
...@@ -728,21 +728,12 @@ class XST(Statistics): ...@@ -728,21 +728,12 @@ class XST(Statistics):
processing_enabled = self.read_attribute("FPGA_xst_processing_enable_R") processing_enabled = self.read_attribute("FPGA_xst_processing_enable_R")
offload_enabled = self.read_attribute("FPGA_xst_offload_enable_R") offload_enabled = self.read_attribute("FPGA_xst_offload_enable_R")
expected_processing_enabled = numpy.array( mask = self.control.read_parent_attribute("TR_fpga_mask_R")
self.FPGA_xst_processing_enable_RW_default, dtype=bool
)
expected_offload_enabled = numpy.array(
self.FPGA_xst_offload_enable_RW_default, dtype=bool
)
mask = expected_processing_enabled | expected_offload_enabled
try: try:
# "powered" means processing and offload is as expected for the FPGAs required # "powered" means processing and offload is as expected for the FPGAs required
return numpy.count_nonzero( return numpy.count_nonzero(
(processing_enabled == expected_processing_enabled) processing_enabled & offload_enabled & mask
& (offload_enabled == expected_offload_enabled)
& mask
) / numpy.count_nonzero(mask) ) / numpy.count_nonzero(mask)
except ZeroDivisionError: except ZeroDivisionError:
return 1.0 return 1.0
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment