diff --git a/docker-compose/alerta-server/Dockerfile b/docker-compose/alerta-server/Dockerfile index 80431da39da9ddb7ff0c28997660163234eb6d57..04f9bce1233a38a09cea6814b2ce8ac54f30fd84 100644 --- a/docker-compose/alerta-server/Dockerfile +++ b/docker-compose/alerta-server/Dockerfile @@ -9,6 +9,9 @@ RUN bash -c 'source /venv/bin/activate; pip install /tmp/grafana-plugin' COPY lofar-plugin /tmp/lofar-plugin RUN bash -c 'source /venv/bin/activate; pip install /tmp/lofar-plugin' +COPY lofar-routing-plugin /tmp/lofar-routing-plugin +RUN bash -c 'source /venv/bin/activate; pip install /tmp/lofar-routing-plugin' + COPY alertad.conf /app/alertad.conf COPY alerta.conf /app/alerta.conf COPY config.json /web/config.json diff --git a/docker-compose/alerta-server/lofar-routing-plugin/routing.py b/docker-compose/alerta-server/lofar-routing-plugin/routing.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd9f9e159c5f44bf12cacf17fb926b5db7bdb5a --- /dev/null +++ b/docker-compose/alerta-server/lofar-routing-plugin/routing.py @@ -0,0 +1,72 @@ +import logging + +from alerta.app import alarm_model +from alerta.models.enums import ChangeType + +LOG = logging.getLogger('alerta.plugins.routing') + +# For a description of this interface, +# see https://docs.alerta.io/gettingstarted/tutorial-3-plugins.html?highlight=rules#step-3-route-alerts-to-plugins +def rules(alert, plugins, config): + if alert.previous_severity is None: + # The alert still has to be parsed, and enriched, before it is + # merged into existing alerts. + return rules_prereceive(alert, plugins, config) + else: + # The alert has been processed. Check to which plugins we + # want to send it. + return rules_postreceive(alert, plugins, config) + +def rules_prereceive(alert, plugins, config): + """ Rules to determine which processing filters to use. """ + + # no filtering + return (plugins.values(), {}) + +def _is_new_problem(alert) -> bool: + """ Return whether the state change denotes a newly identified issue + on a system that (as far as the operator knew) was fine before. + + Returns True when detecting NORM -> UNACK transitions, and False + on any duplicates of this transition. + + Note that RTNUN -> UNACK is thus not triggered on. """ + + if alert.status != 'UNACK': + # Only report problems (not ACKing, SHELVing, etc) + return False + elif alert.last_receive_time != alert.update_time: + # Ignore anything that didn't update the alert, + # to avoid triggering on alerts that repeat + # the current situation + return False + else: + # Only report if the previous status was NORM, to avoid + # triggering on (f.e.) RTNUN -> UNACK transitions. + for h in alert.history: # is sorted new -> old + if h.status == alert.status: + # ignore any update that didn't change the status + continue + + return h.status == "NORM" + + # ... or if there was no previous status (a brand new alert) + return True + +def rules_postreceive(alert, plugins, config): + """ Rules to determine which emission methods to use. """ + + # decide whether to notify the user on slack + send_to_slack = _is_new_problem(alert) + + LOG.debug(f"Sending alert {alert.event} with status {alert.status} and severity {alert.previous_severity} => {alert.severity} to slack? {send_to_slack}") + + # filter the plugin list based on these decisions + use_plugins = [] + for name, plugin in plugins.items(): + if name == 'slack' and not send_to_slack: + pass + else: + use_plugins.append(plugin) + + return (use_plugins, {}) diff --git a/docker-compose/alerta-server/lofar-routing-plugin/setup.py b/docker-compose/alerta-server/lofar-routing-plugin/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..038881e14b12d0f0c0ca941fd629a53ac730df75 --- /dev/null +++ b/docker-compose/alerta-server/lofar-routing-plugin/setup.py @@ -0,0 +1,24 @@ + +from setuptools import setup, find_packages + +version = '1.0.0' + +setup( + name="alerta-routing", + version=version, + description='Alerta plugin to configure LOFAR custom alert routing', + url='https://git.astron.nl/lofar2.0/tango', + license='Apache License 2.0', + author='Jan David Mol', + author_email='mol@astron.nl', + packages=find_packages(), + py_modules=['routing'], + include_package_data=True, + zip_safe=True, + entry_points={ + 'alerta.routing': [ + 'rules = routing:rules' + ] + }, + python_requires='>=3.5' +) diff --git a/tangostationcontrol/docs/source/alerting.rst b/tangostationcontrol/docs/source/alerting.rst index 032bcd379f68d3fa719dc8956334a910bf6227ee..88cc07db4afde1abfff7f2ef7c2a0cf9d2668895 100644 --- a/tangostationcontrol/docs/source/alerting.rst +++ b/tangostationcontrol/docs/source/alerting.rst @@ -103,19 +103,32 @@ The following enhancements are useful to configure for the alerts: - You'll want to alert on a query, followed by a ``Reduce`` step with Function ``Last`` and Mode ``Drop Non-numeric Value``. This triggers the alert on the latest value(s), but keeps the individual array elements separated, - In ``Add details``, the ``Dashboard UID`` and ``Panel ID`` annotations are useful to configure to where you want the user to go, as Grafana will generate hyperlinks from them. To obtain a dashboard uid, go to ``Dashboards -> Browse`` and check out its URL. For the panel id, view a panel and check the URL, - In ``Add details``, the ``Summary`` annotation will be used as the alert description, -- In ``Custom labels``, add ``severity = major`` to raise the severity of the alert (default: warning). See also the `supported values <https://docs.alerta.io/webui/configuration.html#severity-colors>`_. +- In ``Custom labels``, add ``severity = High`` to raise the severity of the alert (default: Low). See also the `supported values <https://github.com/alerta/alerta/blob/master/alerta/models/alarms/isa_18_2.py#L14>`_. Alerta dashboard `````````````````` -The Alerta dashboard (http://localhost:8081) provides an overview of received alerts, which stay in the list until the alert condition disappears, and the alert is explicitly acknowledged or deleted: +The Alerta dashboard (http://localhost:8081) provides an overview of received alerts, according to the ISA 18.2 Alarm Model. It distinguishes the following states: -- *Acknowledging* an alert silences it for a day, -- *Shelving* an alert silences it for 2 hours, and removes it from more overviews, +- ``NORM``: the situation is nominal (any past alarm condition has been acknowledged), +- ``UNACK``: an alarm condition is active, which has not been acknowledged by an operator, +- ``RTNUN``: an alarm condition came and went, but has not been acknowledged by an operator, +- ``ACKED``: an alarm condition is active, and has been acknowledged by an operator. + +Furthermore, the following rarer states are known: + +- ``SHLVD``: the alert is put aside, regardless of its condition, +- ``DSUPR``: the alert is intentionally suppressed, +- ``OOSRV``: the alert concerns something out of service, and thus should be ignored. + +Any alerts stay in the displayed list until the alert condition disappears, *and* the alert is explicitly acknowledged, shelved, or deleted: + +- *Acknowledging* an alert silences it for a day, unless its severity rises, +- *Shelving* an alert silences it for a week, regardless of what happens, - *Watching* an alert means receiving browser notifications on changes, - *Deleting* an alert removes it until Grafana sends it again (default: 10 minutes). -See ``docker-compose/alerta-web/alertad.conf`` for these settings. +See ``docker-compose/alerta-server/alertad.conf`` for these settings. Several installed plugins enhance the received events: @@ -135,9 +148,9 @@ Our Alerta setup is configured to send alerts to Slack. To set this up, you need .. hint:: To obtain the ``OAuth Token`` later on, go to https://api.slack.com/apps, click on your App, and look under ``Install App``. -Now, edit ``docker-compose/alerta-web/alerta-secrets.json``: +Now, edit ``docker-compose/alerta-server/alerta-secrets.json``: -.. literalinclude:: ../../../docker-compose/alerta-web/alerta-secrets.json +.. literalinclude:: ../../../docker-compose/alerta-server/alerta-secrets.json The ``SLACK_TOKEN`` is the ``OAuth Token``, and the ``SLACK_CHANNEL`` is the channel in which to post the alerts.