diff --git a/.gitmodules b/.gitmodules index 1c9e69fc593c305a941f8d35e16f2efb531cefb5..f1248450adb0a12584a247b8119bc9653e6498f0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,7 @@ [submodule "tangostationcontrol/tangostationcontrol/toolkit/libhdbpp-python"] path = tangostationcontrol/tangostationcontrol/toolkit/libhdbpp-python url = https://gitlab.com/tango-controls/hdbpp/libhdbpp-python.git +[submodule "docker-compose/alerta-web"] + path = docker-compose/alerta-web + url = https://github.com/jjdmol/alerta-webui + branch = add-isa-18-2-states diff --git a/docker-compose/alerta-web/Dockerfile b/docker-compose/alerta-server/Dockerfile similarity index 82% rename from docker-compose/alerta-web/Dockerfile rename to docker-compose/alerta-server/Dockerfile index 80431da39da9ddb7ff0c28997660163234eb6d57..04f9bce1233a38a09cea6814b2ce8ac54f30fd84 100644 --- a/docker-compose/alerta-web/Dockerfile +++ b/docker-compose/alerta-server/Dockerfile @@ -9,6 +9,9 @@ RUN bash -c 'source /venv/bin/activate; pip install /tmp/grafana-plugin' COPY lofar-plugin /tmp/lofar-plugin RUN bash -c 'source /venv/bin/activate; pip install /tmp/lofar-plugin' +COPY lofar-routing-plugin /tmp/lofar-routing-plugin +RUN bash -c 'source /venv/bin/activate; pip install /tmp/lofar-routing-plugin' + COPY alertad.conf /app/alertad.conf COPY alerta.conf /app/alerta.conf COPY config.json /web/config.json diff --git a/docker-compose/alerta-web/README.md b/docker-compose/alerta-server/README.md similarity index 100% rename from docker-compose/alerta-web/README.md rename to docker-compose/alerta-server/README.md diff --git a/docker-compose/alerta-web/alerta-secrets.json b/docker-compose/alerta-server/alerta-secrets.json similarity index 100% rename from docker-compose/alerta-web/alerta-secrets.json rename to docker-compose/alerta-server/alerta-secrets.json diff --git a/docker-compose/alerta-web/alerta.conf b/docker-compose/alerta-server/alerta.conf similarity index 100% rename from docker-compose/alerta-web/alerta.conf rename to docker-compose/alerta-server/alerta.conf diff --git a/docker-compose/alerta-web/alertad.conf b/docker-compose/alerta-server/alertad.conf similarity index 75% rename from docker-compose/alerta-web/alertad.conf rename to docker-compose/alerta-server/alertad.conf index dc7b6c2e295ae4230a9373ed26f148d6aad59cd0..b0088c6c2bf8f26fd9cec59a3e12680dcbb1029e 100644 --- a/docker-compose/alerta-web/alertad.conf +++ b/docker-compose/alerta-server/alertad.conf @@ -1,15 +1,22 @@ +import os + DEBUG = True SECRET = "T=&7xvF2S&x7w_JAcq$h1x5ocfA)8H2i" # Allow non-admin views CUSTOMER_VIEWS = True +# Use more advanced ANSI/ISA 18.2 alarm model, +# which does not auto-close alarms and thus +# allows for tracking alarms that came and went. +ALARM_MODEL = "ISA_18_2" + # Never timeout alerts ALERT_TIMEOUT = 0 # Auto unack after a day ACK_TIMEOUT = 24 * 3600 # Auto unshelve after 2 hours -SHELVE_TIMEOUT = 2 * 3600 +SHELVE_TIMEOUT = 7 * 24 * 3600 # Use custom date formats DATE_FORMAT_MEDIUM_DATE = "dd DD/MM HH:mm" @@ -17,10 +24,31 @@ DATE_FORMAT_LONG_DATE = "yyyy-MM-DD HH:mm:ss.sss" # Default overview settings COLUMNS = ['severity', 'status', 'createTime', 'lastReceiveTime', 'resource', 'grafanaDashboardHtml', 'grafanaPanelHtml', 'event', 'text'] -DEFAULT_FILTER = {'status': ['open']} +DEFAULT_FILTER = {'status': ['UNACK', 'RTNUN']} SORT_LIST_BY = "createTime" AUTO_REFRESH_INTERVAL = 5000 # ms +COLOR_MAP = { + 'severity': { + 'Critical': 'red', + 'High': 'orange', + 'Medium': '#FFF380', # corn yellow + 'Low': 'dodgerblue', + 'Advisory': 'lightblue', + 'OK': '#00CC00', # lime green + 'Unknown': 'silver' + }, + 'text': 'black' +} + +# Allow alerta-web to refer to alerta-server for the client +CORS_ORIGINS = [ + 'http://localhost:8081', + 'http://localhost:8082', + os.environ.get("BASE_URL", ""), + os.environ.get("DASHBOARD_URL", ""), +] + # ------------------------------------ # Plugin configuration # ------------------------------------ @@ -28,7 +56,7 @@ AUTO_REFRESH_INTERVAL = 5000 # ms PLUGINS = ['reject', 'blackout', 'acked_by', 'enhance', 'grafana', 'lofar', 'slack'] # Slack plugin settings, see https://github.com/alerta/alerta-contrib/tree/master/plugins/slack -import os, json +import json with open("/run/secrets/alerta-secrets") as secrets_file: secrets = json.load(secrets_file) diff --git a/docker-compose/alerta-web/config.json b/docker-compose/alerta-server/config.json similarity index 100% rename from docker-compose/alerta-web/config.json rename to docker-compose/alerta-server/config.json diff --git a/docker-compose/alerta-web/grafana-plugin/alerta_grafana.py b/docker-compose/alerta-server/grafana-plugin/alerta_grafana.py similarity index 100% rename from docker-compose/alerta-web/grafana-plugin/alerta_grafana.py rename to docker-compose/alerta-server/grafana-plugin/alerta_grafana.py diff --git a/docker-compose/alerta-web/grafana-plugin/setup.py b/docker-compose/alerta-server/grafana-plugin/setup.py similarity index 100% rename from docker-compose/alerta-web/grafana-plugin/setup.py rename to docker-compose/alerta-server/grafana-plugin/setup.py diff --git a/docker-compose/alerta-server/lofar-plugin/alerta_lofar.py b/docker-compose/alerta-server/lofar-plugin/alerta_lofar.py new file mode 100644 index 0000000000000000000000000000000000000000..b227069c8805b0f71aa8438c474d5a9afe5129ac --- /dev/null +++ b/docker-compose/alerta-server/lofar-plugin/alerta_lofar.py @@ -0,0 +1,69 @@ +import os +import json +import logging + +from alerta.plugins import PluginBase +import alerta.models.alarms.isa_18_2 as isa_18_2 + +LOG = logging.getLogger() + + +class EnhanceLOFAR(PluginBase): + """ + Plugin for enhancing alerts with LOFAR-specific information + """ + + @staticmethod + def _fix_severity(alert): + """ + Force conversion of severity to ISA 18.2 model, to allow Alerta to parse the alert. + + For example, the 'prometheus' webhook by default uses the 'warning' severity, + but also users might specify a non-existing severity level. + """ + + if alert.severity not in isa_18_2.SEVERITY_MAP: + # Save original severity + alert.attributes['unparsableSeverity'] = alert.severity + + translation = { + "normal": isa_18_2.OK, + "ok": isa_18_2.OK, + "cleared": isa_18_2.OK, + "warning": isa_18_2.LOW, + "minor": isa_18_2.MEDIUM, + "major": isa_18_2.HIGH, + "critical": isa_18_2.CRITICAL, + } + + alert.severity = translation.get(alert.severity.lower(), isa_18_2.MEDIUM) + + def pre_receive(self, alert, **kwargs): + self._fix_severity(alert) + + # Parse LOFAR-specific fields + for tag in alert.tags: + try: + key, value = tag.split("=", 1) + except ValueError: + continue + + if key == "device": + alert.attributes['lofarDevice'] = value + + if key == "name": + alert.attributes['lofarAttribute'] = value + + if key == "station": + alert.resource = value + + return alert + + def post_receive(self, alert, **kwargs): + return + + def status_change(self, alert, status, text, **kwargs): + return + + def take_action(self, alert, action, text, **kwargs): + raise NotImplementedError diff --git a/docker-compose/alerta-web/lofar-plugin/setup.py b/docker-compose/alerta-server/lofar-plugin/setup.py similarity index 100% rename from docker-compose/alerta-web/lofar-plugin/setup.py rename to docker-compose/alerta-server/lofar-plugin/setup.py diff --git a/docker-compose/alerta-server/lofar-routing-plugin/routing.py b/docker-compose/alerta-server/lofar-routing-plugin/routing.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd9f9e159c5f44bf12cacf17fb926b5db7bdb5a --- /dev/null +++ b/docker-compose/alerta-server/lofar-routing-plugin/routing.py @@ -0,0 +1,72 @@ +import logging + +from alerta.app import alarm_model +from alerta.models.enums import ChangeType + +LOG = logging.getLogger('alerta.plugins.routing') + +# For a description of this interface, +# see https://docs.alerta.io/gettingstarted/tutorial-3-plugins.html?highlight=rules#step-3-route-alerts-to-plugins +def rules(alert, plugins, config): + if alert.previous_severity is None: + # The alert still has to be parsed, and enriched, before it is + # merged into existing alerts. + return rules_prereceive(alert, plugins, config) + else: + # The alert has been processed. Check to which plugins we + # want to send it. + return rules_postreceive(alert, plugins, config) + +def rules_prereceive(alert, plugins, config): + """ Rules to determine which processing filters to use. """ + + # no filtering + return (plugins.values(), {}) + +def _is_new_problem(alert) -> bool: + """ Return whether the state change denotes a newly identified issue + on a system that (as far as the operator knew) was fine before. + + Returns True when detecting NORM -> UNACK transitions, and False + on any duplicates of this transition. + + Note that RTNUN -> UNACK is thus not triggered on. """ + + if alert.status != 'UNACK': + # Only report problems (not ACKing, SHELVing, etc) + return False + elif alert.last_receive_time != alert.update_time: + # Ignore anything that didn't update the alert, + # to avoid triggering on alerts that repeat + # the current situation + return False + else: + # Only report if the previous status was NORM, to avoid + # triggering on (f.e.) RTNUN -> UNACK transitions. + for h in alert.history: # is sorted new -> old + if h.status == alert.status: + # ignore any update that didn't change the status + continue + + return h.status == "NORM" + + # ... or if there was no previous status (a brand new alert) + return True + +def rules_postreceive(alert, plugins, config): + """ Rules to determine which emission methods to use. """ + + # decide whether to notify the user on slack + send_to_slack = _is_new_problem(alert) + + LOG.debug(f"Sending alert {alert.event} with status {alert.status} and severity {alert.previous_severity} => {alert.severity} to slack? {send_to_slack}") + + # filter the plugin list based on these decisions + use_plugins = [] + for name, plugin in plugins.items(): + if name == 'slack' and not send_to_slack: + pass + else: + use_plugins.append(plugin) + + return (use_plugins, {}) diff --git a/docker-compose/alerta-server/lofar-routing-plugin/setup.py b/docker-compose/alerta-server/lofar-routing-plugin/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..038881e14b12d0f0c0ca941fd629a53ac730df75 --- /dev/null +++ b/docker-compose/alerta-server/lofar-routing-plugin/setup.py @@ -0,0 +1,24 @@ + +from setuptools import setup, find_packages + +version = '1.0.0' + +setup( + name="alerta-routing", + version=version, + description='Alerta plugin to configure LOFAR custom alert routing', + url='https://git.astron.nl/lofar2.0/tango', + license='Apache License 2.0', + author='Jan David Mol', + author_email='mol@astron.nl', + packages=find_packages(), + py_modules=['routing'], + include_package_data=True, + zip_safe=True, + entry_points={ + 'alerta.routing': [ + 'rules = routing:rules' + ] + }, + python_requires='>=3.5' +) diff --git a/docker-compose/alerta-web b/docker-compose/alerta-web new file mode 160000 index 0000000000000000000000000000000000000000..9ee69dfbd0e33604169604b5a5cc506d560cb60b --- /dev/null +++ b/docker-compose/alerta-web @@ -0,0 +1 @@ +Subproject commit 9ee69dfbd0e33604169604b5a5cc506d560cb60b diff --git a/docker-compose/alerta-web/lofar-plugin/alerta_lofar.py b/docker-compose/alerta-web/lofar-plugin/alerta_lofar.py deleted file mode 100644 index c4f618d2d6675feab78fce49cedc9f8030766c97..0000000000000000000000000000000000000000 --- a/docker-compose/alerta-web/lofar-plugin/alerta_lofar.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -import json -import logging - -from alerta.plugins import PluginBase - -LOG = logging.getLogger() - - -class EnhanceLOFAR(PluginBase): - """ - Plugin for enhancing alerts with LOFAR-specific information - """ - - def pre_receive(self, alert, **kwargs): - # Parse LOFAR-specific fields - for tag in alert.tags: - try: - key, value = tag.split("=", 1) - except ValueError: - continue - - if key == "device": - alert.attributes['lofarDevice'] = value - - if key == "name": - alert.attributes['lofarAttribute'] = value - - if key == "station": - alert.resource = value - - return alert - - def post_receive(self, alert, **kwargs): - return - - def status_change(self, alert, status, text, **kwargs): - return - - def take_action(self, alert, action, text, **kwargs): - raise NotImplementedError diff --git a/docker-compose/alerta-web/rules.json b/docker-compose/alerta-web/rules.json deleted file mode 100644 index ca8df8cf7b01a4bd014387e045a2492d35292300..0000000000000000000000000000000000000000 --- a/docker-compose/alerta-web/rules.json +++ /dev/null @@ -1 +0,0 @@ -{"test":[{"name":"test2","interval":"10s","rules":[{"expr":"","for":"20s","labels":{"severity":"major"},"annotations":{"__dashboardUid__":"nC8N_kO7k","__panelId__":"9","summary":"My test alert"},"grafana_alert":{"id":3,"orgId":1,"title":"FPGA processing error 2","condition":"B","data":[{"refId":"A","queryType":"","relativeTimeRange":{"from":600,"to":0},"datasourceUid":"ZqArtG97z","model":{"exemplar":false,"expr":"device_attribute{device=\"stat/sdp/1\",name=\"FPGA_error_R\"}","format":"time_series","group":[],"hide":false,"interval":"","intervalMs":1000,"legendFormat":"","maxDataPoints":43200,"metricColumn":"name","rawQuery":true,"rawSql":"SELECT\n data_time AS \"time\",\n x::text,\n device,\n name,\n case when value then 1 else 0 end AS value\nFROM lofar_array_boolean\nWHERE\n $__timeFilter(data_time) AND\n name = 'fpga_error_r'\nORDER BY 1,2","refId":"A","select":[[{"params":["x"],"type":"column"}],[{"params":["value"],"type":"column"}]],"table":"lofar_array_boolean","timeColumn":"data_time","timeColumnType":"timestamptz","where":[{"name":"$__timeFilter","params":[],"type":"macro"},{"datatype":"text","name":"","params":["name","=","'fpga_error_r'"],"type":"expression"}]}},{"refId":"B","queryType":"","relativeTimeRange":{"from":0,"to":0},"datasourceUid":"-100","model":{"conditions":[{"evaluator":{"params":[0,0],"type":"gt"},"operator":{"type":"and"},"query":{"params":[]},"reducer":{"params":[],"type":"avg"},"type":"query"}],"datasource":{"type":"__expr__","uid":"__expr__"},"expression":"A","hide":false,"intervalMs":1000,"maxDataPoints":43200,"reducer":"last","refId":"B","settings":{"mode":"dropNN"},"type":"reduce"}}],"updated":"2022-04-04T14:18:48Z","intervalSeconds":10,"version":1,"uid":"waXdSCynk","namespace_uid":"9DkbdYy7z","namespace_id":6,"rule_group":"test2","no_data_state":"OK","exec_err_state":"Error"}}]},{"name":"test","interval":"10s","rules":[{"expr":"","for":"20s","labels":{"severity":"major"},"annotations":{"__dashboardUid__":"nC8N_kO7k","__panelId__":"9","summary":"My test alert"},"grafana_alert":{"id":2,"orgId":1,"title":"FPGA processing error","condition":"B","data":[{"refId":"A","queryType":"","relativeTimeRange":{"from":600,"to":0},"datasourceUid":"ZqArtG97z","model":{"exemplar":false,"expr":"device_attribute{device=\"stat/sdp/1\",name=\"FPGA_error_R\"}","format":"time_series","group":[],"hide":false,"interval":"","intervalMs":1000,"legendFormat":"","maxDataPoints":43200,"metricColumn":"name","rawQuery":true,"rawSql":"SELECT\n data_time AS \"time\",\n x::text,\n device,\n name,\n case when value then 1 else 0 end AS value\nFROM lofar_array_boolean\nWHERE\n $__timeFilter(data_time) AND\n name = 'fpga_error_r'\nORDER BY 1,2","refId":"A","select":[[{"params":["x"],"type":"column"}],[{"params":["value"],"type":"column"}]],"table":"lofar_array_boolean","timeColumn":"data_time","timeColumnType":"timestamptz","where":[{"name":"$__timeFilter","params":[],"type":"macro"},{"datatype":"text","name":"","params":["name","=","'fpga_error_r'"],"type":"expression"}]}},{"refId":"B","queryType":"","relativeTimeRange":{"from":0,"to":0},"datasourceUid":"-100","model":{"conditions":[{"evaluator":{"params":[0,0],"type":"gt"},"operator":{"type":"and"},"query":{"params":[]},"reducer":{"params":[],"type":"avg"},"type":"query"}],"datasource":{"type":"__expr__","uid":"__expr__"},"expression":"A","hide":false,"intervalMs":1000,"maxDataPoints":43200,"reducer":"last","refId":"B","settings":{"mode":"dropNN"},"type":"reduce"}}],"updated":"2022-04-04T14:16:22Z","intervalSeconds":10,"version":1,"uid":"MIt4Ijs7k","namespace_uid":"9DkbdYy7z","namespace_id":6,"rule_group":"test","no_data_state":"OK","exec_err_state":"Error"}}]}]} \ No newline at end of file diff --git a/docker-compose/alerta.yml b/docker-compose/alerta.yml index 2ae3be42c17e450007914facd2a686c7cce1d63e..f828f1413d034e93b8c855876d647439696c69f3 100644 --- a/docker-compose/alerta.yml +++ b/docker-compose/alerta.yml @@ -5,7 +5,7 @@ volumes: secrets: alerta-secrets: - file: alerta-web/alerta-secrets.json + file: alerta-server/alerta-secrets.json services: alerta-web: @@ -14,7 +14,21 @@ services: networks: - control ports: - - "8081:8080" + - 8081:80 + depends_on: + - alerta-server + command: > + sh -c 'echo {\"endpoint\": \"http://\${HOSTNAME}:8082/api\"} > /usr/share/nginx/html/config.json && + nginx -g "daemon off;"' + restart: always + + alerta-server: + build: alerta-server + container_name: alerta-server + networks: + - control + ports: + - 8082:8080 # NOTE: This exposes an API and a web UI. Ignore the web UI as we replaced it with alerta-web depends_on: - alerta-db secrets: diff --git a/docker-compose/grafana/alerting.json b/docker-compose/grafana/alerting.json index d5193964ae1127c0f76cc60a05dfc8f0dd4e1bf4..bc5c76e7f8870efa52e60e21bf621ae0f1cd8418 100644 --- a/docker-compose/grafana/alerting.json +++ b/docker-compose/grafana/alerting.json @@ -15,7 +15,7 @@ "type": "webhook", "disableResolveMessage": false, "settings": { - "url": "http://alerta-web:8080/api/webhooks/prometheus?api-key=demo-key" + "url": "http://alerta-server:8080/api/webhooks/prometheus?api-key=demo-key" }, "secureFields": {} } diff --git a/docker-compose/grafana/datasources/alertaui.yaml b/docker-compose/grafana/datasources/alertaui.yaml index 8fa7ddcfe36d5b1fcaf04a79a7defe166c26bcf8..7a3b62425a71ddf39642fa5f0fd515f7032170f7 100644 --- a/docker-compose/grafana/datasources/alertaui.yaml +++ b/docker-compose/grafana/datasources/alertaui.yaml @@ -12,7 +12,7 @@ datasources: # <string> custom UID which can be used to reference this datasource in other parts of the configuration, if not specified will be generated automatically uid: alertaui # <string> url - url: http://alerta-web:8080/api + url: http://alerta-server:8080/api # <string> Deprecated, use secureJsonData.password password: # <string> database user, if used diff --git a/tangostationcontrol/docs/source/alerting.rst b/tangostationcontrol/docs/source/alerting.rst index 032bcd379f68d3fa719dc8956334a910bf6227ee..88cc07db4afde1abfff7f2ef7c2a0cf9d2668895 100644 --- a/tangostationcontrol/docs/source/alerting.rst +++ b/tangostationcontrol/docs/source/alerting.rst @@ -103,19 +103,32 @@ The following enhancements are useful to configure for the alerts: - You'll want to alert on a query, followed by a ``Reduce`` step with Function ``Last`` and Mode ``Drop Non-numeric Value``. This triggers the alert on the latest value(s), but keeps the individual array elements separated, - In ``Add details``, the ``Dashboard UID`` and ``Panel ID`` annotations are useful to configure to where you want the user to go, as Grafana will generate hyperlinks from them. To obtain a dashboard uid, go to ``Dashboards -> Browse`` and check out its URL. For the panel id, view a panel and check the URL, - In ``Add details``, the ``Summary`` annotation will be used as the alert description, -- In ``Custom labels``, add ``severity = major`` to raise the severity of the alert (default: warning). See also the `supported values <https://docs.alerta.io/webui/configuration.html#severity-colors>`_. +- In ``Custom labels``, add ``severity = High`` to raise the severity of the alert (default: Low). See also the `supported values <https://github.com/alerta/alerta/blob/master/alerta/models/alarms/isa_18_2.py#L14>`_. Alerta dashboard `````````````````` -The Alerta dashboard (http://localhost:8081) provides an overview of received alerts, which stay in the list until the alert condition disappears, and the alert is explicitly acknowledged or deleted: +The Alerta dashboard (http://localhost:8081) provides an overview of received alerts, according to the ISA 18.2 Alarm Model. It distinguishes the following states: -- *Acknowledging* an alert silences it for a day, -- *Shelving* an alert silences it for 2 hours, and removes it from more overviews, +- ``NORM``: the situation is nominal (any past alarm condition has been acknowledged), +- ``UNACK``: an alarm condition is active, which has not been acknowledged by an operator, +- ``RTNUN``: an alarm condition came and went, but has not been acknowledged by an operator, +- ``ACKED``: an alarm condition is active, and has been acknowledged by an operator. + +Furthermore, the following rarer states are known: + +- ``SHLVD``: the alert is put aside, regardless of its condition, +- ``DSUPR``: the alert is intentionally suppressed, +- ``OOSRV``: the alert concerns something out of service, and thus should be ignored. + +Any alerts stay in the displayed list until the alert condition disappears, *and* the alert is explicitly acknowledged, shelved, or deleted: + +- *Acknowledging* an alert silences it for a day, unless its severity rises, +- *Shelving* an alert silences it for a week, regardless of what happens, - *Watching* an alert means receiving browser notifications on changes, - *Deleting* an alert removes it until Grafana sends it again (default: 10 minutes). -See ``docker-compose/alerta-web/alertad.conf`` for these settings. +See ``docker-compose/alerta-server/alertad.conf`` for these settings. Several installed plugins enhance the received events: @@ -135,9 +148,9 @@ Our Alerta setup is configured to send alerts to Slack. To set this up, you need .. hint:: To obtain the ``OAuth Token`` later on, go to https://api.slack.com/apps, click on your App, and look under ``Install App``. -Now, edit ``docker-compose/alerta-web/alerta-secrets.json``: +Now, edit ``docker-compose/alerta-server/alerta-secrets.json``: -.. literalinclude:: ../../../docker-compose/alerta-web/alerta-secrets.json +.. literalinclude:: ../../../docker-compose/alerta-server/alerta-secrets.json The ``SLACK_TOKEN`` is the ``OAuth Token``, and the ``SLACK_CHANNEL`` is the channel in which to post the alerts.