Skip to content
Snippets Groups Projects
Commit b0f38754 authored by Jan David Mol's avatar Jan David Mol
Browse files

Merge branch 'L2SS-766-alerta-isa182' into 'master'

L2SS-766: First stab at moving to the ISA 18.2 alert model

Closes L2SS-766

See merge request !313
parents 0a25f842 4dad2efa
Branches
Tags
1 merge request!313L2SS-766: First stab at moving to the ISA 18.2 alert model
Showing
with 242 additions and 15 deletions
[submodule "tangostationcontrol/tangostationcontrol/toolkit/libhdbpp-python"] [submodule "tangostationcontrol/tangostationcontrol/toolkit/libhdbpp-python"]
path = tangostationcontrol/tangostationcontrol/toolkit/libhdbpp-python path = tangostationcontrol/tangostationcontrol/toolkit/libhdbpp-python
url = https://gitlab.com/tango-controls/hdbpp/libhdbpp-python.git url = https://gitlab.com/tango-controls/hdbpp/libhdbpp-python.git
[submodule "docker-compose/alerta-web"]
path = docker-compose/alerta-web
url = https://github.com/jjdmol/alerta-webui
branch = add-isa-18-2-states
...@@ -9,6 +9,9 @@ RUN bash -c 'source /venv/bin/activate; pip install /tmp/grafana-plugin' ...@@ -9,6 +9,9 @@ RUN bash -c 'source /venv/bin/activate; pip install /tmp/grafana-plugin'
COPY lofar-plugin /tmp/lofar-plugin COPY lofar-plugin /tmp/lofar-plugin
RUN bash -c 'source /venv/bin/activate; pip install /tmp/lofar-plugin' RUN bash -c 'source /venv/bin/activate; pip install /tmp/lofar-plugin'
COPY lofar-routing-plugin /tmp/lofar-routing-plugin
RUN bash -c 'source /venv/bin/activate; pip install /tmp/lofar-routing-plugin'
COPY alertad.conf /app/alertad.conf COPY alertad.conf /app/alertad.conf
COPY alerta.conf /app/alerta.conf COPY alerta.conf /app/alerta.conf
COPY config.json /web/config.json COPY config.json /web/config.json
import os
DEBUG = True DEBUG = True
SECRET = "T=&7xvF2S&x7w_JAcq$h1x5ocfA)8H2i" SECRET = "T=&7xvF2S&x7w_JAcq$h1x5ocfA)8H2i"
# Allow non-admin views # Allow non-admin views
CUSTOMER_VIEWS = True CUSTOMER_VIEWS = True
# Use more advanced ANSI/ISA 18.2 alarm model,
# which does not auto-close alarms and thus
# allows for tracking alarms that came and went.
ALARM_MODEL = "ISA_18_2"
# Never timeout alerts # Never timeout alerts
ALERT_TIMEOUT = 0 ALERT_TIMEOUT = 0
# Auto unack after a day # Auto unack after a day
ACK_TIMEOUT = 24 * 3600 ACK_TIMEOUT = 24 * 3600
# Auto unshelve after 2 hours # Auto unshelve after 2 hours
SHELVE_TIMEOUT = 2 * 3600 SHELVE_TIMEOUT = 7 * 24 * 3600
# Use custom date formats # Use custom date formats
DATE_FORMAT_MEDIUM_DATE = "dd DD/MM HH:mm" DATE_FORMAT_MEDIUM_DATE = "dd DD/MM HH:mm"
...@@ -17,10 +24,31 @@ DATE_FORMAT_LONG_DATE = "yyyy-MM-DD HH:mm:ss.sss" ...@@ -17,10 +24,31 @@ DATE_FORMAT_LONG_DATE = "yyyy-MM-DD HH:mm:ss.sss"
# Default overview settings # Default overview settings
COLUMNS = ['severity', 'status', 'createTime', 'lastReceiveTime', 'resource', 'grafanaDashboardHtml', 'grafanaPanelHtml', 'event', 'text'] COLUMNS = ['severity', 'status', 'createTime', 'lastReceiveTime', 'resource', 'grafanaDashboardHtml', 'grafanaPanelHtml', 'event', 'text']
DEFAULT_FILTER = {'status': ['open']} DEFAULT_FILTER = {'status': ['UNACK', 'RTNUN']}
SORT_LIST_BY = "createTime" SORT_LIST_BY = "createTime"
AUTO_REFRESH_INTERVAL = 5000 # ms AUTO_REFRESH_INTERVAL = 5000 # ms
COLOR_MAP = {
'severity': {
'Critical': 'red',
'High': 'orange',
'Medium': '#FFF380', # corn yellow
'Low': 'dodgerblue',
'Advisory': 'lightblue',
'OK': '#00CC00', # lime green
'Unknown': 'silver'
},
'text': 'black'
}
# Allow alerta-web to refer to alerta-server for the client
CORS_ORIGINS = [
'http://localhost:8081',
'http://localhost:8082',
os.environ.get("BASE_URL", ""),
os.environ.get("DASHBOARD_URL", ""),
]
# ------------------------------------ # ------------------------------------
# Plugin configuration # Plugin configuration
# ------------------------------------ # ------------------------------------
...@@ -28,7 +56,7 @@ AUTO_REFRESH_INTERVAL = 5000 # ms ...@@ -28,7 +56,7 @@ AUTO_REFRESH_INTERVAL = 5000 # ms
PLUGINS = ['reject', 'blackout', 'acked_by', 'enhance', 'grafana', 'lofar', 'slack'] PLUGINS = ['reject', 'blackout', 'acked_by', 'enhance', 'grafana', 'lofar', 'slack']
# Slack plugin settings, see https://github.com/alerta/alerta-contrib/tree/master/plugins/slack # Slack plugin settings, see https://github.com/alerta/alerta-contrib/tree/master/plugins/slack
import os, json import json
with open("/run/secrets/alerta-secrets") as secrets_file: with open("/run/secrets/alerta-secrets") as secrets_file:
secrets = json.load(secrets_file) secrets = json.load(secrets_file)
......
...@@ -3,6 +3,7 @@ import json ...@@ -3,6 +3,7 @@ import json
import logging import logging
from alerta.plugins import PluginBase from alerta.plugins import PluginBase
import alerta.models.alarms.isa_18_2 as isa_18_2
LOG = logging.getLogger() LOG = logging.getLogger()
...@@ -12,7 +13,34 @@ class EnhanceLOFAR(PluginBase): ...@@ -12,7 +13,34 @@ class EnhanceLOFAR(PluginBase):
Plugin for enhancing alerts with LOFAR-specific information Plugin for enhancing alerts with LOFAR-specific information
""" """
@staticmethod
def _fix_severity(alert):
"""
Force conversion of severity to ISA 18.2 model, to allow Alerta to parse the alert.
For example, the 'prometheus' webhook by default uses the 'warning' severity,
but also users might specify a non-existing severity level.
"""
if alert.severity not in isa_18_2.SEVERITY_MAP:
# Save original severity
alert.attributes['unparsableSeverity'] = alert.severity
translation = {
"normal": isa_18_2.OK,
"ok": isa_18_2.OK,
"cleared": isa_18_2.OK,
"warning": isa_18_2.LOW,
"minor": isa_18_2.MEDIUM,
"major": isa_18_2.HIGH,
"critical": isa_18_2.CRITICAL,
}
alert.severity = translation.get(alert.severity.lower(), isa_18_2.MEDIUM)
def pre_receive(self, alert, **kwargs): def pre_receive(self, alert, **kwargs):
self._fix_severity(alert)
# Parse LOFAR-specific fields # Parse LOFAR-specific fields
for tag in alert.tags: for tag in alert.tags:
try: try:
......
import logging
from alerta.app import alarm_model
from alerta.models.enums import ChangeType
LOG = logging.getLogger('alerta.plugins.routing')
# For a description of this interface,
# see https://docs.alerta.io/gettingstarted/tutorial-3-plugins.html?highlight=rules#step-3-route-alerts-to-plugins
def rules(alert, plugins, config):
if alert.previous_severity is None:
# The alert still has to be parsed, and enriched, before it is
# merged into existing alerts.
return rules_prereceive(alert, plugins, config)
else:
# The alert has been processed. Check to which plugins we
# want to send it.
return rules_postreceive(alert, plugins, config)
def rules_prereceive(alert, plugins, config):
""" Rules to determine which processing filters to use. """
# no filtering
return (plugins.values(), {})
def _is_new_problem(alert) -> bool:
""" Return whether the state change denotes a newly identified issue
on a system that (as far as the operator knew) was fine before.
Returns True when detecting NORM -> UNACK transitions, and False
on any duplicates of this transition.
Note that RTNUN -> UNACK is thus not triggered on. """
if alert.status != 'UNACK':
# Only report problems (not ACKing, SHELVing, etc)
return False
elif alert.last_receive_time != alert.update_time:
# Ignore anything that didn't update the alert,
# to avoid triggering on alerts that repeat
# the current situation
return False
else:
# Only report if the previous status was NORM, to avoid
# triggering on (f.e.) RTNUN -> UNACK transitions.
for h in alert.history: # is sorted new -> old
if h.status == alert.status:
# ignore any update that didn't change the status
continue
return h.status == "NORM"
# ... or if there was no previous status (a brand new alert)
return True
def rules_postreceive(alert, plugins, config):
""" Rules to determine which emission methods to use. """
# decide whether to notify the user on slack
send_to_slack = _is_new_problem(alert)
LOG.debug(f"Sending alert {alert.event} with status {alert.status} and severity {alert.previous_severity} => {alert.severity} to slack? {send_to_slack}")
# filter the plugin list based on these decisions
use_plugins = []
for name, plugin in plugins.items():
if name == 'slack' and not send_to_slack:
pass
else:
use_plugins.append(plugin)
return (use_plugins, {})
from setuptools import setup, find_packages
version = '1.0.0'
setup(
name="alerta-routing",
version=version,
description='Alerta plugin to configure LOFAR custom alert routing',
url='https://git.astron.nl/lofar2.0/tango',
license='Apache License 2.0',
author='Jan David Mol',
author_email='mol@astron.nl',
packages=find_packages(),
py_modules=['routing'],
include_package_data=True,
zip_safe=True,
entry_points={
'alerta.routing': [
'rules = routing:rules'
]
},
python_requires='>=3.5'
)
Subproject commit 9ee69dfbd0e33604169604b5a5cc506d560cb60b
{"test":[{"name":"test2","interval":"10s","rules":[{"expr":"","for":"20s","labels":{"severity":"major"},"annotations":{"__dashboardUid__":"nC8N_kO7k","__panelId__":"9","summary":"My test alert"},"grafana_alert":{"id":3,"orgId":1,"title":"FPGA processing error 2","condition":"B","data":[{"refId":"A","queryType":"","relativeTimeRange":{"from":600,"to":0},"datasourceUid":"ZqArtG97z","model":{"exemplar":false,"expr":"device_attribute{device=\"stat/sdp/1\",name=\"FPGA_error_R\"}","format":"time_series","group":[],"hide":false,"interval":"","intervalMs":1000,"legendFormat":"","maxDataPoints":43200,"metricColumn":"name","rawQuery":true,"rawSql":"SELECT\n data_time AS \"time\",\n x::text,\n device,\n name,\n case when value then 1 else 0 end AS value\nFROM lofar_array_boolean\nWHERE\n $__timeFilter(data_time) AND\n name = 'fpga_error_r'\nORDER BY 1,2","refId":"A","select":[[{"params":["x"],"type":"column"}],[{"params":["value"],"type":"column"}]],"table":"lofar_array_boolean","timeColumn":"data_time","timeColumnType":"timestamptz","where":[{"name":"$__timeFilter","params":[],"type":"macro"},{"datatype":"text","name":"","params":["name","=","'fpga_error_r'"],"type":"expression"}]}},{"refId":"B","queryType":"","relativeTimeRange":{"from":0,"to":0},"datasourceUid":"-100","model":{"conditions":[{"evaluator":{"params":[0,0],"type":"gt"},"operator":{"type":"and"},"query":{"params":[]},"reducer":{"params":[],"type":"avg"},"type":"query"}],"datasource":{"type":"__expr__","uid":"__expr__"},"expression":"A","hide":false,"intervalMs":1000,"maxDataPoints":43200,"reducer":"last","refId":"B","settings":{"mode":"dropNN"},"type":"reduce"}}],"updated":"2022-04-04T14:18:48Z","intervalSeconds":10,"version":1,"uid":"waXdSCynk","namespace_uid":"9DkbdYy7z","namespace_id":6,"rule_group":"test2","no_data_state":"OK","exec_err_state":"Error"}}]},{"name":"test","interval":"10s","rules":[{"expr":"","for":"20s","labels":{"severity":"major"},"annotations":{"__dashboardUid__":"nC8N_kO7k","__panelId__":"9","summary":"My test alert"},"grafana_alert":{"id":2,"orgId":1,"title":"FPGA processing error","condition":"B","data":[{"refId":"A","queryType":"","relativeTimeRange":{"from":600,"to":0},"datasourceUid":"ZqArtG97z","model":{"exemplar":false,"expr":"device_attribute{device=\"stat/sdp/1\",name=\"FPGA_error_R\"}","format":"time_series","group":[],"hide":false,"interval":"","intervalMs":1000,"legendFormat":"","maxDataPoints":43200,"metricColumn":"name","rawQuery":true,"rawSql":"SELECT\n data_time AS \"time\",\n x::text,\n device,\n name,\n case when value then 1 else 0 end AS value\nFROM lofar_array_boolean\nWHERE\n $__timeFilter(data_time) AND\n name = 'fpga_error_r'\nORDER BY 1,2","refId":"A","select":[[{"params":["x"],"type":"column"}],[{"params":["value"],"type":"column"}]],"table":"lofar_array_boolean","timeColumn":"data_time","timeColumnType":"timestamptz","where":[{"name":"$__timeFilter","params":[],"type":"macro"},{"datatype":"text","name":"","params":["name","=","'fpga_error_r'"],"type":"expression"}]}},{"refId":"B","queryType":"","relativeTimeRange":{"from":0,"to":0},"datasourceUid":"-100","model":{"conditions":[{"evaluator":{"params":[0,0],"type":"gt"},"operator":{"type":"and"},"query":{"params":[]},"reducer":{"params":[],"type":"avg"},"type":"query"}],"datasource":{"type":"__expr__","uid":"__expr__"},"expression":"A","hide":false,"intervalMs":1000,"maxDataPoints":43200,"reducer":"last","refId":"B","settings":{"mode":"dropNN"},"type":"reduce"}}],"updated":"2022-04-04T14:16:22Z","intervalSeconds":10,"version":1,"uid":"MIt4Ijs7k","namespace_uid":"9DkbdYy7z","namespace_id":6,"rule_group":"test","no_data_state":"OK","exec_err_state":"Error"}}]}]}
\ No newline at end of file
...@@ -5,7 +5,7 @@ volumes: ...@@ -5,7 +5,7 @@ volumes:
secrets: secrets:
alerta-secrets: alerta-secrets:
file: alerta-web/alerta-secrets.json file: alerta-server/alerta-secrets.json
services: services:
alerta-web: alerta-web:
...@@ -14,7 +14,21 @@ services: ...@@ -14,7 +14,21 @@ services:
networks: networks:
- control - control
ports: ports:
- "8081:8080" - 8081:80
depends_on:
- alerta-server
command: >
sh -c 'echo {\"endpoint\": \"http://\${HOSTNAME}:8082/api\"} > /usr/share/nginx/html/config.json &&
nginx -g "daemon off;"'
restart: always
alerta-server:
build: alerta-server
container_name: alerta-server
networks:
- control
ports:
- 8082:8080 # NOTE: This exposes an API and a web UI. Ignore the web UI as we replaced it with alerta-web
depends_on: depends_on:
- alerta-db - alerta-db
secrets: secrets:
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
"type": "webhook", "type": "webhook",
"disableResolveMessage": false, "disableResolveMessage": false,
"settings": { "settings": {
"url": "http://alerta-web:8080/api/webhooks/prometheus?api-key=demo-key" "url": "http://alerta-server:8080/api/webhooks/prometheus?api-key=demo-key"
}, },
"secureFields": {} "secureFields": {}
} }
......
...@@ -12,7 +12,7 @@ datasources: ...@@ -12,7 +12,7 @@ datasources:
# <string> custom UID which can be used to reference this datasource in other parts of the configuration, if not specified will be generated automatically # <string> custom UID which can be used to reference this datasource in other parts of the configuration, if not specified will be generated automatically
uid: alertaui uid: alertaui
# <string> url # <string> url
url: http://alerta-web:8080/api url: http://alerta-server:8080/api
# <string> Deprecated, use secureJsonData.password # <string> Deprecated, use secureJsonData.password
password: password:
# <string> database user, if used # <string> database user, if used
......
...@@ -103,19 +103,32 @@ The following enhancements are useful to configure for the alerts: ...@@ -103,19 +103,32 @@ The following enhancements are useful to configure for the alerts:
- You'll want to alert on a query, followed by a ``Reduce`` step with Function ``Last`` and Mode ``Drop Non-numeric Value``. This triggers the alert on the latest value(s), but keeps the individual array elements separated, - You'll want to alert on a query, followed by a ``Reduce`` step with Function ``Last`` and Mode ``Drop Non-numeric Value``. This triggers the alert on the latest value(s), but keeps the individual array elements separated,
- In ``Add details``, the ``Dashboard UID`` and ``Panel ID`` annotations are useful to configure to where you want the user to go, as Grafana will generate hyperlinks from them. To obtain a dashboard uid, go to ``Dashboards -> Browse`` and check out its URL. For the panel id, view a panel and check the URL, - In ``Add details``, the ``Dashboard UID`` and ``Panel ID`` annotations are useful to configure to where you want the user to go, as Grafana will generate hyperlinks from them. To obtain a dashboard uid, go to ``Dashboards -> Browse`` and check out its URL. For the panel id, view a panel and check the URL,
- In ``Add details``, the ``Summary`` annotation will be used as the alert description, - In ``Add details``, the ``Summary`` annotation will be used as the alert description,
- In ``Custom labels``, add ``severity = major`` to raise the severity of the alert (default: warning). See also the `supported values <https://docs.alerta.io/webui/configuration.html#severity-colors>`_. - In ``Custom labels``, add ``severity = High`` to raise the severity of the alert (default: Low). See also the `supported values <https://github.com/alerta/alerta/blob/master/alerta/models/alarms/isa_18_2.py#L14>`_.
Alerta dashboard Alerta dashboard
`````````````````` ``````````````````
The Alerta dashboard (http://localhost:8081) provides an overview of received alerts, which stay in the list until the alert condition disappears, and the alert is explicitly acknowledged or deleted: The Alerta dashboard (http://localhost:8081) provides an overview of received alerts, according to the ISA 18.2 Alarm Model. It distinguishes the following states:
- *Acknowledging* an alert silences it for a day, - ``NORM``: the situation is nominal (any past alarm condition has been acknowledged),
- *Shelving* an alert silences it for 2 hours, and removes it from more overviews, - ``UNACK``: an alarm condition is active, which has not been acknowledged by an operator,
- ``RTNUN``: an alarm condition came and went, but has not been acknowledged by an operator,
- ``ACKED``: an alarm condition is active, and has been acknowledged by an operator.
Furthermore, the following rarer states are known:
- ``SHLVD``: the alert is put aside, regardless of its condition,
- ``DSUPR``: the alert is intentionally suppressed,
- ``OOSRV``: the alert concerns something out of service, and thus should be ignored.
Any alerts stay in the displayed list until the alert condition disappears, *and* the alert is explicitly acknowledged, shelved, or deleted:
- *Acknowledging* an alert silences it for a day, unless its severity rises,
- *Shelving* an alert silences it for a week, regardless of what happens,
- *Watching* an alert means receiving browser notifications on changes, - *Watching* an alert means receiving browser notifications on changes,
- *Deleting* an alert removes it until Grafana sends it again (default: 10 minutes). - *Deleting* an alert removes it until Grafana sends it again (default: 10 minutes).
See ``docker-compose/alerta-web/alertad.conf`` for these settings. See ``docker-compose/alerta-server/alertad.conf`` for these settings.
Several installed plugins enhance the received events: Several installed plugins enhance the received events:
...@@ -135,9 +148,9 @@ Our Alerta setup is configured to send alerts to Slack. To set this up, you need ...@@ -135,9 +148,9 @@ Our Alerta setup is configured to send alerts to Slack. To set this up, you need
.. hint:: To obtain the ``OAuth Token`` later on, go to https://api.slack.com/apps, click on your App, and look under ``Install App``. .. hint:: To obtain the ``OAuth Token`` later on, go to https://api.slack.com/apps, click on your App, and look under ``Install App``.
Now, edit ``docker-compose/alerta-web/alerta-secrets.json``: Now, edit ``docker-compose/alerta-server/alerta-secrets.json``:
.. literalinclude:: ../../../docker-compose/alerta-web/alerta-secrets.json .. literalinclude:: ../../../docker-compose/alerta-server/alerta-secrets.json
The ``SLACK_TOKEN`` is the ``OAuth Token``, and the ``SLACK_CHANNEL`` is the channel in which to post the alerts. The ``SLACK_TOKEN`` is the ``OAuth Token``, and the ``SLACK_CHANNEL`` is the channel in which to post the alerts.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment