diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..485dee64bcfb48793379b200a1afd14e85a8aaf4 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea diff --git a/Dockerfile b/Dockerfile index c62e9cb8260b3159f543ee714fc8a3e9588edf2d..85c567fd969a0828cc2daafddf1cffb2cbadef77 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,8 @@ USER root RUN apk --no-cache add curl RUN apk --no-cache add jq +RUN wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64.tar.gz -O - |\ + tar xz && mv yq_linux_amd64 /usr/bin/yq && chmod +x /usr/bin/yq USER grafana @@ -21,15 +23,16 @@ RUN grafana cli plugins install grafana-oncall-app RUN wget https://algenty.github.io/flowcharting-repository/archives/agenty-flowcharting-panel-1.0.0b-SNAPSHOT.zip -O /tmp/agenty-flowcharting-panel.zip RUN cd /var/lib/grafana/plugins/ && unzip /tmp/agenty-flowcharting-panel.zip && mv grafana-flowcharting agenty-flowcharting-panel -COPY grafana.ini /etc/grafana/ -COPY alerting.json /opt/grafana-import/ -COPY rules.json /opt/grafana-import/ -COPY import-rules.sh /opt/grafana-import/ +COPY grafana.ini /etc/grafana/grafana.ini +COPY imports/populate-tokens.sh /opt/grafana-import/populate-tokens.sh # Add default configuration through provisioning (see https://grafana.com/docs/grafana/latest/administration/provisioning) +# https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +COPY alerting /etc/grafana/provisioning/alerting/ COPY datasources /etc/grafana/provisioning/datasources/ -COPY dashboards /var/lib/grafana/dashboards/ -COPY panels /var/lib/grafana/panels/ +COPY station-dashboards.yaml /etc/grafana/provisioning/dashboards/ +COPY dashboards /var/lib/grafana/dashboards/station/station/ +COPY panels /var/lib/grafana/panels/panels/ COPY run-wrapper.sh /run-wrapper.sh ENTRYPOINT ["/run-wrapper.sh"] diff --git a/README.md b/README.md index b66b780f146abf242577e70dffd4034cd9f8f640..d6872777ecbb9fea28ad888801fda6f30a077acb 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,19 @@ These dashboards show the state of a single station. They are tailored to be installed: -* Locally on a station, through https://git.astron.nl/lofar2.0/tango/-/tree/master/docker-compose/grafana +* Locally on a station, through https://git.astron.nl/lofar2.0/tango/-/tree/master/docker/grafana * Centrally to monitor a group of stations, through https://git.astron.nl/lofar2.0/operations-central-management/-/tree/main/grafana-central/dashboards +## Environment variables + +This container uses the following environment variables upon starting for configuration + +1. `SLACK_TOKEN`: Authentication token for slack alarms + ## Datasources -The Grafana installation in this repo exposes and uses the following data sources, as configured in the `dashboards/` directory: +The Grafana installation in this repo exposes and uses the following data sources, +as configured in the `dashboards/` directory: * Prometheus, at `http://prometheus:9090`, serving metrics, * Loki, at `http://loki:3100`, serving logs, @@ -25,6 +32,6 @@ To cover both these use cases, the designer must consider: To deploy changes, they must be: -1. Committed to this repository, +1. Commit to this repository, 2. The submodules in the tango and operations-central-management repositories must link to the new commit, 3. Those repositories need to be redeployed on the stations and centrally, respectively. diff --git a/alerting.json b/alerting.json deleted file mode 100644 index bc5c76e7f8870efa52e60e21bf621ae0f1cd8418..0000000000000000000000000000000000000000 --- a/alerting.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "template_files": {}, - "alertmanager_config": { - "route": { - "receiver": "Alerta", - "repeat_interval": "10m" - }, - "templates": null, - "receivers": [ - { - "name": "Alerta", - "grafana_managed_receiver_configs": [ - { - "name": "Alerta", - "type": "webhook", - "disableResolveMessage": false, - "settings": { - "url": "http://alerta-server:8080/api/webhooks/prometheus?api-key=demo-key" - }, - "secureFields": {} - } - ] - } - ] - } -} diff --git a/alerting/alert-rules.yaml b/alerting/alert-rules.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2039f97feea0202987d46cfcfbed922400c7046 --- /dev/null +++ b/alerting/alert-rules.yaml @@ -0,0 +1,1667 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: Evaluate every minte + folder: station + interval: 1m + rules: + - uid: edloik9y36m0we + title: FPGA Temperatures hba0 + condition: C + data: + - refId: B + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + expr: ds_fpga_temp{host="localhost",family="SDPFirmware", member="HBA0"} != 0 + instant: false + interval: "" + intervalMs: 15000 + legendFormat: '{{x}}' + maxDataPoints: 43200 + range: true + refId: B + - refId: A + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: A + type: reduce + - refId: C + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 6 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "6" + description: The FPGA temperature in hba0 is to high + summary: FPGA temperature in hba0 have been above alarm conditions for the last 5 minutes. + labels: {} + isPaused: false + - uid: fdloko30iiv40a + title: FPGA Temperatures hba1 + condition: C + data: + - refId: B + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + expr: ds_fpga_temp{host="localhost",family="SDPFirmware", member="HBA1"} != 0 + instant: false + interval: "" + intervalMs: 15000 + legendFormat: '{{x}}' + maxDataPoints: 43200 + range: true + refId: B + - refId: A + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: A + type: reduce + - refId: C + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 6 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "6" + summary: FPGA temperature in hba1 have been above alarm conditions for the last 5 minutes. + labels: {} + isPaused: false + - uid: adlokt8acedj4c + title: FPGA Temperatures lba + condition: C + data: + - refId: B + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + expr: ds_fpga_temp{host="localhost",family="SDPFirmware", member="LBA"} != 0 + instant: false + interval: "" + intervalMs: 15000 + legendFormat: '{{x}}' + maxDataPoints: 43200 + range: true + refId: B + - refId: A + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: A + type: reduce + - refId: C + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 6 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "6" + summary: FPGA temperature in lba have been above alarm conditions for the last 5 minutes. + labels: {} + isPaused: false + - uid: fdlon8p6bvri8e + title: Uniboard2 FPGA POL Core Temperatures H0 + condition: H + data: + - refId: A + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: 'ds_unb2_fpga_pol_core_temp{host="localhost",family="UNB2",member="H0"} ' + instant: false + interval: "" + intervalMs: 15000 + legendFormat: Core board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: A + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL Core temperature for H0 has been above alarm conditions for 5 minutes. + summary: FPGA POL Core temperature for H0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: edlonaz69vy80d + title: Uniboard2 FPGA POL Core Temperatures L0 + condition: H + data: + - refId: A + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: 'ds_unb2_fpga_pol_core_temp{host="localhost",family="UNB2",member="L0"} ' + instant: false + interval: "" + intervalMs: 15000 + legendFormat: Core board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: A + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL Core temperature for L0 has been above alarm conditions for 5 minutes. + summary: FPGA POL Core temperature for L0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: edlondwb8j474a + title: Uniboard2 FPGA POL Core Temperatures L1 + condition: H + data: + - refId: A + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: 'ds_unb2_fpga_pol_core_temp{host="localhost",family="UNB2",member="L1"} ' + instant: false + interval: "" + intervalMs: 15000 + legendFormat: Core board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: A + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL Core temperature for L1 has been above alarm conditions for 5 minutes. + summary: FPGA POL Core temperature for L1 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: adlonqgt6crgga + title: Uniboard2 FPGA POL ERAM Temperatures H0 + condition: H + data: + - refId: B + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_eram_temp{host="localhost",family="UNB2",member="H0"} + interval: "" + intervalMs: 15000 + legendFormat: ERAM board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: B + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL ERAM temperature for H0 has been above alarm conditions for 5 minutes. + summary: FPGA POL ERAM temperature for H0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: bdlonzk67asxsb + title: Uniboard2 FPGA POL ERAM Temperatures L0 + condition: H + data: + - refId: B + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_eram_temp{host="localhost",family="UNB2",member="L0"} + instant: false + interval: "" + intervalMs: 15000 + legendFormat: ERAM board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: B + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL ERAM temperature for L0 has been above alarm conditions for 5 minutes. + summary: FPGA POL ERAM temperature for L0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: fdloo6q2sdxq8d + title: Uniboard2 FPGA POL ERAM Temperatures L1 + condition: H + data: + - refId: B + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_eram_temp{host="localhost",family="UNB2",member="L1"} + instant: false + interval: "" + intervalMs: 15000 + legendFormat: ERAM board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: B + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL ERAM temperature for L1 has been above alarm conditions for 5 minutes. + summary: FPGA POL ERAM temperature for L1 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: edlooih1cfpq8d + title: Uniboard2 FPGA POL rxgxb Temperatures H0 + condition: H + data: + - refId: C + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_rxgxb_temp{host="localhost",family="UNB2",member="H0"} + instant: false + interval: "" + intervalMs: 15000 + legendFormat: TrRx board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: C + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: C + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL rxgxb temperature for H0 has been above alarm conditions for 5 minutes. + summary: FPGA POL rxgxb temperature for H0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: fdloojx11up6oe + title: Uniboard2 FPGA POL rxgxb Temperatures L0 + condition: H + data: + - refId: C + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_rxgxb_temp{host="localhost",family="UNB2",member="L0"} + instant: false + interval: "" + intervalMs: 15000 + legendFormat: TrRx board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: C + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: C + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL rxgxb temperature for L0 has been above alarm conditions for 5 minutes. + summary: FPGA POL rxgxb temperature for L0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: cdlop06k1064gf + title: Uniboard2 FPGA POL rxgxb Temperatures L1 + condition: H + data: + - refId: C + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_rxgxb_temp{host="localhost",family="UNB2",member="L1"} + instant: false + interval: "" + intervalMs: 15000 + legendFormat: TrRx board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: C + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: C + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL rxgxb temperature for L1 has been above alarm conditions for 5 minutes. + summary: FPGA POL rxgxb temperature for L1 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: bdlop7ejj1o8wa + title: Uniboard2 FPGA POL pgm Temperatures H0 + condition: H + data: + - refId: E + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_pgm_temp{host="localhost",family="UNB2",member="H0"} + interval: "" + intervalMs: 15000 + legendFormat: IO board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: E + - refId: G + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: E + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL pgm temperature for H0 has been above alarm conditions for 5 minutes. + summary: FPGA POL pgm temperature for H0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: edlopfjxrp43kb + title: Uniboard2 FPGA POL pgm Temperatures L0 + condition: H + data: + - refId: E + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_pgm_temp{host="localhost",family="UNB2",member="L0"} + instant: false + interval: "" + intervalMs: 15000 + legendFormat: IO board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: E + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: E + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL pgm temperature for L0 has been above alarm conditions for 5 minutes. + summary: FPGA POL pgm temperature for L0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: edlopgzduetq8d + title: Uniboard2 FPGA POL pgm Temperatures L1 + condition: H + data: + - refId: E + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: ds_unb2_fpga_pol_pgm_temp{host="localhost",family="UNB2",member="L1"} + instant: false + interval: "" + intervalMs: 15000 + legendFormat: IO board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: E + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: E + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL pgm temperature for L1 has been above alarm conditions for 5 minutes. + summary: FPGA POL pgm temperature for L1 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: bdloppfezhxq8e + title: Uniboard2 FPGA POL hxgb Temperatures H0 + condition: H + data: + - refId: F + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: 'ds_unb2_fpga_pol_hgxb_temp{host="localhost",family="UNB2",member="H0"} ' + interval: "" + intervalMs: 15000 + legendFormat: HGXB board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: F + - refId: G + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: F + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL Core temperature for H0 has been above alarm conditions for 5 minutes. + summary: FPGA POL Core temperature for H0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: bdlopvscihgjkd + title: Uniboard2 FPGA POL hxgb Temperatures L0 + condition: H + data: + - refId: F + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: 'ds_unb2_fpga_pol_hgxb_temp{host="localhost",family="UNB2",member="L0"} ' + instant: false + interval: "" + intervalMs: 15000 + legendFormat: HGXB board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: F + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: F + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL Core temperature for L0 has been above alarm conditions for 5 minutes. + summary: FPGA POL Core temperature for L0 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false + - uid: fdlopwu07dkhsb + title: Uniboard2 FPGA POL hxgb Temperatures L1 + condition: H + data: + - refId: F + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: prometheus + model: + datasource: + type: prometheus + uid: prometheus + editorMode: code + exemplar: true + expr: 'ds_unb2_fpga_pol_hgxb_temp{host="localhost",family="UNB2",member="L1"} ' + instant: false + interval: "" + intervalMs: 15000 + legendFormat: HGXB board {{x}} node {{y}} + maxDataPoints: 43200 + range: true + refId: F + - refId: G + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - G + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: F + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: G + settings: + mode: dropNN + type: reduce + - refId: H + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - H + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: G + intervalMs: 1000 + maxDataPoints: 43200 + refId: H + type: threshold + dashboardUid: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + panelId: 4 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + __dashboardUid__: c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35 + __panelId__: "4" + description: FPGA POL Core temperature for L1 has been above alarm conditions for 5 minutes. + summary: FPGA POL Core temperature for L1 has been above alarm conditions for 5 minutes. + labels: {} + isPaused: false diff --git a/alerting/cp.yaml b/alerting/cp.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bb7e35ff11fb9efcfbb803602c8a94d2693aaa55 --- /dev/null +++ b/alerting/cp.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 +contactPoints: + - orgId: 1 + name: Corne Alarms + receivers: + - uid: slackalarms + type: slack + settings: + recipient: C04411Y8EAU # Corne slack user id + # Dummy token replaced by scripts + token: "xoxb-noop" + disableResolveMessage: true diff --git a/alerting/policies.yaml b/alerting/policies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de84c888cd86e5f15534a36fb613b0bfdb473132 --- /dev/null +++ b/alerting/policies.yaml @@ -0,0 +1,8 @@ +apiVersion: 1 +policies: + - orgId: 1 + receiver: Corne Alarms + group_by: + - grafana_folder + group_interval: 1d + repeat_interval: 2d diff --git a/dashboards/sensors.json b/dashboards/sensors.json index 3e98d652865961b7477090507c04d5a7165a5070..040bd7f82e22ecd8d516c7501e2ff403f88387cb 100644 --- a/dashboards/sensors.json +++ b/dashboards/sensors.json @@ -3,7 +3,10 @@ "list": [ { "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", @@ -20,8 +23,8 @@ }, "editable": true, "fiscalYearStartMonth": 0, - "gnetId": null, "graphTooltip": 0, + "id": 67, "links": [ { "asDropdown": true, @@ -41,7 +44,10 @@ "liveNow": true, "panels": [ { - "datasource": null, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "gridPos": { "h": 1, "w": 24, @@ -49,11 +55,23 @@ "y": 0 }, "id": 16, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], "title": "Temperatures", "type": "row" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "", "fieldConfig": { "defaults": { @@ -61,6 +79,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -72,6 +93,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -116,32 +138,39 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.1.2", "targets": [ { - "exemplar": true, - "expr": "device_attribute{host=\"$station\",device=\"stat/sdpfirmware/$antennafield\",name=\"FPGA_temp_R\"} != 0", - "format": "time_series", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "ds_fpga_temp{host=\"$station\",family=\"SDPFirmware\"} != 0", "hide": false, "instant": false, - "interval": "", "legendFormat": "{{x}}", - "refId": "A" + "range": true, + "refId": "B" } ], "title": "FPGA Temperatures", - "transformations": [], "type": "timeseries" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "", "fieldConfig": { "defaults": { @@ -150,6 +179,9 @@ "seriesBy": "max" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -161,6 +193,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -205,68 +238,108 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", "exemplar": true, - "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_CORE_TEMP_R\"} ", + "expr": "ds_unb2_fpga_pol_core_temp{host=\"$station\",family=\"UNB2\",member=\"$subrack\"} ", "interval": "", "legendFormat": "Core board {{x}} node {{y}}", + "range": true, "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", "exemplar": true, - "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_ERAM_TEMP_R\"} ", + "expr": "ds_unb2_fpga_pol_eram_temp{host=\"$station\",family=\"UNB2\",member=\"$subrack\"}", "hide": false, "interval": "", "legendFormat": "ERAM board {{x}} node {{y}}", + "range": true, "refId": "B" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", "exemplar": true, - "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_RXGXB_TEMP_R\"} ", + "expr": "ds_unb2_fpga_pol_rxgxb_temp{host=\"$station\",family=\"UNB2\",member=\"$subrack\"}", "hide": false, "interval": "", "legendFormat": "TrRx board {{x}} node {{y}}", + "range": true, "refId": "C" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", "exemplar": true, - "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_TXGB_TEMP_R\"} ", + "expr": "ds_unb2_fpga_pol_txgb_temp{host=\"$station\",family=\"UNB2\",member=\"$subrack\"}", "hide": false, "interval": "", "legendFormat": "TrHx board {{x}} node {{y}}", + "range": true, "refId": "D" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", "exemplar": true, - "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_PGM_TEMP_R\"} ", + "expr": "ds_unb2_fpga_pol_pgm_temp{host=\"$station\",family=\"UNB2\",member=\"$subrack\"}", "hide": false, "interval": "", "legendFormat": "IO board {{x}} node {{y}}", + "range": true, "refId": "E" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", "exemplar": true, - "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_HGXB_TEMP_R\"} ", + "expr": "ds_unb2_fpga_pol_hgxb_temp{host=\"$station\",family=\"UNB2\",member=\"$subrack\"} ", "hide": false, "interval": "", "legendFormat": "HGXB board {{x}} node {{y}}", + "range": true, "refId": "F" } ], "title": "Uniboard2 FPGA POL Temperatures", - "transformations": [], "type": "timeseries" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "", "fieldConfig": { "defaults": { @@ -275,6 +348,9 @@ "seriesBy": "max" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -286,6 +362,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -330,15 +407,21 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_QSFP_CAGE_TEMP_R\"}", "interval": "", @@ -350,7 +433,10 @@ "type": "timeseries" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "", "fieldConfig": { "defaults": { @@ -359,6 +445,9 @@ "seriesBy": "max" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -370,6 +459,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -414,15 +504,21 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_DDR4_SLOT_TEMP_R\"}", "interval": "", @@ -434,7 +530,10 @@ "type": "timeseries" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "", "fieldConfig": { "defaults": { @@ -442,6 +541,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -453,6 +555,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -497,16 +600,22 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.1.2", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"$recv\",name=\"RCU_TEMP_R\"}", "format": "time_series", @@ -518,11 +627,13 @@ } ], "title": "RCU Temperatures", - "transformations": [], "type": "timeseries" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "Temperatures reported by APSCT and APSPU", "fieldConfig": { "defaults": { @@ -531,6 +642,9 @@ "seriesBy": "max" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -542,6 +656,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -586,15 +701,21 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/apsct/$subrack\",name=~\"APSCT_TEMP_R\"}", "interval": "", @@ -602,6 +723,10 @@ "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/apspu/$subrack\",name=~\"APSPU_.*_TEMP_R\"}", "hide": false, @@ -614,7 +739,10 @@ "type": "timeseries" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "Temperature sensors of the power supply on each board", "fieldConfig": { "defaults": { @@ -623,6 +751,9 @@ "seriesBy": "max" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -634,6 +765,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -671,22 +803,28 @@ "gridPos": { "h": 8, "w": 5, - "x": 15, + "x": 10, "y": 9 }, "id": 8, "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_QSFP_N01_TEMP_R\"} ", "interval": "", @@ -694,6 +832,10 @@ "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_QSFP_N23_TEMP_R\"} ", "hide": false, @@ -702,6 +844,10 @@ "refId": "B" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_SWITCH_1V2_TEMP_R\"} ", "hide": false, @@ -710,6 +856,10 @@ "refId": "C" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_SWITCH_PHY_TEMP_R\"} ", "hide": false, @@ -718,6 +868,10 @@ "refId": "D" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_CLOCK_TEMP_R\"} ", "hide": false, @@ -726,6 +880,10 @@ "refId": "E" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_DC_DC_48V_12V_TEMP_R\"} ", "hide": false, @@ -739,7 +897,10 @@ }, { "collapsed": true, - "datasource": null, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "gridPos": { "h": 1, "w": 24, @@ -748,11 +909,23 @@ }, "id": 18, "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], "title": "Voltages", "type": "row" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "Voltage sensors of the power supplies of the APS", "fieldConfig": { "defaults": { @@ -761,6 +934,9 @@ "seriesBy": "max" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 10, @@ -774,6 +950,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -818,15 +995,21 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/apspu/$subrack\",name=~\"APSPU_.*_VOUT_R\"}", "interval": "", @@ -838,7 +1021,10 @@ "type": "timeseries" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "Voltage sensors of each node on each board", "fieldConfig": { "defaults": { @@ -846,6 +1032,9 @@ "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 2, @@ -859,6 +1048,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -903,15 +1093,21 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_CORE_VOUT_R\"}", "interval": "", @@ -919,6 +1115,10 @@ "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_ERAM_VOUT_R\"}", "hide": false, @@ -927,6 +1127,10 @@ "refId": "B" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_RXGXB_VOUT_R\"}", "hide": false, @@ -935,6 +1139,10 @@ "refId": "C" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_TXGB_VOUT_R\"}", "hide": false, @@ -943,6 +1151,10 @@ "refId": "D" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_FPGA_POL_PGM_VOUT_R\"}", "hide": false, @@ -955,7 +1167,10 @@ "type": "timeseries" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "Voltage sensors of the power supply on each board", "fieldConfig": { "defaults": { @@ -964,6 +1179,9 @@ "seriesBy": "max" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMin": 0, @@ -976,6 +1194,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1020,15 +1239,21 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_QSFP_N01_VOUT_R\"}", "interval": "", @@ -1036,6 +1261,10 @@ "refId": "A" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_QSFP_N23_VOUT_R\"}", "hide": false, @@ -1044,6 +1273,10 @@ "refId": "B" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_SWITCH_1V2_VOUT_R\"}", "hide": false, @@ -1052,6 +1285,10 @@ "refId": "C" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_SWITCH_PHY_VOUT_R\"}", "hide": false, @@ -1060,6 +1297,10 @@ "refId": "D" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_POL_CLOCK_VOUT_R\"}", "hide": false, @@ -1068,6 +1309,10 @@ "refId": "E" }, { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/unb2/$subrack\",name=\"UNB2_DC_DC_48V_12V_VOUT_R\"}", "hide": false, @@ -1081,7 +1326,10 @@ }, { "collapsed": true, - "datasource": null, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "gridPos": { "h": 1, "w": 24, @@ -1090,11 +1338,23 @@ }, "id": 20, "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A" + } + ], "title": "Clock stability", "type": "row" }, { - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "description": "Measured difference between PTP and PPS", "fieldConfig": { "defaults": { @@ -1102,6 +1362,9 @@ "mode": "thresholds" }, "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -1113,6 +1376,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1135,8 +1399,7 @@ "mode": "absolute", "steps": [ { - "color": "red", - "value": null + "color": "red" }, { "color": "green", @@ -1162,16 +1425,22 @@ "options": { "legend": { "calcs": [], - "displayMode": "hidden", - "placement": "bottom" + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.1.2", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "device_attribute{host=\"$station\",device=\"stat/sdpfirmware/$antennafield\",name=\"TR_tod_pps_delta_R\"}", "format": "time_series", @@ -1183,12 +1452,11 @@ } ], "title": "FPGA Clock offset", - "transformations": [], "type": "timeseries" } ], - "schemaVersion": 31, - "style": "dark", + "refresh": "", + "schemaVersion": 39, "tags": [ "station" ], @@ -1197,14 +1465,14 @@ { "current": { "selected": false, - "text": "test-lcu2", - "value": "test-lcu2" + "text": "localhost", + "value": "localhost" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "definition": "scrape_series_added\n", + "definition": "label_values(ds_state,host)", "description": "", "hide": 0, "includeAll": false, @@ -1213,108 +1481,73 @@ "name": "station", "options": [], "query": { - "query": "scrape_series_added\n", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "/.*host=\"(?<text>[^\"]*)\".*/", - "skipUrlSync": false, - "sort": 1, - "type": "query" - }, - { - "current": {}, - "definition": "label_values(device_attribute{device=\"stat/stationmanager/1\", host=\"$station\", name=\"station_name_R\"},str_value)", - "description": "Station Name retrieved from device-stationmanager", - "hide": 0, - "includeAll": false, - "label": "Station Name", - "multi": false, - "name": "station_name", - "options": [], - "query": { - "query": "label_values(device_attribute{device=\"stat/stationmanager/1\", host=\"$station\", name=\"station_name_R\"},str_value)", + "qryType": 1, + "query": "label_values(ds_state,host)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, - "sort": 0, + "sort": 1, "type": "query" }, { "current": { - "selected": true, - "text": "h0", - "value": "h0" + "selected": false, + "text": "LBA", + "value": "LBA" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "definition": "device_attribute{host=\"$station\", device=~\"stat/apspu/.*\", name=\"State\"}", + "definition": "label_values(ds_state{family=~\"AFL|AFH\", host=\"$station\"},member)", "description": "", "hide": 0, "includeAll": false, - "label": "Subrack", + "label": "Antennafield", "multi": false, - "name": "subrack", + "name": "antennafield", "options": [], "query": { - "query": "device_attribute{host=\"$station\", device=~\"stat/apspu/.*\", name=\"State\"}", - "refId": "StandardVariableQuery" + "qryType": 1, + "query": "label_values(ds_state{family=~\"AFL|AFH\", host=\"$station\"},member)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, - "regex": "/.*device=\"stat/apspu/(?<text>[^\"]*)\".*/", + "regex": "", "skipUrlSync": false, "sort": 1, "type": "query" }, { - "current": {}, + "current": { + "selected": false, + "text": "H0", + "value": "H0" + }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "definition": "device_attribute{host=\"$station\", device=~\"stat/af[hl]/.*\", name=\"State\"}", + "definition": "label_values(ds_state{family=\"APS\", host=\"$station\"},member)", "description": "", "hide": 0, "includeAll": false, - "label": "Antennafield", + "label": "Subrack", "multi": false, - "name": "antennafield", + "name": "subrack", "options": [], "query": { - "query": "device_attribute{host=\"$station\", device=~\"stat/af[hl]/.*\", name=\"State\"}", - "refId": "StandardVariableQuery" + "qryType": 1, + "query": "label_values(ds_state{family=\"APS\", host=\"$station\"},member)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, - "regex": "/.*device=\"stat/af[hl]/(?<text>[^\"]*)\".*/", + "regex": "", "skipUrlSync": false, "sort": 1, "type": "query" - }, - { - "current": { - "selected": false, - "text": "stat/recv/h0", - "value": "stat/recv/h0" - }, - "definition": "device_attribute{host=\"$station\", device=~\"stat/(recv|rcu2l|rcu2h|recvh|recvl)/$subrack\", name=\"State\"}", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "recv", - "options": [], - "query": { - "query": "device_attribute{host=\"$station\", device=~\"stat/(recv|rcu2l|rcu2h|recvh|recvl)/$subrack\", name=\"State\"}", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "/.*device=\"(?<text>[^\"]*)\".*/", - "skipUrlSync": false, - "sort": 0, - "type": "query" } ] }, @@ -1325,6 +1558,7 @@ "timepicker": {}, "timezone": "", "title": "Sensors", - "uid": "KMRmQzd7z", - "version": 1 -} + "uid": "c54bff42-4ff7-4e1d-8bf0-1f1c9cc0ea35", + "version": 11, + "weekStart": "" +} \ No newline at end of file diff --git a/datasources/grafanaapi.yaml b/datasources/grafanaapi.yaml deleted file mode 100644 index a2310cdf2f4432c09581b1f60bbf9ec16a573606..0000000000000000000000000000000000000000 --- a/datasources/grafanaapi.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: 1 - -datasources: - # <string, required> name of the datasource. Required - - name: Grafana API - # <string, required> datasource type. Required - type: yesoreyeram-infinity-datasource - # <string, required> access mode. proxy or direct (Server or Browser in the UI). Required - access: proxy - # <int> org id. will default to orgId 1 if not specified - orgId: 1 - # <string> custom UID which can be used to reference this datasource in other parts of the configuration, if not specified will be generated automatically - uid: grafanaapi - # <string> url - url: http://localhost:3000/api - # <string> Deprecated, use secureJsonData.password - password: - # <string> database user, if used - user: postgres - # <string> database name, if used - database: hdb - # <bool> enable/disable basic auth - basicAuth: false - # <string> basic auth username - basicAuthUser: - # <string> Deprecated, use secureJsonData.basicAuthPassword - basicAuthPassword: - # <bool> enable/disable with credentials headers - withCredentials: - # <bool> mark as default datasource. Max one per org - isDefault: false - # <map> fields that will be converted to json and stored in jsonData - version: 1 - # <bool> allow users to edit datasources from the UI. - editable: false - diff --git a/datasources/loki.yaml b/datasources/loki.yaml index f9108f15f3791de72fb8c80cc24ae156e0bfea73..d98b2971fe70b60aaa274f7c5602cd8076fda70f 100644 --- a/datasources/loki.yaml +++ b/datasources/loki.yaml @@ -13,21 +13,6 @@ datasources: uid: loki # <string> url url: http://loki:3100 - # <string> Deprecated, use secureJsonData.password - password: - # <string> database user, if used - user: - # <string> database name, if used - database: - # <bool> enable/disable basic auth - basicAuth: false - # <string> basic auth username - basicAuthUser: - # <string> Deprecated, use secureJsonData.basicAuthPassword - basicAuthPassword: - # <bool> enable/disable with credentials headers - withCredentials: - # <bool> mark as default datasource. Max one per org isDefault: false # <map> fields that will be converted to json and stored in jsonData jsonData: @@ -37,9 +22,3 @@ datasources: logMessageField: maxConcurrentShardRequests: 5 timeField: "@timestamp" - # <string> json object of data that will be encrypted. - secureJsonData: - version: 1 - # <bool> allow users to edit datasources from the UI. - editable: false - diff --git a/datasources/prometheus.yaml b/datasources/prometheus.yaml index efea8bd474db6eb6c9865c1731be5e4a46c42fcc..eeea0044d73713575ffcec2e8b234210d58f5902 100644 --- a/datasources/prometheus.yaml +++ b/datasources/prometheus.yaml @@ -13,27 +13,4 @@ datasources: uid: prometheus # <string> url url: http://prometheus:9090 - # <string> Deprecated, use secureJsonData.password - password: - # <string> database user, if used - user: - # <string> database name, if used - database: - # <bool> enable/disable basic auth - basicAuth: false - # <string> basic auth username - basicAuthUser: - # <string> Deprecated, use secureJsonData.basicAuthPassword - basicAuthPassword: - # <bool> enable/disable with credentials headers - withCredentials: - # <bool> mark as default datasource. Max one per org isDefault: true - # <map> fields that will be converted to json and stored in jsonData - jsonData: - httpMethod: POST - # <string> json object of data that will be encrypted. - secureJsonData: - version: 1 - # <bool> allow users to edit datasources from the UI. - editable: false diff --git a/import-rules.sh b/import-rules.sh deleted file mode 100755 index b3d5871bd532387a5386b8fdfee0331c0daeddc2..0000000000000000000000000000000000000000 --- a/import-rules.sh +++ /dev/null @@ -1,24 +0,0 @@ -#! /bin/bash - -until curl -s -X GET "http://localhost:3000" -o /dev/null -do - echo "Wait until grafana is ready..." - sleep 5 -done - -# Create API key -API_RESPONSE=`curl -s -X POST -H "Content-Type: application/json" -d '{"name":"apikeycurl", "role": "Admin"}' http://admin:admin@localhost:3000/api/auth/keys` -API_KEY=`echo "$API_RESPONSE" | jq -j '.key'` -API_KEY_ID=`echo "$API_RESPONSE" | jq -j '.id'` - -# Import alerts -curl -X POST http://localhost:3000/api/alertmanager/grafana/config/api/v1/alerts -H 'Content-Type: application/json' -H 'Accept: application/json' -H "Authorization: Bearer $API_KEY" -d '@alerting.json' -curl -X POST http://localhost:3000/api/folders -H 'Content-Type: application/json' -H 'Accept: application/json' -H "Authorization: Bearer $API_KEY" -d '{"title": "station"}' - -# Import station rules -jq -c '.station[]' rules.json | while read rule; do - echo $rule | curl -X POST http://localhost:3000/api/ruler/grafana/api/v1/rules/station -H 'Content-Type: application/json' -H 'Accept: application/json' -H "Authorization: Bearer $API_KEY" -d '@-' -done - -# Cleanup api key -curl -s -X DELETE "http://localhost:3000/api/auth/keys/$API_KEY_ID" -H 'Content-Type: application/json' -H 'Accept: application/json' -H "Authorization: Bearer $API_KEY" \ No newline at end of file diff --git a/imports/README.md b/imports/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fa913d1319a86a0542e21390fe03ec1a477aff87 --- /dev/null +++ b/imports/README.md @@ -0,0 +1,4 @@ +# Imports + +These files are manually copied by the Dockerfile, intended for Alert but not +compatible with alert provisioning (disables admin API) \ No newline at end of file diff --git a/imports/populate-tokens.sh b/imports/populate-tokens.sh new file mode 100755 index 0000000000000000000000000000000000000000..0a350a11c1e93443e840cdb12333f6b15a8499cc --- /dev/null +++ b/imports/populate-tokens.sh @@ -0,0 +1,10 @@ +#! /bin/bash +# Populate tokens from environment + +ALERT_DIR="/etc/grafana/provisioning/alerting/" + +if [[ -n "${SLACK_TOKEN}" ]]; then + yq -i '(.contactPoints[].receivers[] | select(has("type")) | select(.type == "slack")) .settings.token = ("${SLACK_TOKEN}" | envsubst)' ${ALERT_DIR}/cp.yaml +else + echo "SLACK_TOKEN not set alarm notifications for slack will not work!" >&2 +fi diff --git a/rules.json b/rules.json deleted file mode 100644 index b870d4cbc2b8e5708213fd4e4f3d1c5501d4467d..0000000000000000000000000000000000000000 --- a/rules.json +++ /dev/null @@ -1 +0,0 @@ -{"station":[{"name":"FPGA processing error","interval":"10s","rules":[{"expr":"","for":"20s","labels":{"severity":"major"},"annotations":{"__dashboardUid__":"nC8N_kO7k","__panelId__":"9","summary":"One or more FPGAs are unusable."},"grafana_alert":{"id":1,"orgId":1,"title":"FPGA processing error","condition":"B","data":[{"refId":"A","queryType":"","relativeTimeRange":{"from":600,"to":0},"datasourceUid":"timescaledb","model":{"format":"time_series","group":[],"hide":false,"intervalMs":1000,"maxDataPoints":43200,"metricColumn":"none","rawQuery":true,"rawSql":"SELECT\n $__timeGroup(data_time, $__interval),\n x::text,\n device,\n name,\n value\nFROM lofar_array_boolean\nWHERE\n $__timeFilter(data_time) AND\n name = 'fpga_error_r'\nORDER BY 1,2","refId":"A","select":[[{"params":["value_r"],"type":"column"}]],"table":"att_scalar_devdouble","timeColumn":"data_time","timeColumnType":"timestamp","where":[{"name":"$__timeFilter","params":[],"type":"macro"}]}},{"refId":"B","queryType":"","relativeTimeRange":{"from":0,"to":0},"datasourceUid":"-100","model":{"conditions":[{"evaluator":{"params":[0],"type":"gt"},"operator":{"type":"and"},"query":{"params":["A"]},"reducer":{"params":[],"type":"last"},"type":"query"}],"datasource":{"type":"__expr__","uid":"-100"},"expression":"A","hide":false,"intervalMs":1000,"maxDataPoints":43200,"reducer":"last","refId":"B","settings":{"mode":"dropNN"},"type":"reduce"}}],"updated":"2022-04-04T18:01:53Z","intervalSeconds":10,"version":3,"uid":"kujybCynk","namespace_uid":"R_jsbCynz","namespace_id":6,"rule_group":"FPGA processing error","no_data_state":"NoData","exec_err_state":"Alerting"}}]}]} \ No newline at end of file diff --git a/run-wrapper.sh b/run-wrapper.sh index 3eeb0b0dd65fc806bb22280a063aa031f6c48231..65bc03f5f2b057644a36230018915c352b3df8d7 100755 --- a/run-wrapper.sh +++ b/run-wrapper.sh @@ -1,4 +1,6 @@ #! /bin/bash -/opt/grafana-import/import-rules.sh & +echo "Starting grafana, with provisioned alarms" +#/opt/grafana-import/import-rules.sh & # disabled due to incompatibility with alert provisioning +/opt/grafana-import/populate-tokens.sh /run.sh diff --git a/station-dashboards.yaml b/station-dashboards.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a79059a2b1a46f786a961724ff1d09f5d95c8843 --- /dev/null +++ b/station-dashboards.yaml @@ -0,0 +1,45 @@ +apiVersion: 1 + +providers: + # <string> an unique provider name. Required + - name: 'StationControl (panels)' + # <int> Org id. Default to 1 + orgId: 1 + # <string> name of the dashboard folder. + folder: '' + # <string> folder UID. will be automatically generated if not specified + folderUid: '' + # <string> provider type. Default to 'file' + type: file + # <bool> disable dashboard deletion + disableDeletion: true + # <int> how often Grafana will scan for changed dashboards + updateIntervalSeconds: 60 + # <bool> allow updating provisioned dashboards from the UI + allowUiUpdates: true + options: + # <string, required> path to dashboard files on disk. Required when using the 'file' type + path: /var/lib/grafana/panels + # <bool> use folder names from filesystem to create folders in Grafana + foldersFromFilesStructure: true + # <string> an unique provider name. Required + - name: 'StationControl (dashboards)' + # <int> Org id. Default to 1 + orgId: 1 + # <string> name of the dashboard folder. + folder: '' + # <string> folder UID. will be automatically generated if not specified + folderUid: '' + # <string> provider type. Default to 'file' + type: file + # <bool> disable dashboard deletion + disableDeletion: true + # <int> how often Grafana will scan for changed dashboards + updateIntervalSeconds: 60 + # <bool> allow updating provisioned dashboards from the UI + allowUiUpdates: true + options: + # <string, required> path to dashboard files on disk. Required when using the 'file' type + path: /var/lib/grafana/dashboards/station + # <bool> use folder names from filesystem to create folders in Grafana + foldersFromFilesStructure: true