Skip to content
Snippets Groups Projects
Commit de47c6d6 authored by Jan David Mol's avatar Jan David Mol
Browse files

L2SS-359: Add grafana, prometheus, and a tango->prometheus bridge.

parent 8c8a685f
No related branches found
No related tags found
1 merge request!106Resolve L2SS-359 "Add grafana"
Showing
with 722 additions and 0 deletions
...@@ -23,3 +23,4 @@ services: ...@@ -23,3 +23,4 @@ services:
- grafana-configs:/etc/grafana - grafana-configs:/etc/grafana
ports: ports:
- "3000:3000" - "3000:3000"
restart: unless-stopped
FROM grafana/grafana FROM grafana/grafana
# To populate the Grafana configuration:
#
# Datasources (thanks to https://rmoff.net/2017/08/08/simple-export/import-of-data-sources-in-grafana/):
#
# Import:
#
# for i in data_sources/*; do \
# curl -X "POST" "http://localhost:3000/api/datasources" \
# -H "Content-Type: application/json" \
# --user admin:admin \
# --data-binary @$i
# done
#
# Export:
#
# mkdir -p data_sources && curl -s "http://localhost:3000/api/datasources" -u admin:admin|jq -c -M '.[]'|split -l 1 - data_sources/
#
# Dashboards:
#
# Import: http://localhost:3000/dashboard/import
# Export: "share" icon next to dashboard title -> "Export"
#
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 1,
"links": [],
"panels": [
{
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "green",
"index": 1,
"text": "ON"
},
"1": {
"color": "purple",
"index": 3,
"text": "OFF"
},
"7": {
"color": "yellow",
"index": 2,
"text": "STANDBY"
},
"8": {
"color": "red",
"index": 0,
"text": "FAULT"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "string"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "value_and_name"
},
"pluginVersion": "8.1.2",
"targets": [
{
"exemplar": true,
"expr": "device_attribute{label=\"State\"}",
"instant": true,
"interval": "",
"legendFormat": "{{device}}",
"refId": "A"
}
],
"title": "Device States",
"type": "stat"
},
{
"datasource": "Prometheus",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "continuous-GrYlRd"
},
"custom": {
"fillOpacity": 70,
"lineWidth": 1
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 5,
"x": 0,
"y": 8
},
"id": 2,
"options": {
"colWidth": 0.9,
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"rowHeight": 0.9,
"showValue": "never",
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "8.1.2",
"targets": [
{
"exemplar": true,
"expr": "100*device_attribute{device=\"lts/sdp/1\",name=\"TR_fpga_communication_error_R\"}",
"format": "time_series",
"hide": false,
"interval": "",
"legendFormat": "{{x}}",
"refId": "A"
}
],
"title": "FPGA communication errors",
"transformations": [],
"type": "status-history"
},
{
"datasource": "Prometheus",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "continuous-GrYlRd"
},
"custom": {
"fillOpacity": 70,
"lineWidth": 1
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 5,
"x": 5,
"y": 8
},
"id": 6,
"options": {
"colWidth": 0.9,
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"rowHeight": 0.9,
"showValue": "never",
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "8.1.2",
"targets": [
{
"exemplar": true,
"expr": "100-100*device_attribute{device=\"lts/sdp/1\",name=\"FPGA_processing_enable_R\"}",
"format": "time_series",
"hide": false,
"interval": "",
"legendFormat": "{{x}}",
"refId": "A"
}
],
"title": "FPGA processing enabled",
"transformations": [],
"type": "status-history"
},
{
"datasource": "Prometheus",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 5,
"x": 10,
"y": 8
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "8.1.2",
"targets": [
{
"exemplar": true,
"expr": "device_attribute{device=\"lts/sdp/1\",name=\"FPGA_temp_R\"}",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"legendFormat": "{{x}}",
"refId": "A"
}
],
"title": "FPGA temperatures",
"transformations": [],
"type": "timeseries"
}
],
"schemaVersion": 30,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "LOFAR2.0 Station",
"uid": "jCzG0a47z",
"version": 3
}
{"id":4,"uid":"6W2nM-Vnz","orgId":1,"name":"Prometheus","type":"prometheus","typeName":"Prometheus","typeLogoUrl":"public/app/plugins/datasource/prometheus/img/prometheus_logo.svg","access":"proxy","url":"prometheus:9090","password":"","user":"","database":"","basicAuth":false,"isDefault":false,"jsonData":{"httpMethod":"POST"},"readOnly":false}
{"id":2,"uid":"d5_heb47k","orgId":1,"name":"TangoDB","type":"mysql","typeName":"MySQL","typeLogoUrl":"public/app/plugins/datasource/mysql/img/mysql_logo.svg","access":"proxy","url":"tangodb","password":"","user":"tango","database":"tango","basicAuth":false,"isDefault":true,"jsonData":{"timezone":""},"readOnly":false}
#
# Docker compose file that launches Prometheus
#
# Defines:
# - prometheus: Prometheus
#
version: '2'
services:
prometheus:
image: prometheus
build:
context: prometheus
container_name: ${CONTAINER_NAME_PREFIX}prometheus
networks:
- control
ports:
- "9090:9090"
restart: unless-stopped
FROM prom/prometheus
COPY prometheus.yml /etc/prometheus/prometheus.yml
global:
evaluation_interval: 5s
scrape_interval: 5s
scrape_timeout: 5s
scrape_configs:
- job_name: tango
static_configs:
- targets:
- "tango-prometheus-exporter:8000"
#
# Docker compose file that launches the Tango -> Prometheus adapter
#
version: '2'
services:
tango-prometheus-exporter:
build:
context: tango-prometheus-exporter
container_name: ${CONTAINER_NAME_PREFIX}tango-prometheus-exporter
networks:
- control
environment:
- TANGO_HOST=${TANGO_HOST}
ports:
- "8000:8000"
restart: unless-stopped
FROM tangocs/tango-pytango
USER root
RUN apt-get update && apt-get install curl -y
USER tango
ADD code /code
RUN pip install -r /code/pip-requirements.txt
WORKDIR /code
ENV PYTHONPATH '/code/'
CMD ["python" , "/code/collector.py"]
\ No newline at end of file
Copyright 2020 INAF Matteo Di Carlo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
NAME:=tango-exporter
VERSION:=1.0.2
TAG:=$(VERSION)
include ../make/Makefile.mk
\ No newline at end of file
import time
import argparse
from prometheus_client.core import GaugeMetricFamily, REGISTRY, CounterMetricFamily
from prometheus_client import start_http_server
from tango import Database, DeviceProxy, CmdArgType as ArgType, AttrDataFormat
class CustomCollector(object):
def __init__(self):
self.replicas=1
self.replica_id=0
try:
self.db = Database()
except:
pass
def add_to_metric(self, dev, attr_info, metric):
if(attr_info.data_type == ArgType.DevShort or attr_info.data_type == ArgType.DevLong or
attr_info.data_type == ArgType.DevUShort or attr_info.data_type == ArgType.DevULong or
attr_info.data_type == ArgType.DevLong64 or attr_info.data_type == ArgType.DevULong64 or
attr_info.data_type == ArgType.DevInt or attr_info.data_type == ArgType.DevFloat or
attr_info.data_type == ArgType.DevDouble):
attr_value = dev.read_attribute(attr_info.name)
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, '', 'float', str(attr_value.dim_x), str(attr_value.dim_y), '0', '0'], float(attr_value.value))
return 1
elif(attr_info.data_type == ArgType.DevBoolean):
attr_value = dev.read_attribute(attr_info.name)
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, '','bool', str(attr_value.dim_x), str(attr_value.dim_y), '0', '0'], int(attr_value.value))
return 1
elif(attr_info.data_type == ArgType.DevString):
attr_value = dev.read_attribute(attr_info.name)
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, str(attr_value.value),'string', str(attr_value.dim_x), str(attr_value.dim_y), '0', '0'], 1)
return 1
elif(attr_info.data_type == ArgType.DevEnum):
attr_config = dev.get_attribute_config(attr_info.name)
attr_value = dev.read_attribute(attr_info.name)
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, str(attr_config.enum_labels[attr_value.value]),'enum', str(attr_value.dim_x), str(attr_value.dim_y), '0', '0'], int(attr_value.value))
return 1
elif(attr_info.data_type == ArgType.DevState):
attr_value = dev.read_attribute(attr_info.name)
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, '' ,'state', str(attr_value.dim_x), str(attr_value.dim_y), '0', '0'], int(attr_value.value))
return 1
else:
return 0
def add_to_metric_spectrum(self, dev, attr_info, metric):
attr_value = dev.read_attribute(attr_info.name)
for x in range(int(attr_value.dim_x)):
if(attr_info.data_type == ArgType.DevShort or attr_info.data_type == ArgType.DevLong or
attr_info.data_type == ArgType.DevUShort or attr_info.data_type == ArgType.DevULong or
attr_info.data_type == ArgType.DevLong64 or attr_info.data_type == ArgType.DevULong64 or
attr_info.data_type == ArgType.DevInt or attr_info.data_type == ArgType.DevFloat or
attr_info.data_type == ArgType.DevDouble):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, '', 'float', str(attr_value.dim_x), str(attr_value.dim_y), str(x), '0'], float(attr_value.value[x]))
elif(attr_info.data_type == ArgType.DevBoolean):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, '','bool', str(attr_value.dim_x), str(attr_value.dim_y), str(x), '0'], int(attr_value.value[x]))
elif(attr_info.data_type == ArgType.DevString):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, str(attr_value.value[x]),'string', str(attr_value.dim_x), str(attr_value.dim_y), str(x), '0'], 1)
elif(attr_info.data_type == ArgType.DevEnum):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, str(attr_value.value[x]),'enum', str(attr_value.dim_x), str(attr_value.dim_y), str(x), '0'], int(attr_value.value[x]))
elif(attr_info.data_type == ArgType.DevState):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, str(attr_value.value[x]),'state', str(attr_value.dim_x), str(attr_value.dim_y), str(x), '0'], int(attr_value.value[x]))
else:
pass
return 1
def add_to_metric_image(self, dev, attr_info, metric):
attr_value = dev.read_attribute(attr_info.name)
for y in range(int(attr_value.dim_y)):
for x in range(int(attr_value.dim_x)):
if(attr_info.data_type == ArgType.DevShort or attr_info.data_type == ArgType.DevLong or
attr_info.data_type == ArgType.DevUShort or attr_info.data_type == ArgType.DevULong or
attr_info.data_type == ArgType.DevLong64 or attr_info.data_type == ArgType.DevULong64 or
attr_info.data_type == ArgType.DevInt or attr_info.data_type == ArgType.DevFloat or
attr_info.data_type == ArgType.DevDouble):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, '', 'float', str(attr_value.dim_x), str(attr_value.dim_y), str(x), str(y)], float(attr_value.value[y][x]))
elif(attr_info.data_type == ArgType.DevBoolean):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, '','bool', str(attr_value.dim_x), str(attr_value.dim_y), str(x), str(y)], int(attr_value.value[y][x]))
elif(attr_info.data_type == ArgType.DevString):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, str(attr_value.value[y][x]),'string', str(attr_value.dim_x), str(attr_value.dim_y), str(x), str(y)], 1)
elif(attr_info.data_type == ArgType.DevEnum):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, str(attr_value.value[y][x]),'enum', str(attr_value.dim_x), str(attr_value.dim_y), str(x), str(y)], int(attr_value.value[y][x]))
elif(attr_info.data_type == ArgType.DevState):
metric.add_metric([dev.dev_name(), attr_info.name, attr_info.label, str(attr_value.value[y][x]),'state', str(attr_value.dim_x), str(attr_value.dim_y), str(x), str(y)], int(attr_value.value[y][x]))
else:
pass
return 1
def collect(self):
attribute_metrics = GaugeMetricFamily("device_attribute", 'Device attribute value', labels=['device', 'name', 'label', 'str_value', 'type', 'dim_x', 'dim_y', 'x', 'y'])
total_count = 0
read_count = 0
error_count = 0
error_attr_count = 0
scalar_count = 0
spectrum_count = 0
image_count = 0
not_managed_attribute_count = 0
try:
server_list = self.db.get_server_list()
except:
try:
self.db = Database()
except:
sys_errors = GaugeMetricFamily("system_errors_count", 'Total number of system errors acccessing the TANGO system')
sys_errors.add_metric([], 1)
return sys_errors
count = len(server_list) / self.replicas # 15,8
i = int(count * self.replica_id) # 0 15,8 31,6 47,4 63,2 -> 0 15 31 47 63
count = int(count) + i # 15 31 47 63
if(self.replicas-1 == self.replica_id):
count = len(server_list) # 79
#print("i=" + str(i) +",count="+str(count))
while i < count:
# https://pytango.readthedocs.io/en/stable/database.html#tango.Database.get_device_class_list
class_list = self.db.get_device_class_list(server_list[i])
j = 0
while j < len(class_list):
try:
if "dserver" in class_list[j]:
j += 2
continue
dev = DeviceProxy(class_list[j])
#print(class_list[j])
dev.set_timeout_millis(10)
attr_list = dev.attribute_list_query()
for attr_info in attr_list:
try:
#print(" " +attr_info.name)
total_count += 1
# 1: tango._tango.CmdArgType.DevBoolean,
# 2: tango._tango.CmdArgType.DevShort,
# 3: tango._tango.CmdArgType.DevLong,
# 4: tango._tango.CmdArgType.DevFloat,
# 5: tango._tango.CmdArgType.DevDouble,
# 6: tango._tango.CmdArgType.DevUShort,
# 7: tango._tango.CmdArgType.DevULong,
# 8: tango._tango.CmdArgType.DevString,
# 19: tango._tango.CmdArgType.DevState,
# 23: tango._tango.CmdArgType.DevLong64,
# 24: tango._tango.CmdArgType.DevULong64,
# 27: tango._tango.CmdArgType.DevInt,
# 29: tango._tango.CmdArgType.DevEnum,
if(attr_info.data_format == AttrDataFormat.SCALAR):
res = self.add_to_metric(dev, attr_info, attribute_metrics)
if(res > 0):
read_count = read_count + res
scalar_count += 1
else:
# {0: tango._tango.CmdArgType.DevVoid,
# 28: tango._tango.CmdArgType.DevEncoded,
# 30: tango._tango.CmdArgType.DevPipeBlob,
# 22: tango._tango.CmdArgType.DevUChar,
# 20: tango._tango.CmdArgType.ConstDevString,
not_managed_attribute_count += 1
#print("*******NOT MANAGED: "+attr_info.name)
# 9: tango._tango.CmdArgType.DevVarCharArray,
# 10: tango._tango.CmdArgType.DevVarShortArray,
# 11: tango._tango.CmdArgType.DevVarLongArray,
# 12: tango._tango.CmdArgType.DevVarFloatArray,
# 13: tango._tango.CmdArgType.DevVarDoubleArray,
# 14: tango._tango.CmdArgType.DevVarUShortArray,
# 15: tango._tango.CmdArgType.DevVarULongArray,
# 16: tango._tango.CmdArgType.DevVarStringArray,
# 17: tango._tango.CmdArgType.DevVarLongStringArray,
# 18: tango._tango.CmdArgType.DevVarDoubleStringArray,
# 21: tango._tango.CmdArgType.DevVarBooleanArray,
# 25: tango._tango.CmdArgType.DevVarLong64Array,
# 26: tango._tango.CmdArgType.DevVarULong64Array,
# 31: tango._tango.CmdArgType.DevVarStateArray}
elif(attr_info.data_format == AttrDataFormat.SPECTRUM):
res = self.add_to_metric_spectrum(dev, attr_info, attribute_metrics)
if(res <= 0):
not_managed_attribute_count += 1
#print("*******NOT MANAGED: "+attr_info.name)
else:
spectrum_count += 1
read_count += 1
elif(attr_info.data_format == AttrDataFormat.IMAGE):
# res = self.add_to_metric_image(dev, attr_info, attribute_metrics)
# if(res <= 0):
not_managed_attribute_count += 1
#print("*******NOT MANAGED: "+attr_info.name)
image_count += 1
# read_count += 1
else:
# AttrDataFormat.FMT_UNKNOWN
not_managed_attribute_count += 1
#print("*******NOT MANAGED: "+attr_info.name)
except Exception as e1:
#print ("Could not connect to the '"+ class_list[j] + "." + attr_info.name+"' Attribute.\r\n")
#print(e1)
error_attr_count += 1
except Exception as e2:
#print ("Could not connect to the '"+class_list[j]+"' DeviceProxy.\r\n")
#print(e2)
error_count += 1
j += 2
i += 1
yield attribute_metrics
errors = GaugeMetricFamily("error_count", 'Total number of errors reading the devices')
errors.add_metric([], error_count)
yield errors
errors_attr = GaugeMetricFamily("error_attr_count", 'Total number of errors reading the device attributes')
errors_attr.add_metric([], error_attr_count)
yield errors_attr
attribute_count = GaugeMetricFamily("attribute_count", 'Total number of attributes')
attribute_count.add_metric([], total_count)
yield attribute_count
attribute_read_count = GaugeMetricFamily("attribute_read_count", 'Total number of read attributes')
attribute_read_count.add_metric([], read_count)
yield attribute_read_count
spectrum_attribute_count = GaugeMetricFamily("spectrum_attribute_count", 'Total number of spectrum attributes')
spectrum_attribute_count.add_metric([], spectrum_count)
yield spectrum_attribute_count
image_attribute_count = GaugeMetricFamily("image_attribute_count", 'Total number of image attributes')
image_attribute_count.add_metric([], image_count)
yield image_attribute_count
not_managed = GaugeMetricFamily("not_managed_attribute_count", 'Total number of not managed attributes')
not_managed.add_metric([], not_managed_attribute_count)
yield not_managed
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--replica_id', help='Replica ID identification')
parser.add_argument('-r', '--replicas', help='Replica count')
args = parser.parse_args()
collector = CustomCollector()
if(args.replica_id and args.replicas):
collector.replica_id=int(args.replica_id)
collector.replicas=int(args.replicas)
start_http_server(8000)
REGISTRY.register(collector)
while True:
time.sleep(1)
prometheus_client
\ No newline at end of file
curl $(kubectl get svc -n tango-grafana -o jsonpath='{.items[?(@.metadata.name=="tango-exporter-service-0")].spec.clusterIP}')/metrics
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment