From 509fdeb07556641a984701f6f10ffa37e8a0d866 Mon Sep 17 00:00:00 2001 From: Jan David Mol <mol@astron.nl> Date: Tue, 16 May 2023 14:09:55 +0200 Subject: [PATCH] L2SS-1323: Ignore status code 255 when determining to restart a device, so we only restart on recoverable failures. --- bin/fix-tango-exit-status.sh | 50 +++++++++++++++++++ bin/start-ds.sh | 2 +- docker-compose/device-antennafield.yml | 2 +- docker-compose/device-beamlet.yml | 2 +- docker-compose/device-boot.yml | 2 +- docker-compose/device-bst.yml | 2 +- docker-compose/device-calibration.yml | 2 +- docker-compose/device-digitalbeam.yml | 2 +- docker-compose/device-docker.yml | 2 +- docker-compose/device-observation-control.yml | 2 +- docker-compose/device-observation.yml | 2 +- docker-compose/device-pcon.yml | 2 +- docker-compose/device-psoc.yml | 2 +- docker-compose/device-rcu2h.yml | 2 +- docker-compose/device-rcu2l.yml | 2 +- docker-compose/device-sdp.yml | 2 +- docker-compose/device-sst.yml | 2 +- docker-compose/device-station-manager.yml | 2 +- docker-compose/device-temperature-manager.yml | 2 +- docker-compose/device-tilebeam.yml | 2 +- docker-compose/device-unb2.yml | 2 +- docker-compose/device-xst.yml | 2 +- 22 files changed, 71 insertions(+), 21 deletions(-) create mode 100755 bin/fix-tango-exit-status.sh diff --git a/bin/fix-tango-exit-status.sh b/bin/fix-tango-exit-status.sh new file mode 100755 index 000000000..9b775d5f3 --- /dev/null +++ b/bin/fix-tango-exit-status.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright (C) 2022 ASTRON (Netherlands Institute for Radio Astronomy) +# SPDX-License-Identifier: Apache-2.0 +# +# This script waraps Tango Controls device servers such that they return +# exit code 0 when they should not be restarted. This allows them to +# be run effectively under the "restart: on-failure" strategy of running +# Docker containers. +# +# Exit code 0 will be returned under the following conditions: +# - Process completed succesfully (Tango returns exit code 0). +# - Process failed but an immediate restart would be useless +# (Tango returns exit code 255 ("exit(-1)" in its code base). +# +# Failures for which an immediate restart of the process would be +# useless include: +# - Device server is not found in the Tango Database +# - Mandatory properties of devices are missing in the Tango Database +# - Tango Database cannot be reached +# +# In all other cases, the non-zero exit code is propagated to Docker, +# resulting in a restart of the container. This includes for example +# Segmentation Faults. +# +# If the process is not wrapped with this script, the above conditions +# would result in a flurry of non-stop restarts of the container, +# thus stressing the system needlessly and spamming the logs. + +set -e + +# run command in background +"$@" & +PID=$! + +# propagate signals sent to us to the command +trap 'kill -INT $PID' INT +trap 'kill -TERM $PID' TERM + +# wait for the command to finish +wait $PID +RESULT=$? + +if [[ $RESULT -eq 255 ]]; then + # ignore exit status 255, which indicates a fundamental + # error that is useless to retry execution for. + RESULT=0 +fi + +# our exit code is the command's +exit $RESULT diff --git a/bin/start-ds.sh b/bin/start-ds.sh index a046c4878..e05e25bee 100755 --- a/bin/start-ds.sh +++ b/bin/start-ds.sh @@ -50,4 +50,4 @@ cd "$CWD" || exit 1 # Replace this script's process with the actual command, allowing any signals # send to the bash PID to be sent to the command directly. -exec /usr/local/bin/wait-for-it.sh "$TANGO_HOST" --timeout=30 --strict -- "$@" +exec /opt/lofar/tango/bin/fix-tango-exit-status.sh /usr/local/bin/wait-for-it.sh "$TANGO_HOST" --timeout=30 --strict -- "$@" diff --git a/docker-compose/device-antennafield.yml b/docker-compose/device-antennafield.yml index 527230243..b1f4dc6c0 100644 --- a/docker-compose/device-antennafield.yml +++ b/docker-compose/device-antennafield.yml @@ -51,6 +51,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-antennafield AntennaField STAT -v -ORBendPoint giop:tcp:0:5715 -ORBendPointPublish giop:tcp:${HOSTNAME}:5715 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-beamlet.yml b/docker-compose/device-beamlet.yml index 1cf7dc5a2..def12d9ab 100644 --- a/docker-compose/device-beamlet.yml +++ b/docker-compose/device-beamlet.yml @@ -50,6 +50,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-beamlet Beamlet STAT -v -ORBendPoint giop:tcp:0:5712 -ORBendPointPublish giop:tcp:${HOSTNAME}:5712 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-boot.yml b/docker-compose/device-boot.yml index 847226796..5c4a58689 100644 --- a/docker-compose/device-boot.yml +++ b/docker-compose/device-boot.yml @@ -49,6 +49,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-boot Boot STAT -v -ORBendPoint giop:tcp:0:5708 -ORBendPointPublish giop:tcp:${HOSTNAME}:5708 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-bst.yml b/docker-compose/device-bst.yml index 69d71ff97..c6cd78149 100644 --- a/docker-compose/device-bst.yml +++ b/docker-compose/device-bst.yml @@ -55,6 +55,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-bst BST STAT -v -ORBendPoint giop:tcp:0:5717 -ORBendPointPublish giop:tcp:${HOSTNAME}:5717 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-calibration.yml b/docker-compose/device-calibration.yml index a024ab8a2..3ff6353c5 100644 --- a/docker-compose/device-calibration.yml +++ b/docker-compose/device-calibration.yml @@ -52,7 +52,7 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-calibration Calibration STAT -v -ORBendPoint giop:tcp:0:5724 -ORBendPointPublish giop:tcp:${HOSTNAME}:5724 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s depends_on: diff --git a/docker-compose/device-digitalbeam.yml b/docker-compose/device-digitalbeam.yml index 2ad5c7d24..766c4317d 100644 --- a/docker-compose/device-digitalbeam.yml +++ b/docker-compose/device-digitalbeam.yml @@ -50,6 +50,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-digitalbeam DigitalBeam STAT -v -ORBendPoint giop:tcp:0:5713 -ORBendPointPublish giop:tcp:${HOSTNAME}:5713 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-docker.yml b/docker-compose/device-docker.yml index c31130996..bf51663ae 100644 --- a/docker-compose/device-docker.yml +++ b/docker-compose/device-docker.yml @@ -52,6 +52,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-docker Docker STAT -v -ORBendPoint giop:tcp:0:5705 -ORBendPointPublish giop:tcp:${HOSTNAME}:5705 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-observation-control.yml b/docker-compose/device-observation-control.yml index c4a3e4bf0..f5fcaa1a6 100644 --- a/docker-compose/device-observation-control.yml +++ b/docker-compose/device-observation-control.yml @@ -49,7 +49,7 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-observationcontrol ObservationControl STAT -v -ORBendPoint giop:tcp:0:5703 -ORBendPointPublish giop:tcp:${HOSTNAME}:5703 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s depends_on: diff --git a/docker-compose/device-observation.yml b/docker-compose/device-observation.yml index 3d4b24168..9fa593a96 100644 --- a/docker-compose/device-observation.yml +++ b/docker-compose/device-observation.yml @@ -48,6 +48,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-observation Observation STAT -v -ORBendPoint giop:tcp:0:5718 -ORBendPointPublish giop:tcp:${HOSTNAME}:5718 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-pcon.yml b/docker-compose/device-pcon.yml index 5b9007691..093c22c10 100644 --- a/docker-compose/device-pcon.yml +++ b/docker-compose/device-pcon.yml @@ -45,6 +45,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-pcon pcon STAT -v -ORBendPoint giop:tcp:device-pcon:5720 -ORBendPointPublish giop:tcp:${HOSTNAME}:5720 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-psoc.yml b/docker-compose/device-psoc.yml index ea3ef27d9..3610a7e34 100644 --- a/docker-compose/device-psoc.yml +++ b/docker-compose/device-psoc.yml @@ -45,6 +45,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-psoc PSOC STAT -v -ORBendPoint giop:tcp:device-psoc:5719 -ORBendPointPublish giop:tcp:${HOSTNAME}:5719 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-rcu2h.yml b/docker-compose/device-rcu2h.yml index 48594945a..8bfcdf2f9 100644 --- a/docker-compose/device-rcu2h.yml +++ b/docker-compose/device-rcu2h.yml @@ -50,6 +50,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-rcu2h RCU2H STAT -v -ORBendPoint giop:tcp:device-rcu2h:5725 -ORBendPointPublish giop:tcp:${HOSTNAME}:5725 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-rcu2l.yml b/docker-compose/device-rcu2l.yml index a499d8b7c..8cb03dd03 100644 --- a/docker-compose/device-rcu2l.yml +++ b/docker-compose/device-rcu2l.yml @@ -50,6 +50,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-rcu2l RCU2L STAT -v -ORBendPoint giop:tcp:device-rcu2l:5726 -ORBendPointPublish giop:tcp:${HOSTNAME}:5726 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-sdp.yml b/docker-compose/device-sdp.yml index f9e45c368..e47185925 100644 --- a/docker-compose/device-sdp.yml +++ b/docker-compose/device-sdp.yml @@ -50,6 +50,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-sdp SDP STAT -v -ORBendPoint giop:tcp:device-sdp:5701 -ORBendPointPublish giop:tcp:${HOSTNAME}:5701 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-sst.yml b/docker-compose/device-sst.yml index 1e430f5bf..2139b3de6 100644 --- a/docker-compose/device-sst.yml +++ b/docker-compose/device-sst.yml @@ -55,6 +55,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-sst SST STAT -v -ORBendPoint giop:tcp:0:5702 -ORBendPointPublish giop:tcp:${HOSTNAME}:5702 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-station-manager.yml b/docker-compose/device-station-manager.yml index 24c124256..fb27bec2d 100644 --- a/docker-compose/device-station-manager.yml +++ b/docker-compose/device-station-manager.yml @@ -45,6 +45,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-station-manager StationManager STAT -v -ORBendPoint giop:tcp:device-station-manager:5723 -ORBendPointPublish giop:tcp:${HOSTNAME}:5723 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-temperature-manager.yml b/docker-compose/device-temperature-manager.yml index 9bb1a4589..a62613803 100644 --- a/docker-compose/device-temperature-manager.yml +++ b/docker-compose/device-temperature-manager.yml @@ -45,6 +45,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-temperaturemanager TemperatureManager STAT -v -ORBendPoint giop:tcp:0:5716 -ORBendPointPublish giop:tcp:${HOSTNAME}:5716 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-tilebeam.yml b/docker-compose/device-tilebeam.yml index 3bd65a9ca..5f36c7b30 100644 --- a/docker-compose/device-tilebeam.yml +++ b/docker-compose/device-tilebeam.yml @@ -46,6 +46,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-tilebeam TileBeam STAT -v -ORBendPoint giop:tcp:0:5711 -ORBendPointPublish giop:tcp:${HOSTNAME}:5711 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-unb2.yml b/docker-compose/device-unb2.yml index ed5a43098..fdfa1938e 100644 --- a/docker-compose/device-unb2.yml +++ b/docker-compose/device-unb2.yml @@ -50,6 +50,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-unb2 UNB2 STAT -v -ORBendPoint giop:tcp:device-unb2:5704 -ORBendPointPublish giop:tcp:${HOSTNAME}:5704 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s diff --git a/docker-compose/device-xst.yml b/docker-compose/device-xst.yml index e064d4eca..bf199067d 100644 --- a/docker-compose/device-xst.yml +++ b/docker-compose/device-xst.yml @@ -55,6 +55,6 @@ services: # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA # can't know about our Docker port forwarding - l2ss-xst XST STAT -v -ORBendPoint giop:tcp:0:5706 -ORBendPointPublish giop:tcp:${HOSTNAME}:5706 - restart: unless-stopped + restart: on-failure stop_signal: SIGINT # request a graceful shutdown of Tango stop_grace_period: 2s -- GitLab