From 509fdeb07556641a984701f6f10ffa37e8a0d866 Mon Sep 17 00:00:00 2001
From: Jan David Mol <mol@astron.nl>
Date: Tue, 16 May 2023 14:09:55 +0200
Subject: [PATCH] L2SS-1323: Ignore status code 255 when determining to restart
 a device, so we only restart on recoverable failures.

---
 bin/fix-tango-exit-status.sh                  | 50 +++++++++++++++++++
 bin/start-ds.sh                               |  2 +-
 docker-compose/device-antennafield.yml        |  2 +-
 docker-compose/device-beamlet.yml             |  2 +-
 docker-compose/device-boot.yml                |  2 +-
 docker-compose/device-bst.yml                 |  2 +-
 docker-compose/device-calibration.yml         |  2 +-
 docker-compose/device-digitalbeam.yml         |  2 +-
 docker-compose/device-docker.yml              |  2 +-
 docker-compose/device-observation-control.yml |  2 +-
 docker-compose/device-observation.yml         |  2 +-
 docker-compose/device-pcon.yml                |  2 +-
 docker-compose/device-psoc.yml                |  2 +-
 docker-compose/device-rcu2h.yml               |  2 +-
 docker-compose/device-rcu2l.yml               |  2 +-
 docker-compose/device-sdp.yml                 |  2 +-
 docker-compose/device-sst.yml                 |  2 +-
 docker-compose/device-station-manager.yml     |  2 +-
 docker-compose/device-temperature-manager.yml |  2 +-
 docker-compose/device-tilebeam.yml            |  2 +-
 docker-compose/device-unb2.yml                |  2 +-
 docker-compose/device-xst.yml                 |  2 +-
 22 files changed, 71 insertions(+), 21 deletions(-)
 create mode 100755 bin/fix-tango-exit-status.sh

diff --git a/bin/fix-tango-exit-status.sh b/bin/fix-tango-exit-status.sh
new file mode 100755
index 000000000..9b775d5f3
--- /dev/null
+++ b/bin/fix-tango-exit-status.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Copyright (C) 2022 ASTRON (Netherlands Institute for Radio Astronomy)
+# SPDX-License-Identifier: Apache-2.0
+#
+# This script waraps Tango Controls device servers such that they return
+# exit code 0 when they should not be restarted. This allows them to
+# be run effectively under the "restart: on-failure" strategy of running
+# Docker containers.
+#
+# Exit code 0 will be returned under the following conditions:
+#  - Process completed succesfully (Tango returns exit code 0).
+#  - Process failed but an immediate restart would be useless
+#    (Tango returns exit code 255 ("exit(-1)" in its code base).
+#
+# Failures for which an immediate restart of the process would be
+# useless include:
+#   - Device server is not found in the Tango Database
+#   - Mandatory properties of devices are missing in the Tango Database
+#   - Tango Database cannot be reached
+#
+# In all other cases, the non-zero exit code is propagated to Docker,
+# resulting in a restart of the container. This includes for example
+# Segmentation Faults.
+#
+# If the process is not wrapped with this script, the above conditions
+# would result in a flurry of non-stop restarts of the container,
+# thus stressing the system needlessly and spamming the logs.
+
+set -e
+
+# run command in background
+"$@" &
+PID=$!
+
+# propagate signals sent to us to the command
+trap 'kill -INT $PID' INT
+trap 'kill -TERM $PID' TERM
+
+# wait for the command to finish
+wait $PID
+RESULT=$?
+
+if [[ $RESULT -eq 255 ]]; then
+  # ignore exit status 255, which indicates a fundamental
+  # error that is useless to retry execution for.
+  RESULT=0
+fi
+
+# our exit code is the command's
+exit $RESULT
diff --git a/bin/start-ds.sh b/bin/start-ds.sh
index a046c4878..e05e25bee 100755
--- a/bin/start-ds.sh
+++ b/bin/start-ds.sh
@@ -50,4 +50,4 @@ cd "$CWD" || exit 1
 
 # Replace this script's process with the actual command, allowing any signals
 # send to the bash PID to be sent to the command directly.
-exec /usr/local/bin/wait-for-it.sh "$TANGO_HOST" --timeout=30 --strict -- "$@"
+exec /opt/lofar/tango/bin/fix-tango-exit-status.sh /usr/local/bin/wait-for-it.sh "$TANGO_HOST" --timeout=30 --strict -- "$@"
diff --git a/docker-compose/device-antennafield.yml b/docker-compose/device-antennafield.yml
index 527230243..b1f4dc6c0 100644
--- a/docker-compose/device-antennafield.yml
+++ b/docker-compose/device-antennafield.yml
@@ -51,6 +51,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-antennafield AntennaField STAT -v -ORBendPoint giop:tcp:0:5715 -ORBendPointPublish giop:tcp:${HOSTNAME}:5715
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-beamlet.yml b/docker-compose/device-beamlet.yml
index 1cf7dc5a2..def12d9ab 100644
--- a/docker-compose/device-beamlet.yml
+++ b/docker-compose/device-beamlet.yml
@@ -50,6 +50,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-beamlet Beamlet STAT -v -ORBendPoint giop:tcp:0:5712 -ORBendPointPublish giop:tcp:${HOSTNAME}:5712
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-boot.yml b/docker-compose/device-boot.yml
index 847226796..5c4a58689 100644
--- a/docker-compose/device-boot.yml
+++ b/docker-compose/device-boot.yml
@@ -49,6 +49,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-boot Boot STAT -v -ORBendPoint giop:tcp:0:5708 -ORBendPointPublish giop:tcp:${HOSTNAME}:5708
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-bst.yml b/docker-compose/device-bst.yml
index 69d71ff97..c6cd78149 100644
--- a/docker-compose/device-bst.yml
+++ b/docker-compose/device-bst.yml
@@ -55,6 +55,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-bst BST STAT -v -ORBendPoint giop:tcp:0:5717 -ORBendPointPublish giop:tcp:${HOSTNAME}:5717
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-calibration.yml b/docker-compose/device-calibration.yml
index a024ab8a2..3ff6353c5 100644
--- a/docker-compose/device-calibration.yml
+++ b/docker-compose/device-calibration.yml
@@ -52,7 +52,7 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-calibration Calibration STAT -v -ORBendPoint giop:tcp:0:5724 -ORBendPointPublish giop:tcp:${HOSTNAME}:5724
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
     depends_on:
diff --git a/docker-compose/device-digitalbeam.yml b/docker-compose/device-digitalbeam.yml
index 2ad5c7d24..766c4317d 100644
--- a/docker-compose/device-digitalbeam.yml
+++ b/docker-compose/device-digitalbeam.yml
@@ -50,6 +50,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-digitalbeam DigitalBeam STAT -v -ORBendPoint giop:tcp:0:5713 -ORBendPointPublish giop:tcp:${HOSTNAME}:5713
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-docker.yml b/docker-compose/device-docker.yml
index c31130996..bf51663ae 100644
--- a/docker-compose/device-docker.yml
+++ b/docker-compose/device-docker.yml
@@ -52,6 +52,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-docker Docker STAT -v -ORBendPoint giop:tcp:0:5705 -ORBendPointPublish giop:tcp:${HOSTNAME}:5705
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-observation-control.yml b/docker-compose/device-observation-control.yml
index c4a3e4bf0..f5fcaa1a6 100644
--- a/docker-compose/device-observation-control.yml
+++ b/docker-compose/device-observation-control.yml
@@ -49,7 +49,7 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-observationcontrol ObservationControl STAT -v -ORBendPoint giop:tcp:0:5703 -ORBendPointPublish giop:tcp:${HOSTNAME}:5703
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
     depends_on:
diff --git a/docker-compose/device-observation.yml b/docker-compose/device-observation.yml
index 3d4b24168..9fa593a96 100644
--- a/docker-compose/device-observation.yml
+++ b/docker-compose/device-observation.yml
@@ -48,6 +48,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-observation Observation STAT -v -ORBendPoint giop:tcp:0:5718 -ORBendPointPublish giop:tcp:${HOSTNAME}:5718
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-pcon.yml b/docker-compose/device-pcon.yml
index 5b9007691..093c22c10 100644
--- a/docker-compose/device-pcon.yml
+++ b/docker-compose/device-pcon.yml
@@ -45,6 +45,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-pcon pcon STAT -v -ORBendPoint giop:tcp:device-pcon:5720 -ORBendPointPublish giop:tcp:${HOSTNAME}:5720
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-psoc.yml b/docker-compose/device-psoc.yml
index ea3ef27d9..3610a7e34 100644
--- a/docker-compose/device-psoc.yml
+++ b/docker-compose/device-psoc.yml
@@ -45,6 +45,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-psoc PSOC STAT -v -ORBendPoint giop:tcp:device-psoc:5719 -ORBendPointPublish giop:tcp:${HOSTNAME}:5719
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-rcu2h.yml b/docker-compose/device-rcu2h.yml
index 48594945a..8bfcdf2f9 100644
--- a/docker-compose/device-rcu2h.yml
+++ b/docker-compose/device-rcu2h.yml
@@ -50,6 +50,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-rcu2h RCU2H STAT -v -ORBendPoint giop:tcp:device-rcu2h:5725 -ORBendPointPublish giop:tcp:${HOSTNAME}:5725
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-rcu2l.yml b/docker-compose/device-rcu2l.yml
index a499d8b7c..8cb03dd03 100644
--- a/docker-compose/device-rcu2l.yml
+++ b/docker-compose/device-rcu2l.yml
@@ -50,6 +50,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-rcu2l RCU2L STAT -v -ORBendPoint giop:tcp:device-rcu2l:5726 -ORBendPointPublish giop:tcp:${HOSTNAME}:5726
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-sdp.yml b/docker-compose/device-sdp.yml
index f9e45c368..e47185925 100644
--- a/docker-compose/device-sdp.yml
+++ b/docker-compose/device-sdp.yml
@@ -50,6 +50,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-sdp SDP STAT -v -ORBendPoint giop:tcp:device-sdp:5701 -ORBendPointPublish giop:tcp:${HOSTNAME}:5701
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-sst.yml b/docker-compose/device-sst.yml
index 1e430f5bf..2139b3de6 100644
--- a/docker-compose/device-sst.yml
+++ b/docker-compose/device-sst.yml
@@ -55,6 +55,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-sst SST STAT -v -ORBendPoint giop:tcp:0:5702 -ORBendPointPublish giop:tcp:${HOSTNAME}:5702
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-station-manager.yml b/docker-compose/device-station-manager.yml
index 24c124256..fb27bec2d 100644
--- a/docker-compose/device-station-manager.yml
+++ b/docker-compose/device-station-manager.yml
@@ -45,6 +45,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-station-manager StationManager STAT -v -ORBendPoint giop:tcp:device-station-manager:5723 -ORBendPointPublish giop:tcp:${HOSTNAME}:5723
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-temperature-manager.yml b/docker-compose/device-temperature-manager.yml
index 9bb1a4589..a62613803 100644
--- a/docker-compose/device-temperature-manager.yml
+++ b/docker-compose/device-temperature-manager.yml
@@ -45,6 +45,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-temperaturemanager TemperatureManager STAT -v -ORBendPoint giop:tcp:0:5716 -ORBendPointPublish giop:tcp:${HOSTNAME}:5716
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-tilebeam.yml b/docker-compose/device-tilebeam.yml
index 3bd65a9ca..5f36c7b30 100644
--- a/docker-compose/device-tilebeam.yml
+++ b/docker-compose/device-tilebeam.yml
@@ -46,6 +46,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-tilebeam TileBeam STAT -v -ORBendPoint giop:tcp:0:5711 -ORBendPointPublish giop:tcp:${HOSTNAME}:5711
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-unb2.yml b/docker-compose/device-unb2.yml
index ed5a43098..fdfa1938e 100644
--- a/docker-compose/device-unb2.yml
+++ b/docker-compose/device-unb2.yml
@@ -50,6 +50,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-unb2 UNB2 STAT -v -ORBendPoint giop:tcp:device-unb2:5704 -ORBendPointPublish giop:tcp:${HOSTNAME}:5704
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
diff --git a/docker-compose/device-xst.yml b/docker-compose/device-xst.yml
index e064d4eca..bf199067d 100644
--- a/docker-compose/device-xst.yml
+++ b/docker-compose/device-xst.yml
@@ -55,6 +55,6 @@ services:
       # configure CORBA to _listen_ on 0:port, but tell others we're _reachable_ through ${HOSTNAME}:port, since CORBA
       # can't know about our Docker port forwarding
       - l2ss-xst XST STAT -v -ORBendPoint giop:tcp:0:5706 -ORBendPointPublish giop:tcp:${HOSTNAME}:5706
-    restart: unless-stopped
+    restart: on-failure
     stop_signal: SIGINT # request a graceful shutdown of Tango
     stop_grace_period: 2s
-- 
GitLab