From da3a4aabb42d809b1bcea344b5d754a68ffd47cd Mon Sep 17 00:00:00 2001
From: Jan David Mol <mol@astron.nl>
Date: Fri, 4 Dec 2020 09:54:57 +0100
Subject: [PATCH] Only reconnect once if connection fails -- we only want to
 catch exotic errors, not cover up hardware/infra failure

---
 RCUSCC/RCUSCC/RCUSCC.py | 58 +++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/RCUSCC/RCUSCC/RCUSCC.py b/RCUSCC/RCUSCC/RCUSCC.py
index 73cf614ca..7655e4e7a 100644
--- a/RCUSCC/RCUSCC/RCUSCC.py
+++ b/RCUSCC/RCUSCC/RCUSCC.py
@@ -74,38 +74,34 @@ class OPCUAConnection(Thread):
       Run a connector function in the background, until it succeeds.
     """
 
-    def __init__(self, client, init_func, fault_func, streams, try_interval=2):
+    def __init__(self, client, on_func, fault_func, streams, try_interval=2):
         super().__init__(daemon=True)
 
         self.client = client
-        self.init_func = init_func
+        self.on_func = on_func
         self.fault_func = fault_func
         self.try_interval = try_interval
         self.streams = streams
         self.stopping = False
-
-        self.start()
+        self.connected = False
 
     def _servername(self):
         return self.client.server_url.geturl()
 
-    def try_connect(self):
+    def connect(self):
         try:
             self.streams.debug_stream("Connecting to server %s", self._servername())
             self.client.connect()
+            self.connected = True
             self.streams.debug_stream("Connected to server. Initialising.")
-
-            self.init_func()
-
             return True
         except socket.error as e:
             self.streams.error_stream("Could not connect to server %s: %s", self._servername(), e)
-
-            # signal that we're disconnected
-            self.fault_func()
             return False
 
-    def try_disconnect(self):
+    def disconnect(self):
+        self.connected = False # always force a reconnect, regardless of a successful disconnect
+
         try:
             self.client.disconnect()
         except Exception as e:
@@ -114,8 +110,14 @@ class OPCUAConnection(Thread):
     def run(self):
         while not self.stopping:
             # keep trying to connect
-            while not self.stopping and not self.try_connect(): 
-               time.sleep(self.try_interval)
+            if not self.connected:
+                if self.connect():
+                    self.on_func()
+                else:
+                    # we retry only once, to catch exotic network issues. if the infra or hardware is down,
+                    # our device cannot help, and must be reinitialised after the infra or hardware is fixed.
+                    self.fault_func()
+                    return
 
             # keep checking if the connection is still alive
             try:
@@ -126,7 +128,7 @@ class OPCUAConnection(Thread):
                 self.streams.error_stream("Lost connection to server %s: %s", self._servername(), e)
 
                 # technically, we may not have dropped the connection, but encounter a different error. so explicitly disconnect.
-                self.try_disconnect()
+                self.disconnect()
 
                 # signal that we're disconnected
                 self.fault_func()
@@ -139,7 +141,7 @@ class OPCUAConnection(Thread):
         self.stopping = True
         self.join()
 
-        self.try_disconnect()
+        self.disconnect()
 
 class RCUSCC(Device):
     """
@@ -262,7 +264,7 @@ class RCUSCC(Device):
             return DummyNode()
 
 
-    def _init_opcua(self):
+    def _map_attributes(self):
         try:
             self.name_space_index = self.client.get_namespace_index("http://lofar.eu")
         except Exception as e:
@@ -367,7 +369,21 @@ class RCUSCC(Device):
         self.client = opcua.Client("opc.tcp://{}:{}/".format(self.OPC_Server_Name, self.OPC_Server_Port), self.OPC_Time_Out) # timeout in seconds
 
         # Connect to OPC-UA -- will set ON state on success
-        self.opcua_connection = OPCUAConnection(self.client, self._init_opcua, self.Fault, self)
+        self.opcua_connection = OPCUAConnection(self.client, self.On, self.Fault, self)
+
+        if not self.opcua_connection.connect():
+            # hardware or infra is down -- needs fixing first
+            self.Fault()
+            return
+
+        # Retrieve and map server attributes
+        self._map_attributes()
+
+        # Start keep-alive
+        self.opcua_connection.start()
+
+        # Everything went ok -- go online
+        self.On()
 
         # PROTECTED REGION END #    //  RCUSCC.init_device
 
@@ -568,11 +584,15 @@ class RCUSCC(Device):
 
         :return:None
         """
+        # Turn off
         self.set_state(DevState.OFF)
 
-        # stop reconnecting before disconnect
+        # Stop keep-alive
         self.opcua_connection.stop()
 
+        # Turn off again, in case of race conditions through reconnecting
+        self.set_state(DevState.OFF)
+
         # PROTECTED REGION END #    //  RCUSCC.Off
 
     @command(
-- 
GitLab