From fb57dbef9e7397beda311cb1000dbc25b2967e22 Mon Sep 17 00:00:00 2001
From: Ruud Overeem <overeem@astron.nl>
Date: Fri, 24 Feb 2012 09:10:54 +0000
Subject: [PATCH] Task #3048: Fixed bug that multiple stations are dropped when
 one station is unreachable. Also improved reporting about the status of the
 childcontrollers.

---
 MAC/APL/APLCommon/src/ChildControl.cc         |  7 +--
 .../MainCU/src/MACScheduler/MACScheduler.cc   |  2 +-
 .../ObservationControl/ObservationControl.cc  | 55 +++++++++++++++----
 .../ObservationControl.conf                   | 12 +++-
 .../ObservationControl/ObservationControl.h   | 16 ++++--
 5 files changed, 67 insertions(+), 25 deletions(-)

diff --git a/MAC/APL/APLCommon/src/ChildControl.cc b/MAC/APL/APLCommon/src/ChildControl.cc
index ca39b69466d..f13c9eef585 100644
--- a/MAC/APL/APLCommon/src/ChildControl.cc
+++ b/MAC/APL/APLCommon/src/ChildControl.cc
@@ -1024,13 +1024,12 @@ void ChildControl::_doGarbageCollection()
 		// 1: port == 0: inform main task about removal after retry interval expired
 		// 2: port == -1: remove from list
 		// This is necc. because main task may poll childcontrol for results.
-		if (!iter->port) {
-			restartTimer = true;
-			LOG_DEBUG_STR(time(0)<<"-"<<iter->requestTime<<">="<<itsStartupRetryInterval<<"*"<<itsMaxStartupRetries<<"?");
+		if (!iter->port && (iter->requestedState != CTState::CONNECTED)) {
 			if ((time(0)-iter->requestTime) >= int32(MAC_SCP_TIMEOUT+(itsStartupRetryInterval*itsMaxStartupRetries))) {
 				LOG_DEBUG_STR ("Controller " << iter->cntlrName << " is still unreachable, informing main task");
 				_setEstablishedState(iter->cntlrName, CTState::QUITED, time(0), CT_RESULT_LOST_CONNECTION);
 				iter->port = (GCFPortInterface*) -1;
+				restartTimer = true;
 			}
 
 			iter++;
@@ -1178,7 +1177,6 @@ GCFEvent::TResult	ChildControl::operational(GCFEvent&			event,
 					_setEstablishedState(controller->cntlrName, CTState::ANYSTATE, 
 													time(0), CT_RESULT_LOST_CONNECTION);
 					controller->port = 0;
-
 #if 0
 					// Try to restart the controller over 5 seconds
 					// Add it to the action list.
@@ -1186,7 +1184,6 @@ GCFEvent::TResult	ChildControl::operational(GCFEvent&			event,
 					itsListener->cancelTimer(itsActionTimer);
 					itsActionTimer = itsListener->setTimer(1.0);
 					itsActionList.push_back(*controller);
-
 #endif
 				}
 
diff --git a/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc b/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc
index ba4c8750204..3dca4ec2b58 100644
--- a/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc
+++ b/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc
@@ -767,7 +767,7 @@ void MACScheduler::_updateFinishedList()
 	GCFPValueArray	finishedArr;
 	int32	freeSpace = MAX_CONCURRENT_OBSERVATIONS - itsNrPlanned - itsNrActive;
 	int32	idx       = finishedDBlist.size() - 1;
-	int32	limit     = idx - (MIN2(MIN2(finishedDBlist.size(), itsMaxFinished), freeSpace) - 1);
+	int32	limit     = idx - (MIN2(MIN2(finishedDBlist.size(), itsMaxFinished), (uint32)freeSpace) - 1);
 	while (idx >= limit)  {
 		// construct name and timings info for observation
 		string		obsName(observationName(finishedDBlist[idx].treeID()));
diff --git a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc
index 21ac4d867b3..e5d867a0b84 100644
--- a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc
+++ b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc
@@ -70,6 +70,7 @@ ObservationControl::ObservationControl(const string&	cntlrName) :
 	itsFullReport		(false),
 	itsChangeReport		(false),
 	itsState			(CTState::NOSTATE),
+	itsLastReportedState(CTState::NOSTATE),
 	itsNrControllers  	(0),
 	itsBusyControllers  (0),
 	itsQuitReason		(CT_RESULT_NO_ERROR),
@@ -97,14 +98,18 @@ ObservationControl::ObservationControl(const string&	cntlrName) :
 	itsClaimPeriod   = globalParameterSet()->getTime  ("Observation.claimPeriod");
 	itsPreparePeriod = globalParameterSet()->getTime  ("Observation.preparePeriod");
 
+	// Values from my conf file
+	itsLateLimit     = globalParameterSet()->getTime   ("ObservationControl.lateLimit", 15);
+	itsFailedLimit   = globalParameterSet()->getTime   ("ObservationControl.failedLimit", 30);
+	itsHeartBeatItv	 = globalParameterSet()->getTime   ("ObservationControl.heartbeatInterval", 10);
+	string reportType = globalParameterSet()->getString("ObservationControl.reportType", "Full");
+	if 		(reportType == "Full")		itsFullReport = true;
+	else if (reportType == "Changes")	itsChangeReport = true;
+
 	// My own parameters
 	itsTreePrefix   	 = globalParameterSet()->getString("prefix");
 	itsTreeID			 = globalParameterSet()->getUint32("_treeID");	// !!!
 	itsObsDPname		 = globalParameterSet()->getString("_DPname");
-	itsHeartBeatItv 	 = globalParameterSet()->getTime("heartbeatInterval", 10);
-	string reportType    = globalParameterSet()->getString("reportType", "Full");
-	if 		(reportType == "Full")		itsFullReport = true;
-	else if (reportType == "Changes")	itsChangeReport = true;
 
 	// The time I have to wait for the forced quit depends on the integration time of OLAP
 	string	OLAPpos = globalParameterSet()->locateModule("OLAP");
@@ -747,12 +752,13 @@ void  ObservationControl::doHeartBeatTask()
 #if 1
 	// NOTE: [15122010] Sending respons when first child reached required state.
 	// NOTE: [15122010] WHEN nrChilds = 1 EACH TIME WE COME HERE A REPLY IS SENT!!!!!
-	if (itsBusyControllers == nrChilds-1) {	// first reply received?
+	if ((itsBusyControllers == nrChilds-1) && (itsLastReportedState != itsState)) {	// first reply received?
 		CTState		cts;					// report that state is reached.
 		LOG_INFO_STR("First controller reached required state " << cts.name(cts.stateAck(itsState)) << 
 					 ", informing SAS although it is too early!");
 		sendControlResult(*itsParentPort, cts.signal(cts.stateAck(itsState)), getName(), CT_RESULT_NO_ERROR);
 		setState(cts.stateAck(itsState));
+		itsLastReportedState = itsState;
 	}
 #endif
 
@@ -795,20 +801,24 @@ void ObservationControl::_updateChildInfo(const string& name, CTState::CTstateNr
 {
 	// make sure that quited (and by ChildControl already removed) controllers are updatd also.
 	if (!name.empty() && state != CTState::NOSTATE && itsChildInfo.find(name) != itsChildInfo.end()) {
-		itsChildInfo[name].state = state;
+		itsChildInfo[name].currentState = state;
 		CTState	CTS; 
 		LOG_DEBUG_STR("_updateChildInfo: FORCING " << name << " to " << CTS.name(state));
 		return;
 	}
 
+	// get latest status info
 	vector<ChildControl::StateInfo> childs = itsChildControl->getChildInfo(name, 0, CNTLRTYPE_NO_TYPE);
 	int	nrChilds = childs.size();
 	for (int i = 0; i < nrChilds; i++) {
 		if (itsChildInfo.find(childs[i].name) == itsChildInfo.end()) {	// not in map already?
-			itsChildInfo[childs[i].name] = ChildProc(childs[i].cntlrType, childs[i].currentState);
+			itsChildInfo[childs[i].name] = 
+				ChildProc(childs[i].cntlrType, childs[i].currentState, childs[i].requestedState, childs[i].requestTime);
 		}
 		else {
-			itsChildInfo[childs[i].name].state = (state != CTState::NOSTATE) ? state : childs[i].currentState;
+			itsChildInfo[childs[i].name].currentState   = (state != CTState::NOSTATE) ? state : childs[i].currentState;
+			itsChildInfo[childs[i].name].requestedState = childs[i].requestedState;
+			itsChildInfo[childs[i].name].requestTime    = childs[i].requestTime;
 		}
 	}
 }
@@ -825,15 +835,36 @@ void ObservationControl::_showChildInfo()
 	CTState		CTS;
 	map<string, ChildProc>::iterator	iter = itsChildInfo.begin();
 	map<string, ChildProc>::iterator	end  = itsChildInfo.end();
+	time_t		now(time(0));
 	while (iter != end) {
-		if (itsFullReport || (itsChangeReport && iter->second.state != iter->second.reportedState)) {
-			LOG_INFO(formatString("%-35.35s: %-10.10s", iter->first.c_str(), CTS.name(iter->second.state).c_str()));
-			iter->second.reportedState = iter->second.state;
+		ChildProc*	cp = &(iter->second);
+		LOG_DEBUG_STR(iter->first<<":cur="<<cp->currentState<<",req="<<cp->requestedState<<",rep="
+														<<cp->reportedState<<",late="<<now-cp->requestTime);
+		//    always      OR     child not in requested state       OR     current state not yet reported
+		if (itsFullReport || cp->currentState != cp->requestedState || cp->reportedState != cp->currentState) {
+			string	stateName = CTS.name(cp->currentState);
+			
+			// Selection of loglineType is based on 'late', while starting up correct late for scp timeout
+			// to prevent fault 'not responding' messages.
+			int32	late      = now - cp->requestTime - (cp->requestedState <= CTState::CLAIMED ? MAC_SCP_TIMEOUT : 0);
+			if (late > itsLateLimit && late < itsFailedLimit && cp->currentState < cp->requestedState) {
+				LOG_WARN(formatString("%-35.35s: IS LATE, state= %-10.10s", iter->first.c_str(), stateName.c_str()));
+			}
+			else if (late > itsFailedLimit && cp->reportedState != cp->requestedState && cp->currentState < cp->requestedState) {
+				LOG_FATAL(formatString("%-35.35s: IS NOT RESPONDING, state= %-10.10s", iter->first.c_str(), stateName.c_str()));
+				cp->reportedState = cp->currentState;
+			} 
+			else {
+				LOG_INFO(formatString("%-35.35s: %-10.10s", iter->first.c_str(), stateName.c_str()));
+				if (cp->currentState == cp->requestedState) {
+					cp->reportedState = cp->currentState;
+				}
+			}
 		}
 		iter++;
 	}
 }
-	
+
 //
 // _connectedHandler(port)
 //
diff --git a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.conf b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.conf
index b3b993358bb..69f735d982f 100644
--- a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.conf
+++ b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.conf
@@ -1,13 +1,19 @@
 #
 # Manage amount of logging
 #
-# Full: report state of each childController every heartbeat
+# Full: report state of each childController every heartbeat [default]
 # Changes : report state-changes of the childControllers only
 #
-#reportType = Changes
+#ObservationControl.reportType = Changes
 
 #
 # Intervaltime how often the ObservationController should report the current state
 # and checks its quality measures.
 #
-heartbeatInterval = 10s
+ObservationControl.heartbeatInterval = 10s
+
+#
+# Timeout values to control when a state-change is late or is treated as a failure.
+#
+ObservationControl.lateLimit   = 15s
+ObservationControl.FailedLimit = 30s
diff --git a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.h b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.h
index d22fbdcdf73..2acf4d7417e 100644
--- a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.h
+++ b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.h
@@ -127,17 +127,23 @@ private:
 	// State administration. Note: administration is done by ChildControl, to simplify reports
 	// about the states we keep a copy of it.
 	typedef struct ChildProc {
-		ChildProc() : type(APLCommon::CNTLRTYPE_NO_TYPE), state(CTState::NOSTATE) {};
-		ChildProc(int aType, CTState::CTstateNr aState) : type(aType), state(aState) {};
+		ChildProc() : type(APLCommon::CNTLRTYPE_NO_TYPE), requestTime(0), requestedState(CTState::NOSTATE), 
+					  currentState(CTState::NOSTATE), reportedState(CTState::NOSTATE) {};
+		ChildProc(int aType, CTState::CTstateNr curState, CTState::CTstateNr reqState, time_t reqTime) : 
+					type(aType), requestTime(reqTime), requestedState(reqState), 
+					currentState(curState), reportedState(CTState::NOSTATE) {};
 		uint16				type;
-		CTState::CTstateNr	state;
-		CTState::CTstateNr	reportedState;
+		time_t				requestTime;		// time last statechange was requested
+		CTState::CTstateNr	requestedState;	
+		CTState::CTstateNr	currentState;
+		CTState::CTstateNr	reportedState;		// state eported in logfiles.
 	} ChildProc;
 	map<string, ChildProc>	itsChildInfo;
 	bool					itsFullReport;		// report every child every heartbeat
 	bool					itsChangeReport;	// report only changed states
 
 	CTState::CTstateNr		itsState;
+	CTState::CTstateNr		itsLastReportedState;	// to SAS
 	uint32					itsNrStations;
 	uint32					itsNrOnlineCtrls;
 	uint32					itsNrOfflineCtrls;
@@ -162,6 +168,8 @@ private:
 	uint32					itsForcedQuitDelay;
 	uint32					itsClaimPeriod;
 	uint32					itsPreparePeriod;
+	int32					itsLateLimit;		// after how many seconds a requested state should have been reached.
+	int32					itsFailedLimit;		// after how many seconds a late state change is treated as failure.
 	ptime					itsStartTime;
 	ptime					itsStopTime;
 };
-- 
GitLab