From fb57dbef9e7397beda311cb1000dbc25b2967e22 Mon Sep 17 00:00:00 2001 From: Ruud Overeem <overeem@astron.nl> Date: Fri, 24 Feb 2012 09:10:54 +0000 Subject: [PATCH] Task #3048: Fixed bug that multiple stations are dropped when one station is unreachable. Also improved reporting about the status of the childcontrollers. --- MAC/APL/APLCommon/src/ChildControl.cc | 7 +-- .../MainCU/src/MACScheduler/MACScheduler.cc | 2 +- .../ObservationControl/ObservationControl.cc | 55 +++++++++++++++---- .../ObservationControl.conf | 12 +++- .../ObservationControl/ObservationControl.h | 16 ++++-- 5 files changed, 67 insertions(+), 25 deletions(-) diff --git a/MAC/APL/APLCommon/src/ChildControl.cc b/MAC/APL/APLCommon/src/ChildControl.cc index ca39b69466d..f13c9eef585 100644 --- a/MAC/APL/APLCommon/src/ChildControl.cc +++ b/MAC/APL/APLCommon/src/ChildControl.cc @@ -1024,13 +1024,12 @@ void ChildControl::_doGarbageCollection() // 1: port == 0: inform main task about removal after retry interval expired // 2: port == -1: remove from list // This is necc. because main task may poll childcontrol for results. - if (!iter->port) { - restartTimer = true; - LOG_DEBUG_STR(time(0)<<"-"<<iter->requestTime<<">="<<itsStartupRetryInterval<<"*"<<itsMaxStartupRetries<<"?"); + if (!iter->port && (iter->requestedState != CTState::CONNECTED)) { if ((time(0)-iter->requestTime) >= int32(MAC_SCP_TIMEOUT+(itsStartupRetryInterval*itsMaxStartupRetries))) { LOG_DEBUG_STR ("Controller " << iter->cntlrName << " is still unreachable, informing main task"); _setEstablishedState(iter->cntlrName, CTState::QUITED, time(0), CT_RESULT_LOST_CONNECTION); iter->port = (GCFPortInterface*) -1; + restartTimer = true; } iter++; @@ -1178,7 +1177,6 @@ GCFEvent::TResult ChildControl::operational(GCFEvent& event, _setEstablishedState(controller->cntlrName, CTState::ANYSTATE, time(0), CT_RESULT_LOST_CONNECTION); controller->port = 0; - #if 0 // Try to restart the controller over 5 seconds // Add it to the action list. @@ -1186,7 +1184,6 @@ GCFEvent::TResult ChildControl::operational(GCFEvent& event, itsListener->cancelTimer(itsActionTimer); itsActionTimer = itsListener->setTimer(1.0); itsActionList.push_back(*controller); - #endif } diff --git a/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc b/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc index ba4c8750204..3dca4ec2b58 100644 --- a/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc +++ b/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc @@ -767,7 +767,7 @@ void MACScheduler::_updateFinishedList() GCFPValueArray finishedArr; int32 freeSpace = MAX_CONCURRENT_OBSERVATIONS - itsNrPlanned - itsNrActive; int32 idx = finishedDBlist.size() - 1; - int32 limit = idx - (MIN2(MIN2(finishedDBlist.size(), itsMaxFinished), freeSpace) - 1); + int32 limit = idx - (MIN2(MIN2(finishedDBlist.size(), itsMaxFinished), (uint32)freeSpace) - 1); while (idx >= limit) { // construct name and timings info for observation string obsName(observationName(finishedDBlist[idx].treeID())); diff --git a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc index 21ac4d867b3..e5d867a0b84 100644 --- a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc +++ b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc @@ -70,6 +70,7 @@ ObservationControl::ObservationControl(const string& cntlrName) : itsFullReport (false), itsChangeReport (false), itsState (CTState::NOSTATE), + itsLastReportedState(CTState::NOSTATE), itsNrControllers (0), itsBusyControllers (0), itsQuitReason (CT_RESULT_NO_ERROR), @@ -97,14 +98,18 @@ ObservationControl::ObservationControl(const string& cntlrName) : itsClaimPeriod = globalParameterSet()->getTime ("Observation.claimPeriod"); itsPreparePeriod = globalParameterSet()->getTime ("Observation.preparePeriod"); + // Values from my conf file + itsLateLimit = globalParameterSet()->getTime ("ObservationControl.lateLimit", 15); + itsFailedLimit = globalParameterSet()->getTime ("ObservationControl.failedLimit", 30); + itsHeartBeatItv = globalParameterSet()->getTime ("ObservationControl.heartbeatInterval", 10); + string reportType = globalParameterSet()->getString("ObservationControl.reportType", "Full"); + if (reportType == "Full") itsFullReport = true; + else if (reportType == "Changes") itsChangeReport = true; + // My own parameters itsTreePrefix = globalParameterSet()->getString("prefix"); itsTreeID = globalParameterSet()->getUint32("_treeID"); // !!! itsObsDPname = globalParameterSet()->getString("_DPname"); - itsHeartBeatItv = globalParameterSet()->getTime("heartbeatInterval", 10); - string reportType = globalParameterSet()->getString("reportType", "Full"); - if (reportType == "Full") itsFullReport = true; - else if (reportType == "Changes") itsChangeReport = true; // The time I have to wait for the forced quit depends on the integration time of OLAP string OLAPpos = globalParameterSet()->locateModule("OLAP"); @@ -747,12 +752,13 @@ void ObservationControl::doHeartBeatTask() #if 1 // NOTE: [15122010] Sending respons when first child reached required state. // NOTE: [15122010] WHEN nrChilds = 1 EACH TIME WE COME HERE A REPLY IS SENT!!!!! - if (itsBusyControllers == nrChilds-1) { // first reply received? + if ((itsBusyControllers == nrChilds-1) && (itsLastReportedState != itsState)) { // first reply received? CTState cts; // report that state is reached. LOG_INFO_STR("First controller reached required state " << cts.name(cts.stateAck(itsState)) << ", informing SAS although it is too early!"); sendControlResult(*itsParentPort, cts.signal(cts.stateAck(itsState)), getName(), CT_RESULT_NO_ERROR); setState(cts.stateAck(itsState)); + itsLastReportedState = itsState; } #endif @@ -795,20 +801,24 @@ void ObservationControl::_updateChildInfo(const string& name, CTState::CTstateNr { // make sure that quited (and by ChildControl already removed) controllers are updatd also. if (!name.empty() && state != CTState::NOSTATE && itsChildInfo.find(name) != itsChildInfo.end()) { - itsChildInfo[name].state = state; + itsChildInfo[name].currentState = state; CTState CTS; LOG_DEBUG_STR("_updateChildInfo: FORCING " << name << " to " << CTS.name(state)); return; } + // get latest status info vector<ChildControl::StateInfo> childs = itsChildControl->getChildInfo(name, 0, CNTLRTYPE_NO_TYPE); int nrChilds = childs.size(); for (int i = 0; i < nrChilds; i++) { if (itsChildInfo.find(childs[i].name) == itsChildInfo.end()) { // not in map already? - itsChildInfo[childs[i].name] = ChildProc(childs[i].cntlrType, childs[i].currentState); + itsChildInfo[childs[i].name] = + ChildProc(childs[i].cntlrType, childs[i].currentState, childs[i].requestedState, childs[i].requestTime); } else { - itsChildInfo[childs[i].name].state = (state != CTState::NOSTATE) ? state : childs[i].currentState; + itsChildInfo[childs[i].name].currentState = (state != CTState::NOSTATE) ? state : childs[i].currentState; + itsChildInfo[childs[i].name].requestedState = childs[i].requestedState; + itsChildInfo[childs[i].name].requestTime = childs[i].requestTime; } } } @@ -825,15 +835,36 @@ void ObservationControl::_showChildInfo() CTState CTS; map<string, ChildProc>::iterator iter = itsChildInfo.begin(); map<string, ChildProc>::iterator end = itsChildInfo.end(); + time_t now(time(0)); while (iter != end) { - if (itsFullReport || (itsChangeReport && iter->second.state != iter->second.reportedState)) { - LOG_INFO(formatString("%-35.35s: %-10.10s", iter->first.c_str(), CTS.name(iter->second.state).c_str())); - iter->second.reportedState = iter->second.state; + ChildProc* cp = &(iter->second); + LOG_DEBUG_STR(iter->first<<":cur="<<cp->currentState<<",req="<<cp->requestedState<<",rep=" + <<cp->reportedState<<",late="<<now-cp->requestTime); + // always OR child not in requested state OR current state not yet reported + if (itsFullReport || cp->currentState != cp->requestedState || cp->reportedState != cp->currentState) { + string stateName = CTS.name(cp->currentState); + + // Selection of loglineType is based on 'late', while starting up correct late for scp timeout + // to prevent fault 'not responding' messages. + int32 late = now - cp->requestTime - (cp->requestedState <= CTState::CLAIMED ? MAC_SCP_TIMEOUT : 0); + if (late > itsLateLimit && late < itsFailedLimit && cp->currentState < cp->requestedState) { + LOG_WARN(formatString("%-35.35s: IS LATE, state= %-10.10s", iter->first.c_str(), stateName.c_str())); + } + else if (late > itsFailedLimit && cp->reportedState != cp->requestedState && cp->currentState < cp->requestedState) { + LOG_FATAL(formatString("%-35.35s: IS NOT RESPONDING, state= %-10.10s", iter->first.c_str(), stateName.c_str())); + cp->reportedState = cp->currentState; + } + else { + LOG_INFO(formatString("%-35.35s: %-10.10s", iter->first.c_str(), stateName.c_str())); + if (cp->currentState == cp->requestedState) { + cp->reportedState = cp->currentState; + } + } } iter++; } } - + // // _connectedHandler(port) // diff --git a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.conf b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.conf index b3b993358bb..69f735d982f 100644 --- a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.conf +++ b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.conf @@ -1,13 +1,19 @@ # # Manage amount of logging # -# Full: report state of each childController every heartbeat +# Full: report state of each childController every heartbeat [default] # Changes : report state-changes of the childControllers only # -#reportType = Changes +#ObservationControl.reportType = Changes # # Intervaltime how often the ObservationController should report the current state # and checks its quality measures. # -heartbeatInterval = 10s +ObservationControl.heartbeatInterval = 10s + +# +# Timeout values to control when a state-change is late or is treated as a failure. +# +ObservationControl.lateLimit = 15s +ObservationControl.FailedLimit = 30s diff --git a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.h b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.h index d22fbdcdf73..2acf4d7417e 100644 --- a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.h +++ b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.h @@ -127,17 +127,23 @@ private: // State administration. Note: administration is done by ChildControl, to simplify reports // about the states we keep a copy of it. typedef struct ChildProc { - ChildProc() : type(APLCommon::CNTLRTYPE_NO_TYPE), state(CTState::NOSTATE) {}; - ChildProc(int aType, CTState::CTstateNr aState) : type(aType), state(aState) {}; + ChildProc() : type(APLCommon::CNTLRTYPE_NO_TYPE), requestTime(0), requestedState(CTState::NOSTATE), + currentState(CTState::NOSTATE), reportedState(CTState::NOSTATE) {}; + ChildProc(int aType, CTState::CTstateNr curState, CTState::CTstateNr reqState, time_t reqTime) : + type(aType), requestTime(reqTime), requestedState(reqState), + currentState(curState), reportedState(CTState::NOSTATE) {}; uint16 type; - CTState::CTstateNr state; - CTState::CTstateNr reportedState; + time_t requestTime; // time last statechange was requested + CTState::CTstateNr requestedState; + CTState::CTstateNr currentState; + CTState::CTstateNr reportedState; // state eported in logfiles. } ChildProc; map<string, ChildProc> itsChildInfo; bool itsFullReport; // report every child every heartbeat bool itsChangeReport; // report only changed states CTState::CTstateNr itsState; + CTState::CTstateNr itsLastReportedState; // to SAS uint32 itsNrStations; uint32 itsNrOnlineCtrls; uint32 itsNrOfflineCtrls; @@ -162,6 +168,8 @@ private: uint32 itsForcedQuitDelay; uint32 itsClaimPeriod; uint32 itsPreparePeriod; + int32 itsLateLimit; // after how many seconds a requested state should have been reached. + int32 itsFailedLimit; // after how many seconds a late state change is treated as failure. ptime itsStartTime; ptime itsStopTime; }; -- GitLab