From 9e49fce628d49673b94fe7e7eb8d22aa4b1e0e8c Mon Sep 17 00:00:00 2001 From: Jorrit Schaap <schaap@astron.nl> Date: Wed, 6 Jul 2016 14:49:12 +0000 Subject: [PATCH] Task #9607: added logging in MACScheduler::_updatePlannedList() to identify if the missed observations are due to a potential race-condiction in itsOTDBconnection->getTreeGroup and the timeBeforeStart check --- .../MainCU/src/MACScheduler/MACScheduler.cc | 72 ++++++++++--------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc b/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc index 268a664f815..2b1be5279aa 100644 --- a/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc +++ b/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc @@ -65,7 +65,7 @@ namespace LOFAR { // static (this) pointer used for signal handling static MACScheduler* pMacScheduler = 0; - + // // MACScheduler() // @@ -108,7 +108,7 @@ MACScheduler::MACScheduler() : } } - ASSERTSTR(itsMaxPlanned + itsMaxFinished < MAX_CONCURRENT_OBSERVATIONS, + ASSERTSTR(itsMaxPlanned + itsMaxFinished < MAX_CONCURRENT_OBSERVATIONS, "maxPlannedList + maxFinishedList should be less than " << MAX_CONCURRENT_OBSERVATIONS); // Read the schedule periods for starting observations. @@ -192,12 +192,12 @@ void MACScheduler::_databaseEventHandler(GCFEvent& event) itsQueuePeriod = newVal; } #endif - } + } break; default: break; - } + } } @@ -211,7 +211,7 @@ GCFEvent::TResult MACScheduler::initial_state(GCFEvent& event, GCFPortInterface& LOG_DEBUG_STR ("initial_state:" << eventName(event)); GCFEvent::TResult status = GCFEvent::HANDLED; - + switch (event.signal) { case F_INIT: break; @@ -235,7 +235,7 @@ GCFEvent::TResult MACScheduler::initial_state(GCFEvent& event, GCFPortInterface& itsTimerPort->setTimer(0.0); } break; - + case F_TIMER: { // must be timer that PropSet is enabled. // update PVSS. LOG_TRACE_FLOW ("Updateing state to PVSS"); @@ -249,7 +249,7 @@ GCFEvent::TResult MACScheduler::initial_state(GCFEvent& event, GCFPortInterface& itsPropertySet->setValue(PN_MS_PLANNED_OBSERVATIONS, GCFPVDynArr(LPT_STRING, emptyArr)); itsPropertySet->setValue(PN_MS_FINISHED_OBSERVATIONS, GCFPVDynArr(LPT_STRING, emptyArr)); - + // Try to connect to the SAS database. ParameterSet* pParamSet = globalParameterSet(); string username = pParamSet->getString("OTDBusername"); @@ -261,7 +261,7 @@ GCFEvent::TResult MACScheduler::initial_state(GCFEvent& event, GCFPortInterface& itsOTDBconnection= new OTDBconnection(username, password, DBname, hostname); ASSERTSTR (itsOTDBconnection, "Memory allocation error (OTDB)"); ASSERTSTR (itsOTDBconnection->connect(), - "Unable to connect to database " << DBname << " on " << hostname << + "Unable to connect to database " << DBname << " on " << hostname << " using " << username << "," << password); LOG_INFO ("Connected to the OTDB"); itsPropertySet->setValue(PN_MS_OTDB_CONNECTED, GCFPVBool(true)); @@ -285,12 +285,12 @@ GCFEvent::TResult MACScheduler::initial_state(GCFEvent& event, GCFPortInterface& case F_DISCONNECTED: break; - + default: LOG_DEBUG ("MACScheduler::initial, default"); status = GCFEvent::NOT_HANDLED; break; - } + } return (status); } @@ -320,15 +320,15 @@ GCFEvent::TResult MACScheduler::recover_state(GCFEvent& event, GCFPortInterface& // TODO: do recovery TRAN(MACScheduler::active_state); - + break; } - + default: LOG_DEBUG("recover_state, default"); status = GCFEvent::NOT_HANDLED; break; - } + } return (status); } @@ -349,7 +349,7 @@ GCFEvent::TResult MACScheduler::active_state(GCFEvent& event, GCFPortInterface& break; case F_ENTRY: { - // install my own signal handler. GCFTask also installs a handler so we have + // install my own signal handler. GCFTask also installs a handler so we have // to install our handler later than the GCFTask handler. pMacScheduler = this; signal (SIGINT, MACScheduler::sigintHandler); // ctrl-c @@ -367,12 +367,12 @@ GCFEvent::TResult MACScheduler::active_state(GCFEvent& event, GCFPortInterface& case F_ACCEPT_REQ: break; - case F_CONNECTED: + case F_CONNECTED: // Should be from the (lost) connection with the SD _connectedHandler(port); break; - case F_DISCONNECTED: + case F_DISCONNECTED: // Can be from StartDaemon or ObsController. // StartDaemon: trouble! Try to reconnect asap. // ObsController: ok when obs is finished, BIG TROUBLE when not! @@ -432,7 +432,7 @@ GCFEvent::TResult MACScheduler::active_state(GCFEvent& event, GCFPortInterface& // -------------------- EVENTS FROM CHILDCONTROL -------------------- // - // That must be events from the ObservationControllers that are currently + // That must be events from the ObservationControllers that are currently // started or running. // case CONTROL_STARTED: { @@ -562,16 +562,16 @@ GCFEvent::TResult MACScheduler::finishing_state(GCFEvent& event, GCFPortInterfac itsTimerPort->setTimer(1L); break; } - + case F_TIMER: GCFScheduler::instance()->stop(); break; - + default: LOG_DEBUG("finishing_state, default"); status = GCFEvent::NOT_HANDLED; break; - } + } return (status); } @@ -624,25 +624,27 @@ void MACScheduler::_doOTDBcheck() // void MACScheduler::_updatePlannedList() { - LOG_DEBUG("_updatePlannedList()"); + LOG_INFO("_updatePlannedList()"); // get time info time_t now = time(0); ptime currentTime = from_time_t(now); ASSERTSTR (currentTime != not_a_date_time, "Can't determine systemtime, bailing out"); + LOG_INFO_STR("calling getTreeGroup"); // get new list (list is ordered on starttime) of planned observations - vector<OTDBtree> plannedDBlist = itsOTDBconnection->getTreeGroup(1, itsPlannedPeriod, itsExclPLcluster); + vector<OTDBtree> plannedDBlist = itsOTDBconnection->getTreeGroup(1, itsPlannedPeriod, itsExclPLcluster); + LOG_INFO_STR("called getTreeGroup, nr of items in plannedDBlist: " << plannedDBlist.size()); if (!plannedDBlist.empty()) { - LOG_DEBUG(formatString("OTDBCheck:First planned observation (%d) is at %s (active over %d seconds)", - plannedDBlist[0].treeID(), to_simple_string(plannedDBlist[0].starttime).c_str(), + LOG_DEBUG(formatString("OTDBCheck:First planned observation (%d) is at %s (active over %d seconds)", + plannedDBlist[0].treeID(), to_simple_string(plannedDBlist[0].starttime).c_str(), time_duration(plannedDBlist[0].starttime - currentTime).total_seconds())); } // NOTE: do not exit routine on emptylist: we need to write an empty list to clear the DB // make a copy of the current prepared observations (= observations shown in the navigator in the 'future' - // list). By eliminating the observations that are in the current SAS list we end up (at the end of this function) + // list). By eliminating the observations that are in the current SAS list we end up (at the end of this function) // with a list of observations that were in the SASlist the last time but now not anymore. Normally those observations // will appear in the active-list and will be removed there from the prepared list but WHEN AN OPERATOR CHANGES // THE STATUS MANUALLY into something different (e.g. ON-HOLD) the observation stays in the preparedlist. @@ -672,14 +674,14 @@ void MACScheduler::_updatePlannedList() // must we claim this observation at the claimMgr? OLiter prepIter = itsPreparedObs.find(obsID); - if ((prepIter == itsPreparedObs.end()) || (prepIter->second.prepReady == false) || + if ((prepIter == itsPreparedObs.end()) || (prepIter->second.prepReady == false) || (prepIter->second.modTime != modTime)) { // create a ParameterFile for this Observation TreeMaintenance tm(itsOTDBconnection); OTDBnode topNode = tm.getTopNode(obsID); string filename(observationParset(obsID)); if (!tm.exportTree(obsID, topNode.nodeID(), filename)) { - LOG_ERROR_STR ("Cannot create ParameterSet '" << filename << + LOG_ERROR_STR ("Cannot create ParameterSet '" << filename << "' for new observation. Observation CANNOT BE STARTED!"); } else { @@ -694,10 +696,14 @@ void MACScheduler::_updatePlannedList() // otherwise thing will go wrong in the Navigator plannedArr.push_back(new GCFPVString(obsName)); } - + // should this observation (have) be(en) started? int timeBeforeStart = time_duration(plannedDBlist[idx].starttime - currentTime).total_seconds(); -// LOG_DEBUG_STR(obsName << " starts over " << timeBeforeStart << " seconds"); + LOG_INFO(formatString("%s starts at %s which is in %d seconds", + obsName.c_str(), + to_simple_string(plannedDBlist[idx].starttime).c_str(), + timeBeforeStart)); + if (timeBeforeStart > 0 && timeBeforeStart <= (int)itsQueuePeriod) { if (itsPreparedObs[obsID].prepReady == false) { LOG_INFO_STR("Observation " << obsID << " must be started but is not claimed yet."); @@ -710,22 +716,22 @@ void MACScheduler::_updatePlannedList() string cntlrName(controllerName(CNTLRTYPE_OBSERVATIONCTRL, 0, obsID)); if (itsControllerMap.find(cntlrName) == itsControllerMap.end()) { LOG_INFO_STR("Requesting start of " << cntlrName); - itsChildControl->startChild(CNTLRTYPE_OBSERVATIONCTRL, - obsID, + itsChildControl->startChild(CNTLRTYPE_OBSERVATIONCTRL, + obsID, 0, // instanceNr myHostname(true)); // Note: controller is now in state NO_STATE/CONNECTED (C/R) // add controller to our 'monitor' administration itsControllerMap[cntlrName] = obsID; - LOG_DEBUG_STR("itsControllerMap[" << cntlrName << "]=" << obsID); + LOG_INFO_STR("itsControllerMap[" << cntlrName << "]=" << obsID); if (!itsPreparedObs[obsID].parsetDistributed) { _setParsetOnMsgBus(observationParset(obsID)); itsPreparedObs[obsID].parsetDistributed = true; } } else { - LOG_DEBUG_STR("Observation " << obsID << " is already (being) started"); + LOG_INFO_STR("Observation " << obsID << " is already (being) started"); } } } -- GitLab