From 6fdcff71bc3e5935e567b265390e9e4eb2ccb057 Mon Sep 17 00:00:00 2001 From: Ruud Overeem <overeem@astron.nl> Date: Thu, 28 Apr 2011 12:21:33 +0000 Subject: [PATCH] Bug 1000: + Update to PythonControl to allow clients that can not communicate. + Added 'result' field to CONTROL_CONNECT message to pass failures. + Fixed stopping garbagecontroller timer in ChildControl + Fixed emergency stop timer in ParentControl + ObservationControl reports more detailed how an observation ends. --- .../include/APL/APLCommon/ControllerDefines.h | 1 + MAC/APL/APLCommon/src/ChildControl.cc | 23 +++--- .../APLCommon/src/Controller_Protocol.prot | 4 + MAC/APL/APLCommon/src/ParentControl.cc | 17 ++-- .../CEPCU/src/PythonControl/PythonControl.cc | 77 +++++++++++++++---- .../CEPCU/src/PythonControl/PythonControl.h | 2 + .../MainCU/src/MACScheduler/MACScheduler.cc | 4 +- .../ObservationControl/ObservationControl.cc | 45 ++++++++--- .../src/ClockControl/ClockControl.cc | 2 +- MAC/Deployment/data/OTDB/PythonControl.comp | 1 + 10 files changed, 133 insertions(+), 43 deletions(-) diff --git a/MAC/APL/APLCommon/include/APL/APLCommon/ControllerDefines.h b/MAC/APL/APLCommon/include/APL/APLCommon/ControllerDefines.h index 0ee00ff74ef..1641cb3a2f1 100644 --- a/MAC/APL/APLCommon/include/APL/APLCommon/ControllerDefines.h +++ b/MAC/APL/APLCommon/include/APL/APLCommon/ControllerDefines.h @@ -51,6 +51,7 @@ enum CT_RESULT_OUT_OF_SYNC, CT_RESULT_OBS_CONFLICT, CT_RESULT_MANUAL_ABORT, + CT_RESULT_MANUAL_REMOVED, CT_RESULT_NO_PARSET, CT_RESULT_UNKNOWN_OBSERVATION, CT_RESULT_UNSPECIFIED diff --git a/MAC/APL/APLCommon/src/ChildControl.cc b/MAC/APL/APLCommon/src/ChildControl.cc index 3ace53bfd35..23dcdbd904b 100644 --- a/MAC/APL/APLCommon/src/ChildControl.cc +++ b/MAC/APL/APLCommon/src/ChildControl.cc @@ -966,23 +966,19 @@ void ChildControl::_doGarbageCollection() CIiter iter = itsCntlrList->begin(); const_CIiter end = itsCntlrList->end(); + bool restartTimer(false); while (iter != end) { // Note: Removing a controller is done in two stages. // 1: port == 0: inform main task about removal after retry interval expired // 2: port == -1: remove from list // This is necc. because main task may poll childcontrol for results. if (!iter->port) { + restartTimer = true; LOG_DEBUG_STR(time(0)<<"-"<<iter->requestTime<<">="<<itsStartupRetryInterval<<"*"<<itsMaxStartupRetries<<"?"); if ((uint32(time(0)-iter->requestTime)) >= itsStartupRetryInterval*itsMaxStartupRetries) { LOG_DEBUG_STR ("Controller " << iter->cntlrName << " is still unreachable, informing main task"); _setEstablishedState(iter->cntlrName, CTState::QUITED, time(0), CT_RESULT_LOST_CONNECTION); iter->port = (GCFPortInterface*) -1; - // start timer for second stage. - if (itsGarbageTimer) { - itsTimerPort.cancelTimer(itsGarbageTimer); - } - itsGarbageTimer = itsTimerPort.setTimer(1.0 * itsGarbageInterval); - LOG_DEBUG_STR("GarbageTimer = " << itsGarbageTimer); } iter++; } else if (iter->port == (GCFPortInterface*)-1) { @@ -995,6 +991,15 @@ void ChildControl::_doGarbageCollection() iter++; } } + + // restart the garbage timer when more work needs to be done in the future. + if (restartTimer) { + if (itsGarbageTimer) { + itsTimerPort.cancelTimer(itsGarbageTimer); + } + itsGarbageTimer = itsTimerPort.setTimer(1.0 * itsGarbageInterval); + LOG_DEBUG_STR("GarbageTimer = " << itsGarbageTimer); + } } // -------------------- STATE MACHINES FOR GCFTASK -------------------- @@ -1206,14 +1211,12 @@ GCFEvent::TResult ChildControl::operational(GCFEvent& event, CIiter controller = findController(msg.cntlrName); if (controller == itsCntlrList->end()) { // not found? - LOG_WARN_STR ("CONNECT event received from unknown controller: " << - msg.cntlrName); + LOG_WARN_STR ("CONNECT event received from unknown controller: " << msg.cntlrName); answer.result = CT_RESULT_UNSPECIFIED; } else { LOG_DEBUG_STR("CONNECT event received from " << msg.cntlrName); - // we don't have a result field in the Connect message, return NO_ERROR - _setEstablishedState(msg.cntlrName, CTState::CONNECTED, time(0), CT_RESULT_NO_ERROR); + _setEstablishedState(msg.cntlrName, CTState::CONNECTED, time(0), msg.result); // first direct contact with controller, remember port controller->port = &port; answer.result = CT_RESULT_NO_ERROR; diff --git a/MAC/APL/APLCommon/src/Controller_Protocol.prot b/MAC/APL/APLCommon/src/Controller_Protocol.prot index f0cfc25e471..293a1ee0105 100644 --- a/MAC/APL/APLCommon/src/Controller_Protocol.prot +++ b/MAC/APL/APLCommon/src/Controller_Protocol.prot @@ -104,6 +104,10 @@ event = { name = "cntlrName"; type = "string"; }; + param = { + name = "result"; + type = "uint16"; + }; }; // diff --git a/MAC/APL/APLCommon/src/ParentControl.cc b/MAC/APL/APLCommon/src/ParentControl.cc index f5a5d89b3d1..73a444a9be8 100644 --- a/MAC/APL/APLCommon/src/ParentControl.cc +++ b/MAC/APL/APLCommon/src/ParentControl.cc @@ -429,6 +429,7 @@ void ParentControl::_doRequestedAction(PIiter parent) // construct and send message CONTROLConnectEvent hello; hello.cntlrName = parent->name; + hello.result = CT_RESULT_NO_ERROR; parent->port->send(hello); // (re)set the parameters of this connection @@ -693,7 +694,7 @@ GCFEvent::TResult ParentControl::operational(GCFEvent& event, LOG_TRACE_VAR_STR("timerID:" << timerEvent.id); PIiter parent = findParentOnTimerID(timerEvent.id, &timerType); if (!isParent(parent)) { - LOG_DEBUG ("timerevent is not of a known parent, ignore"); + LOG_WARN ("Timerevent is not of a known parent, ignore"); break; } @@ -730,10 +731,14 @@ GCFEvent::TResult ParentControl::operational(GCFEvent& event, } } else { - LOG_WARN_STR ("Could not reconnect to parent " << - parent->name << ", deleting entry"); -// concrete_release(parent); - itsParentList.erase(parent); + LOG_WARN_STR ("Could not reconnect to parent " << parent->name << ", deleting entry"); + if (parent->stopTimer) { + itsTimerPort.cancelTimer(parent->stopTimer); + parent->stopTimer = itsTimerPort.setTimer(0.0); + } + else { + itsParentList.erase(parent); + } } } break; @@ -784,6 +789,7 @@ GCFEvent::TResult ParentControl::operational(GCFEvent& event, LOG_DEBUG_STR("Sending CONNECT(" << parent.name << ") event to maintask"); CONTROLConnectEvent request; request.cntlrName = parent.name; + request.result = CT_RESULT_NO_ERROR; itsMainTaskPort->sendBack(request); } break; @@ -800,6 +806,7 @@ GCFEvent::TResult ParentControl::operational(GCFEvent& event, } CONTROLConnectEvent outMsg; outMsg.cntlrName = inMsg.cntlrName; + outMsg.result = inMsg.result; parent->port->send(outMsg); parent->currentState = CTState::CONNECT; LOG_DEBUG_STR("Forwarding CONTROL_CONNECT to parent " << inMsg.cntlrName); diff --git a/MAC/APL/CEPCU/src/PythonControl/PythonControl.cc b/MAC/APL/CEPCU/src/PythonControl/PythonControl.cc index de981ae3191..2d9ceec5dbc 100644 --- a/MAC/APL/CEPCU/src/PythonControl/PythonControl.cc +++ b/MAC/APL/CEPCU/src/PythonControl/PythonControl.cc @@ -29,17 +29,16 @@ //#include <Common/lofar_string.h> #include <Common/ParameterSet.h> #include <Common/Exceptions.h> -#include <ApplCommon/StationInfo.h> -#include <GCF/PVSS/GCF_PVTypes.h> #include <Common/SystemUtil.h> +#include <ApplCommon/StationInfo.h> #include <MACIO/MACServiceInfo.h> #include <GCF/TM/GCF_Protocols.h> +#include <GCF/PVSS/GCF_PVTypes.h> +#include <GCF/RTDB/DP_Protocol.ph> #include <APL/APLCommon/APL_Defines.h> #include <APL/APLCommon/APLUtilities.h> #include <APL/APLCommon/Controller_Protocol.ph> -#include <APL/APLCommon/APLUtilities.h> #include <APL/APLCommon/CTState.h> -#include <GCF/RTDB/DP_Protocol.ph> #include "PythonControl.h" #include "PVSSDatapointDefs.h" @@ -264,9 +263,10 @@ GCFEvent::TResult PythonControl::initial_state(GCFEvent& event, // request from parent task to start up the child side. ParameterSet* thePS = globalParameterSet(); // shortcut to global PS. - string myPrefix(thePS->locateModule("PythonControl")+"PythonControl."); - string pythonProg(thePS->getString(myPrefix+"pythonProgram", "@pythonProgram@")); - string pythonHost(thePS->getString(myPrefix+"pythonHost", "@pythonHost@")); + string myPrefix (thePS->locateModule("PythonControl")+"PythonControl."); + string pythonProg (thePS->getString(myPrefix+"pythonProgram", "@pythonProgram@")); + string pythonHost (thePS->getString(myPrefix+"pythonHost", "@pythonHost@")); + itsChildCanCommunicate = thePS->getBool (myPrefix+"canCommunicate", true); // START PYTHON bool startOK = _startPython(pythonProg, getObservationNr(getName()), realHostname(pythonHost), itsListener->makeServiceName()); if (!startOK) { @@ -278,8 +278,18 @@ GCFEvent::TResult PythonControl::initial_state(GCFEvent& event, TRAN(PythonControl::finishing_state); } else { - LOG_DEBUG ("Started Python environment, going to waitForConnection state"); - TRAN(PythonControl::waitForConnection_state); + if (itsChildCanCommunicate) { + LOG_DEBUG ("Started Python environment, going to waitForConnection state"); + TRAN(PythonControl::waitForConnection_state); + } + else { + LOG_WARN ("Started Python environment, CHILD CANNOT COMMUNICATE, FAKING RESPONSES!!!"); + CONTROLConnectedEvent answer; + answer.cntlrName = itsMyName; + answer.result = CT_RESULT_NO_ERROR; + itsParentPort->send(answer); + TRAN(PythonControl::operational_state); + } } } break; @@ -426,42 +436,79 @@ GCFEvent::TResult PythonControl::operational_state(GCFEvent& event, GCFPortInter case CONTROL_CLAIM: { CONTROLClaimEvent msg(event); LOG_DEBUG_STR("Received CLAIM(" << msg.cntlrName << ")"); - itsPythonPort->send(msg); + if (itsChildCanCommunicate) { + itsPythonPort->send(msg); + } + else { + LOG_WARN("Sending FAKE Claim response"); + sendControlResult(*itsParentPort, event.signal, itsMyName, CT_RESULT_NO_ERROR); + } break; } case CONTROL_PREPARE: { CONTROLPrepareEvent msg(event); LOG_DEBUG_STR("Received PREPARE(" << msg.cntlrName << ")"); - itsPythonPort->send(msg); + if (itsChildCanCommunicate) { + itsPythonPort->send(msg); + } + else { + LOG_WARN("Sending FAKE Prepare response"); + sendControlResult(*itsParentPort, event.signal, itsMyName, CT_RESULT_NO_ERROR); + } break; } case CONTROL_RESUME: { CONTROLResumeEvent msg(event); LOG_DEBUG_STR("Received RESUME(" << msg.cntlrName << ")"); - itsPythonPort->send(msg); + if (itsChildCanCommunicate) { + itsPythonPort->send(msg); + } + else { + LOG_WARN("Sending FAKE Resume response"); + sendControlResult(*itsParentPort, event.signal, itsMyName, CT_RESULT_NO_ERROR); + } break; } case CONTROL_SUSPEND: { CONTROLSuspendEvent msg(event); LOG_DEBUG_STR("Received SUSPEND(" << msg.cntlrName << ")"); - itsPythonPort->send(msg); + if (itsChildCanCommunicate) { + itsPythonPort->send(msg); + } + else { + LOG_WARN("Sending FAKE Suspend response"); + sendControlResult(*itsParentPort, event.signal, itsMyName, CT_RESULT_NO_ERROR); + } break; } case CONTROL_RELEASE: { CONTROLReleaseEvent msg(event); LOG_DEBUG_STR("Received RELEASE(" << msg.cntlrName << ")"); - itsPythonPort->send(msg); + if (itsChildCanCommunicate) { + itsPythonPort->send(msg); + } + else { + LOG_WARN("Sending FAKE Release response"); + sendControlResult(*itsParentPort, event.signal, itsMyName, CT_RESULT_NO_ERROR); + } break; } case CONTROL_QUIT: { CONTROLQuitEvent msg(event); LOG_DEBUG_STR("Received QUIT(" << msg.cntlrName << ")"); - itsPythonPort->send(msg); + if (itsChildCanCommunicate) { + itsPythonPort->send(msg); + } + else { + LOG_WARN("Sending FAKE Quit response and quiting application"); + sendControlResult(*itsParentPort, event.signal, itsMyName, CT_RESULT_NO_ERROR); + TRAN(PythonControl::finishing_state); + } break; } diff --git a/MAC/APL/CEPCU/src/PythonControl/PythonControl.h b/MAC/APL/CEPCU/src/PythonControl/PythonControl.h index a2848dfa15f..4ce3fff2988 100644 --- a/MAC/APL/CEPCU/src/PythonControl/PythonControl.h +++ b/MAC/APL/CEPCU/src/PythonControl/PythonControl.h @@ -103,6 +103,8 @@ private: GCFTCPPort* itsPythonPort; string itsPythonName; + // Until the python pipeline can communicate we can 'fake' the communication by setting the variable to FALSE + bool itsChildCanCommunicate; CTState::CTstateNr itsState; diff --git a/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc b/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc index 108e4513ca5..5a3cac41cc4 100644 --- a/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc +++ b/MAC/APL/MainCU/src/MACScheduler/MACScheduler.cc @@ -459,6 +459,7 @@ GCFEvent::TResult MACScheduler::active_state(GCFEvent& event, GCFPortInterface& } OTDB::TreeMaintenance tm(itsOTDBconnection); TreeStateConv tsc(itsOTDBconnection); + // CT_RESULT_: MANUAL_REMOVED, MANUAL_ABORT, LOST_CONNECTION, NO_ERROR if (quitedEvent.result == CT_RESULT_NO_ERROR) { tm.setTreeState(theObs->second, tsc.get("finished")); } @@ -467,8 +468,7 @@ GCFEvent::TResult MACScheduler::active_state(GCFEvent& event, GCFPortInterface& } // update our administration - LOG_DEBUG_STR("Removing observation " << quitedEvent.cntlrName << - " from activeList"); + LOG_DEBUG_STR("Removing observation " << quitedEvent.cntlrName << " from activeList"); // _removeActiveObservation(quitedEvent.cntlrName); break; } diff --git a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc index 7f3808f4431..89baaf53549 100644 --- a/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc +++ b/MAC/APL/MainCU/src/ObservationControl/ObservationControl.cc @@ -182,9 +182,8 @@ void ObservationControl::finish() void ObservationControl::abortObservation() { LOG_WARN("Received manual interrupt to ABORT the observation"); - if (itsState < CTState::RESUME) { - itsQuitReason = CT_RESULT_MANUAL_ABORT; - } + itsQuitReason = (itsState < CTState::RESUME) ? CT_RESULT_MANUAL_REMOVED : CT_RESULT_MANUAL_ABORT; + itsTimerPort->cancelTimer(itsStopTimer); // cancel old timer itsStopTimer = itsTimerPort->setTimer(0.0); // expire immediately // will result in F_TIMER in ::active_state @@ -212,8 +211,32 @@ void ObservationControl::setState(CTState::CTstateNr newState) LOG_WARN_STR("Could not update runstate in PVSS of observation " << itsTreeID); } } - setObjectState(formatString("ObservationControl: %s: %s", getName().c_str(), cts.name(newState).c_str()), - itsObsDPname, ((newState>CTState::CONNECT) ? RTDB_OBJ_STATE_OPERATIONAL : RTDB_OBJ_STATE_OFF)); + + string message(cts.name(newState)); + int reportState(RTDB_OBJ_STATE_OFF); + if (newState == CTState::QUITED && itsQuitReason != CT_RESULT_NO_ERROR) { + switch (itsQuitReason) { + case CT_RESULT_MANUAL_REMOVED: + message = "Aborted by operator before observation started"; + break; + case CT_RESULT_MANUAL_ABORT: + message = "Aborted by operator during the observation"; + reportState = RTDB_OBJ_STATE_SUSPICIOUS; + break; + case CT_RESULT_LOST_CONNECTION: + message = "Lost connection(s)"; + reportState = RTDB_OBJ_STATE_BROKEN; + break; + default: + message = "Unknown reason"; + reportState = RTDB_OBJ_STATE_BROKEN; + } + } + else if (newState > CTState::CONNECT) { + reportState = RTDB_OBJ_STATE_OPERATIONAL; + } + string reporterID (formatString("ObservationControl: %s: %s", getName().c_str(), message.c_str())); + setObjectState(reporterID, itsObsDPname, reportState); } if (itsParentControl) { // allow calling this function before parentControl is online @@ -240,9 +263,9 @@ void ObservationControl::registerResultMessage(const string& cntlrName, int resu << cts.name(cts.stateAck(itsState)) << " message."); itsBusyControllers--; // [15122010] see note in doHeartBeatTask! } -// if (state == CTState::QUITED) { -// ... -// } + if (state == CTState::QUITED && result != CT_RESULT_NO_ERROR) { + itsQuitReason = result; + } return; } @@ -426,7 +449,7 @@ GCFEvent::TResult ObservationControl::active_state(GCFEvent& event, GCFPortInter } else if (timerEvent.id == itsStopTimer) { setState(CTState::QUIT); - itsChildResult = CT_RESULT_NO_ERROR; + itsChildResult = itsQuitReason; itsChildsInError = 0; itsStopTimer = 0; LOG_DEBUG("Requesting all childs to quit"); @@ -552,6 +575,7 @@ GCFEvent::TResult ObservationControl::finishing_state(GCFEvent& event, // tell Parent task we like to go down. itsParentControl->nowInState(getName(), CTState::QUIT); + setState(CTState::QUITED); // inform MACScheduler we are going down CONTROLQuitedEvent msg; @@ -560,7 +584,8 @@ GCFEvent::TResult ObservationControl::finishing_state(GCFEvent& event, itsParentPort->send(msg); // update PVSS - itsPropertySet->setValue(string(PN_FSM_CURRENT_ACTION),GCFPVString("Finished")); + itsPropertySet->setValue(string(PN_FSM_CURRENT_ACTION), + GCFPVString((itsQuitReason == CT_RESULT_NO_ERROR) ? "Finished" : "Aborted")); itsPropertySet->setValue(string(PN_FSM_ERROR),GCFPVString("")); itsTimerPort->setTimer(1L); // give PVSS task some time to update the DB. diff --git a/MAC/APL/StationCU/src/ClockControl/ClockControl.cc b/MAC/APL/StationCU/src/ClockControl/ClockControl.cc index 577ecaf4225..248b860a697 100644 --- a/MAC/APL/StationCU/src/ClockControl/ClockControl.cc +++ b/MAC/APL/StationCU/src/ClockControl/ClockControl.cc @@ -985,7 +985,7 @@ GCFEvent::TResult ClockControl::defaultMessageHandling(GCFEvent& event, CONTROLConnectEvent msg(event); CONTROLConnectedEvent answer; answer.cntlrName = msg.cntlrName; - answer.result = true; + answer.result = true; // !!!! ??? TODO itsParentPort->send(answer); } break; diff --git a/MAC/Deployment/data/OTDB/PythonControl.comp b/MAC/Deployment/data/OTDB/PythonControl.comp index 87291744f6d..d26614e4e89 100644 --- a/MAC/Deployment/data/OTDB/PythonControl.comp +++ b/MAC/Deployment/data/OTDB/PythonControl.comp @@ -14,6 +14,7 @@ par pythonHost I text - 100 0 'CCU001' - "Machine the Pythonscript par runtimeDirectory I text - 100 0 '/home/pipeline/runtime/${MSNUMBER}/run' - "Where to start the programs" par resultDirectory I text - 100 0 'lexar001:/data/${MSNUMBER}/output' - "Where to store the results" par workingDirectory I text - 100 0 '/data/scratch/${MSNUMBER}/work' - "Where to store temporarely files" +par canCommunicate I bool - 10 0 'true' - "Temp flag to tell MAC if the current PythonController can respond to CONTROL_xxx messages" #uses ApplCtrl 4.0.0 development 1 "The ACC controller" #uses Flagger 4.0.0 development 1 "Flagger" -- GitLab