Skip to content

Commit 9b967cc

Browse files
knopers8teo
authored andcommitted
OCTRL-949 [core] Improve reaction to controlled nodes becoming unreachable
Includes: - fixed copy-paste logs "received executor failed" -> "received agent failed" - added an operator log in case of connection issues to a mesos slave - allowed to re-register agent and executor IDs for a Task once they come back (they are removed when an Agent/Executor failure is received). Effectively, this allows an environment to be torn down correctly, fixing at least some of the leftover task issues (OCTRL-611). - added documentation about configuring the node-down timeout
1 parent 4451613 commit 9b967cc

File tree

4 files changed

+24
-3
lines changed

4 files changed

+24
-3
lines changed

core/environment/manager.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,13 +116,13 @@ func NewEnvManager(tm *task.Manager, incomingEventCh chan event.Event) *Manager
116116
WithField("partition", envId.String()).
117117
WithField("agentId", typedEvent.GetId().Value).
118118
WithError(err).
119-
Error("cannot find environment for incoming executor failed event")
119+
Error("cannot find environment for incoming agent failed event")
120120
}
121121
log.WithPrefix("scheduler").
122122
WithField("partition", envId.String()).
123123
WithField("agentId", typedEvent.GetId().Value).
124124
WithField("envState", env.CurrentState()).
125-
Debug("received executor failed event")
125+
Debug("received agent failed event")
126126
}
127127

128128
case *event.TasksReleasedEvent:

core/task/manager.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,6 +1047,13 @@ func (m *Manager) updateTaskStatus(status *mesos.TaskStatus) {
10471047
if taskPtr.GetParent() != nil {
10481048
taskPtr.GetParent().UpdateStatus(ACTIVE)
10491049
}
1050+
if status.GetAgentID() != nil {
1051+
taskPtr.agentId = status.GetAgentID().GetValue()
1052+
}
1053+
if status.GetExecutorID() != nil {
1054+
taskPtr.executorId = status.GetExecutorID().GetValue()
1055+
}
1056+
10501057
case mesos.TASK_DROPPED, mesos.TASK_LOST, mesos.TASK_KILLED, mesos.TASK_FAILED, mesos.TASK_ERROR, mesos.TASK_FINISHED:
10511058

10521059
taskPtr.status = INACTIVE

core/task/scheduler.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,9 @@ func (state *schedulerState) failure(_ context.Context, e *scheduler.Event) erro
246246
WithFields(fields).
247247
WithField("level", infologger.IL_Support).
248248
Error("agent failed")
249+
log.WithField("level", infologger.IL_Ops).
250+
WithField("detector", detector).
251+
Errorf("possible connectivity issues with host '%s'", host)
249252
state.taskman.internalEventCh <- event.NewAgentFailedEvent(aid)
250253
}
251254
return nil

docs/handbook/appconfiguration.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,12 @@
1-
# Component Configuration
1+
# Component Configuration
2+
3+
## Connectivity to controlled nodes
4+
5+
ECS relies on Mesos to know the state of the controlled nodes.
6+
Thus, losing connection to a Mesos slave can be treated as a node being down or unresponsive.
7+
In case a Mesos slave is lost, tasks belonging to it are set to ERROR state and treated as INACTIVE.
8+
Then, the environment is transitioned to ERROR.
9+
10+
Mesos slave health check can be configured with `MESOS_MAX_AGENT_PING_TIMEOUTS` (`--max_agent_ping_timeouts`) and `MESOS_AGENT_PING_TIMEOUT` (`--agent_ping_timeout`) parameters for Mesos.
11+
Effectively, the factor of the two parameters is the time needed to consider a slave/agent as lost.
12+
Please refer to Mesos documentation for more details.

0 commit comments

Comments
 (0)