OCTRL-949 [core] Improve reaction to controlled nodes becoming unreachable

knopers8 · teo · commit 9b967cc6be0b · 2024-12-05T13:18:48.000+01:00
Includes:
- fixed copy-paste logs "received executor failed" -&gt; "received agent failed"
- added an operator log in case of connection issues to a mesos slave
- allowed to re-register agent and executor IDs for a Task once they come back (they are removed when an Agent/Executor failure is received). Effectively, this allows an environment to be torn down correctly, fixing at least some of the leftover task issues (OCTRL-611).
- added documentation about configuring the node-down timeout
diff --git a/core/environment/manager.go b/core/environment/manager.go
@@ -116,13 +116,13 @@ func NewEnvManager(tm *task.Manager, incomingEventCh chan event.Event) *Manager
 								WithField("partition", envId.String()).
 								WithField("agentId", typedEvent.GetId().Value).
 								WithError(err).
-								Error("cannot find environment for incoming executor failed event")
+								Error("cannot find environment for incoming agent failed event")
 						}
 						log.WithPrefix("scheduler").
 							WithField("partition", envId.String()).
 							WithField("agentId", typedEvent.GetId().Value).
 							WithField("envState", env.CurrentState()).
-							Debug("received executor failed event")
+							Debug("received agent failed event")
 					}
 
 				case *event.TasksReleasedEvent:
diff --git a/core/task/manager.go b/core/task/manager.go
@@ -1047,6 +1047,13 @@ func (m *Manager) updateTaskStatus(status *mesos.TaskStatus) {
 		if taskPtr.GetParent() != nil {
 			taskPtr.GetParent().UpdateStatus(ACTIVE)
 		}
+		if status.GetAgentID() != nil {
+			taskPtr.agentId = status.GetAgentID().GetValue()
+		}
+		if status.GetExecutorID() != nil {
+			taskPtr.executorId = status.GetExecutorID().GetValue()
+		}
+
 	case mesos.TASK_DROPPED, mesos.TASK_LOST, mesos.TASK_KILLED, mesos.TASK_FAILED, mesos.TASK_ERROR, mesos.TASK_FINISHED:
 
 		taskPtr.status = INACTIVE
diff --git a/core/task/scheduler.go b/core/task/scheduler.go
@@ -246,6 +246,9 @@ func (state *schedulerState) failure(_ context.Context, e *scheduler.Event) erro
 			WithFields(fields).
 			WithField("level", infologger.IL_Support).
 			Error("agent failed")
+		log.WithField("level", infologger.IL_Ops).
+			WithField("detector", detector).
+			Errorf("possible connectivity issues with host '%s'", host)
 		state.taskman.internalEventCh <- event.NewAgentFailedEvent(aid)
 	}
 	return nil
diff --git a/docs/handbook/appconfiguration.md b/docs/handbook/appconfiguration.md
@@ -1 +1,12 @@
-# Component Configuration
+# Component Configuration
+
+## Connectivity to controlled nodes
+
+ECS relies on Mesos to know the state of the controlled nodes.
+Thus, losing connection to a Mesos slave can be treated as a node being down or unresponsive.
+In case a Mesos slave is lost, tasks belonging to it are set to ERROR state and treated as INACTIVE.
+Then, the environment is transitioned to ERROR.
+
+Mesos slave health check can be configured with `MESOS_MAX_AGENT_PING_TIMEOUTS` (`--max_agent_ping_timeouts`) and `MESOS_AGENT_PING_TIMEOUT` (`--agent_ping_timeout`) parameters for Mesos.
+Effectively, the factor of the two parameters is the time needed to consider a slave/agent as lost.
+Please refer to Mesos documentation for more details.