From fa3ed845560bf9b9560ef6130ad2fec788f36deb Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 20 Dec 2018 17:51:42 -0800 Subject: [PATCH 01/29] first commit --- .../GroupCommunicationTests.cs | 4 +- .../NetworkService/NetworkServiceTests.cs | 2 +- .../StreamingNetworkServiceTests.cs | 2 +- .../Elastic/Comm/Enum/TaskMessageType.cs | 35 ++ .../Elastic/Comm/IElasticDriverMessage.cs | 44 ++ .../Elastic/Comm/ITaskMessageResponse.cs | 37 ++ .../Elastic/Comm/Impl/CheckpointMessage.cs | 52 ++ .../Elastic/Comm/Impl/DataMessage.cs | 86 +++ .../Comm/Impl/DataMessageWithTopology.cs | 94 ++++ .../Elastic/Comm/Impl/DriverMessagePayload.cs | 72 +++ .../Comm/Impl/ElasticDriverMessageImpl.cs | 104 ++++ .../Comm/Impl/FailureMessagePayload.cs | 43 ++ .../Comm/Impl/GroupCommunicationMessage.cs | 58 ++ .../Comm/Impl/TopologyMessagePayload.cs | 135 +++++ .../Elastic/Comm/Impl/TopologyUpdate.cs | 196 +++++++ .../Elastic/Comm/Impl/UpdateMessagePayload.cs | 43 ++ .../ElasticServiceConfigurationOptions.cs | 124 +++++ .../Elastic/Config/OperatorParameters.cs | 103 ++++ .../Elastic/Driver/IElasticTaskSetService.cs | 59 +- .../Driver/IElasticTaskSetSubscription.cs | 100 +++- .../Elastic/Driver/ITaskSetManager.cs | 78 +-- .../Driver/Impl/DefaultTaskSetService.cs | 350 ++++++++++++ .../Driver/Impl/DefaultTaskSetSubscription.cs | 471 ++++++++++++++++ .../Elastic/Failures/Enum/CheckpointLevel.cs | 34 ++ .../Enum/DefaultFailureStateEvents.cs | 38 ++ .../Failures/Enum/DefaultFailureStates.cs | 39 ++ .../Elastic/Failures/ICheckpointState.cs | 65 +++ .../Elastic/Failures/ICheckpointableState.cs | 57 ++ .../Failures/IDefaultFailureEventResponse.cs | 53 ++ .../Elastic/Failures/IFailureEvent.cs | 27 +- .../Elastic/Failures/IFailureResponse.cs | 31 +- .../Elastic/Failures/IFailureState.cs | 1 - .../Elastic/Failures/IFailureStateMachine.cs | 39 +- .../Elastic/Failures/IReconfigure.cs | 40 ++ .../Elastic/Failures/IReschedule.cs | 40 ++ .../Elastic/Failures/IStop.cs | 29 + .../Elastic/Failures/ITimeout.cs | 36 ++ .../Impl/CheckpointableImmutableObject.cs | 102 ++++ .../Failures/Impl/DefaultCheckpointState.cs | 82 +++ .../Failures/Impl/DefaultFailureState.cs | 65 +++ .../Impl/DefaultFailureStateMachine.cs | 267 +++++++++ .../Elastic/Failures/Impl/FailEvent.cs | 68 +++ .../Failures/Impl/OperatorException.cs | 119 ++++ .../Elastic/Failures/Impl/ReconfigureEvent.cs | 80 +++ .../Elastic/Failures/Impl/RescheduleEvent.cs | 96 ++++ .../Elastic/Failures/Impl/StopEvent.cs | 65 +++ .../Elastic/Operators/Constants.cs | 36 ++ .../Elastic/Operators/IElasticBroadcast.cs | 29 + .../Operators/Logical/IElasticBroadcast.cs | 29 + .../Logical/Impl/DefaultBroadcast.cs | 73 +++ .../Operators/Logical/Impl/DefaultEmpty.cs | 97 ++++ .../Operators/Logical/Impl/DefaultOneToN.cs | 194 +++++++ .../ElastiOperatorWithDefaultDispatcher.cs | 253 +++++++++ .../Operators/Logical/Impl/ElasticOperator.cs | 525 ++++++++++++++++++ .../Physical/Enum/PositionTracker.cs | 43 ++ .../Operators/Physical/IElasticIterator.cs | 53 ++ .../Operators/Physical/IElasticOperator.cs | 73 +++ .../Physical/IElasticTypedOperator.cs | 30 + .../Elastic/Operators/Physical/IReceiver.cs | 35 ++ .../Elastic/Operators/Physical/ISender.cs | 35 ++ .../Physical/Impl/DefaultBroadcast.cs | 75 +++ .../Operators/Physical/Impl/DefaultOneToN.cs | 211 +++++++ .../Elastic/Task/IWaitForTaskRegistration.cs | 36 ++ .../Topology/Logical/Enum/DataNodeState.cs | 31 ++ .../Topology/Logical/Enum/TopologyType.cs | 35 ++ .../Elastic/Topology/Logical/ITopology.cs | 120 ++++ .../Elastic/Topology/Logical/Impl/DataNode.cs | 104 ++++ .../Topology/Logical/Impl/EmptyTopology.cs | 175 ++++++ .../Topology/Logical/Impl/FlatTopology.cs | 434 +++++++++++++++ .../Physical/ICheckpointingTopology.cs | 54 ++ .../Physical/Impl/DefaultBroadcastTopology.cs | 129 +++++ .../Impl/DriverAwareOperatorTopology.cs | 71 +++ .../Topology/Physical/Impl/OneToNTopology.cs | 314 +++++++++++ .../Physical/Impl/OperatorTopology.cs | 72 +++ .../Impl/OperatorTopologyWithCommunication.cs | 273 +++++++++ .../Org.Apache.REEF.Network/Elastic/Utils.cs | 142 +++++ .../Task/Impl/GroupCommNetworkObserver.cs | 2 +- .../Group/Task/Impl/NodeMessageObserver.cs | 9 +- .../Group/Task/Impl/TaskMessageObserver.cs | 2 +- .../NetworkService/Codec/NsMessageCodec.cs | 5 +- .../NetworkService/Codec/NsMessageProto.cs | 9 +- .../Codec/NsMessageStreamingCodec.cs | 52 +- .../NetworkService/NsMessage.cs | 17 +- .../Org.Apache.REEF.Network.csproj | 3 +- .../ByteUtilities.cs | 8 + 85 files changed, 7471 insertions(+), 182 deletions(-) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/IElasticDriverMessage.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/GroupCommunicationMessage.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetService.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetSubscription.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStateEvents.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStates.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointableState.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IDefaultFailureEventResponse.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReconfigure.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IStop.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ITimeout.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/CheckpointableImmutableObject.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureState.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureStateMachine.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/FailEvent.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/OperatorException.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/ReconfigureEvent.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/StopEvent.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Constants.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/IElasticBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/IElasticBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultEmpty.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultOneToN.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElastiOperatorWithDefaultDispatcher.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Enum/PositionTracker.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticTypedOperator.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/IWaitForTaskRegistration.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/DataNodeState.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/TopologyType.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/ICheckpointingTopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs diff --git a/lang/cs/Org.Apache.REEF.Network.Tests/GroupCommunication/GroupCommunicationTests.cs b/lang/cs/Org.Apache.REEF.Network.Tests/GroupCommunication/GroupCommunicationTests.cs index 2f5fe8ac97..83eea0f594 100644 --- a/lang/cs/Org.Apache.REEF.Network.Tests/GroupCommunication/GroupCommunicationTests.cs +++ b/lang/cs/Org.Apache.REEF.Network.Tests/GroupCommunication/GroupCommunicationTests.cs @@ -69,9 +69,9 @@ public void TestSender() new BlockingCollection(); var handler1 = - Observer.Create>(msg => messages1.Add(msg.Data.First())); + Observer.Create>(msg => messages1.Add(msg.Data)); var handler2 = - Observer.Create>(msg => messages2.Add(msg.Data.First())); + Observer.Create>(msg => messages2.Add(msg.Data)); var networkServiceInjector1 = BuildNetworkServiceInjector(endpoint, handler1); var networkServiceInjector2 = BuildNetworkServiceInjector(endpoint, handler2); diff --git a/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/NetworkServiceTests.cs b/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/NetworkServiceTests.cs index d62e9aff65..2efded01b9 100644 --- a/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/NetworkServiceTests.cs +++ b/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/NetworkServiceTests.cs @@ -172,7 +172,7 @@ public MessageHandler(BlockingCollection queue) public void OnNext(NsMessage value) { - _queue.Add(value.Data.First()); + _queue.Add(value.Data); } public void OnError(Exception error) diff --git a/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/StreamingNetworkServiceTests.cs b/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/StreamingNetworkServiceTests.cs index 691ecbf864..8f59104363 100644 --- a/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/StreamingNetworkServiceTests.cs +++ b/lang/cs/Org.Apache.REEF.Network.Tests/NetworkService/StreamingNetworkServiceTests.cs @@ -319,7 +319,7 @@ private MessageHandler() public void OnNext(NsMessage value) { - _queue.Add(value.Data.First()); + _queue.Add(value.Data); } public void OnError(Exception error) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs new file mode 100644 index 0000000000..eb7a4c05d4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Enum +{ + [Unstable("0.16", "Types may change")] + internal enum TaskMessageType : ushort + { + IterationNumber = 0, + + JoinTopology = 1, + + TopologyUpdateRequest = 2, + + NextDataRequest = 3, + + CompleteSubscription = 4 + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/IElasticDriverMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/IElasticDriverMessage.cs new file mode 100644 index 0000000000..857b360de8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/IElasticDriverMessage.cs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm +{ + /// + /// Message sent by the driver to operators on running tasks. + /// This message contains instructions from the driver to tasks's operators. + /// + [Unstable("0.16", "API may change")] + public interface IElasticDriverMessage + { + /// + /// The destination task of the message. + string Destination { get; } + + /// + /// Operator and event specific payload of the message. + /// + DriverMessagePayload Message { get; } + + /// + /// Utility method to serialize the message for communication over the network. + /// + /// The serialized message + byte[] Serialize(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs new file mode 100644 index 0000000000..f501f689b0 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Comm +{ + /// + /// Used to propagate task reponses through operators and subscriptions. + /// + [Unstable("0.16", "API may change")] + public interface ITaskMessageResponse + { + /// + /// Method triggered when a task to driver message is received. + /// + /// The task message for the operator + /// A list of messages containing the instructions for the task + void OnTaskMessage(ITaskMessage message, ref List returnMessages); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs new file mode 100644 index 0000000000..9191b6cb37 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Message used to communicated checkpoints between nodes in order to + /// recover execution. + /// + [Unstable("0.16", "API may change")] + internal sealed class CheckpointMessage : GroupCommunicationMessage + { + /// + /// Constructor for a message containig a checkpoint. + /// + /// The checkpoint state + public CheckpointMessage(ICheckpointState checkpoint) : base(checkpoint.SubscriptionName, checkpoint.OperatorId) + { + Checkpoint = checkpoint; + } + + /// + /// The checkpoint contained in the message. + /// + public ICheckpointState Checkpoint { get; internal set; } + + /// + /// Clone the message. + /// + public override object Clone() + { + return new CheckpointMessage(Checkpoint); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs new file mode 100644 index 0000000000..30380f6456 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Untyped data message sent by group communicationoOperators. This is the class inherited by + /// GroupCommunicationMessage but seen by the Network Service. + /// DataMessages are untyped and used to semplify message propapagation through the + /// communication layers that are type-agnostic. + /// + [Unstable("0.16", "API may change")] + internal abstract class DataMessage : GroupCommunicationMessage + { + /// + /// Constructor for an untyped data message. + /// + /// The name of the subscription for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + public DataMessage(string subscriptionName, int operatorId, int iteration) + : base(subscriptionName, operatorId) + { + Iteration = iteration; + } + + /// + /// The iteration number for the message. + /// + internal int Iteration { get; set; } + + /// + /// Clone the message. + /// + override public object Clone() + { + // The assumption is that messages are immutable therefore there is no need to clone them + return this; + } + } + + /// + /// A typed data message. + /// + /// The type for the data message + [Unstable("0.16", "API may change")] + internal sealed class DataMessage : DataMessage + { + /// + /// Constructor of a typed data message. + /// + /// The name of the subscription for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + /// The data contained in the message + public DataMessage( + string subscriptionName, + int operatorId, + int iteration, //// For the moment we consider iterations as ints. Maybe this would change in the future + T data) : base(subscriptionName, operatorId, iteration) + { + Data = data; + } + + /// + /// The data contained in the message. + /// + internal T Data { get; set; } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs new file mode 100644 index 0000000000..6f12133da7 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// In this message data and topology update information are sent together. + /// This message is untyped and used to semplify message propapagation through the + /// communication layers that are type-agnostic. + /// + [Unstable("0.16", "API may change")] + internal abstract class DataMessageWithTopology : DataMessage + { + /// + /// Constructor for the base untyped data message with topology. + /// + /// The name of the subscription for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + public DataMessageWithTopology(string subscriptionName, int operatorId, int iteration) + : base(subscriptionName, operatorId, iteration) + { + } + + /// + /// Some topology updates piggybacked to the main data message. + /// + internal List TopologyUpdates { get; set; } + } + + /// + /// Typed version for DataMessageWithTopology. This classis used at the communication entry-points. + /// + /// + [Unstable("0.16", "API may change")] + internal class DataMessageWithTopology : DataMessageWithTopology + { + /// + /// Main constructor for data messages with topology information. + /// + /// The name of the subscription for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + /// The data contained in the message + /// The topology updates being transmitted with the data + public DataMessageWithTopology( + string subscriptionName, + int operatorId, + int iteration, + T data, + List updates) : base(subscriptionName, operatorId, iteration) + { + Data = data; + TopologyUpdates = updates; + } + + /// + /// Constructor for a data message with topology but without topology updates. + /// + /// The name of the subscription for the message + /// The operator sending the message + /// The iteration in which the message is sent/valid + /// The data contained in the message + public DataMessageWithTopology( + string subscriptionName, + int operatorId, + int iteration, + T data) : this(subscriptionName, operatorId, iteration, data, new List()) + { + } + + /// + /// The data contained in the message. + /// + internal T Data { get; set; } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs new file mode 100644 index 0000000000..0386a591eb --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm +{ + /// + /// Payload for messages going from the driver to tasks. + /// + [Unstable("0.16", "API may change")] + public abstract class DriverMessagePayload : GroupCommunicationMessage + { + /// + /// Construct a payload for messages created at the driver and directed to tasks. + /// + /// The name of the subsription + /// The id of the operator within the subscription + /// The iteration number in which the message is sent + public DriverMessagePayload(string subscriptionName, int operatorId, int iteration) + : base(subscriptionName, operatorId) + { + Iteration = iteration; + } + + /// + /// The type of payload. + /// + internal DriverMessagePayloadType PayloadType { get; set; } + + /// + /// The iteration number in which the message is sent. + /// + internal int Iteration { get; private set; } + + /// + /// Utility method to serialize the payload for communication. + /// + /// The serialized payload + internal abstract byte[] Serialize(); + } + + /// + /// Possible types of driver message payloads. + /// + [Unstable("0.16", "Types may change")] + internal enum DriverMessagePayloadType : ushort + { + Ring = 1, + + Resume = 2, + + Update = 3, // This is a topology message update + + Failure = 4 // This is a topology message update + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs new file mode 100644 index 0000000000..9aa2c8c059 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Message sent by the driver to operators on running tasks. + /// This message contains instructions from the driver to tasks's operators. + /// + [Unstable("0.16", "API may change")] + internal sealed class ElasticDriverMessageImpl : IElasticDriverMessage + { + /// + /// Create a new driver message. + /// + /// The message destination task + /// The message + public ElasticDriverMessageImpl( + string destinationTaskId, + DriverMessagePayload message) + { + Destination = destinationTaskId; + Message = message; + } + + /// + /// The destination task of the message. + public string Destination { get; private set; } + + /// + /// Operator and event specific payload of the message. + /// + public DriverMessagePayload Message { get; private set; } + + /// + /// Utility method to serialize the message for communication over the network. + /// + /// The serialized message + public byte[] Serialize() + { + List buffer = new List(); + + byte[] destinationBytes = ByteUtilities.StringToByteArrays(Destination); + + buffer.Add(BitConverter.GetBytes(destinationBytes.Length)); + buffer.Add(destinationBytes); + buffer.Add(BitConverter.GetBytes((short)Message.PayloadType)); + buffer.Add(Message.Serialize()); + + return buffer.SelectMany(i => i).ToArray(); + } + + /// + /// Creates a driver message payload out of the memory buffer. + /// + /// The buffer containing a serialized message payload + /// The offset where to start the deserialization process + /// A topology message payload + public static ElasticDriverMessageImpl From(byte[] data, int offset = 0) + { + int destinationLength = BitConverter.ToInt32(data, offset); + offset = 4; + string destination = ByteUtilities.ByteArraysToString(data.Skip(offset).Take(destinationLength).ToArray()); + offset += destinationLength; + + DriverMessagePayloadType type = (DriverMessagePayloadType)BitConverter.ToUInt16(data, offset); + offset += sizeof(ushort); + + DriverMessagePayload payload = null; + + switch (type) + { + case DriverMessagePayloadType.Topology: + payload = TopologyMessagePayload.From(data, offset); + break; + default: + throw new IllegalStateException($"Message type {type} not recognized"); + } + + return new ElasticDriverMessageImpl(destination, payload); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs new file mode 100644 index 0000000000..37213fc1e4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Messages sent by the driver to operators. + /// This message contains information for the destination node on the topology. + /// + [Unstable("0.16", "API may change")] + internal sealed class FailureMessagePayload : TopologyMessagePayload + { + /// + /// Create a driver message payload containing topology updates + /// + /// The topology updates + /// Whether the updates are additions to the current topology state or nodes removal + /// The subscription context for the message + /// The id of the operator receiving the topology update + /// The iteration in which the update takes effect + public FailureMessagePayload(List updates, string subscriptionName, int operatorId, int iteration) + : base(DriverMessagePayloadType.Failure, updates, subscriptionName, operatorId, iteration) + { + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/GroupCommunicationMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/GroupCommunicationMessage.cs new file mode 100644 index 0000000000..36e16adc72 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/GroupCommunicationMessage.cs @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Message sent by Group Communication operators. + /// + [Unstable("0.16", "API may change")] + public abstract class GroupCommunicationMessage : ICloneable + { + /// + /// Create a new Group Communication Message. + /// + /// The name of the subscription + /// The id of the operator sending the message + protected GroupCommunicationMessage( + string subscriptionName, + int operatorId) + { + SubscriptionName = subscriptionName; + OperatorId = operatorId; + } + + /// + /// Clone the message. + /// + /// An object containing the shallow copy of the message. + public abstract object Clone(); + + /// + /// Returns the Subscription. + /// + internal string SubscriptionName { get; private set; } + + /// + /// Returns the Operator id. + /// + internal int OperatorId { get; private set; } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs new file mode 100644 index 0000000000..aeb6b34efc --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Messages sent by the driver to operators. + /// This message contains information for the destination node on the topology. + /// + [Unstable("0.16", "API may change")] + internal class TopologyMessagePayload : DriverMessagePayload + { + /// + /// Create a driver message payload containing topology updates. + /// + /// The topology updates + /// Whether the updates are additions to the current topology state or nodes removal + /// The subscription context for the message + /// The id of the operator receiving the topology update + /// The iteration in which the update takes effect + public TopologyMessagePayload(DriverMessagePayloadType type, List updates, string subscriptionName, int operatorId, int iteration) + : base(subscriptionName, operatorId, iteration) + { + PayloadType = type; + TopologyUpdates = updates; + } + + /// + /// Clone the message. + /// + /// An object containing the shallow copy of the message. + public override object Clone() + { + var updatesClone = new List(); + + foreach (var update in TopologyUpdates) + { + var clone = new TopologyUpdate(update.Node, update.Children, update.Root); + updatesClone.Add(update); + } + + return TopologyMessageBuilder(PayloadType, updatesClone, SubscriptionName, OperatorId, Iteration); + } + + /// + /// The updates for the topology. + /// + internal List TopologyUpdates { get; private set; } + + /// + /// Creates a topology message payload out of memory buffer. + /// + /// The buffer containing a serialized message payload + /// The offset where to start the deserialization process + /// A topology message payload + internal static DriverMessagePayload From(DriverMessagePayloadType type, byte[] data, int offset = 0) + { + int length = BitConverter.ToInt32(data, offset); + offset += sizeof(int); + List updates = TopologyUpdate.Deserialize(data, length, offset); + offset += length; + + length = BitConverter.ToInt32(data, offset); + offset += sizeof(int); + string subscription = ByteUtilities.ByteArraysToString(data, offset, length); + offset += length; + int operatorId = BitConverter.ToInt32(data, offset); + offset += sizeof(int); + int iteration = BitConverter.ToInt32(data, offset); + + return TopologyMessageBuilder(type, updates, subscription, operatorId, iteration); + } + + /// + /// Utility method to serialize the payload for communication. + /// + /// The serialized payload + internal override byte[] Serialize() + { + byte[] subscriptionBytes = ByteUtilities.StringToByteArrays(SubscriptionName); + int offset = 0; + var totalLengthUpdates = TopologyUpdates.Sum(x => x.Size); + byte[] buffer = new byte[sizeof(int) + totalLengthUpdates + sizeof(int) + subscriptionBytes.Length + sizeof(bool) + sizeof(int) + sizeof(int)]; + + Buffer.BlockCopy(BitConverter.GetBytes(totalLengthUpdates), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + TopologyUpdate.Serialize(buffer, ref offset, TopologyUpdates); + + Buffer.BlockCopy(BitConverter.GetBytes(subscriptionBytes.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + Buffer.BlockCopy(subscriptionBytes, 0, buffer, offset, subscriptionBytes.Length); + offset += subscriptionBytes.Length; + Buffer.BlockCopy(BitConverter.GetBytes(OperatorId), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + Buffer.BlockCopy(BitConverter.GetBytes(Iteration), 0, buffer, offset, sizeof(int)); + + return buffer; + } + + private static DriverMessagePayload TopologyMessageBuilder(DriverMessagePayloadType type, List updates, string subscriptionName, int operatorId, int iteration) + { + switch (type) + { + case DriverMessagePayloadType.Update: + return new UpdateMessagePayload(updates, subscriptionName, operatorId, iteration); + case DriverMessagePayloadType.Failure: + return new FailureMessagePayload(updates, subscriptionName, operatorId, iteration); + default: + throw new IllegalStateException($"Topology message type {type} not found."); + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs new file mode 100644 index 0000000000..05b81bfa0c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Class defining the updates of the topology for a running task. + /// + [Unstable("0.16", "API may change")] + internal sealed class TopologyUpdate + { + /// + /// Create an update for a node containing both the list of children and the root node. + /// + /// The node receiving the update + /// The update to the children of the node + /// The update for the root of the node + public TopologyUpdate(string node, List children, string root) + { + Node = node; + Children = children; + Root = root; + } + + /// + /// Create an update for a node containing only the list of children. + /// + /// The node receiving the update + /// The update to the children of the node + public TopologyUpdate(string node, List children) : this(node, children, string.Empty) + { + } + + /// + /// Create an update for a node containing only the root node. + /// + /// The node receiving the update + /// The update for the root of the node + public TopologyUpdate(string node, string root) : this(node, new List(), root) + { + } + + /// + /// The node receiving the update. + /// + public string Node { get; private set; } + + /// + /// The updates for the children. + /// + public List Children { get; set; } + + /// + /// The updates for the root. + /// + public string Root { get; private set; } + + /// + /// The total memory size for the update (used for serialization). + /// + public int Size + { + get + { + // 1 int for the size of node + // The size of node + // 1 int for the number of children + // 1 int for the length of each children + // The size of the string of each child + // 1 int + the size of root if not null + var nodeSize = sizeof(int) + Node.Length; + var childrenSize = sizeof(int) + (Children.Count * sizeof(int)) + Children.Sum(x => x.Length); + var rootSize = sizeof(int) + Root.Length; + + return nodeSize + childrenSize + rootSize; + } + } + + /// + /// Serialize the update. + /// + /// The memory space where to copy the serialized update + /// Where to start writing in the buffer + /// The updates to serialize + internal static void Serialize(byte[] buffer, ref int offset, List updates) + { + byte[] tmpBuffer; + + foreach (var value in updates) + { + Buffer.BlockCopy(BitConverter.GetBytes(value.Node.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + tmpBuffer = ByteUtilities.StringToByteArrays(value.Node); + Buffer.BlockCopy(tmpBuffer, 0, buffer, offset, tmpBuffer.Length); + offset += tmpBuffer.Length; + + Buffer.BlockCopy(BitConverter.GetBytes(value.Children.Count), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + foreach (var child in value.Children) + { + tmpBuffer = ByteUtilities.StringToByteArrays(child); + Buffer.BlockCopy(BitConverter.GetBytes(tmpBuffer.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + Buffer.BlockCopy(tmpBuffer, 0, buffer, offset, tmpBuffer.Length); + offset += tmpBuffer.Length; + } + + if (value.Root == null) + { + Buffer.BlockCopy(BitConverter.GetBytes(0), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + } + else + { + tmpBuffer = ByteUtilities.StringToByteArrays(value.Root); + Buffer.BlockCopy(BitConverter.GetBytes(tmpBuffer.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + Buffer.BlockCopy(tmpBuffer, 0, buffer, offset, tmpBuffer.Length); + offset += tmpBuffer.Length; + } + } + } + + /// + /// Deserialize the update. + /// + /// The memory space where to fetch the serialized updates/param> + /// The total memory size of the serialized updates + /// Where to start reading in the buffer + internal static List Deserialize(byte[] data, int totLength, int start) + { + var result = new List(); + var num = 0; + var length = 0; + var offset = 0; + string value; + string node; + List tmp; + + while (offset < totLength) + { + length = BitConverter.ToInt32(data, start + offset); + offset += sizeof(int); + node = ByteUtilities.ByteArraysToString(data, start + offset, length); + offset += length; + + num = BitConverter.ToInt32(data, start + offset); + offset += sizeof(int); + tmp = new List(); + for (int i = 0; i < num; i++) + { + length = BitConverter.ToInt32(data, start + offset); + offset += sizeof(int); + value = ByteUtilities.ByteArraysToString(data, start + offset, length); + offset += length; + tmp.Add(value); + } + + length = BitConverter.ToInt32(data, start + offset); + offset += sizeof(int); + if (length > 0) + { + value = ByteUtilities.ByteArraysToString(data, start + offset, length); + offset += length; + result.Add(new TopologyUpdate(node, tmp, value)); + } + else + { + result.Add(new TopologyUpdate(node, tmp)); + } + } + + return result; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs new file mode 100644 index 0000000000..ed371e3cf5 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Messages sent by the driver to operators. + /// This message contains information for the destination node on the topology. + /// + [Unstable("0.16", "API may change")] + internal sealed class UpdateMessagePayload : TopologyMessagePayload + { + /// + /// Create a driver message payload containing topology updates + /// + /// The topology updates + /// Whether the updates are additions to the current topology state or nodes removal + /// The subscription context for the message + /// The id of the operator receiving the topology update + /// The iteration in which the update takes effect + public UpdateMessagePayload(List updates, string subscriptionName, int operatorId, int iteration) + : base(DriverMessagePayloadType.Update, updates, subscriptionName, operatorId, iteration) + { + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs new file mode 100644 index 0000000000..7c32cd563d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Annotations; + +namespace Org.Apache.REEF.Network.Elastic.Config +{ + /// + ///Class wrapping the configuration option for the elastic + ///group communication service. + /// + public sealed class ElasticServiceConfigurationOptions + { + [NamedParameter("Number of Evaluators")] + public sealed class NumEvaluators : Name + { + } + + [NamedParameter("Number of Servers")] + public sealed class NumServers : Name + { + } + + [NamedParameter("Number of Workers")] + public sealed class NumWorkers : Name + { + } + + [NamedParameter(Documentation = "Number of retry when a failure occurs", DefaultValue = "1")] + public sealed class RetryAfterFailure : Name + { + } + + [NamedParameter(Documentation = "Starting port for TcpPortProvider", DefaultValue = "8900")] + public sealed class StartingPort : Name + { + } + + [NamedParameter(Documentation = "Port Range count for TcpPortProvider", DefaultValue = "1000")] + public sealed class PortRange : Name + { + } + + [NamedParameter("Driver identifier")] + public sealed class DriverId : Name + { + } + + [NamedParameter("Default Group name", defaultValue: "Subscription1")] + public sealed class DefaultSubscriptionName : Name + { + } + + [NamedParameter("Number of tasks", defaultValue: "5")] + public sealed class NumberOfTasks : Name + { + } + + [NamedParameter("Serialized subscriptions configuration")] + public sealed class SerializedSubscriptionConfigs : Name> + { + } + + [NamedParameter("Timeout after which computation is consider inactive", defaultValue: "600000")] + public sealed class Timeout : Name + { + } + + [NamedParameter("Number of retry to send a message", defaultValue: "50")] + public sealed class SendRetry : Name + { + } + + [NamedParameter("Number of millisecond between each message retry", defaultValue: "1000")] + public sealed class RetryWaitTime : Name + { + } + + [NamedParameter("Number of failures before a task abort the task set", defaultValue: "100")] + public sealed class NumTaskFailures : Name + { + } + + [NamedParameter(Documentation = "Rack name used when a new evaluator is requested after a failure", DefaultValue = "WonderlandRack")] + public sealed class NewEvaluatorRackName : Name + { + } + + [NamedParameter(Documentation = "Batch id used when a new evaluator is requested after a failure", DefaultValue = "IterateBroadcast")] + public sealed class NewEvaluatorBatchId : Name + { + } + + [NamedParameter(Documentation = "Number of cores used when a new evaluator is requested after a failure", DefaultValue = "1")] + public sealed class NewEvaluatorNumCores : Name + { + } + + [NamedParameter(Documentation = "Memory size used when a new evaluator is requested after a failure", DefaultValue = "512")] + public sealed class NewEvaluatorMemorySize : Name + { + } + + [NamedParameter("Number of checkpoints to store per operator", defaultValue: "1")] + public sealed class NumCheckpoints : Name + { + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs new file mode 100644 index 0000000000..010a921720 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Config +{ + /// + ///Class wrapping the configuration option parameters for operators. + /// + public sealed class OperatorParameters + { + [NamedParameter("Operator Name")] + public sealed class OperatorType : Name + { + } + + [NamedParameter("Type of the message")] + public sealed class MessageType : Name + { + } + + [NamedParameter("Operator Id")] + public sealed class OperatorId : Name + { + } + + [NamedParameter("Name of the subscriptions")] + public sealed class SubscriptionName : Name + { + } + + [NamedParameter("Serialized operator configuration")] + public sealed class SerializedOperatorConfigs : Name> + { + } + + [NamedParameter("Request topology update", defaultValue: "false")] + public sealed class RequestTopologyUpdate : Name + { + } + + [NamedParameter("Number of iterations")] + public sealed class NumIterations : Name + { + } + + [NamedParameter("Number of element to scatter for each node", defaultValue: "0")] + public sealed class NumScatterElements : Name + { + } + + [NamedParameter("Iteration number to begin with", defaultValue: "1")] + public sealed class StartIteration : Name + { + } + + [NamedParameter("Master Id")] + public sealed class MasterId : Name + { + } + + [NamedParameter("Checkpoint level", defaultValue: "0")] + public sealed class Checkpointing : Name + { + } + + [NamedParameter("Whether the operator is the last to be executed in the subscription", defaultValue: "false")] + public sealed class IsLast : Name + { + } + + [NamedParameter("Id of root task in operator topology", defaultValue: "-1")] + public sealed class TopologyRootTaskId : Name + { + } + + [NamedParameter("Ids of child tasks in operator topology")] + public sealed class TopologyChildTaskIds : Name> + { + } + + [NamedParameter("Whether topology updates can be piggybacked to data messages", defaultValue: "false")] + public sealed class PiggybackTopologyUpdates : Name + { + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs index dfd87b358b..d768e7a4e9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs @@ -15,67 +15,89 @@ // specific language governing permissions and limitations // under the License. +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Network.Elastic.Driver.Impl; using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Tang.Interface; using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; namespace Org.Apache.REEF.Network.Elastic.Driver { /// - /// Used to create Subscriptions for fault tolerant Task Sets. - /// Also manages configurations for Group Communication operators/services. + /// This is the entry point for enabling the Elastic Group Communication service. + /// The workflow is the following: + /// (1) Create a service instance; + /// (2) Use the service to create one or more subscriptions; + /// (3) Use the subscription to create a pipeline of operators representing the + /// communication pattern the tasks should implement; + /// (4) Create one or more managers to manage the scheduling of the tasks + /// (5) Register subscriptions to the manager to properly configure the task set. + /// + /// This interface is mainly used to create subscriptions. + /// Also manages configurations for Elastic Group Communication operators/services. /// [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(DefaultTaskSetService))] public interface IElasticTaskSetService : IFailureResponse { /// - /// Creates a Subscription with the default settings. + /// Creates a subscription with the default settings. /// The subscription lifecicle is managed by the service. /// - /// A new Task Set Subscription with default parameters + /// A new subscription with default parameters IElasticTaskSetSubscription DefaultTaskSetSubscription(); /// - /// Creates a new Task Set Subscription. + /// Creates a new subscription. /// The subscription lifecicle is managed by the service. /// /// The name of the subscription /// The number of tasks required by the subscription /// An optional failure machine governing the subscription - /// The new Task Set Subscrption + /// The new task Set subscrption IElasticTaskSetSubscription NewTaskSetSubscription(string subscriptionName, int numTasks, IFailureStateMachine failureMachine = null); /// - /// Remove a Task Set Subscription from the service. + /// Remove a task Set subscription from the service. /// - /// The name of the subscription + /// The name of the subscription to be removed void RemoveTaskSetSubscription(string subscriptionName); + /// + /// Get the subscriptions names from the context. + /// + /// An activeContext + /// The subscriptions representented in the context + string GetContextSubscriptions(IActiveContext activeContext); + /// /// Generate the service configuration object. - /// This method is used to properly configure the Context with the service. + /// This method is used to properly configure Contexts with the service. /// - /// The Service Configuration + /// The service Configuration IConfiguration GetServiceConfiguration(); /// /// At task submission time the following steps are executed: /// 1) Each subscription the task is registered to generates a task subscription /// 2) Internally each configuration generated by subscriptions contains a configuration entry for each - /// operator defining the subscription. Such operator configurations are serialized using + /// operator defining the subscription pipeline. Such operator configurations are serialized using /// {@link Org.Apache.REEF.Network.Elastic.Driver.IElasticTaskSetService#SerializeOperatorConfiguration} /// 3) Tasks subscriptions are serialized into a configuration - /// 4) The service Task configuration is added to the configuration object containing the serialized subscription confs - /// 5) the Task configuration is merged with the configuraiton object of 4) to generate the final task configuration + /// 4) The service task configuration is added to the configuration object containing the serialized subscription confs + /// 5) the task configuration is merged with the configuraiton object of 4) to generate the final task configuration /// /// /// - /// Creates a generic Task Configuration object for the tasks registering to the service. + /// Creates a generic task Configuration object for the tasks registering to the service. /// /// The configuration of the subscription the task will register to - /// The configuration for the Task with added service parameters + /// The configuration for the task with added service parameters IConfiguration GetTaskConfiguration(ICsConfigurationBuilder subscriptionsConf); + #region Serialization Helpers /// /// Appends a subscription configuration to a configuration builder object. /// @@ -87,9 +109,10 @@ public interface IElasticTaskSetService : IFailureResponse /// /// Append an operator configuration to a configuration builder object. /// - /// The configuration where the operator configuration will be appended to + /// The list where the operator configuration will be appended to /// The operator configuration at hand /// The configuration containing the serialized operator configuration - void SerializeOperatorConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration operatorConf); + void SerializeOperatorConfiguration(ref IList serializedOperatorsConfs, IConfiguration operatorConfiguration); + #endregion } -} \ No newline at end of file +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs index fbc1c48bd9..6742bf4150 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs @@ -16,79 +16,129 @@ // under the License. using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl; using Org.Apache.REEF.Driver.Context; using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.IO.PartitionedData; using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Comm; namespace Org.Apache.REEF.Network.Elastic.Driver { /// - /// Used to group operators in logical units. - /// All operators in the same Subscription share similar semantics - /// and behaviour under failures. + /// Used to group elastic operators into logical units. + /// All operators in the same subscriptions share similar semantics and behavior + /// under failures. Subscriptions can only be created by a service. /// [Unstable("0.16", "API may change")] - public interface IElasticTaskSetSubscription : IFailureResponse + public interface IElasticTaskSetSubscription : IFailureResponse, ITaskMessageResponse { /// - /// The name of the Subscription. + /// The name of the subscriptions. /// string SubscriptionName { get; } /// - /// The Failure State of the target Subscription. + /// The operator at the beginning of the computation workflow. /// - IFailureState FailureStatus { get; } + ElasticOperator RootOperator { get; } /// - /// The Service managing the Subscription. + /// The failure state of the target subscriptions. + /// + IFailureState FailureState { get; } + + /// + /// The service managing the subscriptions. /// IElasticTaskSetService Service { get; } /// - /// Generates an id to uniquely identify operators in the Subscription. + /// Whether the subscriptions is completed or not. + /// + bool IsCompleted { get; set; } + + /// + /// Whether the subscriptions contains iterations or not. + /// + bool IsIterative { get; set; } + + /// + /// Generates an id to uniquely identify Operators in the subscriptions. /// /// A new unique id int GetNextOperatorId(); /// - /// Finalizes the Subscription. - /// After the Subscription has been finalized, no more operators may + /// Add a partitioned dataset to the subscription. + /// + /// The partitioned dataset + /// Whether the master node should get a partition + void AddDataset(IPartitionedInputDataSet inputDataSet, bool isMasterGettingInputData = false); + + /// + /// Add a set of datasets to the subscription. + /// + /// The configuration for the datasets + /// Whether the master node should get a partition + void AddDataset(IConfiguration[] inputDataSet, bool isMasterGettingInputData = false); + + /// + /// Finalizes the subscriptions. + /// After the subscriptions has been finalized, no more operators can /// be added to the group. /// - /// The same finalized Subscription + /// The same finalized subscriptions IElasticTaskSetSubscription Build(); /// - /// Add a task to the Subscription. - /// The Subscription must have called Build() before adding tasks. + /// Add a task to the subscriptions. + /// The subscriptions must have been buit before tasks can be added. /// /// The id of the task to add - /// True if the task is added to the Subscription + /// True if the task is correctly added to the subscriptions bool AddTask(string taskId); /// - /// Decides if the tasks added to the Subscription can be scheduled for execution - /// or not. Method used for implementing different policies for + /// Decides if the tasks added to the subscriptions can be scheduled for execution + /// or not. This method is used for implementing different policies for /// triggering the scheduling of tasks. /// - /// True if the added tasks can be scheduled for execution + /// True if the previously added tasks can be scheduled for execution bool ScheduleSubscription(); /// - /// Whether the input activeContext is the one of the master Task. + /// Whether the input activeContext is the one of the master tasks. /// - /// The active context for the task - /// True if the parameter is the master task's active context + /// The active context of the task + /// True if the input parameter is the master task's active context bool IsMasterTaskContext(IActiveContext activeContext); /// /// Creates the Configuration for the input task. - /// Must be called only after all tasks have been added to the Subscription. + /// Must be called only after all tasks have been added to the subscriptions. /// /// The configuration builder the configuration will be appended to - /// The task id of the task that belongs to this Subscription - /// The configuration for the Task with added Subscription informations - void GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId); + /// The task id of the task that belongs to this subscriptions + /// The configuration for the Task with added subscriptions informations + IConfiguration GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId); + + /// + /// Given a task id, this method returns the configuration of the task's data partition + /// (if any). + /// + /// The task id of the task we wanto to retrieve the data partition. + /// The task is required to belong to thq subscriptions + /// The configuration of the data partition (if any) of the task + Optional GetPartitionConf(string taskId); + + /// + /// Retrieve the log the final statistics of the computation: this is the sum of all + /// the stats of all the Operators compising the subscription. This method can be called + /// only once the subscriptions is completed. + /// + /// The final statistics for the computation + string LogFinalStatistics(); } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs index 4d4e8063d8..419475be62 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs @@ -15,33 +15,32 @@ // specific language governing permissions and limitations // under the License. -using System.Collections.Generic; using Org.Apache.REEF.Driver.Context; -using Org.Apache.REEF.Tang.Interface; using Org.Apache.REEF.Driver.Evaluator; using Org.Apache.REEF.Driver.Task; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Utilities.Attributes; using System; +using System.Collections.Generic; namespace Org.Apache.REEF.Network.Elastic.Driver { /// /// Class defining how groups of tasks sharing similar scheduling semantics are managed. - /// TaskSets subscribe to Subscriptions in order to define tasks logic. - /// TaskSets schedule and manage group of tasks running in the cluster. + /// Task set managers subscribe to subscriptions in order to define tasks logic. + /// Task set managers schedule and manage group of tasks running in the cluster. /// [Unstable("0.16", "API may change")] public interface ITaskSetManager : IFailureResponse, IDisposable { /// /// An identifier for the set of Subscriptions the Task Manager is subscribed to. - /// The Task Set has to be built before retrieving its subscriptions id. + /// The task set has to be built before retrieving its subscriptions id. /// string SubscriptionsId { get; } /// - /// Subscribe the current Task Set to a new Subscription. + /// Subscribe the current task set manager to a new subscription. /// /// The subscription to subscribe to void AddTaskSetSubscription(IElasticTaskSetSubscription subscription); @@ -56,72 +55,89 @@ public interface ITaskSetManager : IFailureResponse, IDisposable /// Method used to generate unique context ids. /// /// The evaluator the context will run on - /// A new unique context id - int GetNextTaskContextId(IAllocatedEvaluator evaluator = null); + /// A new unique context id + /// True if an new context id is sucessufully created + bool TryGetNextTaskContextId(IAllocatedEvaluator evaluator, out string identifier); /// /// Method used to generate unique task ids. /// /// The context the task will run on /// A new task id - int GetNextTaskId(IActiveContext context = null); + string GetTaskId(IActiveContext context); /// - /// Finalizes the Task Set. - /// After the Task set has been finalized, no more Subscriptions can be added. + /// Finalizes the task set manager. + /// After the task set has been finalized, no more subscriptions can be added. /// - /// The same finalized Task Set + /// The same finalized task set manager ITaskSetManager Build(); /// - /// Retrieves all Subscriptions having the context passed as a parameter + /// Retrieves all subscriptions having the context passed as a parameter /// as master task context. /// /// The target context - /// A list of Subscriptions having the master task running on context + /// A list of subscriptions having the master task running on context IEnumerable IsMasterTaskContext(IActiveContext context); /// - /// Add a task to the Task Set. - /// The Task Set must have called Build() before adding tasks. + /// Method implementing how the task set manager should react when a new context is active. /// - /// The id of the task to add - /// The current configuration of the task - /// The context the task will run on - void AddTask(string taskId, IConfiguration taskConfig, IActiveContext context); + /// The new active context + void OnNewActiveContext(IActiveContext activeContext); /// - /// Actions to execute when a notification that a task is running is received. + /// Method implementing how the task set manager should react when a notification that a task is running is received. /// /// The running task void OnTaskRunning(IRunningTask task); /// - /// Actions to execute when a notification that a task is completed is received. + /// Method implementing how the task set manager should react when a notification that a task is completed is received. /// /// The completed task void OnTaskCompleted(ICompletedTask task); /// - /// Actions to execute when a task message is received. + /// Method implementing how the task set manager should react when a task message is received. /// /// A message from a task void OnTaskMessage(ITaskMessage message); /// - /// This method contains the logic to trigger when the Task Set execution is completed + /// Whether the imput task is managed by this task set manger. /// - bool Done(); + /// The task identifier + bool IsTaskManagedBy(string id); /// - /// Used to react of a failure event occurred on an evaluator. + /// Whether the imput context is managed by this task set manger. /// - /// The failed evaluator - void OnEvaluatorFailure(IFailedEvaluator evaluator); + /// The context identifier + bool IsContextManagedBy(string id); + + /// + /// Whether the imput evaluator is managed by this task set manger. + /// + /// The context identifier + bool IsEvaluatorManagedBy(string id); /// - /// Contains the logic to trigger when the execution fails. + /// Whether this task set manger is done. /// - void OnFail(); + bool IsCompleted(); + + /// + /// Used to react on a task failure. + /// + /// The failed task + void OnTaskFailure(IFailedTask task); + + /// + /// Used to react of a failure event occurred on an evaluator. + /// + /// The failed evaluator + void OnEvaluatorFailure(IFailedEvaluator evaluator); } -} \ No newline at end of file +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetService.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetService.cs new file mode 100644 index 0000000000..4f06c71fdc --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetService.cs @@ -0,0 +1,350 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Net; +using Org.Apache.REEF.Common.Io; +using Org.Apache.REEF.Common.Services; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Naming; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Failures.Impl; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Impl +{ + /// + /// Default implementation for the task service. + /// This is mainly used to create subscription. + /// Also manages configurations for Elastic Group Communication operators/services. + /// + [Unstable("0.16", "API may change")] + public sealed class DefaultTaskSetService : IElasticTaskSetService, IDefaultFailureEventResponse + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultTaskSetService)); + + private readonly string _driverId; + private readonly int _numEvaluators; + private readonly string _nameServerAddr; + private readonly int _nameServerPort; + private readonly INameServer _nameServer; + private readonly string _defaultSubscriptionName; + private readonly IFailureStateMachine _defaultFailureMachine; + + private readonly Dictionary _subscriptions; + private readonly AvroConfigurationSerializer _configSerializer; + + private readonly object _subsLock = new object(); + private readonly object _statusLock = new object(); + + private IFailureState _failureStatus; + + [Inject] + private DefaultTaskSetService( + [Parameter(typeof(ElasticServiceConfigurationOptions.DriverId))] string driverId, + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultSubscriptionName))] string defaultSubscriptionName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + AvroConfigurationSerializer configSerializer, + INameServer nameServer, + IFailureStateMachine defaultFailureStateMachine) + { + _driverId = driverId; + _numEvaluators = numEvaluators; + _defaultSubscriptionName = defaultSubscriptionName; + _defaultFailureMachine = defaultFailureStateMachine; + + _failureStatus = new DefaultFailureState(); + _configSerializer = configSerializer; + _subscriptions = new Dictionary(); + + _nameServer = nameServer; + IPEndPoint localEndpoint = nameServer.LocalEndpoint; + _nameServerAddr = localEndpoint.Address.ToString(); + _nameServerPort = localEndpoint.Port; + } + + /// + /// Returns a subscription with the default settings (default name and failure machine). + /// + /// A subscription with default settings + public IElasticTaskSetSubscription DefaultTaskSetSubscription() + { + lock (_subsLock) + { + IElasticTaskSetSubscription defaultSubscription; + _subscriptions.TryGetValue(_defaultSubscriptionName, out defaultSubscription); + + if (defaultSubscription == null) + { + NewTaskSetSubscription(_defaultSubscriptionName, _numEvaluators, _defaultFailureMachine.Clone(_numEvaluators, (int)DefaultFailureStates.Fail)); + } + + return _subscriptions[_defaultSubscriptionName]; + } + } + + /// + /// Creates a new subscription. + /// The subscription lifecicle is managed by the service. + /// + /// The name of the subscription + /// The number of tasks required by the subscription + /// An optional failure machine governing the subscription + /// The new task Set subscrption + public IElasticTaskSetSubscription NewTaskSetSubscription( + string subscriptionName, + int numTasks, + IFailureStateMachine failureMachine = null) + { + if (string.IsNullOrEmpty(subscriptionName)) + { + throw new ArgumentNullException($"{nameof(subscriptionName)} cannot be null."); + } + + if (numTasks <= 0) + { + throw new ArgumentException($"{nameof(numTasks)} is required to be greater than 0."); + } + + lock (_subsLock) + { + if (_subscriptions.ContainsKey(subscriptionName)) + { + throw new ArgumentException($"Subscription {subscriptionName} already registered with the service."); + } + + var subscription = new DefaultTaskSetSubscription( + subscriptionName, + numTasks, + this, + failureMachine ?? _defaultFailureMachine.Clone(numTasks, (int)DefaultFailureStates.Fail)); + _subscriptions[subscriptionName] = subscription; + + return subscription; + } + } + + /// + /// Remove a task Set subscription from the service. + /// + /// The name of the subscription to be removed + public void RemoveTaskSetSubscription(string subscriptionName) + { + lock (_subsLock) + { + if (!_subscriptions.ContainsKey(subscriptionName)) + { + throw new ArgumentException($"Subscription {subscriptionName} is not registered with the service."); + } + + _subscriptions.Remove(subscriptionName); + } + } + + /// + /// Get the subscriptions names from the context. + /// + /// An activeContext + /// The subscriptions representented in the context + public string GetContextSubscriptions(IActiveContext activeContext) + { + return Utils.GetContextSubscriptions(activeContext); + } + + /// + /// Generate the service configuration object. + /// This method is used to properly configure Contexts with the service. + /// + /// The service Configuration + public IConfiguration GetServiceConfiguration() + { + IConfiguration serviceConfig = ServiceConfiguration.ConfigurationModule + .Set(ServiceConfiguration.Services, + GenericType>.Class) + .Build(); + + return TangFactory.GetTang().NewConfigurationBuilder(serviceConfig) + .BindNamedParameter( + GenericType.Class, + _nameServerAddr) + .BindNamedParameter( + GenericType.Class, + _nameServerPort.ToString(CultureInfo.InvariantCulture)) + .BindImplementation(GenericType.Class, + GenericType.Class) + + .Build(); + } + + /// + /// Creates a generic task Configuration object for the tasks registering to the service. + /// + /// The configuration of the subscription the task will register to + /// The configuration for the task with added service parameters + public IConfiguration GetTaskConfiguration(ICsConfigurationBuilder subscriptionsConf) + { + return subscriptionsConf + .BindNamedParameter( + GenericType.Class, + _driverId) + .Build(); + } + + /// + /// Appends a subscription configuration to a configuration builder object. + /// + /// The configuration where the subscription configuration will be appended to + /// The subscription configuration at hand + /// The configuration containing the serialized subscription configuration + public void SerializeSubscriptionConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration subscriptionConfiguration) + { + confBuilder.BindSetEntry( + GenericType.Class, + _configSerializer.ToString(subscriptionConfiguration)); + } + + /// + /// Append an operator configuration to a configuration builder object. + /// + /// The list where the operator configuration will be appended to + /// The operator configuration at hand + /// The configuration containing the serialized operator configuration + public void SerializeOperatorConfiguration(ref IList serializedOperatorsConfs, IConfiguration operatorConfiguration) + { + serializedOperatorsConfs.Add(_configSerializer.ToString(operatorConfiguration)); + } + + #region Failure Response + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + public void OnTaskFailure(IFailedTask value, ref List failureEvents) + { + var task = value.Id; + _nameServer.Unregister(task); + } + + /// + /// Used to react when a timeout event is triggered. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote Tasks need to reach + /// /// The next timeouts to be scheduled + public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) + { + } + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public void EventDispatcher(ref IFailureEvent @event) + { + switch ((DefaultFailureStateEvents)@event.FailureEvent) + { + case DefaultFailureStateEvents.Reconfigure: + var rec = @event as IReconfigure; + OnReconfigure(ref rec); + break; + case DefaultFailureStateEvents.Reschedule: + var res = @event as IReschedule; + OnReschedule(ref res); + break; + case DefaultFailureStateEvents.Stop: + var stp = @event as IStop; + OnStop(ref stp); + break; + default: + OnFail(); + break; + } + } + + #endregion + + #region Default Failure event Response + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public void OnReconfigure(ref IReconfigure info) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + } + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public void OnReschedule(ref IReschedule rescheduleEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + } + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public void OnStop(ref IStop stopEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + } + } + + /// + /// Mechanism to execute when a fail event is triggered. + /// + public void OnFail() + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.Fail)); + } + } + #endregion + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetSubscription.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetSubscription.cs new file mode 100644 index 0000000000..bef5ccfe59 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetSubscription.cs @@ -0,0 +1,471 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl; +using System.Threading; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Failures.Impl; +using System.Collections.Generic; +using Org.Apache.REEF.Network.Elastic.Comm; +using System.Linq; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.IO.PartitionedData; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using System; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Impl +{ + /// + /// Used to group elastic operators into logical units. + /// All operators in the same subscriptions share similar semantics and behavior + /// under failures. Subscriptions can only be created by a service. + /// This class is used to create subscriptions able to manage default failure events. + /// + [Unstable("0.16", "API may change")] + public sealed class DefaultTaskSetSubscription : IElasticTaskSetSubscription, IDefaultFailureEventResponse + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultTaskSetSubscription)); + + private bool _finalized; + private volatile bool _scheduled; + + private readonly int _numTasks; + private int _tasksAdded; + private HashSet _missingMasterTasks; + private HashSet _masterTasks; + private readonly IFailureStateMachine _defaultFailureMachine; + + private int _numOperators; + private Optional _datasetConfiguration; + private bool _isMasterGettingInputData; + + private readonly object _tasksLock = new object(); + private readonly object _statusLock = new object(); + + /// + /// Create a new subscription with the input settings. + /// + /// The name of the subscription + /// The number of tasks managed by the subscription + /// The service managing the subscription + /// The failure machine for the subscription + internal DefaultTaskSetSubscription( + string subscriptionName, + int numTasks, + IElasticTaskSetService elasticService, + IFailureStateMachine failureMachine = null) + { + SubscriptionName = subscriptionName; + _finalized = false; + _scheduled = false; + _numTasks = numTasks; + _tasksAdded = 0; + _masterTasks = new HashSet(); + _datasetConfiguration = Optional.Empty(); + IsCompleted = false; + Service = elasticService; + _defaultFailureMachine = failureMachine ?? new DefaultFailureStateMachine(numTasks, DefaultFailureStates.Fail); + FailureState = _defaultFailureMachine.State; + RootOperator = new DefaultEmpty(this, _defaultFailureMachine.Clone()); + + IsIterative = false; + } + + /// + /// The name of the subscriptions. + /// + public string SubscriptionName { get; set; } + + /// + /// The operator at the beginning of the computation workflow. + /// + public ElasticOperator RootOperator { get; private set; } + + /// + /// The service managing the subscriptions. + /// + /// Whether the subscriptions contains iterations or not. + /// + public bool IsIterative { get; set; } + + /// + /// The failure state of the target subscriptions. + /// + public IFailureState FailureState { get; private set; } + + /// + /// Whether the subscriptions is completed or not. + /// + public bool IsCompleted { get; set; } + + /// + /// Generates an id to uniquely identify operators in the subscriptions. + /// + /// A new unique id + public int GetNextOperatorId() + { + return Interlocked.Increment(ref _numOperators); + } + + /// + /// Add a partitioned dataset to the subscription. + /// + /// The partitioned dataset + /// Whether the master node should get a partition + public void AddDataset(IPartitionedInputDataSet inputDataSet, bool isMasterGettingInputData = false) + { + AddDataset(inputDataSet.Select(x => x.GetPartitionConfiguration()).ToArray(), isMasterGettingInputData); + } + + /// + /// Add a set of datasets to the subscription. + /// + /// The configuration for the datasets + /// Whether the master node should get a partition + public void AddDataset(IConfiguration[] inputDataSet, bool isMasterGettingInputData = false) + { + _isMasterGettingInputData = isMasterGettingInputData; + + _datasetConfiguration = Optional.Of(inputDataSet); + } + + /// + /// Finalizes the subscriptions. + /// After the subscriptions has been finalized, no more operators can + /// be added to the group. + /// + /// The same finalized subscriptions + public IElasticTaskSetSubscription Build() + { + if (_finalized == true) + { + throw new IllegalStateException("Subscription cannot be built more than once"); + } + + if (_datasetConfiguration.IsPresent()) + { + var adjust = _isMasterGettingInputData ? 0 : 1; + + if (_datasetConfiguration.Value.Length + adjust < _numTasks) + { + throw new IllegalStateException($"Dataset is smaller than the number of tasks: re-submit with {_datasetConfiguration.Value.Length + adjust} tasks"); + } + } + + RootOperator.GatherMasterIds(ref _masterTasks); + _missingMasterTasks = new HashSet(_masterTasks); + + _finalized = true; + + return this; + } + + /// + /// Add a task to the subscriptions. + /// The subscriptions must have been buit before tasks can be added. + /// + /// The id of the task to add + /// True if the task is correctly added to the subscriptions + public bool AddTask(string taskId) + { + if (taskId == string.Empty) + { + throw new ArgumentException($"{nameof(taskId)} cannot be empty."); + } + + if (IsCompleted || (_scheduled && FailureState.FailureState == (int)DefaultFailureStates.Fail)) + { + LOGGER.Log(Level.Warning, "Taskset " + (IsCompleted ? "completed." : "failed.")); + return false; + } + + if (!_finalized) + { + throw new IllegalStateException("Subscription must be finalized before adding tasks."); + } + + lock (_tasksLock) + { + // We don't add a task if eventually we end up by not adding the master task + var tooManyTasks = _tasksAdded >= _numTasks; + var notAddingMaster = _tasksAdded + _missingMasterTasks.Count >= _numTasks && !_missingMasterTasks.Contains(taskId); + + if (!_scheduled && (tooManyTasks || notAddingMaster)) + { + if (tooManyTasks) + { + LOGGER.Log(Level.Warning, $"Already added {_tasksAdded} tasks when total tasks request is {_numTasks}"); + } + + if (notAddingMaster) + { + LOGGER.Log(Level.Warning, $"Already added {_tasksAdded} over {_numTasks} but missing master task(s)"); + } + + return false; + } + + if (!RootOperator.AddTask(taskId)) + { + return true; + } + + _tasksAdded++; + + _missingMasterTasks.Remove(taskId); + + _defaultFailureMachine.AddDataPoints(1, false); + } + + return true; + } + + /// + /// Decides if the tasks added to the subscriptions can be scheduled for execution + /// or not. This method is used for implementing different policies for + /// triggering the scheduling of tasks. + /// + /// True if the previously added tasks can be scheduled for execution + public bool ScheduleSubscription() + { + // Schedule if we reach the number of requested tasks or the subscription contains an iterative pipeline that is ready to be scheduled and the + // policy requested by the user allow early start with ramp up. + if (!_scheduled && (_numTasks == _tasksAdded || (IsIterative && _defaultFailureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule && RootOperator.CanBeScheduled()))) + { + _scheduled = true; + + RootOperator.BuildState(); + } + + return _scheduled; + } + + /// + /// Whether the input activeContext is the one of the master tasks. + /// + /// The active context of the task + /// True if the input parameter is the master task's active context + public bool IsMasterTaskContext(IActiveContext activeContext) + { + if (!_finalized) + { + throw new IllegalStateException("Driver must call Build() before checking IsMasterTaskContext."); + } + + int id = Utils.GetContextNum(activeContext); + return _masterTasks.Select(Utils.GetTaskNum).Any(x => x == id); + } + + /// + /// Creates the Configuration for the input task. + /// Must be called only after all tasks have been added to the subscriptions. + /// + /// The configuration builder the configuration will be appended to + /// The task id of the task that belongs to this subscriptions + /// The configuration for the Task with added subscriptions informations + public IConfiguration GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId) + { + IList serializedOperatorsConfs = new List(); + builder = builder + .BindNamedParameter( + GenericType.Class, + SubscriptionName); + + RootOperator.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); + + var subConf = builder + .BindList( + GenericType.Class, + serializedOperatorsConfs) + .Build(); + + return subConf; + } + + /// + /// Given a task id, this method returns the configuration of the task's data partition + /// (if any). + /// + /// The task id of the task we wanto to retrieve the data partition. + /// The task is required to belong to thq subscriptions + /// The configuration of the data partition (if any) of the task + public Optional GetPartitionConf(string taskId) + { + if (!_datasetConfiguration.IsPresent() || (_masterTasks.Contains(taskId) && !_isMasterGettingInputData)) + { + return Optional.Empty(); + } + + var index = Utils.GetTaskNum(taskId) - 1; + index = _masterTasks.Count == 0 || _isMasterGettingInputData ? index : index - 1; + + if (index < 0 || index >= _datasetConfiguration.Value.Length) + { + throw new IllegalStateException($"Asking for a not existing partition configuration {index}."); + } + + return Optional.Of(_datasetConfiguration.Value[index]); + } + + /// + /// Retrieve the log the final statistics of the computation: this is the sum of all + /// the stats of all the Operators compising the subscription. This method can be called + /// only once the subscriptions is completed. + /// + /// The final statistics for the computation + public string LogFinalStatistics() + { + if (!IsCompleted) + { + throw new IllegalStateException($"Cannot log statistics before Subscription {SubscriptionName} is completed"); + } + + return RootOperator.LogFinalStatistics(); + } + + /// + /// Method triggered when a task to driver message is received. + /// + /// The task message for the operator + /// A list of messages containing the instructions for the task + public void OnTaskMessage(ITaskMessage message, ref List returnMessages) + { + // Messages have to be propagated down to the operators + RootOperator.OnTaskMessage(message, ref returnMessages); + } + + #region Failure Response + /// + /// Used to react when a timeout event is triggered. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote Tasks need to reach + /// The next timeouts to be scheduled + public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) + { + RootOperator.OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + public void OnTaskFailure(IFailedTask task, ref List failureEvents) + { + // Failures have to be propagated down to the operators + RootOperator.OnTaskFailure(task, ref failureEvents); + } + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public void EventDispatcher(ref IFailureEvent @event) + { + switch ((DefaultFailureStateEvents)@event.FailureEvent) + { + case DefaultFailureStateEvents.Reconfigure: + var rec = @event as IReconfigure; + OnReconfigure(ref rec); + break; + case DefaultFailureStateEvents.Reschedule: + var res = @event as IReschedule; + OnReschedule(ref res); + break; + case DefaultFailureStateEvents.Stop: + var stp = @event as IStop; + OnStop(ref stp); + break; + default: + OnFail(); + break; + } + + RootOperator.EventDispatcher(ref @event); + } + + #endregion + + #region Default Failure Events Response + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public void OnReconfigure(ref IReconfigure reconfigureEvent) + { + lock (_statusLock) + { + FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + } + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public void OnReschedule(ref IReschedule rescheduleEvent) + { + lock (_statusLock) + { + FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + } + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public void OnStop(ref IStop stopEvent) + { + lock (_statusLock) + { + FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + } + } + + /// + /// Mechanism to execute when a fail event is triggered. + /// + public void OnFail() + { + lock (_statusLock) + { + FailureState = FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.Fail)); + } + } + + #endregion + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs new file mode 100644 index 0000000000..885b084e21 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Enum +{ + /// + /// Definition of supported checkpointing policies. + /// + [Unstable("0.16", "Policies may change")] + public enum CheckpointLevel : int + { + None = 0, // No checkpointing + + EphemeralMaster = 10, // Checkpointing on the master task, not tolerant to task failures + + EphemeralAll = 11, // Checkpointing on all tasks, not tolerant to task failures + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStateEvents.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStateEvents.cs new file mode 100644 index 0000000000..83c120985c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStateEvents.cs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Enum +{ + /// + /// The list of default failure events triggered by default state chages. + /// + [Unstable("0.16", "The default evens may change")] + public enum DefaultFailureStateEvents : int + { + Continue = 1, + + Reconfigure = 2, + + Reschedule = 3, + + Stop = 4, + + Fail = 5 + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStates.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStates.cs new file mode 100644 index 0000000000..319e5bb907 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStates.cs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Enum +{ + /// + /// The default failure states. + /// + [Unstable("0.16", "The default states may change")] + public enum DefaultFailureStates : int + { + Continue = 0, // When a failre is detected, just ignore it continue the computation + + ContinueAndReconfigure = 1, // When a failre is detected, continue the computation by properly reconfiguring the operators + + ContinueAndReschedule = 2, // When a failre is detected, continue the computation by reconfiguring the operators and try to reschedule the task + + StopAndReschedule = 3, // When a failre is detected, stop the computation and try to reschedule the task + + Fail = 4 // Fail + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs new file mode 100644 index 0000000000..66c7d56442 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Interface for a state that is checkpointed. + /// + [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(DefaultCheckpointState))] + public interface ICheckpointState + { + /// + /// The iteration number for this checkpoint. + /// + int Iteration { get; set; } + + /// + /// The operator id for this checkpoint. + /// + int OperatorId { get; set; } + + /// + /// The subscription name of the checkpoint. + /// + string SubscriptionName { get; set; } + + /// + /// The actual state of the checkpoint. + /// + object State { get; } + + /// + /// Create a new empty checkpoint from the settings of the current one. + /// + /// A checkpoint with no state but with properly set up fields + ICheckpointState Create(object state); + + /// + /// Utility method used to create message out of + /// the checkpoint. This is used when checkpoints need + /// to be sent among nodes to recover computation. + /// + /// A checkpoint ready to be communicated + GroupCommunicationMessage ToMessage(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointableState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointableState.cs new file mode 100644 index 0000000000..5f03b0ac95 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointableState.cs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Interface for checkpointing some task state. + /// Clients can implement this interface and inject it into operators to save the current task state. + /// The workflow is as follows: + /// 1-Create a checkpointable state either through injection or for an iteration + /// 2-Make an object checkpointable using the MakeCheckpointable. At this point the state is not checkpointed. + /// 3-Create a checkpoint state. + /// + [Unstable("0.16", "API may change")] + public interface ICheckpointableState + { + /// + /// The current checkpoint level. + /// + CheckpointLevel Level { get; } + + /// + /// Make the given input state a checkpointable state. + /// + /// The state that needs to be make checkpointable + void MakeCheckpointable(object state); + + /// + /// Checkpoint the current state. + /// + /// A checkpoint state + ICheckpointState Checkpoint(); + + /// + /// Create a new empty checkpointable state from the current one. + /// + /// An empty checkpointable state + ICheckpointableState Create(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IDefaultFailureEventResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IDefaultFailureEventResponse.cs new file mode 100644 index 0000000000..8af2ad3f62 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IDefaultFailureEventResponse.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Default failures response interface. + /// The default events are Reconfigure, Reschedule, Stop and Fail. + /// Mechanisms implementing the default failure responses must extend this interface. + /// + [Unstable("0.16", "API may change")] + public interface IDefaultFailureEventResponse + { + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + void OnReconfigure(ref IReconfigure reconfigureEvent); + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + void OnReschedule(ref IReschedule rescheduleEvent); + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + void OnStop(ref IStop stopEvent); + + /// + /// Mechanism to execute when a fail event is triggered. + /// + void OnFail(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureEvent.cs index 61cc8ba588..32a592c136 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureEvent.cs @@ -15,25 +15,40 @@ // specific language governing permissions and limitations // under the License. +using Org.Apache.REEF.Network.Elastic.Comm; using Org.Apache.REEF.Utilities.Attributes; -using System; +using System.Collections.Generic; namespace Org.Apache.REEF.Network.Elastic.Failures { /// - /// Interface wrapping an event rised by a transition to a new failure - /// state. The event speicifies which action have to be executed in response + /// Interface wrapping an event raised by a transition to a new failure + /// state. The event specifies which action have to be executed in response /// to the change in the failure state. /// [Unstable("0.16", "API may change")] public interface IFailureEvent { /// - /// The event / action rised by the transition to a new failure state. - /// It is assumed that the result encodes the magnituted of the action, + /// The event / action raised by the transition to the new failure state. + /// It is assumed that the result encodes the magnitude of the action, /// e.g., smaller number, less demanding action. /// - /// A value identifing the magnitued of the event int FailureEvent { get; } + + /// + /// The task id where the failure occurred. + /// + string TaskId { get; } + + /// + /// The operator id where the failure occurred. + /// + int OperatorId { get; } + + /// + /// The response messages generated to react to the failure event. + /// + List FailureResponse { get; } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs index b31f118384..2cc49aec2c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs @@ -16,12 +16,15 @@ // under the License. using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Elastic.Comm; using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Wake.Time.Event; +using System.Collections.Generic; namespace Org.Apache.REEF.Network.Elastic.Failures { /// - /// Entry point for classes expected to be aware and act over failres. + /// Entry point for classes expected to be aware and act over failures. /// Used to propagate failures through operators, subscriptions and the service. /// [Unstable("0.16", "API may change")] @@ -29,16 +32,28 @@ public interface IFailureResponse { /// /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. /// - /// The failed task - /// The failure state after the notification of the failed task - IFailureState OnTaskFailure(IFailedTask task); + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + void OnTaskFailure(IFailedTask task, ref List failureEvents); /// - /// When a new failure state is rised, this method is used to dispatch + /// Used to react when a timeout event is triggered. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote Tasks need to reach + /// /// The next timeouts to be scheduled + void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts); + + /// + /// When a new failure state is reached, this method is used to dispatch /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). /// - /// Notification specifiying the updated failure state - void EventDispatcher(IFailureEvent @event); + /// The failure event to react upon + void EventDispatcher(ref IFailureEvent @event); } -} \ No newline at end of file +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureState.cs index 447dfafe66..4a9f8b8ee9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureState.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureState.cs @@ -29,7 +29,6 @@ public interface IFailureState /// The current failure state. It is assumed that bigger values mean worst /// failure state. /// - /// A value identifing the failure state int FailureState { get; set; } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs index 3126f63e24..c0100c0cad 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs @@ -15,21 +15,28 @@ // specific language governing permissions and limitations // under the License. -using System; +using Org.Apache.REEF.Network.Elastic.Failures.Impl; +using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Utilities.Attributes; +using System; namespace Org.Apache.REEF.Network.Elastic.Failures { /// /// Where the decision is made on what to do when a failure happen. - /// A decision is made based on the ration between the initial data points + /// A decision is made based on the ratio between the initial data points /// and how many data points are lost. + /// Decisions are in form of failure states and threshold levels. + /// Failure machines should work as ladders, when some data is lost and the number + /// of available data points move below / above one of the threshold, the state of the + /// machine changes. /// [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(DefaultFailureStateMachine))] public interface IFailureStateMachine { /// - /// The Machine current failure state. + /// The machine current failure state. /// IFailureState State { get; } @@ -53,40 +60,36 @@ public interface IFailureStateMachine void SetThreashold(IFailureState level, float threshold); /// - /// A utility method for setting multiple threshould at once. + /// A utility method for setting multiple threshold at once. /// - /// Pairs of failure states with realted new threshold + /// Pairs of failure states with related new thresholds void SetThreasholds(Tuple[] weights); /// - /// Add new data point(s) to the Failure Machine. + /// Add new data point(s) to the failure machine. /// This method can be called either at initialization, or when /// new data points becomes available at runtime e.g., after a failure /// is resolved. /// /// How many data point to add + /// Whether the data point is new or restored from a previous failed points /// The failure state resulting from the addition of the data points - IFailureState AddDataPoints(int points); + IFailureState AddDataPoints(int points, bool isNew); /// - /// Remove data point(s) from the Failure Machine as a result of a runtime failure. + /// Remove data point(s) from the failure machine as a result of a runtime failure. /// /// How many data point to remove - /// The failure state resulting from the removal of the data points + /// A failure event resulting from the removal of the data points IFailureState RemoveDataPoints(int points); - /// - /// Finalizes the Failure Machine. - /// Once finalized, each newly added data point is considered as resolving a failure. - /// - /// The same finalized Failure Machine - IFailureStateMachine Build(); - /// /// Utility method used to clone the target failure machine. /// Only the thresholds are cloned, while the machine state is not. /// + /// How many data points are avaialble in the new state machine + /// The state from which the new machine should start /// A new failure machine with the same settings - IFailureStateMachine Clone(); + IFailureStateMachine Clone(int initalPoints = 0, int initalState = 0); } -} \ No newline at end of file +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReconfigure.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReconfigure.cs new file mode 100644 index 0000000000..264c38e6a3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReconfigure.cs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Reconfigure the execution to work with fewer tasks. + /// + [Unstable("0.16", "API may change")] + public interface IReconfigure : IFailureEvent + { + /// + /// The failed task triggering the event. + /// + Optional FailedTask { get; } + + /// + /// The iteration in which the failure is rised. + /// + Optional Iteration { get; set; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs new file mode 100644 index 0000000000..73618a5349 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Reschedule task event. + /// + [Unstable("0.16", "API may change")] + public interface IReschedule : IReconfigure + { + /// + /// The configurations for the subscriptions of the task. + /// + Dictionary> RescheduleTaskConfigurations { get; } + + /// + /// Whether the task should be rescheduled as consequence of this event. + /// + bool Reschedule { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IStop.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IStop.cs new file mode 100644 index 0000000000..b0ef38ebed --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IStop.cs @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Stop task event. + /// + [Unstable("0.16", "API may change")] + public interface IStop : IFailureEvent + { + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ITimeout.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ITimeout.cs new file mode 100644 index 0000000000..649edb6550 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ITimeout.cs @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Wake.Time.Event; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Failure event due to a timeout. + /// + [Unstable("0.16", "API may change")] + public interface ITimeout + { + /// + /// Method used to schedule a timer event of the proper type. + /// + /// How long to wait before the timer event is triggered + /// A timer event + Alarm GetAlarm(long timeout); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/CheckpointableImmutableObject.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/CheckpointableImmutableObject.cs new file mode 100644 index 0000000000..9531539bee --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/CheckpointableImmutableObject.cs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Checkpointable state wrapping an immutable object. + /// Since immutable when creating a checkpoint we don't need a copy. + /// + /// + [Unstable("0.16", "API may change")] + public class CheckpointableImmutableObject : ICheckpointableState + { + protected ICheckpointState _checkpoint; + + [Inject] + private CheckpointableImmutableObject( + [Parameter(typeof(OperatorParameters.Checkpointing))] int level, + ICheckpointState checkpoint) : this() + { + Level = (CheckpointLevel)level; + _checkpoint = checkpoint; + } + + /// + /// Basic constructor returning a checkponitable object with default state and iteration number = 0. + /// + protected CheckpointableImmutableObject() + { + Level = 0; + State = default; + } + + /// + /// The current checkpoint level. + /// + public CheckpointLevel Level { get; internal set; } + + /// + /// The actual state to checkpoint. + /// + internal T State { get; set; } + + /// + /// Make the given input state a checkpointable state. + /// + /// The state that needs to be make checkpointable + public void MakeCheckpointable(object model) + { + State = (T)model; + } + + /// + /// Checkpoint the current state. + /// + /// A checkpoint state + public virtual ICheckpointState Checkpoint() + { + switch (Level) + { + case CheckpointLevel.EphemeralMaster: + case CheckpointLevel.EphemeralAll: + return _checkpoint.Create(State); + default: + throw new ArgumentException($"Level {Level} not recognized."); + } + } + + /// + /// Create a new empty checkpointable state from the current one. + /// + /// The current iteration for which we need to create a new checkpointable state + /// An empty checkpointable state + public virtual ICheckpointableState Create() + { + return new CheckpointableImmutableObject() + { + Level = Level, + }; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs new file mode 100644 index 0000000000..ae70923f90 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Class wrapping a state that has been checkpointed or is ready to. + /// + [Unstable("0.16", "API may change")] + public sealed class DefaultCheckpointState : ICheckpointState + { + [Inject] + private DefaultCheckpointState() + { + Iteration = -1; + OperatorId = -1; + SubscriptionName = string.Empty; + } + + /// + /// The iteration number for this checkpoint. + /// + public int Iteration { get; set; } + + /// + /// The operator id for this checkpoint. + /// + public int OperatorId { get; set; } + + /// + /// The subscription name of the checkpoint. + /// + public string SubscriptionName { get; set; } + + /// + /// The actual state of the checkpoint. + /// + public object State { get; private set; } + + /// + /// Create a new empty checkpoint from the settings of the current one. + /// + /// A checkpoint with no state but with properly set up fields + public ICheckpointState Create(object state) + { + return new DefaultCheckpointState() + { + State = state, + }; + } + + /// + /// Utility method used to create message out of + /// the checkpoint. This is used when checkpoints need + /// to be sent among nodes to recover computation. + /// + /// A checkpoint ready to be communicated + public GroupCommunicationMessage ToMessage() + { + return new CheckpointMessage(this); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureState.cs new file mode 100644 index 0000000000..a7b9443fcc --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureState.cs @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +/// +/// The default implementation for IFailureState. +/// These events are generated based on the default failure states defined in the enum. +/// +namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +{ + [Unstable("0.16", "API may change")] + internal sealed class DefaultFailureState : IFailureState + { + /// + /// Create a default failure state for 0 (Continue). + /// + public DefaultFailureState() + { + FailureState = (int)DefaultFailureStates.Continue; + } + + /// + /// Create a default failure state for the input state. + /// + /// The input state we want to create a failure state from + public DefaultFailureState(int state) + { + FailureState = state; + } + + /// + /// The current failure state. It is assumed that bigger values mean worst + /// failure state. + /// + public int FailureState { get; set; } + + /// + /// A utility method to merge the current failure states and a new one passed as + /// parameter. The merging is based on user defined semantic. + /// + /// A new failure state + /// The merge of the two failure states + public IFailureState Merge(IFailureState that) + { + return new DefaultFailureState(Math.Max(FailureState, that.FailureState)); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureStateMachine.cs new file mode 100644 index 0000000000..a4641a4543 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureStateMachine.cs @@ -0,0 +1,267 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; +using System.Linq; + +/// +/// The default implementation of the failure state machine. +/// This implementation has 4 states: +/// - Continue the computation and ignore the failures +/// - Continue and reconfigure the operators based on the received failures +/// - Continue, reconfigure the operators while trying to reshedule failed tasks +/// - Stop the computation and try to reschedule the tasks +/// - Fail. +/// +namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +{ + [Unstable("0.16", "API may change")] + public class DefaultFailureStateMachine : IFailureStateMachine + { + private readonly object _statusLock; + + private readonly SortedDictionary transitionMapUp = new SortedDictionary() + { + { DefaultFailureStates.Continue, DefaultFailureStates.ContinueAndReconfigure }, + { DefaultFailureStates.ContinueAndReconfigure, DefaultFailureStates.ContinueAndReschedule }, + { DefaultFailureStates.ContinueAndReschedule, DefaultFailureStates.StopAndReschedule }, + { DefaultFailureStates.StopAndReschedule, DefaultFailureStates.Fail } + }; + + private readonly SortedDictionary transitionMapDown = new SortedDictionary() + { + { DefaultFailureStates.ContinueAndReconfigure, DefaultFailureStates.Continue }, + { DefaultFailureStates.ContinueAndReschedule, DefaultFailureStates.ContinueAndReconfigure }, + { DefaultFailureStates.StopAndReschedule, DefaultFailureStates.ContinueAndReschedule }, + { DefaultFailureStates.Fail, DefaultFailureStates.StopAndReschedule } + }; + + private readonly IDictionary transitionWeights = new Dictionary() + { + { DefaultFailureStates.ContinueAndReconfigure, 0.0F }, + { DefaultFailureStates.ContinueAndReschedule, 0.000001F }, + { DefaultFailureStates.StopAndReschedule, 0.5F }, + { DefaultFailureStates.Fail, 0.5F } + }; + + /// + /// Default failure state machine starting with 0 data points and in continue state. + /// + [Inject] + public DefaultFailureStateMachine() : this(0, DefaultFailureStates.Continue) + { + } + + /// + /// Default failure stata machine starting with a given amount of data points and a given intial state. + /// + /// The number of initial data points for the machine, 0 by default + /// The initial state, continue by default + public DefaultFailureStateMachine(int initalPoints = 0, DefaultFailureStates initalState = DefaultFailureStates.Continue) + { + NumOfDataPoints = initalPoints; + NumOfFailedDataPoints = initalPoints; + State = new DefaultFailureState((int)initalState); + + _statusLock = new object(); + } + + /// + /// The machine current failure state. + /// + public IFailureState State { get; private set; } + + /// + /// The total number of data points the machine was initialized with. + /// + public int NumOfDataPoints { get; private set; } + + /// + /// The current number of data points data not reachable because of failures. + /// > + public int NumOfFailedDataPoints { get; private set; } + + /// + /// Add new data point(s) to the failure machine. + /// This method can be called either at initialization, or when + /// new data points becomes available at runtime e.g., after a failure + /// is resolved. + /// + /// How many data point to add + /// Whether the data point is new or restored from a previous failed points + /// The failure state resulting from the addition of the data points + public IFailureState AddDataPoints(int points, bool isNew) + { + lock (_statusLock) + { + if (isNew) + { + NumOfDataPoints += points; + } + else + { + NumOfFailedDataPoints -= points; + } + if (State.FailureState > (int)DefaultFailureStates.Continue && State.FailureState <= (int)DefaultFailureStates.Fail) + { + float currentRate = (float)NumOfFailedDataPoints / NumOfDataPoints; + + while (State.FailureState > (int)DefaultFailureStates.Continue && currentRate < transitionWeights[(DefaultFailureStates)State.FailureState]) + { + State.FailureState = (int)transitionMapDown[(DefaultFailureStates)State.FailureState]; + } + } + + return State; + } + } + + /// + /// Remove data point(s) from the failure machine as a result of a runtime failure. + /// + /// How many data point to remove + /// A failure event resulting from the removal of the data points + public IFailureState RemoveDataPoints(int points) + { + lock (_statusLock) + { + NumOfFailedDataPoints += points; + + float currentRate = (float)NumOfFailedDataPoints / NumOfDataPoints; + + while (State.FailureState < (int)DefaultFailureStates.Fail && + currentRate > transitionWeights[transitionMapUp[(DefaultFailureStates)State.FailureState]]) + { + State.FailureState = (int)transitionMapUp[(DefaultFailureStates)State.FailureState]; + } + + return State; + } + } + + /// + /// Method used to set or update the current threshold connected with + /// a target failure state. The assumption is that higher failure states + /// have higher thresholds. + /// + /// The failure state we want to change + /// A [0, 1] value specifying when the failure level is reached + public void SetThreashold(IFailureState level, float threshold) + { + if (!(level is DefaultFailureState)) + { + throw new ArgumentException(level.GetType() + " is not DefaultFailureStateMachine"); + } + + if (level.FailureState == (int)DefaultFailureStates.Continue) + { + throw new ArgumentException("Cannot change the threshold for Continue state"); + } + + lock (_statusLock) + { + transitionWeights[(DefaultFailureStates)level.FailureState] = threshold; + + CheckConsistency(); + } + } + + /// + /// A utility method for setting multiple threshold at once. + /// + /// Pairs of failure states with related new thresholds + public void SetThreasholds(Tuple[] weights) + { + if (!weights.All(weight => weight.Item1 is DefaultFailureState)) + { + throw new ArgumentException("Input is not of type DefaultFailureStateMachine"); + } + + if (weights.Any(weight => weight.Item1.FailureState == (int)DefaultFailureStates.Continue)) + { + throw new ArgumentException("Cannot change the threshold for Continue state"); + } + + lock (_statusLock) + { + foreach (Tuple weight in weights) + { + transitionWeights[(DefaultFailureStates)weight.Item1.FailureState] = weight.Item2; + } + + CheckConsistency(); + } + } + + /// + /// Utility method used to clone the target failure machine. + /// Only the thresholds are cloned, while the machine state is not. + /// + /// How many data points are avaialble in the new state machine + /// The state from which the new machine should start + /// A new failure machine with the same settings + public IFailureStateMachine Clone(int initalPoints = 0, int initalState = (int)DefaultFailureStates.Continue) + { + var newMachine = new DefaultFailureStateMachine(initalPoints, (DefaultFailureStates)initalState); + + foreach (DefaultFailureStates state in transitionWeights.Keys.OrderByDescending(x => x)) + { + newMachine.SetThreashold(new DefaultFailureState((int)state), transitionWeights[state]); + } + + return newMachine; + } + + /// + /// Check if the states and related thresholds and consistent: i.e., each state can move up or down to only + /// one other state. + /// + private void CheckConsistency() + { + lock (_statusLock) + { + var state = DefaultFailureStates.ContinueAndReconfigure; + float prevWeight = transitionWeights[state]; + state = transitionMapUp[state]; + float nextWeight = transitionWeights[state]; + + while (nextWeight >= 0) + { + if (nextWeight < prevWeight) + { + throw new IllegalStateException($"State {transitionMapDown[state]} weight is bigger than state {state}."); + } + + prevWeight = nextWeight; + + if (state == DefaultFailureStates.StopAndReschedule) + { + return; + } + + state = transitionMapUp[state]; + transitionWeights.TryGetValue(state, out nextWeight); + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/FailEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/FailEvent.cs new file mode 100644 index 0000000000..313cbfa7c4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/FailEvent.cs @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +{ + /// + /// Faile the current execution. + /// + [Unstable("0.16", "API may change")] + public class FailEvent : IFailureEvent + { + /// + /// Constructor for the faile event. + /// + /// The identifier of the task triggering the failure + public FailEvent(string taskId) + { + TaskId = taskId; + FailureResponse = new List(); + } + + /// + /// The event / action raised by the transition to the new failure state. + /// + public int FailureEvent + { + get { return (int)DefaultFailureStateEvents.Fail; } + } + + /// + /// The identifier of the task triggering the event. + /// + public string TaskId { get; private set; } + + /// + /// The opeartor id in which the failure is rised. + /// + public int OperatorId + { + get { return -1; } + } + + /// + /// Messages implementing the response from the driver to the tasks + /// to reconfigure the compution. + /// + public List FailureResponse { get; private set; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/OperatorException.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/OperatorException.cs new file mode 100644 index 0000000000..07c2f01315 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/OperatorException.cs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Runtime.Serialization; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +{ + /// + /// A serializable exception that represents a task operator error. + /// + [Serializable] + [Unstable("0.16", "API may change")] + public class OperatorException : Exception, ISerializable + { + public readonly int _id; + public readonly string _additionalInfo; + + /// + /// Constructor. A serializable exception object that represents a task operator error. + /// All the operator related errors should be captured in this type of exception in order + /// to be proprierly handled by the elastic framework. + /// The exception message + /// The id of the operator where the exception is triggered + /// + public OperatorException(string message, int id) + : base(GetMessagePrefix(id) + message) + { + _id = id; + } + + /// + /// Constructor. A serializable exception object that represents a task operator error and wraps an inner exception. + /// + /// The exception message + /// The id of the operator where the exception is triggered + /// Inner exception + public OperatorException(string message, int id, Exception innerException) + : base(GetMessagePrefix(id) + message, innerException) + { + _id = id; + } + + /// + /// Constructor. A serializable exception object that represents a task operator error and wraps an inner exception + /// plus some additional operator specific information. + /// + /// The exception message + /// The id of the operator where the exception is triggered + /// Inner exception + /// Additional operator speicifc information on the failure + public OperatorException(string message, int id, Exception innerException, string info) + : base(GetMessagePrefix(id) + message, innerException) + { + _id = id; + _additionalInfo = info; + } + + /// + /// Constructor that generate an operator exception from a serialized buffer. + /// + /// The buffer containing the exception information + /// The streaming context + public OperatorException(SerializationInfo info, StreamingContext context) + : base(info, context) + { + _id = info.GetInt32("id"); + _additionalInfo = info.GetString("info"); + } + + /// + /// The identifier of the operator throwing the exception. + /// + public int OperatorId + { + get { return _id; } + } + + /// + /// Some additional info for the exception. + /// + public string AdditionalInfo + { + get { return _additionalInfo; } + } + + /// + /// Serialize the exception. + /// + /// The buffer where to add the exception information + /// The streaming context + public new void GetObjectData(SerializationInfo info, StreamingContext context) + { + base.GetObjectData(info, context); + info.AddValue("id", _id, typeof(int)); + info.AddValue("info", _additionalInfo, typeof(string)); + } + + private static string GetMessagePrefix(int id) + { + return "Operator " + id + " : "; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/ReconfigureEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/ReconfigureEvent.cs new file mode 100644 index 0000000000..0752377791 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/ReconfigureEvent.cs @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +{ + /// + /// Reconfigure the execution to work with fewer tasks. + /// + [Unstable("0.16", "API may change")] + public class ReconfigureEvent : IReconfigure + { + /// + /// Constructor for a reconfigure event. + /// + /// The failed task + /// The operator identifier in which the event was detected + public ReconfigureEvent(IFailedTask failedTask, int opertorId) + { + FailedTask = Optional.Of(failedTask); + OperatorId = opertorId; + FailureResponse = new List(); + Iteration = Optional.Empty(); + TaskId = failedTask.Id; + } + + /// + /// The event / action raised by the transition to the new failure state. + /// + public int FailureEvent + { + get { return (int)DefaultFailureStateEvents.Reconfigure; } + } + + /// + /// The failed task triggering the event. + /// + public Optional FailedTask { get; set; } + + /// + /// The iteration in which the failure is rised. + /// + public Optional Iteration { get; set; } + + /// + /// The identifier of the task triggering the event. + /// + public string TaskId { get; private set; } + + /// + /// The opeartor id in which the failure is rised. + /// + public int OperatorId { get; private set; } + + /// + /// The response message generated to react to the failure event. + /// + public List FailureResponse { get; private set; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs new file mode 100644 index 0000000000..c6c74698f3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +{ + /// + /// Reconfigure the execution to work with fewer tasks and simultaneusly try to + /// reschedule a new task. + /// + [Unstable("0.16", "API may change")] + public class RescheduleEvent : IReschedule + { + /// + /// Constructor for the reschedule event. + /// + /// The identifier of the task triggering the failure event + public RescheduleEvent(string taskId) + { + TaskId = taskId; + OperatorId = -1; + FailureResponse = new List(); + RescheduleTaskConfigurations = new Dictionary>(); + Iteration = Optional.Empty(); + } + + /// + /// The event / action raised by the transition to the new failure state. + /// + public int FailureEvent + { + get { return (int)DefaultFailureStateEvents.Reschedule; } + } + + /// + /// The failed task triggering the event. + /// + public Optional FailedTask { get; set; } + + /// + /// The identifier of the task triggering the event. + /// + public string TaskId { get; private set; } + + /// + /// The opeartor id in which the failure is rised. + /// + public int OperatorId { get; private set; } + + /// + /// The iteration in which the failure is rised. + /// + public Optional Iteration { get; set; } + + /// + /// The response message generated to react to the failure event. + /// + public List FailureResponse { get; private set; } + + /// + /// The configurations for the subscriptions of the task. + /// + public Dictionary> RescheduleTaskConfigurations { get; private set; } + + /// + /// Whether the task should be rescheduled as consequence of this event. + /// + public bool Reschedule + { + get { return RescheduleTaskConfigurations.Count > 0; } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/StopEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/StopEvent.cs new file mode 100644 index 0000000000..53605cd5e4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/StopEvent.cs @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +{ + /// + /// Stop the execution and try to add new tasks. + /// + [Unstable("0.16", "API may change")] + public class StopEvent : IStop + { + /// + /// Constructor for the stop event. + /// + /// The identifier of the task triggering the failure event + public StopEvent(string taskId) + { + TaskId = taskId; + OperatorId = -1; + FailureResponse = new List(); + } + + /// + /// The event / action raised by the transition to the new failure state. + /// + public int FailureEvent + { + get { return (int)DefaultFailureStateEvents.Stop; } + } + + /// + /// The identifier of the task triggering the event. + /// + public string TaskId { get; private set; } + + /// + /// The opeartor id in which the failure is rised. + /// + public int OperatorId { get; private set; } + + /// + /// The response message generated to react to the failure event. + /// + public List FailureResponse { get; private set; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Constants.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Constants.cs new file mode 100644 index 0000000000..4d833527f4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Constants.cs @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators +{ + /// + /// Constants labeling the set of available operators. + /// + [Unstable("0.16", "Constants may change")] + public static class Constants + { + public const string Broadcast = "broadcast"; + public const string Reduce = "reduce"; + public const string AggregationRing = "aggregation ring"; + public const string Iterate = "iterate"; + public const string Scatter = "scatter"; + public const string Gather = "gather"; + public const string Empty = "empty"; + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/IElasticBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/IElasticBroadcast.cs new file mode 100644 index 0000000000..6bc4a6da44 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/IElasticBroadcast.cs @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Group communication operator used to broadcast messages. + /// + [Unstable("0.16", "API may change")] + public interface IElasticBroadcast : IElasticTypedOperator, IReceiver, ISender + { + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/IElasticBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/IElasticBroadcast.cs new file mode 100644 index 0000000000..6142ec1c1d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/IElasticBroadcast.cs @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical +{ + /// + /// Elastic group communication operator used to broadcast messages. + /// + [Unstable("0.16", "API may change")] + public interface IElasticBroadcast + { + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs new file mode 100644 index 0000000000..f85e1729e5 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Topology.Logical; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl +{ + /// + /// Driver-side broadcast operator implementation. + /// + [Unstable("0.16", "API may change")] + class DefaultBroadcast : DefaultOneToN, IElasticBroadcast + { + /// + /// Constructor for a driver-side broadcast opearator. + /// + /// The identifier of the sender task + /// The previous operator in the pipeline + /// The topology for the broadcast operation + /// The failure machine managing the failures for the operator + /// The checkpoint level + /// Additional configurations for the operator + public DefaultBroadcast( + int senderId, + ElasticOperator prev, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel, + params IConfiguration[] configurations) : base( + senderId, + prev, + topology, + failureMachine, + checkpointLevel, + configurations) + { + OperatorName = Constants.Broadcast; + } + + /// + /// Binding from logical to physical operator. + /// + /// The configuration builder the binding will be added to + protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder confBuilder) + { + confBuilder + .BindImplementation(GenericType>.Class, GenericType>.Class) + .BindImplementation(GenericType.Class, GenericType>.Class); + SetMessageType(typeof(Physical.Impl.DefaultBroadcast), ref confBuilder); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultEmpty.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultEmpty.cs new file mode 100644 index 0000000000..76c1102e54 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultEmpty.cs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl +{ + /// + /// Empty operator implementing the default failure logic. To use only as root of pipelines. + /// + [Unstable("0.16", "API may change")] + class DefaultEmpty : ElasticOperatorWithDefaultDispatcher + { + /// + /// Basic constructor for the empty operator. + /// + /// The subscription the operator is part of + /// The failure machine goverining the opeartor + public DefaultEmpty(IElasticTaskSetSubscription subscription, IFailureStateMachine failureMachine) : + base(subscription, null, new EmptyTopology(), failureMachine) + { + OperatorName = Constants.Empty; + MasterId = 1; + WithinIteration = false; + } + + public override void OnTaskFailure(IFailedTask task, ref List failureEvents) + { + if (_next != null) + { + _next.OnTaskFailure(task, ref failureEvents); + } + } + + /// + /// Logs the current operator state. + /// + protected override void LogOperatorState() + { + } + + /// + /// This method is operator specific and serializes the operator configuration into the input list. + /// + /// A list the serialized operator configuration will be appended to + /// The task id of the task that belongs to this operator + protected override void GetOperatorConfiguration(ref IList serializedOperatorsConfs, int taskId) + { + } + + /// + /// Binding from logical to physical operator. + /// + /// The configuration builder the binding will be added to + protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder confBuilder) + { + } + + /// + /// Utility method gathering the set of master task ids of the operators in the current pipeline. + /// + /// The id of the master tasks of the current and successive operators + internal override void GatherMasterIds(ref HashSet masterTasks) + { + if (!_operatorFinalized) + { + throw new IllegalStateException("Operator need to be build before finalizing the subscription"); + } + + if (_next != null) + { + _next.GatherMasterIds(ref masterTasks); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultOneToN.cs new file mode 100644 index 0000000000..fdde6ebc19 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultOneToN.cs @@ -0,0 +1,194 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Topology.Logical; +using Org.Apache.REEF.Network.Elastic.Failures.Impl; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Driver.Task; +using System.Collections.Generic; +using System; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Comm.Enum; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl +{ + /// + /// Generic implementation of an operator having one node sending to N nodes + /// and with default failure behaviour. + /// + [Unstable("0.16", "API may change")] + internal abstract class DefaultOneToN : ElasticOperatorWithDefaultDispatcher + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultOneToN<>)); + + private volatile bool _stop; + + /// + /// Constructor for an operator where one node sends to N nodes and with default + /// failure behavior. + /// + /// The identifier of the task sending the message + /// The previous node in the pipeline + /// The toopology the message routing protocol will use + /// The failure machine for this operator + /// The checkpoint level for the operator + /// Additional operator specific configurations + public DefaultOneToN( + int senderId, + ElasticOperator prev, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel, + params IConfiguration[] configurations) : base( + null, + prev, + topology, + failureMachine, + checkpointLevel, + configurations) + { + MasterId = senderId; + WithinIteration = prev.WithinIteration; + + _stop = false; + } + + /// + /// Operator specific logic for reacting when a task message is received. + /// + /// Incoming message from a task + /// Zero or more reply messages for the task + /// True if the operator has reacted to the task message + protected override bool ReactOnTaskMessage(ITaskMessage message, ref List returnMessages) + { + var msgReceived = (TaskMessageType)BitConverter.ToUInt16(message.Message, 0); + + switch (msgReceived) + { + case TaskMessageType.JoinTopology: + { + var operatorId = BitConverter.ToInt16(message.Message, sizeof(ushort)); + + if (operatorId != _id) + { + return false; + } + + if (!Subscription.IsCompleted && _failureMachine.State.FailureState < (int)DefaultFailureStates.Fail) + { + var taskId = message.TaskId; + LOGGER.Log(Level.Info, $"{taskId} joins the topology for operator {_id}"); + + _topology.AddTask(taskId, _failureMachine); + } + + return true; + } + case TaskMessageType.TopologyUpdateRequest: + { + var operatorId = BitConverter.ToInt16(message.Message, sizeof(ushort)); + + if (operatorId != _id) + { + return false; + } + + LOGGER.Log(Level.Info, $"Received topology update request for {OperatorName} {_id} from {message.TaskId}"); + + if (!_stop) + { + _topology.TopologyUpdateResponse(message.TaskId, ref returnMessages, Optional.Of(_failureMachine)); + } + else + { + LOGGER.Log(Level.Info, $"Operator {OperatorName} is in stopped: Ignoring"); + } + + return true; + } + case TaskMessageType.CompleteSubscription: + { + Subscription.IsCompleted = true; + + return true; + } + + default: + return false; + } + } + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public override void OnReconfigure(ref IReconfigure reconfigureEvent) + { + LOGGER.Log(Level.Info, $"Going to reconfigure the {OperatorName} operator"); + + if (_stop) + { + _stop = false; + } + + if (reconfigureEvent.FailedTask.IsPresent()) + { + if (reconfigureEvent.FailedTask.Value.AsError() is OperatorException) + { + var info = Optional.Of(((OperatorException)reconfigureEvent.FailedTask.Value.AsError()).AdditionalInfo); + var msg = _topology.Reconfigure(reconfigureEvent.FailedTask.Value.Id, info, reconfigureEvent.Iteration); + + reconfigureEvent.FailureResponse.AddRange(msg); + } + else + { + var msg = _topology.Reconfigure(reconfigureEvent.FailedTask.Value.Id, Optional.Empty(), reconfigureEvent.Iteration); + + reconfigureEvent.FailureResponse.AddRange(msg); + } + } + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public override void OnReschedule(ref IReschedule rescheduleEvent) + { + var reconfigureEvent = rescheduleEvent as IReconfigure; + + OnReconfigure(ref reconfigureEvent); + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public override void OnStop(ref IStop stopEvent) + { + if (!_stop) + { + _stop = true; + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElastiOperatorWithDefaultDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElastiOperatorWithDefaultDispatcher.cs new file mode 100644 index 0000000000..62dc7ed546 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElastiOperatorWithDefaultDispatcher.cs @@ -0,0 +1,253 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Failures.Impl; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Topology.Logical; +using System.Collections.Generic; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Utilities; +using System; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl +{ + /// + /// Abstract operator implementing the default failure logic. + /// This can be used as super class for default operators. + /// + [Unstable("0.16", "API may change")] + internal abstract class ElasticOperatorWithDefaultDispatcher : ElasticOperator, IDefaultFailureEventResponse + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(ElasticOperatorWithDefaultDispatcher)); + + /// + /// Base constructor for an abstract operator implementing the default failure logic. + /// + /// The subscription the operator is part of + /// The previous operator in the pipelines + /// The topology for the operator + /// The failure machine of the operator + /// The chckpoint level for the opearator + /// Additonal opeartor specific configurations + protected ElasticOperatorWithDefaultDispatcher( + IElasticTaskSetSubscription subscription, + ElasticOperator prev, ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel = CheckpointLevel.None, + params IConfiguration[] configurations) : + base(subscription, prev, topology, failureMachine, checkpointLevel, configurations) + { + } + + /// + /// Add the broadcast operator to the operator pipeline. + /// + /// The type of messages that the operator will send / receive + /// The id of the sender / root node of the broadcast + /// The topology of the operator + /// The failure state machine of the operator + /// The checkpoint policy for the operator + /// Additional configurations for the operator + /// The same operator pipeline with the added broadcast operator + public override ElasticOperator Broadcast(int senderId, ITopology topology, IFailureStateMachine failureMachine, CheckpointLevel checkpointLevel, params IConfiguration[] configurations) + { + _next = new DefaultBroadcast(senderId, this, topology, failureMachine, checkpointLevel, configurations); + return _next; + } + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + public override void OnTaskFailure(IFailedTask task, ref List failureEvents) + { + var failedOperatorId = _id; + + if (task.AsError() is OperatorException) + { + var opException = task.AsError() as OperatorException; + failedOperatorId = opException.OperatorId; + } + else + { + LOGGER.Log(Level.Info, $"Failure from {task.Id} cannot be properly managed: failing"); + failureEvents.Add(new FailEvent(task.Id)); + } + + if (WithinIteration || failedOperatorId <= _id) + { + int lostDataPoints = _topology.RemoveTask(task.Id); + var failureState = _failureMachine.RemoveDataPoints(lostDataPoints); + + switch ((DefaultFailureStates)failureState.FailureState) + { + case DefaultFailureStates.ContinueAndReconfigure: + failureEvents.Add(new ReconfigureEvent(task, _id)); + break; + case DefaultFailureStates.ContinueAndReschedule: + if (failedOperatorId == _id) + { + var @event = new RescheduleEvent(task.Id) + { + FailedTask = Optional.Of(task) + }; + failureEvents.Add(@event); + } + break; + case DefaultFailureStates.StopAndReschedule: + failureEvents.Add(new StopEvent(task.Id)); + break; + case DefaultFailureStates.Fail: + failureEvents.Add(new FailEvent(task.Id)); + break; + default: + LOGGER.Log(Level.Info, $"Failure from {task.Id} requires no action"); + break; + } + + LogOperatorState(); + } + + if (PropagateFailureDownstream() && _next != null) + { + _next.OnTaskFailure(task, ref failureEvents); + } + } + + /// + /// Used to react when a timeout event is triggered. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote Tasks need to reach + /// The next timeouts to be scheduled + public override void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) + { + if (_next != null) + { + _next.OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + } + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public override void EventDispatcher(ref IFailureEvent @event) + { + if (@event.OperatorId == _id || (@event.OperatorId < 0 && WithinIteration)) + { + switch ((DefaultFailureStateEvents)@event.FailureEvent) + { + case DefaultFailureStateEvents.Reconfigure: + var rec = @event as IReconfigure; + OnReconfigure(ref rec); + break; + case DefaultFailureStateEvents.Reschedule: + var res = @event as IReschedule; + OnReschedule(ref res); + break; + case DefaultFailureStateEvents.Stop: + var stp = @event as IStop; + OnStop(ref stp); + break; + default: + OnFail(); + break; + } + } + + if (_next != null && (@event.OperatorId == -1 || @event.OperatorId > _id)) + { + _next.EventDispatcher(ref @event); + } + } + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public virtual void OnReconfigure(ref IReconfigure reconfigureEvent) + { + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public virtual void OnReschedule(ref IReschedule rescheduleEvent) + { + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public virtual void OnStop(ref IStop stopEvent) + { + } + + /// + /// Mechanism to execute when a fail event is triggered. + /// + public virtual void OnFail() + { + } + + /// + /// Returns whether a failure should be propagated to downstream operators or not. + /// + /// True if the failure has to be sent downstream + protected override bool PropagateFailureDownstream() + { + switch (_failureMachine.State.FailureState) + { + case (int)DefaultFailureStates.Continue: + case (int)DefaultFailureStates.ContinueAndReconfigure: + case (int)DefaultFailureStates.ContinueAndReschedule: + return true; + default: + return false; + } + } + + /// + /// Logs the current operator state. + /// + protected override void LogOperatorState() + { + string intro = $"State for Operator {OperatorName} in Subscription {Subscription.SubscriptionName}:\n"; + string topologyState = $"Topology:\n{_topology.LogTopologyState()}\n"; + string failureMachineState = $"Failure State: {(DefaultFailureStates)_failureMachine.State.FailureState}" + + $"\nFailure(s) Reported: {_failureMachine.NumOfFailedDataPoints}/{_failureMachine.NumOfDataPoints}"; + + LOGGER.Log(Level.Info, intro + topologyState + failureMachineState); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs new file mode 100644 index 0000000000..fb942f7b05 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs @@ -0,0 +1,525 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities.Logging; +using System.Globalization; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Util; +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Network.Elastic.Topology.Logical; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Wake.Time.Event; +using System.Linq; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl +{ + /// + /// Basic implementation for logical operators. + /// Each operator is part of a subscription and is parametrized by a topology, a failure + /// state machine and a checkpoint policy. + /// Operators are composed into pipelines. + /// Once a pipeline is finalized, tasks can be added to the operator, which + /// will in turn add the tasks to the topology and the failure state machine. + /// When no more tasks are added, the operator state must be finalized in order to + /// schedule the pipeline for execution. + /// + [Unstable("0.16", "API may change")] + public abstract class ElasticOperator : IFailureResponse, ITaskMessageResponse + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(ElasticOperator)); + + // For the moment we consider only linear sequences (pipelines) of operators (no branching for e.g., joins) + protected ElasticOperator _next = null; + protected readonly ElasticOperator _prev; + + protected readonly IFailureStateMachine _failureMachine; + protected readonly CheckpointLevel _checkpointLevel; + protected readonly ITopology _topology; + protected readonly int _id; + protected readonly IConfiguration[] _configurations; + + protected bool _operatorFinalized; + protected volatile bool _operatorStateFinalized; + protected IElasticTaskSetSubscription _subscription; + + /// + /// Specification for generic elastic operators. + /// + /// The subscription this operator is part of + /// The previous operator in the pipeline + /// The topology of the operator + /// The behavior of the operator under failures + /// The checkpoint policy for the operator + /// Additional configuration parameters + public ElasticOperator( + IElasticTaskSetSubscription subscription, + ElasticOperator prev, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel = CheckpointLevel.None, + params IConfiguration[] configurations) + { + _subscription = subscription; + _prev = prev; + _id = Subscription.GetNextOperatorId(); + _topology = topology; + _failureMachine = failureMachine; + _checkpointLevel = checkpointLevel; + _configurations = configurations; + _operatorFinalized = false; + _operatorStateFinalized = false; + + _topology.OperatorId = _id; + _topology.SubscriptionName = Subscription.SubscriptionName; + } + + /// + /// The identifier of the master / coordinator node for this operator. + /// + public int MasterId { get; protected set; } + + /// + /// An operator type specific name. + /// + public string OperatorName { get; protected set; } + + /// + /// Whether the current operator is or is preeceded by an iterator operator. + /// + public bool WithinIteration { get; protected set; } + + /// + /// The subscription this operator is part of. + /// + public IElasticTaskSetSubscription Subscription + { + get + { + if (_subscription == null) + { + if (_prev == null) + { + throw new IllegalStateException("The reference to the parent subscription is lost."); + } + + _subscription = _prev.Subscription; + + return _prev.Subscription; + } + + return _subscription; + } + } + + /// + /// Add an instance of the broadcast operator to the operator pipeline + /// with default failure machine and no checkpointing. + /// + /// The type of messages that the operator will send / receive + /// The topology of the operator + /// Additional configurations for the operator + /// The same operator pipeline with the added broadcast operator + public ElasticOperator Broadcast(TopologyType topology, params IConfiguration[] configurations) + { + return Broadcast(MasterId, GetTopology(topology), _failureMachine.Clone(), CheckpointLevel.None, configurations); + } + + /// + /// Add the broadcast operator to the operator pipeline + /// with default failure machine. + /// + /// The type of messages that the operator will send / receive + /// The topology of the operator + /// The checkpoint policy for the operator + /// Additional configurations for the operator + /// The same operator pipeline with the added broadcast operator + public ElasticOperator Broadcast(TopologyType topology, CheckpointLevel checkpointLevel, params IConfiguration[] configurations) + { + return Broadcast(MasterId, GetTopology(topology), _failureMachine.Clone(), checkpointLevel, configurations); + } + + /// + /// Method triggered when a task to driver message is received. + /// This method eventually propagate tasks message through the pipeline. + /// + /// The task message for the operator + /// A list of messages containing the instructions for the task + public void OnTaskMessage(ITaskMessage message, ref List returnMessages) + { + var hasReacted = ReactOnTaskMessage(message, ref returnMessages); + + if (!hasReacted && _next != null) + { + _next.OnTaskMessage(message, ref returnMessages); + } + } + + /// + /// Add a task to the operator. + /// The bperator must have called Build() before adding tasks. + /// + /// The id of the task to add + /// True if the task is new and is added to the operator + public virtual bool AddTask(string taskId) + { + if (!_operatorFinalized) + { + throw new IllegalStateException("Operator needs to be finalized before adding tasks."); + } + + if (_operatorStateFinalized) + { + throw new IllegalStateException("Task cannot be added to an operator with finalized state."); + } + + var newTask = _topology.AddTask(taskId, _failureMachine); + + if (_next != null) + { + // A task is new if it got added by at least one operator + return _next.AddTask(taskId) || newTask; + } + + return newTask; + } + + /// + /// Finalizes the operator. + /// + /// The same finalized operator + public virtual ElasticOperator Build() + { + if (_operatorFinalized) + { + throw new IllegalStateException("Operator cannot be built more than once."); + } + + if (_prev != null) + { + _prev.Build(); + } + + _operatorFinalized = true; + + return this; + } + + /// + /// Finalizes the operator state. After BuildState, no more tasks can be added + /// to the Operator. + /// + /// The same operator with the finalized state + public virtual ElasticOperator BuildState() + { + if (_operatorStateFinalized) + { + throw new IllegalStateException("Operator state cannot be built more than once."); + } + + if (!_operatorFinalized) + { + throw new IllegalStateException("Operator need to be build before finalizing its state."); + } + + if (_next != null) + { + _next.BuildState(); + } + + _topology.Build(); + + LogOperatorState(); + + _operatorStateFinalized = true; + + return this; + } + + /// + /// Whether this is the last iterator in the pipeline. + /// + /// True if this is the last iterator + public virtual bool CheckIfLastIterator() + { + if (_next == null) + { + return true; + } + + return _next.CheckIfLastIterator(); + } + + /// + /// Add the broadcast operator to the operator pipeline. + /// + /// The type of messages that the operator will send / receive + /// The id of the sender / root node of the broadcast + /// The topology of the operator + /// The failure state machine of the operator + /// The checkpoint policy for the operator + /// Additional configurations for the operator + /// The same operator pipeline with the added broadcast operator + public abstract ElasticOperator Broadcast(int senderId, ITopology topology, IFailureStateMachine failureMachine, CheckpointLevel checkpointLevel = CheckpointLevel.None, params IConfiguration[] configurations); + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + public abstract void OnTaskFailure(IFailedTask task, ref List failureEvents); + + /// + /// Used to react when a timeout event is triggered. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote Tasks need to reach + /// /// The next timeouts to be scheduled + public abstract void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts); + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public abstract void EventDispatcher(ref IFailureEvent @event); + + /// + /// Appends the operator configuration for the input task to the input configuration. + /// Must be called only after Build() and BuildState() have been called. + /// This method should be called from the root operator at beginning of the pipeline. + /// + /// The list the operator configuration we will be appending to + /// The id of the task that belongs to this operator + /// The configuration for the task with added operator information + internal void GetTaskConfiguration(ref IList serializedOperatorsConfs, int taskId) + { + if (_operatorFinalized && _operatorStateFinalized) + { + GetOperatorConfiguration(ref serializedOperatorsConfs, taskId); + + if (_next != null) + { + _next.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); + } + } + else + { + throw new IllegalStateException("Operator needs to be finalized before getting tasks configuration."); + } + } + + /// + /// Whether this operator is ready to be scheduled by the task set manager. + /// + /// True if the operator is ready to be scheduled + internal bool CanBeScheduled() + { + bool canBeScheduled = _topology.CanBeScheduled(); + + if (canBeScheduled && _next != null) + { + return _next.CanBeScheduled(); + } + + return canBeScheduled; + } + + /// + /// Appends the message type to the configuration. + /// + /// The type of the messages the operator is configured to accept + /// The configuration builder the message type will be added to + protected void SetMessageType(Type operatorType, ref ICsConfigurationBuilder confBuilder) + { + if (operatorType.IsGenericType) + { + var genericTypes = operatorType.GenericTypeArguments; + var msgType = genericTypes[0]; + confBuilder.BindNamedParameter( + GenericType.Class, msgType.AssemblyQualifiedName); + } + else + { + throw new IllegalStateException("Expecting a generic type for the message."); + } + } + + /// + /// Action to trigger when the operator recdeives a notification that a new iteration is started. + /// + /// The new iteration number + protected void OnNewIteration(int iteration) + { + _topology.OnNewIteration(iteration); + + if (_next != null) + { + _next.OnNewIteration(iteration); + } + } + + /// + /// This method is operator specific and serializes the operator configuration into the input list. + /// + /// A list the serialized operator configuration will be appended to + /// The task id of the task that belongs to this operator + protected virtual void GetOperatorConfiguration(ref IList serializedOperatorsConfs, int taskId) + { + ICsConfigurationBuilder operatorBuilder = TangFactory.GetTang().NewConfigurationBuilder(); + + _topology.GetTaskConfiguration(ref operatorBuilder, taskId); + + PhysicalOperatorConfiguration(ref operatorBuilder); + + if (!Subscription.IsIterative && _next == null) + { + operatorBuilder.BindNamedParameter( + GenericType.Class, + true.ToString(CultureInfo.InvariantCulture)); + } + + IConfiguration operatorConf = operatorBuilder + .BindNamedParameter( + GenericType.Class, + _id.ToString(CultureInfo.InvariantCulture)) + .BindNamedParameter( + GenericType.Class, + ((int)_checkpointLevel).ToString(CultureInfo.InvariantCulture)) + .Build(); + + foreach (var conf in _configurations) + { + operatorConf = Configurations.Merge(operatorConf, conf); + } + + Subscription.Service.SerializeOperatorConfiguration(ref serializedOperatorsConfs, operatorConf); + } + + /// + /// Returns whether a failure should be propagated to downstream operators or not. + /// + /// True if the failure has to be sent downstream + protected virtual bool PropagateFailureDownstream() + { + return true; + } + + /// + /// Operator specific logic for reacting when a task message is received. + /// + /// Incoming message from a task + /// Zero or more reply messages for the task + /// True if the operator has reacted to the task message + protected virtual bool ReactOnTaskMessage(ITaskMessage message, ref List returnMessages) + { + return false; + } + + /// + /// Utility method gathering the set of master task ids of the operators in the current pipeline. + /// + /// The id of the master tasks of the current and successive operators + internal virtual void GatherMasterIds(ref HashSet masterTasks) + { + if (_operatorFinalized != true) + { + throw new IllegalStateException("Operator need to be build before gathering information."); + } + + masterTasks.Add(Utils.BuildTaskId(Subscription.SubscriptionName, MasterId)); + + if (_next != null) + { + _next.GatherMasterIds(ref masterTasks); + } + } + + /// + /// Logs the current operator state. + /// + protected virtual void LogOperatorState() + { + string intro = $"State for Operator {OperatorName} in Subscription {Subscription.SubscriptionName}:\n"; + string topologyState = $"Topology:\n{_topology.LogTopologyState()}"; + string failureMachineState = "Failure State: " + _failureMachine.State.FailureState + + "\nFailure(s) Reported: " + _failureMachine.NumOfFailedDataPoints; + + LOGGER.Log(Level.Info, intro + topologyState + failureMachineState); + } + + /// + /// Binding from logical to physical operator. + /// + /// The configuration builder the binding will be added to + protected abstract void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder builder); + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + internal virtual string LogFinalStatistics() + { + var str = LogInternalStatistics(); + + if (_next != null) + { + str += _next.LogFinalStatistics(); + } + + return str; + } + + /// + /// Log the final internal statistics of the operator. + /// + protected virtual string LogInternalStatistics() + { + return _topology.LogFinalStatistics(); + } + + private ITopology GetTopology(TopologyType topologyType) + { + ITopology topology; + + switch (topologyType) + { + case TopologyType.Flat: + topology = new FlatTopology(MasterId); + break; + default: throw new ArgumentException(nameof(topologyType), $"Topology type {topologyType} not supported by {OperatorName}."); + } + + return topology; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Enum/PositionTracker.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Enum/PositionTracker.cs new file mode 100644 index 0000000000..6f0ab159f9 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Enum/PositionTracker.cs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum +{ + /// + /// Enum summarizing the positions in which the exeuction is within an operator. + /// This information is used in case of failure to properly reconfigure computation. + /// + [Unstable("0.16", "API may change")] + public enum PositionTracker : int + { + Nil = 0, + + InSend = 1, + + InReceive = 2, + + AfterReceiveBeforeSend = 3, + + AfterReceive = 4, + + AfterSendBeforeReceive = 5, + + AfterSend = 6 + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs new file mode 100644 index 0000000000..a6764954d3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Group communication operator used to for iterations. + /// + [Unstable("0.16", "API may change")] + public interface IElasticIterator : IElasticOperator + { + /// + /// Move to the next iteration. + /// + /// True if the next iteration exists + bool MoveNext(); + + /// + /// The current iteration. + /// + /// An object representing the current iteration + object Current { get; } + + /// + /// Synchronize the current iteration with the input one. + /// + /// The state in which the iterator will be moved + void SyncIteration(int iteration); + + /// + /// Register the action to trigger when a task is rescheduled. + /// + /// Some code to execute upon task rescheduling + void RegisterActionOnTaskRescheduled(Action action); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs new file mode 100644 index 0000000000..74759f95f9 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Threading; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Base class for task-side, physical, group communication operators. + /// + [Unstable("0.16", "API may change")] + public interface IElasticOperator : IWaitForTaskRegistration, IDisposable + { + /// + /// The operator name. + /// + string OperatorName { get; } + + /// + /// The operator identifier. + /// + int OperatorId { get; } + + /// + /// Operator specific information in case of failure. + /// + string FailureInfo { get; } + + /// + /// Get a reference of the iterator in the pipeline (if it exists). + /// + IElasticIterator IteratorReference { set; } + + /// + /// Cancellation source for stopping the exeuction of the opearator. + /// + CancellationTokenSource CancellationSource { get; set; } + + /// + /// Wait until computation is globally completed for this operator + /// before disposing the object. + /// + void WaitCompletionBeforeDisposing(); + + /// + /// Reset the internal position tracker. This should be called + /// every time a new iteration start in the workflow. + /// + void ResetPosition(); + + /// + /// Action to execute when a task is re-scheduled. + /// + Action OnTaskRescheduled { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticTypedOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticTypedOperator.cs new file mode 100644 index 0000000000..d4e3b740eb --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticTypedOperator.cs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Typed physical group communication operator. + /// + /// The type of data managed by the operator + [Unstable("0.16", "API may change")] + public interface IElasticTypedOperator : IElasticOperator + { + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs new file mode 100644 index 0000000000..e83c3c36ec --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Group Communication Operator receiving messages. + /// + /// The type of data being receive. + [Unstable("0.16", "API may change")] + public interface IReceiver + { + /// + /// Receive a message from a sender task. + /// + /// The incoming message + T Receive(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs new file mode 100644 index 0000000000..008bb46967 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Group communication operator used to send messages to child tasks. + /// + /// The data type of the message + [Unstable("0.16", "API may change")] + public interface ISender + { + /// + /// Send the data to all child receivers. + /// + /// The data to send + void Send(T data); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs new file mode 100644 index 0000000000..76bb2b34cd --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; +using Org.Apache.REEF.Network.Elastic.Failures; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Impl +{ + /// + /// Default implementation of a group communication operator used to broadcast messages. + /// + /// The type of message being sent. + [Unstable("0.16", "API may change")] + public sealed class DefaultBroadcast : DefaultOneToN, IElasticBroadcast + { + /// + /// Creates a new Broadcast operator. + /// + /// The operator identifier + /// The operator topology layer + [Inject] + private DefaultBroadcast( + [Parameter(typeof(OperatorParameters.OperatorId))] int id, + [Parameter(typeof(OperatorParameters.IsLast))] bool isLast, + ICheckpointableState checkpointableState, + DefaultBroadcastTopology topology) : base(id, isLast, checkpointableState, topology) + { + OperatorName = Constants.Broadcast; + } + + /// + /// Send the data to all child receivers. + /// Send is asynchronous but works in 3 phases: + /// 1-The task asks the driver for updates to the topology + /// 2-Updates are received and added to the local topology + /// --(Note that altough the method is non-blocking, no message will be sent until + /// updates are not received) + /// 3-Send the message. + /// + /// The data to send + public void Send(T data) + { + _topology.TopologyUpdateRequest(); + + _position = PositionTracker.InSend; + + int iteration = IteratorReference == null ? 0 : (int)IteratorReference.Current; + var message = _topology.AssembleDataMessage(iteration, new[] { data }); + + Checkpoint(message, message.Iteration); + + _topology.Send(message, CancellationSource); + + _position = PositionTracker.AfterSend; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs new file mode 100644 index 0000000000..8dffa6c37f --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Threading; +using System.Collections.Generic; +using Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl; +using Org.Apache.REEF.Network.Elastic.Failures; +using System; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Impl +{ + /// + /// Generic implementation of a group communication operator where one node sends to N. + /// + /// The type of message being sent. + [Unstable("0.16", "API may change")] + public abstract class DefaultOneToN + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultOneToN<>)); + + private readonly ICheckpointableState _checkpointableState; + internal readonly OneToNTopology _topology; + internal volatile PositionTracker _position; + + private readonly bool _isLast; + + /// + /// Creates a new one to N operator. + /// + /// The operator identifier + /// The checkpoint level for the operator + /// Whether this operator is the last in the pipeline + /// The operator topology layer + internal DefaultOneToN(int id, bool isLast, ICheckpointableState checkpointableState, OneToNTopology topology) + { + OperatorId = id; + _checkpointableState = checkpointableState; + _isLast = isLast; + _topology = topology; + _position = PositionTracker.Nil; + + OnTaskRescheduled = new Action(() => + { + _topology.JoinTopology(); + }); + } + + /// + /// The operator identifier. + /// + public int OperatorId { get; private set; } + + /// + /// The operator name. + /// + public string OperatorName { get; protected set; } + + /// + /// Operator-specific information that is sent to the driver in case of failure. + /// + public string FailureInfo + { + get + { + string iteration = IteratorReference == null ? "-1" : IteratorReference.Current.ToString(); + string position = ((int)_position).ToString() + ":"; + string isSending = _topology.IsSending ? "1" : "0"; + return iteration + ":" + position + ":" + isSending; + } + } + + /// + /// Get a reference of the iterator in the pipeline (if it exists). + /// + public IElasticIterator IteratorReference { protected get; set; } + + /// + /// Cancellation source for stopping the exeuction of the opearator. + /// + public CancellationTokenSource CancellationSource { get; set; } + + /// + /// Action to execute when a task is re-scheduled. + /// + public Action OnTaskRescheduled { get; private set; } + + /// + /// The set of messages checkpointed in memory. + /// + private List CheckpointedMessages { get; set; } + + /// + /// Receive a message from neighbors broadcasters. + /// + /// The incoming data + public T Receive() + { + _position = PositionTracker.InReceive; + + var received = false; + DataMessageWithTopology message = null; + var isIterative = IteratorReference != null; + + while (!received && !CancellationSource.IsCancellationRequested) + { + message = _topology.Receive(CancellationSource) as DataMessageWithTopology; + + if (isIterative && message.Iteration < (int)IteratorReference.Current) + { + LOGGER.Log(Level.Warning, $"Received message for iteration {message.Iteration} but I am already in iteration {(int)IteratorReference.Current}: ignoring."); + } + else + { + received = true; + } + } + + if (message == null) + { + throw new OperationCanceledException("Impossible to receive messages: operation cancelled."); + } + + if (isIterative) + { + IteratorReference.SyncIteration(message.Iteration); + } + + Checkpoint(message, message.Iteration); + + _position = PositionTracker.AfterReceive; + + return message.Data; + } + + /// + /// Reset the internal position tracker. This should be called + /// every time a new iteration start in the workflow. + /// + public void ResetPosition() + { + _position = PositionTracker.Nil; + } + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// + public void WaitForTaskRegistration(CancellationTokenSource cancellationSource) + { + LOGGER.Log(Level.Info, $"Waiting for task registration for {OperatorName} operator."); + _topology.WaitForTaskRegistration(cancellationSource); + } + + /// + /// Wait until computation is globally completed for this operator + /// before disposing the object. + /// + public void WaitCompletionBeforeDisposing() + { + _topology.WaitCompletionBeforeDisposing(CancellationSource); + } + + /// + /// Dispose the operator. + /// + public void Dispose() + { + if (_isLast) + { + _topology.SignalSubscriptionComplete(); + } + _topology.Dispose(); + } + + /// + /// Checkpoint the input data for the input iteration using the defined checkpoint level. + /// + /// The messages to checkpoint + /// The iteration of the checkpoint + internal void Checkpoint(GroupCommunicationMessage data, int iteration) + { + if (_checkpointableState.Level > CheckpointLevel.None) + { + var state = _checkpointableState.Create(iteration); + + state.MakeCheckpointable(data); + _topology.Checkpoint(state); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IWaitForTaskRegistration.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IWaitForTaskRegistration.cs new file mode 100644 index 0000000000..f5d9f8d537 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IWaitForTaskRegistration.cs @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System.Threading; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Extended by classes requiring the initialization of group communication. + /// + [Unstable("0.16", "API may change")] + public interface IWaitForTaskRegistration + { + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/DataNodeState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/DataNodeState.cs new file mode 100644 index 0000000000..46c74af970 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/DataNodeState.cs @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum +{ + [Unstable("0.16", "Types may change")] + internal enum DataNodeState : int + { + Reachable = 1, + + Unreachable = 2, + + Lost = 3 + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/TopologyType.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/TopologyType.cs new file mode 100644 index 0000000000..3563a883a8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Enum/TopologyType.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum +{ + /// + /// Enum defining the supported type of (logical) topologies + /// in which networked nodes are organized. + /// + [Unstable("0.16", "Types may change")] + public enum TopologyType + { + Flat = 0, + + Tree = 1, + + Ring = 2 + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs new file mode 100644 index 0000000000..1e305b97e4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Tang.Interface; +using System.Collections.Generic; +using Org.Apache.REEF.Network.Elastic.Failures; +using System; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical +{ + /// + /// Represents a topology graph for Elastic Group Communication Operators. + /// + [Unstable("0.16", "API may change")] + public interface ITopology + { + /// + /// The identifier of the operator using the topology. + /// + int OperatorId { get; set; } + + /// + /// The subscription of the operator using the topology. + /// + string SubscriptionName { get; set; } + + /// + /// Adds a new task to the topology. + /// When called before Build() actually adds the task to the topology. + /// After Build(), it assumes that the task is added because recovered from a failure. + /// A failure machine is given as input so that the topology can update the number of available nodes. + /// + /// The id of the task to be added + /// The failure machine that manage the failure for the operator. + /// True if is the first time the topology sees this task + bool AddTask(string taskId, IFailureStateMachine failureMachine); + + /// + /// Removes a task from the topology. + /// + /// The id of the task to be removed + /// The number of data points lost because of the removed task + int RemoveTask(string taskId); + + /// + /// Whether the topology can be sheduled. + /// + /// True if the topology is ready to be scheduled + bool CanBeScheduled(); + + /// + /// Finalizes the topology. + /// After the topology has been finalized, any task added to the topology is + /// assumed as a task recovered from a failure. + /// + /// The same finalized topology + ITopology Build(); + + /// + /// Adds the topology configuration for the input task to the input builder. + /// Must be called only after all tasks have been added to the topology, i.e., after build. + /// + /// The configuration builder the configuration will be appended to + /// The task id of the task that belongs to this Topology + void GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId); + + /// + /// Utility method for logging the topology state. + /// This will be called every time a topology object is built or modified + /// because of a failure. + /// + string LogTopologyState(); + + /// + /// This method is triggered when a node detects a change in the topology and asks the driver for an update. + /// + /// The identifier of the task asking for the update + /// A list of message containing the topology update + /// An optional failure machine to log updates + void TopologyUpdateResponse(string taskId, ref List returnMessages, Optional failureStateMachine); + + /// + /// Action to trigger when the operator recdeives a notification that a new iteration is started. + /// + /// The new iteration number + void OnNewIteration(int iteration); + + /// + /// Reconfigure the topology in response to some event. + /// + /// The task id responsible for the topology change + /// Some additional topology-specific information + /// The optional iteration number in which the event occurred + /// One or more messages for reconfiguring the Tasks + IList Reconfigure(string taskId, Optional info, Optional iteration); + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + string LogFinalStatistics(); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs new file mode 100644 index 0000000000..29336dd057 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl +{ + /// + /// Represents a node in the operator topology graph. + /// Nodes are logical representations in the Driver for tasks. + /// + [Unstable("0.16", "API may change")] + internal sealed class DataNode + { + private readonly int _taskId; + private readonly bool _isRoot; + private readonly List _children; + + private DataNode _parent; + private DataNodeState _state; + + /// + /// Construct a node using a given task id. + /// + /// The id for the node + /// Whether the node is the root/master of the topology or not + public DataNode( + int taskId, + bool isRoot) + { + _taskId = taskId; + _isRoot = isRoot; + _state = DataNodeState.Reachable; + + _children = new List(); + } + + /// + /// The current state for the node. + /// + public DataNodeState FailState + { + get { return _state; } + set { _state = value; } + } + + /// + /// The parent of the target node. + /// + public DataNode Parent + { + get { return _parent; } + set { _parent = value; } + } + + /// + /// Add a node to the list of children nodes of the current one. + /// + public void AddChild(DataNode child) + { + _children.Add(child); + } + + /// + /// The task id represented by the data node. + /// + public int TaskId + { + get { return _taskId; } + } + + /// + /// Return how many children the current node has. + /// + public int NumberOfChildren + { + get { return _children.Count; } + } + + /// + /// Return the list of children fro the current node. + /// + public IList Children + { + get { return _children; } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs new file mode 100644 index 0000000000..c4cbe71758 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Exceptions; +using System.Collections.Generic; +using Org.Apache.REEF.Network.Elastic.Comm; +using System; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl +{ + /// + /// Topology with no structure. + /// Used as a placeholder when no topology is required. + /// + [Unstable("0.16", "API may change")] + class EmptyTopology : ITopology + { + private bool _finalized; + + /// + /// Constructor for the empty topology. + /// + public EmptyTopology() + { + _finalized = false; + OperatorId = -1; + } + + /// + /// The identifier of the operator using the topology. + /// + public int OperatorId { get; set; } + + /// + /// The subscription of the operator using the topology. + /// + public string SubscriptionName { get; set; } + + /// + /// Adds a new task to the topology. + /// This method does nothing on the empty topology. + /// + /// The id of the task to be added + /// The failure machine that manage the failure for the operator. + /// This method returns always false + public bool AddTask(string taskId, IFailureStateMachine failureMachine) + { + return false; + } + + /// + /// Removes a task from the topology. + /// This method does nothing on the empty topology. + /// + /// The id of the task to be removed + /// This method return always 0 + public int RemoveTask(string taskId) + { + return 0; + } + + /// + /// Whether the topology can be sheduled. + /// + /// This method return always true + public bool CanBeScheduled() + { + return true; + } + + /// + /// Finalizes the topology. + /// + /// The same finalized topology + public ITopology Build() + { + if (_finalized == true) + { + throw new IllegalStateException("Topology cannot be built more than once"); + } + + if (OperatorId <= 0) + { + throw new IllegalStateException("Topology cannot be built because not linked to any operator"); + } + + if (SubscriptionName == string.Empty) + { + throw new IllegalStateException("Topology cannot be built because not linked to any subscription"); + } + + _finalized = true; + + return this; + } + + /// + /// Adds the topology configuration for the input task to the input builder. + /// This method does nothig. + /// + /// The configuration builder the configuration will be appended to + /// The task id of the task that belongs to this Topology + public void GetTaskConfiguration(ref ICsConfigurationBuilder confBuilder, int taskId) + { + } + + /// + /// Utility method for logging the topology state. + /// This will be called every time a topology object is built or modified + /// because of a failure. + /// + public string LogTopologyState() + { + return "empty"; + } + + /// + /// This method is triggered when a node detects a change in the topology and asks the driver for an update. + /// + /// The identifier of the task asking for the update + /// A list of message containing the topology update + /// An optional failure machine to log updates + public void TopologyUpdateResponse(string taskId, ref List returnMessages, Optional failureStateMachine) + { + } + + /// + /// Action to trigger when the operator recdeives a notification that a new iteration is started. + /// This method does nothing. + /// + /// The new iteration number + public void OnNewIteration(int iteration) + { + } + + /// + /// Reconfigure the topology in response to some event. + /// + /// The task id responsible for the topology change + /// Some additional topology-specific information + /// The optional iteration number in which the event occurred + /// An empty list of messages + public IList Reconfigure(string taskId, Optional info, Optional iteration) + { + return new List(); + } + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + public string LogFinalStatistics() + { + return string.Empty; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs new file mode 100644 index 0000000000..dbdec652a2 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -0,0 +1,434 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Interface; +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Util; +using System.Globalization; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Logging; +using System.Linq; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl +{ + /// + /// Topology class for N nodes organized as a shallow tree with 1 root (the master) and N-1 nodes + /// connected to it. + /// + [Unstable("0.16", "API may change")] + public class FlatTopology : ITopology + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(FlatTopology)); + + private string _rootTaskId; + private int _rootId; + private string _taskSubscription; + private volatile int _iteration; + private bool _finalized; + private readonly bool _sorted; + + private readonly Dictionary _nodes; + private readonly HashSet _lostNodesToBeRemoved; + private HashSet _nodesWaitingToJoinTopologyNextIteration; + private HashSet _nodesWaitingToJoinTopology; + + private volatile int _availableDataPoints; + private int _totNumberofNodes; + + private readonly object _lock; + + /// + /// Constructor for flat topology. After construction the graph is empty + /// and tasks need to be added. + /// + /// The id of the task that will be set as root of the topology + /// Whether the leaf nodes need to be ordered or not + public FlatTopology(int rootId, bool sorted = false) + { + _rootTaskId = string.Empty; + _taskSubscription = string.Empty; + _rootId = rootId; + _finalized = false; + _sorted = sorted; + OperatorId = -1; + _iteration = 1; + _availableDataPoints = 0; + + _lock = new object(); + + _nodes = new Dictionary(); + _lostNodesToBeRemoved = new HashSet(); + _nodesWaitingToJoinTopologyNextIteration = new HashSet(); + _nodesWaitingToJoinTopology = new HashSet(); + } + + /// + /// The identifier of the operator using the topology. + /// + public int OperatorId { get; set; } + + /// + /// The subscription of the operator using the topology. + /// + public string SubscriptionName { get; set; } + + /// + /// Adds a new task to the topology. + /// When called before Build() actually adds the task to the topology. + /// After Build(), it assumes that the task is added because recovered from a failure. + /// A failure machine is given as input so that the topology can update the number of available nodes. + /// + /// The id of the task to be added + /// The failure machine that manage the failure for the operator. + /// True if is the first time the topology sees this task + public bool AddTask(string taskId, IFailureStateMachine failureMachine) + { + if (string.IsNullOrEmpty(taskId)) + { + throw new ArgumentNullException(taskId); + } + + if (failureMachine == null) + { + throw new ArgumentNullException(nameof(failureMachine)); + } + + var id = Utils.GetTaskNum(taskId); + + lock (_lock) + { + if (_nodes.ContainsKey(id)) + { + if (_nodes[id].FailState != DataNodeState.Reachable) + { + // This is node already added to the topology and which probably failed. + _nodesWaitingToJoinTopologyNextIteration.Add(taskId); + _nodes[id].FailState = DataNodeState.Unreachable; + return false; + } + + throw new ArgumentException("Task already added to the topology"); + } + + DataNode node = new DataNode(id, false); + _nodes[id] = node; + + if (_finalized) + { + // New node but elastically added. It should be gracefully added to the topology. + _nodesWaitingToJoinTopologyNextIteration.Add(taskId); + _nodes[id].FailState = DataNodeState.Unreachable; + _nodes[_rootId].Children.Add(_nodes[id]); + failureMachine.AddDataPoints(1, true); + failureMachine.RemoveDataPoints(1); + return false; + } + + // This is required later in order to build the topology + if (_taskSubscription == string.Empty) + { + _taskSubscription = Utils.GetTaskSubscriptions(taskId); + } + } + + _availableDataPoints++; + failureMachine.AddDataPoints(1, true); + + return true; + } + + /// + /// Removes a task from the topology. + /// + /// The id of the task to be removed + /// The number of data points lost because of the removed task + public int RemoveTask(string taskId) + { + if (string.IsNullOrEmpty(taskId)) + { + throw new ArgumentNullException(nameof(taskId)); + } + + if (taskId == _rootTaskId) + { + throw new NotImplementedException("Failure on master not supported yet"); + } + + var id = Utils.GetTaskNum(taskId); + + lock (_lock) + { + if (!_nodes.ContainsKey(id)) + { + throw new ArgumentException("Task is not part of this topology"); + } + + DataNode node = _nodes[id]; + var prevState = node.FailState; + node.FailState = DataNodeState.Lost; + _nodesWaitingToJoinTopologyNextIteration.Remove(taskId); + _nodesWaitingToJoinTopology.Remove(taskId); + _lostNodesToBeRemoved.Add(taskId); + + if (prevState != DataNodeState.Reachable) + { + return 0; + } + + _availableDataPoints--; + } + + return 1; + } + + /// + /// Whether the topology can be sheduled. + /// + /// True if the topology is ready to be scheduled + public bool CanBeScheduled() + { + return _nodes.ContainsKey(_rootId); + } + + /// + /// Finalizes the topology. + /// After the topology has been finalized, any task added to the topology is + /// assumed as a task recovered from a failure. + /// + /// The same finalized topology + public ITopology Build() + { + if (_finalized == true) + { + throw new IllegalStateException("Topology cannot be built more than once"); + } + + if (!_nodes.ContainsKey(_rootId)) + { + throw new IllegalStateException("Topology cannot be built because the root node is missing"); + } + + if (OperatorId <= 0) + { + throw new IllegalStateException("Topology cannot be built because not linked to any operator"); + } + + if (SubscriptionName == string.Empty) + { + throw new IllegalStateException("Topology cannot be built because not linked to any subscription"); + } + + BuildTopology(); + + _rootTaskId = Utils.BuildTaskId(_taskSubscription, _rootId); + _finalized = true; + + return this; + } + + /// + /// Utility method for logging the topology state. + /// This will be called every time a topology object is built or modified + /// because of a failure. + /// + public string LogTopologyState() + { + var root = _nodes[_rootId]; + var children = root.Children.GetEnumerator(); + string output = _rootId + "\n"; + while (children.MoveNext()) + { + var rep = "X"; + if (children.Current.FailState == DataNodeState.Reachable) + { + rep = children.Current.TaskId.ToString(); + } + + output += rep + " "; + } + + return output; + } + + /// + /// Adds the topology configuration for the input task to the input builder. + /// Must be called only after all tasks have been added to the topology, i.e., after build. + /// + /// The configuration builder the configuration will be appended to + /// The task id of the task that belongs to this Topology + public void GetTaskConfiguration(ref ICsConfigurationBuilder confBuilder, int taskId) + { + if (!_finalized) + { + throw new IllegalStateException("Cannot get task configuration from a not finalized topology."); + } + + if (taskId == _rootId) + { + var root = _nodes[_rootId]; + + foreach (var tId in root.Children) + { + confBuilder.BindSetEntry( + GenericType.Class, + tId.TaskId.ToString(CultureInfo.InvariantCulture)); + } + } + confBuilder.BindNamedParameter( + GenericType.Class, + _rootId.ToString(CultureInfo.InvariantCulture)); + } + + /// + /// This method is triggered when a node contacts the driver to synchronize the remote topology + /// with the driver's one. + /// + /// The identifier of the task asking for the update + /// A list of message containing the topology update + /// An optional failure machine to log updates + public void TopologyUpdateResponse(string taskId, ref List returnMessages, Optional failureStateMachine) + { + if (taskId != _rootTaskId) + { + throw new IllegalStateException("Only root tasks are supposed to request topology updates."); + } + + if (!failureStateMachine.IsPresent()) + { + throw new IllegalStateException("Cannot update topology without failure machine."); + } + + lock (_lock) + { + var list = _nodesWaitingToJoinTopology.ToList(); + var update = new TopologyUpdate(_rootTaskId, list); + var data = new UpdateMessagePayload(new List() { update }, SubscriptionName, OperatorId, _iteration); + var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); + + returnMessages.Add(returnMessage); + + if (_nodesWaitingToJoinTopology.Count > 0) + { + LOGGER.Log(Level.Info, $"Tasks [{string.Join(",", _nodesWaitingToJoinTopology)}] are added to topology in iteration {_iteration}"); + + _availableDataPoints += _nodesWaitingToJoinTopology.Count; + failureStateMachine.Value.AddDataPoints(_nodesWaitingToJoinTopology.Count, false); + + foreach (var node in _nodesWaitingToJoinTopology) + { + var id = Utils.GetTaskNum(node); + _nodes[id].FailState = DataNodeState.Reachable; + } + + _nodesWaitingToJoinTopology.Clear(); + } + } + } + + /// + /// Action to trigger when the operator recdeives a notification that a new iteration is started. + /// + /// The new iteration number + public void OnNewIteration(int iteration) + { + LOGGER.Log(Level.Info, $"Flat Topology for Operator {OperatorId} in Iteration {iteration - 1} is closed with {_availableDataPoints} nodes"); + _iteration = iteration; + _totNumberofNodes += _availableDataPoints; + + lock (_lock) + { + _nodesWaitingToJoinTopology = _nodesWaitingToJoinTopologyNextIteration; + _nodesWaitingToJoinTopologyNextIteration = new HashSet(); + } + } + + /// + /// Reconfigure the topology in response to some event. + /// + /// The task id responsible for the topology change + /// Some additional topology-specific information + /// The optional iteration number in which the event occurred + /// One or more messages for reconfiguring the Tasks + public IList Reconfigure(string taskId, Optional info, Optional iteration) + { + if (taskId == _rootTaskId) + { + throw new NotImplementedException("Failure on master not supported yet."); + } + + List messages = new List(); + + lock (_lock) + { + int iter; + + if (info.IsPresent()) + { + iter = int.Parse(info.Value.Split(':')[0]); + } + else + { + iter = iteration.Value; + } + + var children = _lostNodesToBeRemoved.ToList(); + var update = new List() + { + new TopologyUpdate(_rootTaskId, children) + }; + var data = new FailureMessagePayload(update, SubscriptionName, OperatorId, -1); + var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); + + LOGGER.Log(Level.Info, $"Task {taskId} is removed from topology"); + messages.Add(returnMessage); + _lostNodesToBeRemoved.Clear(); + } + + return messages; + } + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + public string LogFinalStatistics() + { + return $"\nAverage number of nodes in the topology of Operator {OperatorId}: {(float)_totNumberofNodes / (_iteration > 2 ? _iteration - 1 : 1)}"; + } + + private void BuildTopology() + { + IEnumerator iter = _sorted ? _nodes.OrderBy(kv => kv.Key).Select(kv => kv.Value).GetEnumerator() : _nodes.Values.GetEnumerator(); + var root = _nodes[_rootId]; + + while (iter.MoveNext()) + { + if (iter.Current.TaskId != _rootId) + { + root.AddChild(iter.Current); + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/ICheckpointingTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/ICheckpointingTopology.cs new file mode 100644 index 0000000000..01a5660b7f --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/ICheckpointingTopology.cs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical +{ + /// + /// Interface for topologies able to checkpoint state. + /// + [Unstable("0.16", "API may change")] + internal interface ICheckpointingTopology : IDisposable + { + /// + /// An internal (to the topology) checkpoint. This can be used to implement + /// ephemeral level checkpoints. + /// + // For the moment the assumption is that only one object is stored + ICheckpointState InternalCheckpoint { get; } + + /// + /// Checkpoint the input state for the given iteration. + /// + /// The state to checkpoint + /// The iteration in which the checkpoint is happening + void Checkpoint(ICheckpointableState state, int iteration); + + /// + /// Retrieve a previously saved checkpoint. + /// The iteration number specificy which cehckpoint to retrieve, where -1 + /// is used by default to indicate the latest available checkpoint. + /// + /// The retrieved checkpoint + /// The iteration number for the checkpoint to retrieve. + /// + bool GetCheckpoint(out ICheckpointState checkpoint, int iteration = -1); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs new file mode 100644 index 0000000000..73a537fb76 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Tang.Annotations; +using System.Collections.Generic; +using Org.Apache.REEF.Common.Tasks; +using System.Threading; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Logging; +using System.Linq; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Failures.Impl; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +{ + /// + /// Topology class managing data communication for broadcast operators. + /// + [Unstable("0.16", "API may change")] + internal sealed class DefaultBroadcastTopology : OneToNTopology + { + [Inject] + private DefaultBroadcastTopology( + [Parameter(typeof(OperatorParameters.SubscriptionName))] string subscriptionName, + [Parameter(typeof(OperatorParameters.TopologyRootTaskId))] int rootId, + [Parameter(typeof(OperatorParameters.TopologyChildTaskIds))] ISet children, + [Parameter(typeof(OperatorParameters.PiggybackTopologyUpdates))] bool piggyback, + [Parameter(typeof(OperatorParameters.OperatorId))] int operatorId, + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + [Parameter(typeof(GroupCommunicationConfigurationOptions.Retry))] int retry, + [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, + [Parameter(typeof(GroupCommunicationConfigurationOptions.DisposeTimeout))] int disposeTimeout, + CommunicationService commLayer, + CheckpointService checkpointService) : base( + subscriptionName, + taskId, + Utils.BuildTaskId(subscriptionName, rootId), + operatorId, + children, + piggyback, + retry, + timeout, + disposeTimeout, + commLayer, + checkpointService) + { + } + + public override DataMessage AssembleDataMessage(int iteration, T[] data) + { + if (_piggybackTopologyUpdates) + { + return new DataMessageWithTopology(SubscriptionName, OperatorId, iteration, data[0]); + } + else + { + return new DataMessage(SubscriptionName, OperatorId, iteration, data[0]); + } + } + + /// + /// Send a previously queued data message. + /// + /// The source in case the task is cancelled + protected override void Send(CancellationTokenSource cancellationSource) + { + GroupCommunicationMessage message; + int retry = 0; + + // Check if we have a message to send + if (_sendQueue.TryPeek(out message)) + { + var dm = message as DataMessage; + + // Broadcast topology require the driver to send topology updates to the root node + // in order to have the most update topology at each boradcast round. + while (!_topologyUpdateReceived.WaitOne(_timeout)) + { + // If we are here, we weren't able to receive a topology update on time. Retry. + if (cancellationSource.IsCancellationRequested) + { + LOGGER.Log(Level.Warning, "Received cancellation request: stop sending"); + return; + } + + retry++; + + if (retry > _retry) + { + throw new OperatorException($"Iteration {dm.Iteration}: Failed to send message to the next node in the ring after {_retry} try.", OperatorId); + } + + TopologyUpdateRequest(); + } + + // Get the actual message to send. Note that altough message sending is asynchronous, broadcast rounds should not overlap. + _sendQueue.TryDequeue(out message); + + if (TaskId == RootTaskId) + { + // Prepare the mutex to block for the next round of topology updates. + _topologyUpdateReceived.Reset(); + } + + // Deliver the message to the commonication layer. + foreach (var node in _children.Where(x => !_nodesToRemove.TryGetValue(x.Value, out byte val))) + { + _commService.Send(node.Value, message, cancellationSource); + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs new file mode 100644 index 0000000000..9289a44181 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +{ + /// + /// Abstract class for topologies able to receive messages from the driver. + /// + [Unstable("0.16", "API may change")] + internal abstract class DriverAwareOperatorTopology : OperatorTopology, IObserver + { + /// + /// Constructor. + /// + /// The identifier of the task the topology is running on + /// The identifier of the root note in the topology + /// The subscription name the topology is working on + /// The identifier of the operator for this topology + public DriverAwareOperatorTopology(string taskId, string rootTaskId, string subscriptionName, int operatorId) + : base(taskId, rootTaskId, subscriptionName, operatorId) + { + } + + /// + /// Basic handler for messages coming from the driver. + /// + /// Message from the driver + public virtual void OnNext(DriverMessagePayload message) + { + switch (message.PayloadType) + { + case DriverMessagePayloadType.Ring: + case DriverMessagePayloadType.Resume: + case DriverMessagePayloadType.Update: + case DriverMessagePayloadType.Failure: + break; + default: + throw new ArgumentException($"Message type {message.PayloadType} not recognized."); + } + } + + #region Empty Handlers + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + } + #endregion + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs new file mode 100644 index 0000000000..6da744f6fa --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using System.Collections.Generic; +using System; +using Org.Apache.REEF.Network.Elastic.Comm; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Exceptions; +using System.Threading; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.NetworkService; +using System.Collections.Concurrent; +using System.Linq; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +{ + /// + /// Base class for topologies following a one to N communication pattern. + /// + [Unstable("0.16", "API may change")] + internal abstract class OneToNTopology : OperatorTopologyWithCommunication, ICheckpointingTopology + { + protected static readonly Logger LOGGER = Logger.GetLogger(typeof(OneToNTopology)); + + private readonly CheckpointService _checkpointService; + protected readonly ConcurrentDictionary _nodesToRemove; + + protected readonly ManualResetEvent _topologyUpdateReceived; + protected readonly bool _piggybackTopologyUpdates; + + /// + /// Construct a one to N topology. + /// + /// The identifier of the task the topology is running on + /// The identifier of the root note in the topology + /// The subscription name the topology is working on + /// The identifier of the operator for this topology + /// The list of nodes this task has to send messages to + /// Whether to piggyback topology update messages to data message + /// How many times the topology will retry to send a message + /// After how long the topology waits for an event + /// Maximum wait time for topology disposal + /// Service responsible for communication + /// Service responsible for saving and retrieving checkpoints + public OneToNTopology( + string taskId, + string rootTaskId, + string subscriptionName, + int operatorId, + ISet children, + bool piggyback, + int retry, + int timeout, + int disposeTimeout, + CommunicationService commService, + CheckpointService checkpointService) : base(taskId, rootTaskId, subscriptionName, operatorId, commService, retry, timeout, disposeTimeout) + { + _checkpointService = checkpointService; + _nodesToRemove = new ConcurrentDictionary(); + _topologyUpdateReceived = new ManualResetEvent(RootTaskId == taskId ? false : true); + + _commService.RegisterOperatorTopologyForTask(this); + _commService.RegisterOperatorTopologyForDriver(this); + + _piggybackTopologyUpdates = piggyback; + + foreach (var child in children) + { + var childTaskId = Utils.BuildTaskId(SubscriptionName, child); + + _children.TryAdd(child, childTaskId); + } + } + + /// + /// An internal (to the topology) checkpoint. This can be used to implement + /// ephemeral level checkpoints. + /// + public ICheckpointState InternalCheckpoint { get; private set; } + + /// + /// Whether the topology is still sending messages or not. + /// + public bool IsSending + { + get { return !_sendQueue.IsEmpty; } + } + + /// + /// Checkpoint the input state for the given iteration. + /// + /// The state to checkpoint + /// The iteration in which the checkpoint is happening + + public void Checkpoint(ICheckpointableState state, int iteration) + { + switch (state.Level) + { + case CheckpointLevel.None: + break; + case CheckpointLevel.EphemeralMaster: + if (TaskId == RootTaskId) + { + InternalCheckpoint = state.Checkpoint(); + InternalCheckpoint.Iteration = iteration; + } + break; + case CheckpointLevel.EphemeralAll: + InternalCheckpoint = state.Checkpoint(); + InternalCheckpoint.Iteration = iteration; + break; + default: + throw new IllegalStateException($"Checkpoint level {state.Level} not supported."); + } + } + + /// + /// Retrieve a previously saved checkpoint. + /// The iteration number specificy which cehckpoint to retrieve, where -1 + /// is used by default to indicate the latest available checkpoint. + /// + /// The retrieved checkpoint + /// The iteration number for the checkpoint to retrieve. + /// + public bool GetCheckpoint(out ICheckpointState checkpoint, int iteration = -1) + { + if (InternalCheckpoint != null && (iteration == -1 || InternalCheckpoint.Iteration == iteration)) + { + checkpoint = InternalCheckpoint; + return true; + } + + return _checkpointService.GetCheckpoint(out checkpoint, TaskId, SubscriptionName, OperatorId, iteration, false); + } + + /// + /// Waiting logic before disposing topologies. + /// + public void WaitCompletionBeforeDisposing(CancellationTokenSource cancellationSource) + { + if (TaskId == RootTaskId) + { + foreach (var node in _children.Values) + { + while (_commService.Lookup(node) && !cancellationSource.IsCancellationRequested) + { + Thread.Sleep(100); + } + } + } + } + + public abstract DataMessage AssembleDataMessage(int iteration, T[] data); + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + public override void WaitForTaskRegistration(CancellationTokenSource cancellationSource) + { + try + { + _commService.WaitForTaskRegistration(_children.Values.ToList(), cancellationSource, _nodesToRemove); + } + catch (Exception e) + { + throw new IllegalStateException("Failed to find parent/children nodes in operator topology for node: " + TaskId, e); + } + + _initialized = true; + + Send(cancellationSource); + } + + /// + /// Handler for incoming messages from other topology nodes. + /// + /// The message that need to be devlivered to the operator + public override void OnNext(NsMessage message) + { + if (_messageQueue.IsAddingCompleted) + { + throw new IllegalStateException("Trying to add messages to a closed non-empty queue."); + } + + _messageQueue.Add(message.Data); + + if (_piggybackTopologyUpdates) + { + var topologyPayload = message.Data as DataMessageWithTopology; + var updates = topologyPayload.TopologyUpdates; + + UpdateTopology(ref updates); + topologyPayload.TopologyUpdates = updates; + } + + if (!_children.IsEmpty) + { + _sendQueue.Enqueue(message.Data); + } + + if (_initialized) + { + Send(_cancellationSignal); + } + } + + /// + /// Handler for messages coming from the driver. + /// + /// Message from the driver + public override void OnNext(DriverMessagePayload message) + { + switch (message.PayloadType) + { + case DriverMessagePayloadType.Failure: + { + var rmsg = message as TopologyMessagePayload; + + foreach (var updates in rmsg.TopologyUpdates) + { + foreach (var node in updates.Children) + { + _nodesToRemove.TryAdd(node, new byte()); + _commService.RemoveConnection(node); + } + } + break; + } + case DriverMessagePayloadType.Update: + { + if (_sendQueue.Count > 0) + { + if (_sendQueue.TryPeek(out GroupCommunicationMessage toSendmsg)) + { + var rmsg = message as TopologyMessagePayload; + + if (_piggybackTopologyUpdates) + { + var toSendmsgWithTop = toSendmsg as DataMessageWithTopology; + var updates = rmsg.TopologyUpdates; + + UpdateTopology(ref updates); + toSendmsgWithTop.TopologyUpdates = updates; + } + + foreach (var taskId in _nodesToRemove.Keys) + { + var id = Utils.GetTaskNum(taskId); + _nodesToRemove.TryRemove(taskId, out byte val); + _children.TryRemove(id, out string str); + } + } + + // Unblock this broadcast round. + _topologyUpdateReceived.Set(); + } + else + { + LOGGER.Log(Level.Warning, "Received a topology update message from driver but sending queue is empty: ignoring."); + } + } + break; + default: + throw new ArgumentException($"Message type {message.PayloadType} not supported by N to one topologies."); + } + } + + private void UpdateTopology(ref List updates) + { + TopologyUpdate toRemove = null; + foreach (var update in updates) + { + if (update.Node == TaskId) + { + toRemove = update; + foreach (var child in update.Children) + { + if (!_nodesToRemove.TryRemove(child, out byte value)) + { + var id = Utils.GetTaskNum(child); + _children.TryAdd(id, child); + } + } + break; + } + } + + if (toRemove != null) + { + updates.Remove(toRemove); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs new file mode 100644 index 0000000000..3ff8ef265a --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +{ + /// + /// Base class for task-side topologies. Task-side topologies are + /// not generic but directly related to the operators using them to communicate data. + /// + [Unstable("0.16", "API may change")] + internal abstract class OperatorTopology + { + /// + /// Constructor for an operator topology. + /// + /// The identifier of the task the topology is running on + /// The identifier of the root note in the topology + /// The subscription name the topology is working on + /// The identifier of the operator for this topology + public OperatorTopology(string taskId, string rootTaskId, string subscriptionName, int operatorId) + { + TaskId = taskId; + SubscriptionName = subscriptionName; + OperatorId = operatorId; + + RootTaskId = rootTaskId; + } + + /// + /// The subscription name context in which the topology is running. + /// + public string SubscriptionName { get; private set; } + + /// + /// The identifier of the operator in which the topology is running. + /// + public int OperatorId { get; private set; } + + /// + /// The identifier of the task in which the topology is running. + /// + protected string TaskId { get; private set; } + + /// + /// The task identifier of the root node of the topology. + /// + protected string RootTaskId { get; set; } + + /// + /// Waiting logic before disposing topologies. + /// + public virtual void WaitCompletionBeforeDisposing() + { + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs new file mode 100644 index 0000000000..d9b5dc6633 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using System; +using System.Collections.Concurrent; +using System.Threading; +using System.Linq; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +{ + /// + /// Base class for topologies where nodes are allowed to communicated between themselves + /// and to the driver. + /// + [Unstable("0.16", "API may change")] + internal abstract class OperatorTopologyWithCommunication : + DriverAwareOperatorTopology, + IWaitForTaskRegistration, + IDisposable, + IObserver> + { + protected bool _initialized; + + protected CommunicationService _commService; + + protected readonly int _disposeTimeout; + protected readonly int _timeout; + protected readonly int _retry; + + protected ConcurrentQueue _sendQueue; + protected BlockingCollection _messageQueue; + protected readonly ConcurrentDictionary _children; + protected readonly CancellationTokenSource _cancellationSignal; + + /// + /// Constructor for a communicating topology. + /// + //// The identifier of the task the topology is running on + /// The identifier of the root note in the topology + /// The subscription name the topology is working on + /// The identifier of the operator for this topology + /// How many times the topology will retry to send a message + /// After how long the topology waits for an event + /// Maximum wait time for topology disposal + /// Class responsible for communication + public OperatorTopologyWithCommunication( + string taskId, + string rootTaskId, + string subscription, + int operatorId, + CommunicationService commService, + int retry, + int timeout, + int disposeTimeout) : base(taskId, rootTaskId, subscription, operatorId) + { + _initialized = false; + _commService = commService; + + _children = new ConcurrentDictionary(); + _messageQueue = new BlockingCollection(); + _sendQueue = new ConcurrentQueue(); + + _cancellationSignal = new CancellationTokenSource(); + + _retry = retry; + _timeout = timeout; + _disposeTimeout = disposeTimeout; + } + + /// + /// Communicate to the driver that the current subscrition has completed its + /// execution. + /// + public void SubscriptionComplete() + { + if (TaskId == RootTaskId) + { + _commService.SubscriptionComplete(TaskId); + } + } + + /// + /// Request a topology status update to the driver. + /// + public void TopologyUpdateRequest() + { + _commService.TopologyUpdateRequest(TaskId, OperatorId); + } + + /// + /// Waiting logic before disposing topologies. + /// + public override void WaitCompletionBeforeDisposing() + { + var elapsedTime = 0; + while (_sendQueue.Count > 0 && elapsedTime < _disposeTimeout) + { + // The topology is still trying to send messages, wait. + Thread.Sleep(100); + elapsedTime += 100; + } + } + + /// + /// Signal the the current task is joining the topology. + /// + public virtual void JoinTopology() + { + _commService.JoinTopology(TaskId, OperatorId); + } + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + public virtual void WaitForTaskRegistration(CancellationTokenSource cancellationSource) + { + try + { + _commService.WaitForTaskRegistration(_children.Values.ToList(), cancellationSource); + } + catch (Exception e) + { + throw new OperationCanceledException("Failed to find parent/children nodes in operator topology for node: " + TaskId, e); + } + + _initialized = true; + + // Some message may have been received while we were setting up the topology. Send them. + Send(cancellationSource); + } + + /// + /// Block and wait untill a message is received. + /// + /// The signal that the operation is cacelled + /// + public virtual GroupCommunicationMessage Receive(CancellationTokenSource cancellationSource) + { + GroupCommunicationMessage message; + int retry = 1; + + while (!_messageQueue.TryTake(out message, _timeout, cancellationSource.Token)) + { + if (cancellationSource.IsCancellationRequested) + { + throw new OperationCanceledException("Received cancellation request: stop receiving."); + } + + if (retry++ > _retry) + { + throw new Exception($"Failed to receive message after {_retry} try."); + } + + _commService.NextDataRequest(TaskId, -1); + } + + return message; + } + + /// + /// Send the input message. This method is asynchornous. + /// + /// The message to communicate + /// The signal for cancelling the operation + public virtual void Send(GroupCommunicationMessage message, CancellationTokenSource cancellationSource) + { + _sendQueue.Enqueue(message); + + if (_initialized) + { + Send(cancellationSource); + } + } + + /// + /// Handler for incoming messages from other topology nodes. + /// + /// The message that need to be devlivered to the operator + public virtual void OnNext(NsMessage message) + { + if (_messageQueue.IsAddingCompleted) + { + if (_messageQueue.Count > 0) + { + throw new IllegalStateException("Trying to add messages to a closed non-empty queue."); + } + + _messageQueue = new BlockingCollection(); + } + + _messageQueue.Add(message.Data); + + // Automatically forward the received message to the child nodes in the topology. + if (!_children.IsEmpty) + { + _sendQueue.Enqueue(message.Data); + } + + if (_initialized) + { + Send(_cancellationSignal); + } + } + + /// + /// Dispose the topology. + /// + public virtual void Dispose() + { + _messageQueue.CompleteAdding(); + + _cancellationSignal.Cancel(); + + _commService.Dispose(); + } + + /// + /// Logic to execute in case the observable sends an error event. + /// + /// The error throw on the observable. + public new void OnError(Exception error) + { + _messageQueue.CompleteAdding(); + } + + /// + /// Logic to execute in case the observable sends a complete event. + /// + /// + public new void OnCompleted() + { + _messageQueue.CompleteAdding(); + } + + /// + /// Send a previously queued data message. + /// + /// The singal in case the task is cancelled + protected virtual void Send(CancellationTokenSource cancellationSource) + { + GroupCommunicationMessage message; + while (_sendQueue.TryDequeue(out message) && !cancellationSource.IsCancellationRequested) + { + foreach (var child in _children.Values) + { + _commService.Send(child, message, cancellationSource); + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs new file mode 100644 index 0000000000..a40761ece8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Globalization; + +namespace Org.Apache.REEF.Network.Elastic +{ + /// + /// Utility class. + /// + [Unstable("0.16", "API may change")] + internal static class Utils + { + /// + /// Gets the context number associated with the active context id. + /// + /// The active context to check + /// The context number associated with the active context id + public static int GetContextNum(IActiveContext activeContext) + { + return int.Parse(GetValue(2, activeContext.Id), CultureInfo.InvariantCulture); + } + + /// + /// Gets the subscriptions associated with the active context id. + /// + /// The active context to check + /// The subscription names associated with the active context id + public static string GetContextSubscriptions(IActiveContext activeContext) + { + return GetValue(1, activeContext.Id); + } + + /// + /// Gets the subscriptions associated with the context id. + /// + /// The context id to check + /// The subscription names associated with the context id + public static string GetContextSubscriptions(string id) + { + return GetValue(1, id); + } + + /// + /// Gets the subscriptions associated with the Task id. + /// + /// The task id to check + /// The subscription names associated with the task id + public static string GetTaskSubscriptions(string taskId) + { + return GetValue(1, taskId); + } + + /// + /// Gets the task number associated with the Task id. + /// + /// The task id to check + /// The task number associated with the task id + public static int GetTaskNum(string taskId) + { + return int.Parse(GetValue(2, taskId), CultureInfo.InvariantCulture); + } + + /// + /// Builds a context identifier out of a subscription(s) and a context number. + /// + /// The subscriptions active in the context + /// The context number + /// The context identifier + public static string BuildContextId(string subscriptionName, int contextNum) + { + return BuildIdentifier("Context", subscriptionName, contextNum); + } + + /// + /// Builds a task identifier out of a subscription(s) and an id. + /// + /// The subscriptions active in the task + /// The task id + /// The task identifier + public static string BuildTaskId(string subscriptionName, int id) + { + return BuildIdentifier("Task", subscriptionName, id); + } + + /// + /// Gets the context associated with the task id. + /// + /// The task id to check + /// The context id associated with the task id + public static string GetContextIdFromTaskId(string taskId) + { + return taskId.Replace("Task", "Context"); + } + + /// + /// Utility method returning an identifier by merging the input fields + /// + /// The first field + /// The second field + /// The third field + /// An id merging the three fields + private static string BuildIdentifier(string first, string second, int third) + { + return string.Format(CultureInfo.InvariantCulture, "{0}-{1}-{2}", first, second, third); + } + + /// + /// Utility method returning a requested field out of an identifier + /// + /// The field of interest + /// The id to check + /// The field value extracted from the identifier + private static string GetValue(int field, string identifer) + { + string[] parts = identifer.Split('-'); + if (parts.Length != 3 || field < 0 || field > 2) + { + throw new ArgumentException("Invalid identifier"); + } + + return parts[field]; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/GroupCommNetworkObserver.cs b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/GroupCommNetworkObserver.cs index 8aca8028ae..88a8abc12d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/GroupCommNetworkObserver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/GroupCommNetworkObserver.cs @@ -80,7 +80,7 @@ public TaskMessageObserver RegisterAndGetForTask(string taskSourceId) public void OnNext(IRemoteMessage> remoteMessage) { var nsMessage = remoteMessage.Message; - var gcm = nsMessage.Data.First(); + var gcm = nsMessage.Data; var gcMessageTaskSource = gcm.Source; TaskMessageObserver observer; if (!_taskMessageObservers.TryGetValue(gcMessageTaskSource, out observer)) diff --git a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/NodeMessageObserver.cs b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/NodeMessageObserver.cs index 00ca1d47da..76839e72af 100644 --- a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/NodeMessageObserver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/NodeMessageObserver.cs @@ -39,13 +39,10 @@ internal NodeMessageObserver(NodeStruct nodeStruct) /// public void OnNext(NsMessage value) { - foreach (var data in value.Data) + var gcMessage = value.Data as GroupCommunicationMessage; + if (gcMessage != null && gcMessage.Data != null && gcMessage.Data.Length > 0) { - var gcMessage = data as GroupCommunicationMessage; - if (gcMessage != null && gcMessage.Data != null && gcMessage.Data.Length > 0) - { - _nodeStruct.AddData(gcMessage); - } + _nodeStruct.AddData(gcMessage); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/TaskMessageObserver.cs b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/TaskMessageObserver.cs index d8dd449c6e..a43021df34 100644 --- a/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/TaskMessageObserver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Group/Task/Impl/TaskMessageObserver.cs @@ -120,7 +120,7 @@ private void Handle(NsMessage value, bool isRe return; } - var gcMessage = value.Data.First(); + var gcMessage = value.Data; IObserver> observer; if (!_observers.TryGetValue(NodeObserverIdentifier.FromMessage(gcMessage), out observer)) diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageCodec.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageCodec.cs index ecb9b135d0..0fa83ef389 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageCodec.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageCodec.cs @@ -71,10 +71,9 @@ public NsMessage Decode(byte[] data) IIdentifier sourceId = _idFactory.Create(proto.SourceId); IIdentifier destId = _idFactory.Create(proto.DestId); - NsMessage message = new NsMessage(sourceId, destId); + var payload = _codec.Decode(proto.Data); - var messages = proto.Data.Select(byteArr => _codec.Decode(byteArr)); - message.Data.AddRange(messages); + NsMessage message = new NsMessage(sourceId, destId, payload); return message; } } diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageProto.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageProto.cs index aad73d5ec2..20f79db0ef 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageProto.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageProto.cs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -using System.Collections.Generic; using Org.Apache.REEF.Wake.Remote; using ProtoBuf; @@ -26,7 +25,6 @@ public class NsMessageProto { public NsMessageProto() { - Data = new List(); } [ProtoMember(1)] @@ -36,7 +34,7 @@ public NsMessageProto() public string DestId { get; set; } [ProtoMember(3)] - public List Data { get; set; } + public byte[] Data { get; private set; } public static NsMessageProto Create(NsMessage message, ICodec codec) { @@ -45,10 +43,7 @@ public static NsMessageProto Create(NsMessage message, ICodec codec) proto.SourceId = message.SourceId.ToString(); proto.DestId = message.DestId.ToString(); - foreach (T item in message.Data) - { - proto.Data.Add(codec.Encode(item)); - } + proto.Data = codec.Encode(message.Data); return proto; } diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageStreamingCodec.cs index 76ccbba93f..965ad9a5fe 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageStreamingCodec.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/Codec/NsMessageStreamingCodec.cs @@ -58,19 +58,13 @@ public NsMessage Read(IDataReader reader) { int metadataSize = reader.ReadInt32(); byte[] metadata = new byte[metadataSize]; - reader.Read(ref metadata, 0, metadataSize); - var res = GenerateMetaDataDecoding(metadata); - Type messageType = res.Item3; - NsMessage message = res.Item1; + reader.Read(ref metadata, 0, metadataSize); + var res = GenerateMetaDataDecoding(metadata); + Type messageType = res.type; var codecReadFunc = _codecFunctionsCache.ReadFunction(messageType); - int messageCount = res.Item2; - - for (int i = 0; i < messageCount; i++) - { - message.Data.Add(codecReadFunc(reader)); - } + var message = new NsMessage(res.source, res.destination, codecReadFunc(reader)); return message; } @@ -87,13 +81,10 @@ public void Write(NsMessage obj, IDataWriter writer) byte[] totalEncoding = encodedInt.Concat(encodedMetadata).ToArray(); writer.Write(totalEncoding, 0, totalEncoding.Length); - Type messageType = obj.Data[0].GetType(); + Type messageType = obj.Data.GetType(); var codecWriteFunc = _codecFunctionsCache.WriteFunction(messageType); - - foreach (var data in obj.Data) - { - codecWriteFunc(data, writer); - } + + codecWriteFunc(obj.Data, writer); } /// @@ -108,15 +99,9 @@ public async Task> ReadAsync(IDataReader reader, CancellationToken byte[] metadata = new byte[metadataSize]; await reader.ReadAsync(metadata, 0, metadataSize, token); var res = GenerateMetaDataDecoding(metadata); - Type messageType = res.Item3; - NsMessage message = res.Item1; + Type messageType = res.type; var codecReadFunc = _codecFunctionsCache.ReadAsyncFunction(messageType); - int messageCount = res.Item2; - - for (int i = 0; i < messageCount; i++) - { - message.Data.Add(codecReadFunc(reader, token)); - } + var message = new NsMessage(res.source, res.destination, codecReadFunc(reader, token)); return message; } @@ -134,15 +119,12 @@ public async Task WriteAsync(NsMessage obj, IDataWriter writer, CancellationT byte[] totalEncoding = encodedInt.Concat(encodedMetadata).ToArray(); await writer.WriteAsync(totalEncoding, 0, totalEncoding.Length, token); - Type messageType = obj.Data[0].GetType(); + Type messageType = obj.Data.GetType(); var codecWriteFunc = _codecFunctionsCache.WriteAsyncFunction(messageType); - foreach (var data in obj.Data) - { - var asyncResult = codecWriteFunc.BeginInvoke(data, writer, token, null, null); - await codecWriteFunc.EndInvoke(asyncResult); - } + var asyncResult = codecWriteFunc.BeginInvoke(obj.Data, writer, token, null, null); + await codecWriteFunc.EndInvoke(asyncResult); } private static byte[] GenerateMetaDataEncoding(NsMessage obj) @@ -150,8 +132,7 @@ private static byte[] GenerateMetaDataEncoding(NsMessage obj) List metadataBytes = new List(); byte[] sourceBytes = StringToBytes(obj.SourceId.ToString()); byte[] dstBytes = StringToBytes(obj.DestId.ToString()); - byte[] messageTypeBytes = StringToBytes(obj.Data[0].GetType().AssemblyQualifiedName); - byte[] messageCount = BitConverter.GetBytes(obj.Data.Count); + byte[] messageTypeBytes = StringToBytes(obj.Data.GetType().AssemblyQualifiedName); metadataBytes.Add(BitConverter.GetBytes(sourceBytes.Length)); metadataBytes.Add(BitConverter.GetBytes(dstBytes.Length)); @@ -159,12 +140,11 @@ private static byte[] GenerateMetaDataEncoding(NsMessage obj) metadataBytes.Add(sourceBytes); metadataBytes.Add(dstBytes); metadataBytes.Add(messageTypeBytes); - metadataBytes.Add(messageCount); return metadataBytes.SelectMany(i => i).ToArray(); } - private Tuple, int, Type> GenerateMetaDataDecoding(byte[] obj) + private (IIdentifier source, IIdentifier destination, Type type) GenerateMetaDataDecoding(byte[] obj) { int srcCount = BitConverter.ToInt32(obj, 0); int dstCount = BitConverter.ToInt32(obj, sizeof(int)); @@ -177,10 +157,8 @@ private Tuple, int, Type> GenerateMetaDataDecoding(byte[] obj) offset += dstCount; Type msgType = Type.GetType(BytesToString(obj.Skip(offset).Take(msgTypeCount).ToArray())); offset += msgTypeCount; - int messageCount = BitConverter.ToInt32(obj, offset); - NsMessage msg = new NsMessage(_idFactory.Create(srcString), _idFactory.Create(dstString)); - return new Tuple, int, Type>(msg, messageCount, msgType); + return (source: _idFactory.Create(srcString), destination: _idFactory.Create(dstString), type: msgType); } private static byte[] StringToBytes(string str) diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/NsMessage.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/NsMessage.cs index 839af414bf..63fe3df40e 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/NsMessage.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/NsMessage.cs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -using System.Collections.Generic; using Org.Apache.REEF.Wake; namespace Org.Apache.REEF.Network.NetworkService @@ -26,18 +25,6 @@ namespace Org.Apache.REEF.Network.NetworkService /// The type of data being sent public class NsMessage { - /// - /// Create a new NsMessage with no data. - /// - /// The identifier of the sender - /// The identifier of the receiver - public NsMessage(IIdentifier sourceId, IIdentifier destId) - { - SourceId = sourceId; - DestId = destId; - Data = new List(); - } - /// /// Create a new NsMessage with data. /// @@ -48,7 +35,7 @@ public NsMessage(IIdentifier sourceId, IIdentifier destId, T message) { SourceId = sourceId; DestId = destId; - Data = new List { message }; + Data = message; } /// @@ -64,6 +51,6 @@ public NsMessage(IIdentifier sourceId, IIdentifier destId, T message) /// /// A list of data being sent in the message. /// - public List Data { get; private set; } + public T Data { get; private set; } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Org.Apache.REEF.Network.csproj b/lang/cs/Org.Apache.REEF.Network/Org.Apache.REEF.Network.csproj index 2a387432ea..2f3f4e4e39 100644 --- a/lang/cs/Org.Apache.REEF.Network/Org.Apache.REEF.Network.csproj +++ b/lang/cs/Org.Apache.REEF.Network/Org.Apache.REEF.Network.csproj @@ -31,12 +31,13 @@ under the License. + - + diff --git a/lang/cs/Org.Apache.REEF.Utilities/ByteUtilities.cs b/lang/cs/Org.Apache.REEF.Utilities/ByteUtilities.cs index 0e21453c97..4e3212f32e 100644 --- a/lang/cs/Org.Apache.REEF.Utilities/ByteUtilities.cs +++ b/lang/cs/Org.Apache.REEF.Utilities/ByteUtilities.cs @@ -49,6 +49,14 @@ public static string ByteArraysToString(byte[] b) return Encoding.UTF8.GetString(b); } + /// + /// Converts from a UTF-8 encoded byte array to a string. + /// + public static string ByteArraysToString(byte[] b, int start, int length) + { + return Encoding.UTF8.GetString(b, start, length); + } + /// /// Performs a deep copy of a byte array. /// From 6a68fef1f5c458df3233a325f0e2eb11c5094951 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 27 Dec 2018 17:54:41 -0800 Subject: [PATCH 02/29] update --- .../Elastic/Comm/Enum/TaskMessageType.cs | 2 +- .../Elastic/Comm/ITaskMessageResponse.cs | 2 +- .../Elastic/Comm/ITypedMessage.cs | 38 +++ .../Elastic/Comm/Impl/CheckpointMessage.cs | 6 +- .../Comm/Impl/CheckpointMessageRequest.cs | 55 ++++ .../CheckpointMessageRequestStreamingCodec.cs | 133 ++++++++++ .../Impl/CheckpointMessageStreamingCodec.cs | 155 +++++++++++ .../Elastic/Comm/Impl/DataMessage.cs | 20 +- .../Comm/Impl/DataMessageStreamingCodec.cs | 142 ++++++++++ .../Comm/Impl/DataMessageWithTopology.cs | 22 +- .../DataMessageWithTopologyStreamingCodec.cs | 160 ++++++++++++ .../Elastic/Comm/Impl/DriverMessagePayload.cs | 10 +- .../Comm/Impl/ElasticDriverMessageImpl.cs | 7 +- ...cs => ElasticGroupCommunicationMessage.cs} | 20 +- .../Comm/Impl/FailureMessagePayload.cs | 6 +- .../Comm/Impl/TopologyMessagePayload.cs | 28 +- .../Elastic/Comm/Impl/UpdateMessagePayload.cs | 6 +- .../ElasticServiceConfigurationOptions.cs | 19 +- .../GroupCommunicationConfigurationOptions.cs | 63 +++++ .../Elastic/Config/OperatorParameters.cs | 6 +- .../Config/StreamingCodecConfiguration.cs | 51 ++++ .../Elastic/Driver/IElasticContext.cs | 125 +++++++++ ...askSetSubscription.cs => IElasticStage.cs} | 56 ++-- ...etManager.cs => IElasticTaskSetManager.cs} | 34 ++- .../Elastic/Driver/IElasticTaskSetService.cs | 118 --------- ...SetService.cs => DefaultElasticContext.cs} | 216 ++++++++++------ ...Subscription.cs => DefaultElasticStage.cs} | 94 +++---- .../Elastic/Failures/ICheckpointState.cs | 6 +- .../Elastic/Failures/IFailureResponse.cs | 4 +- .../Elastic/Failures/IReschedule.cs | 2 +- .../Failures/Impl/DefaultCheckpointState.cs | 8 +- .../Elastic/Failures/Impl/RescheduleEvent.cs | 7 +- .../Logical/Impl/DefaultBroadcast.cs | 2 +- .../Operators/Logical/Impl/DefaultEmpty.cs | 8 +- .../Operators/Logical/Impl/DefaultOneToN.cs | 6 +- .../ElastiOperatorWithDefaultDispatcher.cs | 8 +- .../Operators/Logical/Impl/ElasticOperator.cs | 152 ++++++----- .../Physical/Impl/DefaultBroadcast.cs | 2 +- .../Operators/Physical/Impl/DefaultOneToN.cs | 29 ++- .../Elastic/Task/ICheckpointLayer.cs | 72 ++++++ .../Elastic/Task/Impl/CancellationSource.cs | 60 +++++ .../Task/Impl/CentralizedCheckpointService.cs | 243 ++++++++++++++++++ .../Elastic/Task/Impl/CheckpointIdentifier.cs | 88 +++++++ .../Elastic/Topology/Logical/ITopology.cs | 9 +- .../Topology/Logical/Impl/EmptyTopology.cs | 8 +- .../Topology/Logical/Impl/FlatTopology.cs | 22 +- .../Physical/Impl/DefaultBroadcastTopology.cs | 23 +- .../Impl/DriverAwareOperatorTopology.cs | 6 +- .../Topology/Physical/Impl/OneToNTopology.cs | 39 +-- .../Physical/Impl/OperatorTopology.cs | 13 +- .../Impl/OperatorTopologyWithCommunication.cs | 54 ++-- .../Org.Apache.REEF.Network/Elastic/Utils.cs | 34 +-- 52 files changed, 1938 insertions(+), 561 deletions(-) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITypedMessage.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequest.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequestStreamingCodec.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageStreamingCodec.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageStreamingCodec.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopologyStreamingCodec.cs rename lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/{GroupCommunicationMessage.cs => ElasticGroupCommunicationMessage.cs} (72%) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs rename lang/cs/Org.Apache.REEF.Network/Elastic/Driver/{IElasticTaskSetSubscription.cs => IElasticStage.cs} (74%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Driver/{ITaskSetManager.cs => IElasticTaskSetManager.cs} (81%) delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs rename lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/{DefaultTaskSetService.cs => DefaultElasticContext.cs} (57%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/{DefaultTaskSetSubscription.cs => DefaultElasticStage.cs} (85%) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/ICheckpointLayer.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CancellationSource.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CentralizedCheckpointService.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CheckpointIdentifier.cs diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs index eb7a4c05d4..c506f66f9a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs @@ -30,6 +30,6 @@ internal enum TaskMessageType : ushort NextDataRequest = 3, - CompleteSubscription = 4 + CompleteStage = 4 } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs index f501f689b0..7dd8db1732 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs @@ -22,7 +22,7 @@ namespace Org.Apache.REEF.Network.Elastic.Comm { /// - /// Used to propagate task reponses through operators and subscriptions. + /// Used to propagate task reponses through operators and stages. /// [Unstable("0.16", "API may change")] public interface ITaskMessageResponse diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITypedMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITypedMessage.cs new file mode 100644 index 0000000000..aaa374c40c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITypedMessage.cs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +namespace Org.Apache.REEF.Network.Elastic.Comm +{ + /// + /// Typed interface for data messages. + /// This is used to provide a unified interface over the + /// different types of data messages. + /// + /// The ty + internal interface ITypedDataMessage + { + /// + /// The data contained in the message. + /// + T Data { get; } + + /// + /// The iteration number for the message. + /// + int Iteration { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs index 9191b6cb37..54e0566105 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs @@ -21,17 +21,17 @@ namespace Org.Apache.REEF.Network.Elastic.Comm.Impl { /// - /// Message used to communicated checkpoints between nodes in order to + /// Message used to communicate checkpoints between nodes in order to /// recover execution. /// [Unstable("0.16", "API may change")] - internal sealed class CheckpointMessage : GroupCommunicationMessage + internal sealed class CheckpointMessage : ElasticGroupCommunicationMessage { /// /// Constructor for a message containig a checkpoint. /// /// The checkpoint state - public CheckpointMessage(ICheckpointState checkpoint) : base(checkpoint.SubscriptionName, checkpoint.OperatorId) + public CheckpointMessage(ICheckpointState checkpoint) : base(checkpoint.StageName, checkpoint.OperatorId) { Checkpoint = checkpoint; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequest.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequest.cs new file mode 100644 index 0000000000..44acb47d22 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequest.cs @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Message sent to checkpoint service to retrieve a remote checkpoint. + /// + [Unstable("0.16", "API may change")] + internal sealed class CheckpointMessageRequest : ElasticGroupCommunicationMessage + { + /// + /// Constructor. + /// + /// The stage name ffor the checkpoint to retrieve + /// The operator identifier + /// The iteration of the checkpoint of interest + public CheckpointMessageRequest( + string stageName, + int operatorId, + int iteration) : base(stageName, operatorId) + { + Iteration = iteration; + } + + /// + /// Iteration number for the checkpoint of interest. + /// + public int Iteration { get; set; } + + /// + /// Clone the message. + /// + public override object Clone() + { + return new CheckpointMessageRequest(StageName, OperatorId, Iteration); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequestStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequestStreamingCodec.cs new file mode 100644 index 0000000000..d216ef935c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequestStreamingCodec.cs @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Threading; +using System.Threading.Tasks; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake.StreamingCodec; +using Org.Apache.REEF.Utilities; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Streaming codec for the checkpoint message request + /// + internal sealed class CheckpointMessageRequestStreamingCodec : IStreamingCodec + { + /// + /// Empty constructor to allow instantiation by reflection. + /// + [Inject] + private CheckpointMessageRequestStreamingCodec() + { + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The checkpoint message request + public CheckpointMessageRequest Read(IDataReader reader) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + reader.Read(ref metadata, 0, metadataSize); + var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); + + return new CheckpointMessageRequest(stageName, operatorId, iteration); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + public void Write(CheckpointMessageRequest obj, IDataWriter writer) + { + byte[] encodedMetadata = GenerateMetaDataEncoding(obj); + + writer.Write(encodedMetadata, 0, encodedMetadata.Length); + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The cancellation token + /// The checkpoint message request + public async Task ReadAsync(IDataReader reader, + CancellationToken token) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + await reader.ReadAsync(metadata, 0, metadataSize, token); + var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); + + return new CheckpointMessageRequest(stageName, operatorId, iteration); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + /// The cancellation token + public async System.Threading.Tasks.Task WriteAsync(CheckpointMessageRequest obj, IDataWriter writer, CancellationToken token) + { + byte[] encodedMetadata = GenerateMetaDataEncoding(obj); + + await writer.WriteAsync(encodedMetadata, 0, encodedMetadata.Length, token); + } + + private static byte[] GenerateMetaDataEncoding(CheckpointMessageRequest obj) + { + byte[] stageBytes = ByteUtilities.StringToByteArrays(obj.StageName); + var length = stageBytes.Length; + byte[] metadataBytes = new byte[sizeof(int) + length + sizeof(int) + sizeof(int)]; + int offset = 0; + + Buffer.BlockCopy(BitConverter.GetBytes(length), 0, metadataBytes, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(stageBytes, 0, metadataBytes, offset, length); + offset += length; + + Buffer.BlockCopy(BitConverter.GetBytes(obj.OperatorId), 0, metadataBytes, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(BitConverter.GetBytes(obj.Iteration), 0, metadataBytes, offset, sizeof(int)); + + return metadataBytes; + } + + private static (string stageName, int operatorId, int iteration) GenerateMetaDataDecoding(byte[] obj, int stageLength) + { + int offset = 0; + string stageName = ByteUtilities.ByteArraysToString(obj, offset, stageLength); + offset += stageLength; + + int operatorId = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + int iteration = BitConverter.ToInt32(obj, offset); + + return (stageName, operatorId, iteration); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageStreamingCodec.cs new file mode 100644 index 0000000000..24756e8aeb --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageStreamingCodec.cs @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Threading; +using System.Threading.Tasks; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake.StreamingCodec; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Network.Elastic.Failures; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Streaming Codec for the checkpoint message. + /// + internal sealed class CheckpointMessageStreamingCodec : IStreamingCodec + { + private readonly IStreamingCodec _codec; + private readonly ICheckpointState _checkpoint; + + /// + /// Empty constructor to allow instantiation by reflection + /// + [Inject] + private CheckpointMessageStreamingCodec(IStreamingCodec codec, ICheckpointState checkpoint) + { + _codec = codec; + _checkpoint = checkpoint; + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The checkpoint message + public CheckpointMessage Read(IDataReader reader) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + reader.Read(ref metadata, 0, metadataSize); + var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); + var data = _codec.Read(reader); + var payload = _checkpoint.Create(data); + + payload.StageName = stageName; + payload.OperatorId = operatorId; + payload.Iteration = iteration; + + return new CheckpointMessage(payload); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + public void Write(CheckpointMessage obj, IDataWriter writer) + { + byte[] encodedMetadata = GenerateMetaDataEncoding(obj); + + writer.Write(encodedMetadata, 0, encodedMetadata.Length); + + _codec.Write((T)obj.Checkpoint.State, writer); + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The cancellation token + /// The checkpoint message + public async Task ReadAsync(IDataReader reader, + CancellationToken token) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + await reader.ReadAsync(metadata, 0, metadataSize, token); + var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); + var data = await _codec.ReadAsync(reader, token); + var payload = _checkpoint.Create(data); + + payload.StageName = stageName; + payload.OperatorId = operatorId; + payload.Iteration = iteration; + + return new CheckpointMessage(payload); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + /// The cancellation token + public async System.Threading.Tasks.Task WriteAsync(CheckpointMessage obj, IDataWriter writer, CancellationToken token) + { + byte[] encodedMetadata = GenerateMetaDataEncoding(obj); + + await writer.WriteAsync(encodedMetadata, 0, encodedMetadata.Length, token); + + await _codec.WriteAsync((T)obj.Checkpoint.State, writer, token); + } + + private static byte[] GenerateMetaDataEncoding(CheckpointMessage obj) + { + byte[] stageBytes = ByteUtilities.StringToByteArrays(obj.StageName); + var length = stageBytes.Length; + byte[] metadataBytes = new byte[sizeof(int) + length + sizeof(int) + sizeof(int)]; + int offset = 0; + + Buffer.BlockCopy(BitConverter.GetBytes(length), 0, metadataBytes, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(stageBytes, 0, metadataBytes, offset, length); + offset += length; + + Buffer.BlockCopy(BitConverter.GetBytes(obj.OperatorId), 0, metadataBytes, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(BitConverter.GetBytes(obj.Checkpoint.Iteration), 0, metadataBytes, offset, sizeof(int)); + + return metadataBytes; + } + + private static (string stageName, int operatorId, int iteration) GenerateMetaDataDecoding(byte[] obj, int stageLength) + { + int offset = 0; + string stageName = ByteUtilities.ByteArraysToString(obj, offset, stageLength); + offset += stageLength; + + int operatorId = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + int iteration = BitConverter.ToInt32(obj, offset); + + return (stageName, operatorId, iteration); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs index 30380f6456..fcde882313 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessage.cs @@ -26,16 +26,16 @@ namespace Org.Apache.REEF.Network.Elastic.Comm.Impl /// communication layers that are type-agnostic. /// [Unstable("0.16", "API may change")] - internal abstract class DataMessage : GroupCommunicationMessage + internal abstract class DataMessage : ElasticGroupCommunicationMessage { /// /// Constructor for an untyped data message. /// - /// The name of the subscription for the message + /// The name of the stage for the message /// The operator sending the message /// The iteration in which the message is sent/valid - public DataMessage(string subscriptionName, int operatorId, int iteration) - : base(subscriptionName, operatorId) + public DataMessage(string stageName, int operatorId, int iteration) + : base(stageName, operatorId) { Iteration = iteration; } @@ -43,7 +43,7 @@ public DataMessage(string subscriptionName, int operatorId, int iteration) /// /// The iteration number for the message. /// - internal int Iteration { get; set; } + public int Iteration { get; set; } /// /// Clone the message. @@ -60,20 +60,20 @@ override public object Clone() /// /// The type for the data message [Unstable("0.16", "API may change")] - internal sealed class DataMessage : DataMessage + internal sealed class DataMessage : DataMessage, ITypedDataMessage { /// /// Constructor of a typed data message. /// - /// The name of the subscription for the message + /// The name of the stage for the message /// The operator sending the message /// The iteration in which the message is sent/valid /// The data contained in the message public DataMessage( - string subscriptionName, + string stageName, int operatorId, int iteration, //// For the moment we consider iterations as ints. Maybe this would change in the future - T data) : base(subscriptionName, operatorId, iteration) + T data) : base(stageName, operatorId, iteration) { Data = data; } @@ -81,6 +81,6 @@ public DataMessage( /// /// The data contained in the message. /// - internal T Data { get; set; } + public T Data { get; set; } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageStreamingCodec.cs new file mode 100644 index 0000000000..dab643ac0d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageStreamingCodec.cs @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Threading; +using System.Threading.Tasks; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake.StreamingCodec; +using Org.Apache.REEF.Utilities; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Streaming codec for the data message. + /// + internal sealed class DataMessageStreamingCodec : IStreamingCodec> + { + private readonly IStreamingCodec _codec; + + /// + /// Empty constructor to allow instantiation by reflection + /// + [Inject] + private DataMessageStreamingCodec(IStreamingCodec codec) + { + _codec = codec; + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The data message + public DataMessage Read(IDataReader reader) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + reader.Read(ref metadata, 0, metadataSize); + var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); + var data = _codec.Read(reader); + + return new DataMessage(stageName, operatorId, iteration, data); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + public void Write(DataMessage obj, IDataWriter writer) + { + byte[] encodedMetadata = GenerateMetaDataEncoding(obj); + + writer.Write(encodedMetadata, 0, encodedMetadata.Length); + + _codec.Write(obj.Data, writer); + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The cancellation token + /// The data message + public async Task> ReadAsync(IDataReader reader, + CancellationToken token) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + await reader.ReadAsync(metadata, 0, metadataSize, token); + var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); + var data = await _codec.ReadAsync(reader, token); + + return new DataMessage(stageName, operatorId, iteration, data); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + /// The cancellation token + public async System.Threading.Tasks.Task WriteAsync(DataMessage obj, IDataWriter writer, CancellationToken token) + { + byte[] encodedMetadata = GenerateMetaDataEncoding(obj); + + await writer.WriteAsync(encodedMetadata, 0, encodedMetadata.Length, token); + + await _codec.WriteAsync(obj.Data, writer, token); + } + + private static byte[] GenerateMetaDataEncoding(DataMessage obj) + { + byte[] stageBytes = ByteUtilities.StringToByteArrays(obj.StageName); + var length = stageBytes.Length; + byte[] metadataBytes = new byte[sizeof(int) + length + sizeof(int) + sizeof(int)]; + int offset = 0; + + Buffer.BlockCopy(BitConverter.GetBytes(length), 0, metadataBytes, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(stageBytes, 0, metadataBytes, offset, length); + offset += length; + + Buffer.BlockCopy(BitConverter.GetBytes(obj.OperatorId), 0, metadataBytes, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(BitConverter.GetBytes(obj.Iteration), 0, metadataBytes, offset, sizeof(int)); + + return metadataBytes; + } + + private static (string stageName, int operatorId, int iteration) GenerateMetaDataDecoding(byte[] obj, int stageLength) + { + int offset = 0; + string stageName = ByteUtilities.ByteArraysToString(obj, offset, stageLength); + offset += stageLength; + + int operatorId = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + int iteration = BitConverter.ToInt32(obj, offset); + + return (stageName, operatorId, iteration); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs index 6f12133da7..95c13836a6 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopology.cs @@ -31,11 +31,11 @@ internal abstract class DataMessageWithTopology : DataMessage /// /// Constructor for the base untyped data message with topology. /// - /// The name of the subscription for the message + /// The name of the stage for the message /// The operator sending the message /// The iteration in which the message is sent/valid - public DataMessageWithTopology(string subscriptionName, int operatorId, int iteration) - : base(subscriptionName, operatorId, iteration) + public DataMessageWithTopology(string stageName, int operatorId, int iteration) + : base(stageName, operatorId, iteration) { } @@ -50,22 +50,22 @@ public DataMessageWithTopology(string subscriptionName, int operatorId, int iter /// /// [Unstable("0.16", "API may change")] - internal class DataMessageWithTopology : DataMessageWithTopology + internal class DataMessageWithTopology : DataMessageWithTopology, ITypedDataMessage { /// /// Main constructor for data messages with topology information. /// - /// The name of the subscription for the message + /// The name of the stage for the message /// The operator sending the message /// The iteration in which the message is sent/valid /// The data contained in the message /// The topology updates being transmitted with the data public DataMessageWithTopology( - string subscriptionName, + string stageName, int operatorId, int iteration, T data, - List updates) : base(subscriptionName, operatorId, iteration) + List updates) : base(stageName, operatorId, iteration) { Data = data; TopologyUpdates = updates; @@ -74,21 +74,21 @@ public DataMessageWithTopology( /// /// Constructor for a data message with topology but without topology updates. /// - /// The name of the subscription for the message + /// The name of the stage for the message /// The operator sending the message /// The iteration in which the message is sent/valid /// The data contained in the message public DataMessageWithTopology( - string subscriptionName, + string stageName, int operatorId, int iteration, - T data) : this(subscriptionName, operatorId, iteration, data, new List()) + T data) : this(stageName, operatorId, iteration, data, new List()) { } /// /// The data contained in the message. /// - internal T Data { get; set; } + public T Data { get; set; } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopologyStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopologyStreamingCodec.cs new file mode 100644 index 0000000000..2c27e44462 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DataMessageWithTopologyStreamingCodec.cs @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake.StreamingCodec; +using Org.Apache.REEF.Utilities; + +namespace Org.Apache.REEF.Network.Elastic.Comm.Impl +{ + /// + /// Streaming Codec for the Group Communication Message + /// + internal sealed class DataMessageWithTopologyStreamingCodec : IStreamingCodec> + { + private readonly IStreamingCodec _codec; + + /// + /// Empty constructor to allow instantiation by reflection + /// + [Inject] + private DataMessageWithTopologyStreamingCodec(IStreamingCodec codec) + { + _codec = codec; + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The Group Communication Message + public DataMessageWithTopology Read(IDataReader reader) + { + int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); + byte[] metadata = new byte[metadataSize]; + reader.Read(ref metadata, 0, metadataSize); + var (stageName, operatorId, iteration, updates) = MetaDataDecoding(metadata); + var data = _codec.Read(reader); + + return new DataMessageWithTopology(stageName, operatorId, iteration, data); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + public void Write(DataMessageWithTopology obj, IDataWriter writer) + { + byte[] encodedMetadata = MetaDataEncoding(obj); + + writer.Write(encodedMetadata, 0, encodedMetadata.Length); + + _codec.Write(obj.Data, writer); + } + + /// + /// Read the class fields. + /// + /// The reader from which to read + /// The cancellation token + /// The Group Communication Message + public async Task> ReadAsync(IDataReader reader, + CancellationToken token) + { + int metadataSize = await reader.ReadInt32Async(token); + byte[] metadata = new byte[metadataSize]; + await reader.ReadAsync(metadata, 0, metadataSize, token); + var (stageName, operatorId, iteration, updates) = MetaDataDecoding(metadata); + var data = await _codec.ReadAsync(reader, token); + + return new DataMessageWithTopology(stageName, operatorId, iteration, data, updates); + } + + /// + /// Writes the class fields. + /// + /// The message to write + /// The writer to which to write + /// The cancellation token + public async System.Threading.Tasks.Task WriteAsync(DataMessageWithTopology obj, IDataWriter writer, CancellationToken token) + { + byte[] encodedMetadata = MetaDataEncoding(obj); + + await writer.WriteAsync(BitConverter.GetBytes(encodedMetadata.Length), 0, sizeof(int), token); + await writer.WriteAsync(encodedMetadata, 0, encodedMetadata.Length, token); + + await _codec.WriteAsync(obj.Data, writer, token); + } + + private static byte[] MetaDataEncoding(DataMessageWithTopology obj) + { + byte[] stageBytes = ByteUtilities.StringToByteArrays(obj.StageName); + var totalLengthUpdates = obj.TopologyUpdates.Sum(x => x.Size); + byte[] buffer = new byte[sizeof(int) + totalLengthUpdates + sizeof(int) + stageBytes.Length + sizeof(bool) + sizeof(int) + sizeof(int)]; + int offset = 0; + + Buffer.BlockCopy(BitConverter.GetBytes(stageBytes.Length), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(stageBytes, 0, buffer, offset, stageBytes.Length); + offset += stageBytes.Length; + + Buffer.BlockCopy(BitConverter.GetBytes(obj.OperatorId), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(BitConverter.GetBytes(obj.Iteration), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + Buffer.BlockCopy(BitConverter.GetBytes(totalLengthUpdates), 0, buffer, offset, sizeof(int)); + offset += sizeof(int); + + TopologyUpdate.Serialize(buffer, ref offset, obj.TopologyUpdates); + + return buffer; + } + + private static (string stageName, int operatorId, int iteration, List updates) MetaDataDecoding(byte[] obj) + { + int offset = 0; + int stageLength = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + string stageName = ByteUtilities.ByteArraysToString(obj, offset, stageLength); + offset += stageLength; + + int operatorId = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + int iteration = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + int length = BitConverter.ToInt32(obj, offset); + offset += sizeof(int); + + var updates = TopologyUpdate.Deserialize(obj, length, offset); + + return (stageName, operatorId, iteration, updates); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs index 0386a591eb..553876cd05 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/DriverMessagePayload.cs @@ -24,16 +24,16 @@ namespace Org.Apache.REEF.Network.Elastic.Comm /// Payload for messages going from the driver to tasks. /// [Unstable("0.16", "API may change")] - public abstract class DriverMessagePayload : GroupCommunicationMessage + public abstract class DriverMessagePayload : ElasticGroupCommunicationMessage { /// /// Construct a payload for messages created at the driver and directed to tasks. /// - /// The name of the subsription - /// The id of the operator within the subscription + /// The name of the subsription + /// The id of the operator within the stage /// The iteration number in which the message is sent - public DriverMessagePayload(string subscriptionName, int operatorId, int iteration) - : base(subscriptionName, operatorId) + public DriverMessagePayload(string stageName, int operatorId, int iteration) + : base(stageName, operatorId) { Iteration = iteration; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs index 9aa2c8c059..a5ca4e3d85 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs @@ -91,11 +91,12 @@ public static ElasticDriverMessageImpl From(byte[] data, int offset = 0) switch (type) { - case DriverMessagePayloadType.Topology: - payload = TopologyMessagePayload.From(data, offset); + case DriverMessagePayloadType.Update: + case DriverMessagePayloadType.Failure: + payload = TopologyMessagePayload.From(type, data, offset); break; default: - throw new IllegalStateException($"Message type {type} not recognized"); + throw new IllegalStateException("Message type not recognized"); } return new ElasticDriverMessageImpl(destination, payload); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/GroupCommunicationMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs similarity index 72% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/GroupCommunicationMessage.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs index 36e16adc72..e439e8569f 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/GroupCommunicationMessage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs @@ -24,34 +24,32 @@ namespace Org.Apache.REEF.Network.Elastic.Comm.Impl /// Message sent by Group Communication operators. /// [Unstable("0.16", "API may change")] - public abstract class GroupCommunicationMessage : ICloneable + public abstract class ElasticGroupCommunicationMessage : ICloneable { /// - /// Create a new Group Communication Message. + /// Create a new elastic group communication message. /// - /// The name of the subscription + /// The name of the stage /// The id of the operator sending the message - protected GroupCommunicationMessage( - string subscriptionName, + protected ElasticGroupCommunicationMessage( + string stageName, int operatorId) { - SubscriptionName = subscriptionName; + StageName = stageName; OperatorId = operatorId; } /// /// Clone the message. /// - /// An object containing the shallow copy of the message. public abstract object Clone(); /// - /// Returns the Subscription. - /// - internal string SubscriptionName { get; private set; } + /// Returns the stage. + internal string StageName { get; private set; } /// - /// Returns the Operator id. + /// Returns the operator id. /// internal int OperatorId { get; private set; } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs index 37213fc1e4..da2bd35b6a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs @@ -32,11 +32,11 @@ internal sealed class FailureMessagePayload : TopologyMessagePayload /// /// The topology updates /// Whether the updates are additions to the current topology state or nodes removal - /// The subscription context for the message + /// The stage context for the message /// The id of the operator receiving the topology update /// The iteration in which the update takes effect - public FailureMessagePayload(List updates, string subscriptionName, int operatorId, int iteration) - : base(DriverMessagePayloadType.Failure, updates, subscriptionName, operatorId, iteration) + public FailureMessagePayload(List updates, string stageName, int operatorId, int iteration) + : base(DriverMessagePayloadType.Failure, updates, stageName, operatorId, iteration) { } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs index aeb6b34efc..198ad3c366 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs @@ -36,11 +36,11 @@ internal class TopologyMessagePayload : DriverMessagePayload /// /// The topology updates /// Whether the updates are additions to the current topology state or nodes removal - /// The subscription context for the message + /// The stage context for the message /// The id of the operator receiving the topology update /// The iteration in which the update takes effect - public TopologyMessagePayload(DriverMessagePayloadType type, List updates, string subscriptionName, int operatorId, int iteration) - : base(subscriptionName, operatorId, iteration) + public TopologyMessagePayload(DriverMessagePayloadType type, List updates, string stageName, int operatorId, int iteration) + : base(stageName, operatorId, iteration) { PayloadType = type; TopologyUpdates = updates; @@ -60,7 +60,7 @@ public override object Clone() updatesClone.Add(update); } - return TopologyMessageBuilder(PayloadType, updatesClone, SubscriptionName, OperatorId, Iteration); + return TopologyMessageBuilder(PayloadType, updatesClone, StageName, OperatorId, Iteration); } /// @@ -83,13 +83,13 @@ internal static DriverMessagePayload From(DriverMessagePayloadType type, byte[] length = BitConverter.ToInt32(data, offset); offset += sizeof(int); - string subscription = ByteUtilities.ByteArraysToString(data, offset, length); + string stage = ByteUtilities.ByteArraysToString(data, offset, length); offset += length; int operatorId = BitConverter.ToInt32(data, offset); offset += sizeof(int); int iteration = BitConverter.ToInt32(data, offset); - return TopologyMessageBuilder(type, updates, subscription, operatorId, iteration); + return TopologyMessageBuilder(type, updates, stage, operatorId, iteration); } /// @@ -98,20 +98,20 @@ internal static DriverMessagePayload From(DriverMessagePayloadType type, byte[] /// The serialized payload internal override byte[] Serialize() { - byte[] subscriptionBytes = ByteUtilities.StringToByteArrays(SubscriptionName); + byte[] stageBytes = ByteUtilities.StringToByteArrays(StageName); int offset = 0; var totalLengthUpdates = TopologyUpdates.Sum(x => x.Size); - byte[] buffer = new byte[sizeof(int) + totalLengthUpdates + sizeof(int) + subscriptionBytes.Length + sizeof(bool) + sizeof(int) + sizeof(int)]; + byte[] buffer = new byte[sizeof(int) + totalLengthUpdates + sizeof(int) + stageBytes.Length + sizeof(bool) + sizeof(int) + sizeof(int)]; Buffer.BlockCopy(BitConverter.GetBytes(totalLengthUpdates), 0, buffer, offset, sizeof(int)); offset += sizeof(int); TopologyUpdate.Serialize(buffer, ref offset, TopologyUpdates); - Buffer.BlockCopy(BitConverter.GetBytes(subscriptionBytes.Length), 0, buffer, offset, sizeof(int)); + Buffer.BlockCopy(BitConverter.GetBytes(stageBytes.Length), 0, buffer, offset, sizeof(int)); offset += sizeof(int); - Buffer.BlockCopy(subscriptionBytes, 0, buffer, offset, subscriptionBytes.Length); - offset += subscriptionBytes.Length; + Buffer.BlockCopy(stageBytes, 0, buffer, offset, stageBytes.Length); + offset += stageBytes.Length; Buffer.BlockCopy(BitConverter.GetBytes(OperatorId), 0, buffer, offset, sizeof(int)); offset += sizeof(int); Buffer.BlockCopy(BitConverter.GetBytes(Iteration), 0, buffer, offset, sizeof(int)); @@ -119,14 +119,14 @@ internal override byte[] Serialize() return buffer; } - private static DriverMessagePayload TopologyMessageBuilder(DriverMessagePayloadType type, List updates, string subscriptionName, int operatorId, int iteration) + private static DriverMessagePayload TopologyMessageBuilder(DriverMessagePayloadType type, List updates, string stageName, int operatorId, int iteration) { switch (type) { case DriverMessagePayloadType.Update: - return new UpdateMessagePayload(updates, subscriptionName, operatorId, iteration); + return new UpdateMessagePayload(updates, stageName, operatorId, iteration); case DriverMessagePayloadType.Failure: - return new FailureMessagePayload(updates, subscriptionName, operatorId, iteration); + return new FailureMessagePayload(updates, stageName, operatorId, iteration); default: throw new IllegalStateException($"Topology message type {type} not found."); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs index ed371e3cf5..beac689901 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs @@ -32,11 +32,11 @@ internal sealed class UpdateMessagePayload : TopologyMessagePayload /// /// The topology updates /// Whether the updates are additions to the current topology state or nodes removal - /// The subscription context for the message + /// The stage context for the message /// The id of the operator receiving the topology update /// The iteration in which the update takes effect - public UpdateMessagePayload(List updates, string subscriptionName, int operatorId, int iteration) - : base(DriverMessagePayloadType.Update, updates, subscriptionName, operatorId, iteration) + public UpdateMessagePayload(List updates, string stageName, int operatorId, int iteration) + : base(DriverMessagePayloadType.Update, updates, stageName, operatorId, iteration) { } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs index 7c32cd563d..544c811525 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/ElasticServiceConfigurationOptions.cs @@ -61,8 +61,8 @@ public sealed class DriverId : Name { } - [NamedParameter("Default Group name", defaultValue: "Subscription1")] - public sealed class DefaultSubscriptionName : Name + [NamedParameter("Default Group name", defaultValue: "Stage1")] + public sealed class DefaultStageName : Name { } @@ -71,8 +71,8 @@ public sealed class NumberOfTasks : Name { } - [NamedParameter("Serialized subscriptions configuration")] - public sealed class SerializedSubscriptionConfigs : Name> + [NamedParameter("Serialized stages configuration")] + public sealed class SerializedStageConfigs : Name> { } @@ -96,12 +96,17 @@ public sealed class NumTaskFailures : Name { } - [NamedParameter(Documentation = "Rack name used when a new evaluator is requested after a failure", DefaultValue = "WonderlandRack")] + [NamedParameter("Number of failures before an evaluator abort the task set", defaultValue: "3")] + public sealed class NumEvaluatorFailures : Name + { + } + + [NamedParameter(Documentation = "Rack name used when a new evaluator is requested", DefaultValue = "WonderlandRack")] public sealed class NewEvaluatorRackName : Name { } - [NamedParameter(Documentation = "Batch id used when a new evaluator is requested after a failure", DefaultValue = "IterateBroadcast")] + [NamedParameter(Documentation = "Batch id used when a new evaluator is requested", DefaultValue = "IterateBroadcast")] public sealed class NewEvaluatorBatchId : Name { } @@ -111,7 +116,7 @@ public sealed class NewEvaluatorNumCores : Name { } - [NamedParameter(Documentation = "Memory size used when a new evaluator is requested after a failure", DefaultValue = "512")] + [NamedParameter(Documentation = "Memory size used when a new evaluator is requested", DefaultValue = "512")] public sealed class NewEvaluatorMemorySize : Name { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs new file mode 100644 index 0000000000..4972d074e6 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; + +namespace Org.Apache.REEF.Network.Elastic.Config +{ + /// + ///Class wrapping the configuration option parameters for task-side group communication. + /// + public sealed class GroupCommunicationConfigurationOptions + { + [NamedParameter("Timeout for sending or receiving messages", defaultValue: "600000")] + public class Timeout : Name + { + } + + [NamedParameter("Number of retry to send a message", defaultValue: "15")] + public class Retry : Name + { + } + + [NamedParameter("Timeout for disposing operators when messages are still in queue", defaultValue: "10000")] + public class DisposeTimeout : Name + { + } + + /// + /// Each communication group needs to check and wait until all the other nodes in the group are registered to the NameServer. + /// Sleep time is set between each retry. + /// + [NamedParameter("sleep time (in milliseconds) to wait for nodes to be registered", defaultValue: "60000")] + internal sealed class SleepTimeWaitingForRegistration : Name + { + } + + /// + /// Each Communication group needs to check and wait until all the other nodes in the group are registered to the NameServer. + /// + /// + /// If a node is waiting for others that need to download data, the waiting time could be long. + /// As we can use cancellation token to cancel the waiting for registration, setting this number higher should be OK. + /// + [NamedParameter("Retry times to wait for nodes to be registered", defaultValue: "30")] + internal sealed class RetryCountWaitingForRegistration : Name + { + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs index 010a921720..dcfe369a6a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/OperatorParameters.cs @@ -40,8 +40,8 @@ public sealed class OperatorId : Name { } - [NamedParameter("Name of the subscriptions")] - public sealed class SubscriptionName : Name + [NamedParameter("Name of the stage")] + public sealed class StageName : Name { } @@ -80,7 +80,7 @@ public sealed class Checkpointing : Name { } - [NamedParameter("Whether the operator is the last to be executed in the subscription", defaultValue: "false")] + [NamedParameter("Whether the operator is the last to be executed in the stage", defaultValue: "false")] public sealed class IsLast : Name { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs new file mode 100644 index 0000000000..8082231e86 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Wake.StreamingCodec; + +namespace Org.Apache.REEF.Network.Elastic.Config +{ + /// + /// Defines configuration for streaming codecs of messages. + /// + /// Generic type of message + public sealed class StreamingCodecConfiguration : ConfigurationModuleBuilder + { + /// + /// RequiredImpl for Codec. Client needs to set implementation for this parameter + /// + public static readonly RequiredImpl> Codec = new RequiredImpl>(); + + /// + /// Configuration Module for Codec + /// + public static ConfigurationModule Conf = new StreamingCodecConfiguration() + .BindImplementation(GenericType>.Class, Codec) + .BindImplementation(GenericType>>.Class, + GenericType>.Class) + .BindImplementation(GenericType>>.Class, + GenericType>.Class) + .BindImplementation(GenericType>.Class, + GenericType>.Class) + .BindImplementation(GenericType>.Class, + GenericType.Class) + .Build(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs new file mode 100644 index 0000000000..9201767969 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Driver.Impl; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; +using System; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Driver +{ + /// + /// This is the entry point for enabling the Elastic Group Communication. + /// The workflow is the following: + /// (1) Create a context instance; + /// (2) Use the context to create one or more stages; + /// (3) Use the stage to create a pipeline of operators representing the + /// communication pattern the tasks should implement; + /// (4) Create one or more task set managers to manage the scheduling of the tasks; + /// (5) Register stage to the manager to properly configure the task set. + /// + /// This interface is mainly used to create elastic stages. + /// Also manages configurations for elastic group communication operators/stages. + /// + [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(DefaultElasticContext))] + public interface IElasticContext : IFailureResponse + { + /// + /// Creates a stage with the default settings. + /// The stage lifecicle is managed by the context. + /// + /// A new stage with default parameters + IElasticStage DefaultStage(); + + /// + /// Creates a new stage. + /// The stage lifecicle is managed by the context. + /// + /// The name of the stage + /// The number of tasks required by the stage + /// An optional failure machine governing the stage + /// The new task Set subscrption + IElasticStage CreateNewStage(string stageName, int numTasks, IFailureStateMachine failureMachine = null); + + /// + /// Remove a stage from the context. + /// + /// The name of the stage to be removed + void RemoveElasticStage(string stageName); + + /// + /// Generate the base configuration module for tasks. + /// This method is method can be used to generate configurations for the task set menager. + /// + /// The id of the task the configuration is generate for + /// The module with the service properly set up for the task + ConfigurationModule GetTaskConfigurationModule(string taskId); + + /// + /// Start the elastic group communicatio context. + /// This will trigger requests for resources as specified by the parameters. + /// + void Start(); + + /// + /// Create a new task set manager. + /// + /// The configuration for the master task + /// The configuration for the slave task + /// A new task set manager + IElasticTaskSetManager CreateNewTaskSetManager(Func masterTaskConfiguration, Func slaveTaskConfiguration = null); + + /// + /// Create a new task set manager. + /// + /// The number of tasks the task set should manager + /// The configuration for the master task + /// The configuration for the slave task + /// A new task set manager + IElasticTaskSetManager CreateNewTaskSetManager(int numOfTasks, Func masterTaskConfiguration, Func slaveTaskConfiguration = null); + + /// + /// Generate the elastic service configuration object. + /// This method is used to properly configure task contexts with the elastic service. + /// + /// The ealstic service configuration + IConfiguration GetElasticServiceConfiguration(); + + #region Serialization Helpers + /// + /// Append a stage configuration to a configuration builder object. + /// + /// The configuration where the stage configuration will be appended to + /// The stage configuration at hand + /// The configuration containing the serialized stage configuration + void SerializeStageConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration stageConf); + + /// + /// Append an operator configuration to a configuration builder object. + /// + /// The list where the operator configuration will be appended to + /// The operator configuration at hand + /// The configuration containing the serialized operator configuration + void SerializeOperatorConfiguration(ref IList serializedOperatorsConfs, IConfiguration operatorConfiguration); + #endregion + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs similarity index 74% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs index 6742bf4150..1c4bfa3c17 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetSubscription.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs @@ -28,16 +28,16 @@ namespace Org.Apache.REEF.Network.Elastic.Driver { /// /// Used to group elastic operators into logical units. - /// All operators in the same subscriptions share similar semantics and behavior - /// under failures. Subscriptions can only be created by a service. + /// All operators in the same stages share similar semantics and behavior + /// under failures. Stages can only be created by a context. /// [Unstable("0.16", "API may change")] - public interface IElasticTaskSetSubscription : IFailureResponse, ITaskMessageResponse + public interface IElasticStage : IFailureResponse, ITaskMessageResponse { /// - /// The name of the subscriptions. + /// The name of the stages. /// - string SubscriptionName { get; } + string StageName { get; } /// /// The operator at the beginning of the computation workflow. @@ -45,68 +45,68 @@ public interface IElasticTaskSetSubscription : IFailureResponse, ITaskMessageRes ElasticOperator RootOperator { get; } /// - /// The failure state of the target subscriptions. + /// The failure state of the target stages. /// IFailureState FailureState { get; } /// - /// The service managing the subscriptions. + /// The context where the stage is created. /// - IElasticTaskSetService Service { get; } + IElasticContext Context { get; } /// - /// Whether the subscriptions is completed or not. + /// Whether the stages is completed or not. /// bool IsCompleted { get; set; } /// - /// Whether the subscriptions contains iterations or not. + /// Whether the stages contains iterations or not. /// bool IsIterative { get; set; } /// - /// Generates an id to uniquely identify Operators in the subscriptions. + /// Generates an id to uniquely identify operators in the stages. /// /// A new unique id int GetNextOperatorId(); /// - /// Add a partitioned dataset to the subscription. + /// Add a partitioned dataset to the stage. /// /// The partitioned dataset /// Whether the master node should get a partition void AddDataset(IPartitionedInputDataSet inputDataSet, bool isMasterGettingInputData = false); /// - /// Add a set of datasets to the subscription. + /// Add a set of datasets to the stage. /// /// The configuration for the datasets /// Whether the master node should get a partition void AddDataset(IConfiguration[] inputDataSet, bool isMasterGettingInputData = false); /// - /// Finalizes the subscriptions. - /// After the subscriptions has been finalized, no more operators can + /// Finalizes the stages. + /// After the stages has been finalized, no more operators can /// be added to the group. /// - /// The same finalized subscriptions - IElasticTaskSetSubscription Build(); + /// The same finalized stages + IElasticStage Build(); /// - /// Add a task to the subscriptions. - /// The subscriptions must have been buit before tasks can be added. + /// Add a task to the stages. + /// The stages must have been buit before tasks can be added. /// /// The id of the task to add - /// True if the task is correctly added to the subscriptions + /// True if the task is correctly added to the stages bool AddTask(string taskId); /// - /// Decides if the tasks added to the subscriptions can be scheduled for execution + /// Decides if the tasks added to the stages can be scheduled for execution /// or not. This method is used for implementing different policies for /// triggering the scheduling of tasks. /// /// True if the previously added tasks can be scheduled for execution - bool ScheduleSubscription(); + bool ScheduleStage(); /// /// Whether the input activeContext is the one of the master tasks. @@ -117,11 +117,11 @@ public interface IElasticTaskSetSubscription : IFailureResponse, ITaskMessageRes /// /// Creates the Configuration for the input task. - /// Must be called only after all tasks have been added to the subscriptions. + /// Must be called only after all tasks have been added to the stages. /// /// The configuration builder the configuration will be appended to - /// The task id of the task that belongs to this subscriptions - /// The configuration for the Task with added subscriptions informations + /// The task id of the task that belongs to this stages + /// The configuration for the Task with added stages informations IConfiguration GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId); /// @@ -129,14 +129,14 @@ public interface IElasticTaskSetSubscription : IFailureResponse, ITaskMessageRes /// (if any). /// /// The task id of the task we wanto to retrieve the data partition. - /// The task is required to belong to thq subscriptions + /// The task is required to belong to thq stages /// The configuration of the data partition (if any) of the task Optional GetPartitionConf(string taskId); /// /// Retrieve the log the final statistics of the computation: this is the sum of all - /// the stats of all the Operators compising the subscription. This method can be called - /// only once the subscriptions is completed. + /// the stats of all the Operators compising the stage. This method can be called + /// only once the stages is completed. /// /// The final statistics for the computation string LogFinalStatistics(); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs similarity index 81% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs index 419475be62..db66cc9808 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/ITaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs @@ -19,6 +19,7 @@ using Org.Apache.REEF.Driver.Evaluator; using Org.Apache.REEF.Driver.Task; using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Interface; using Org.Apache.REEF.Utilities.Attributes; using System; using System.Collections.Generic; @@ -27,23 +28,23 @@ namespace Org.Apache.REEF.Network.Elastic.Driver { /// /// Class defining how groups of tasks sharing similar scheduling semantics are managed. - /// Task set managers subscribe to subscriptions in order to define tasks logic. + /// Task set managers subscribe to stages in order to define tasks logic. /// Task set managers schedule and manage group of tasks running in the cluster. /// [Unstable("0.16", "API may change")] - public interface ITaskSetManager : IFailureResponse, IDisposable + public interface IElasticTaskSetManager : IFailureResponse, IDisposable { /// - /// An identifier for the set of Subscriptions the Task Manager is subscribed to. - /// The task set has to be built before retrieving its subscriptions id. + /// An identifier for the set of Stages the Task Manager is subscribed to. + /// The task set has to be built before retrieving its stages id. /// - string SubscriptionsId { get; } + string StagesId { get; } /// - /// Subscribe the current task set manager to a new subscription. + /// Subscribe the current task set manager to a new stage. /// - /// The subscription to subscribe to - void AddTaskSetSubscription(IElasticTaskSetSubscription subscription); + /// The stage to subscribe to + void AddStage(IElasticStage stage); /// /// Decides whether more contexts have to be added to this Task Manger or not. @@ -68,18 +69,25 @@ public interface ITaskSetManager : IFailureResponse, IDisposable /// /// Finalizes the task set manager. - /// After the task set has been finalized, no more subscriptions can be added. + /// After the task set has been finalized, no more stages can be added. /// /// The same finalized task set manager - ITaskSetManager Build(); + IElasticTaskSetManager Build(); /// - /// Retrieves all subscriptions having the context passed as a parameter + /// Retrieves all stages having the context passed as a parameter /// as master task context. /// /// The target context - /// A list of subscriptions having the master task running on context - IEnumerable IsMasterTaskContext(IActiveContext context); + /// A list of stages having the master task running on context + IEnumerable IsMasterTaskContext(IActiveContext context); + + /// + /// Get the configuration of the codecs used for data transmission. + /// The codecs are automatically generated from the operator pipeline. + /// + /// A configuration object with the codecs for data transmission + IConfiguration GetCodecConfiguration(); /// /// Method implementing how the task set manager should react when a new context is active. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs deleted file mode 100644 index d768e7a4e9..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetService.cs +++ /dev/null @@ -1,118 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Driver.Context; -using Org.Apache.REEF.Network.Elastic.Driver.Impl; -using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Utilities.Attributes; -using System.Collections.Generic; - -namespace Org.Apache.REEF.Network.Elastic.Driver -{ - /// - /// This is the entry point for enabling the Elastic Group Communication service. - /// The workflow is the following: - /// (1) Create a service instance; - /// (2) Use the service to create one or more subscriptions; - /// (3) Use the subscription to create a pipeline of operators representing the - /// communication pattern the tasks should implement; - /// (4) Create one or more managers to manage the scheduling of the tasks - /// (5) Register subscriptions to the manager to properly configure the task set. - /// - /// This interface is mainly used to create subscriptions. - /// Also manages configurations for Elastic Group Communication operators/services. - /// - [Unstable("0.16", "API may change")] - [DefaultImplementation(typeof(DefaultTaskSetService))] - public interface IElasticTaskSetService : IFailureResponse - { - /// - /// Creates a subscription with the default settings. - /// The subscription lifecicle is managed by the service. - /// - /// A new subscription with default parameters - IElasticTaskSetSubscription DefaultTaskSetSubscription(); - - /// - /// Creates a new subscription. - /// The subscription lifecicle is managed by the service. - /// - /// The name of the subscription - /// The number of tasks required by the subscription - /// An optional failure machine governing the subscription - /// The new task Set subscrption - IElasticTaskSetSubscription NewTaskSetSubscription(string subscriptionName, int numTasks, IFailureStateMachine failureMachine = null); - - /// - /// Remove a task Set subscription from the service. - /// - /// The name of the subscription to be removed - void RemoveTaskSetSubscription(string subscriptionName); - - /// - /// Get the subscriptions names from the context. - /// - /// An activeContext - /// The subscriptions representented in the context - string GetContextSubscriptions(IActiveContext activeContext); - - /// - /// Generate the service configuration object. - /// This method is used to properly configure Contexts with the service. - /// - /// The service Configuration - IConfiguration GetServiceConfiguration(); - - /// - /// At task submission time the following steps are executed: - /// 1) Each subscription the task is registered to generates a task subscription - /// 2) Internally each configuration generated by subscriptions contains a configuration entry for each - /// operator defining the subscription pipeline. Such operator configurations are serialized using - /// {@link Org.Apache.REEF.Network.Elastic.Driver.IElasticTaskSetService#SerializeOperatorConfiguration} - /// 3) Tasks subscriptions are serialized into a configuration - /// 4) The service task configuration is added to the configuration object containing the serialized subscription confs - /// 5) the task configuration is merged with the configuraiton object of 4) to generate the final task configuration - /// - /// - /// - /// Creates a generic task Configuration object for the tasks registering to the service. - /// - /// The configuration of the subscription the task will register to - /// The configuration for the task with added service parameters - IConfiguration GetTaskConfiguration(ICsConfigurationBuilder subscriptionsConf); - - #region Serialization Helpers - /// - /// Appends a subscription configuration to a configuration builder object. - /// - /// The configuration where the subscription configuration will be appended to - /// The subscription configuration at hand - /// The configuration containing the serialized subscription configuration - void SerializeSubscriptionConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration subscriptionConf); - - /// - /// Append an operator configuration to a configuration builder object. - /// - /// The list where the operator configuration will be appended to - /// The operator configuration at hand - /// The configuration containing the serialized operator configuration - void SerializeOperatorConfiguration(ref IList serializedOperatorsConfs, IConfiguration operatorConfiguration); - #endregion - } -} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetService.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticContext.cs similarity index 57% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetService.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticContext.cs index 4f06c71fdc..5008122729 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetService.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticContext.cs @@ -33,34 +33,44 @@ using Org.Apache.REEF.Network.Elastic.Config; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Network.Elastic.Failures.Impl; -using Org.Apache.REEF.Driver.Context; using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Network.Elastic.Comm; using Org.Apache.REEF.Wake.Time.Event; using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Wake.Remote.Parameters; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Driver.Evaluator; namespace Org.Apache.REEF.Network.Elastic.Driver.Impl { /// - /// Default implementation for the task service. - /// This is mainly used to create subscription. - /// Also manages configurations for Elastic Group Communication operators/services. + /// Default implementation for the task context. + /// This is mainly used to create stage. + /// Also manages configurations for Elastic Group Communication operators/contexts. /// [Unstable("0.16", "API may change")] - public sealed class DefaultTaskSetService : IElasticTaskSetService, IDefaultFailureEventResponse + public sealed class DefaultElasticContext : IElasticContext, IDefaultFailureEventResponse { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultTaskSetService)); + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticContext)); + private readonly int _startingPort; + private readonly int _portRange; private readonly string _driverId; private readonly int _numEvaluators; private readonly string _nameServerAddr; private readonly int _nameServerPort; private readonly INameServer _nameServer; - private readonly string _defaultSubscriptionName; + private readonly string _defaultStageName; private readonly IFailureStateMachine _defaultFailureMachine; + private readonly IEvaluatorRequestor _evaluatorRequestor; + private readonly int _memory; + private readonly int _cores; + private readonly string _batchId; + private readonly string _rackName; - private readonly Dictionary _subscriptions; + private readonly Dictionary _stages; private readonly AvroConfigurationSerializer _configSerializer; private readonly object _subsLock = new object(); @@ -69,22 +79,36 @@ public sealed class DefaultTaskSetService : IElasticTaskSetService, IDefaultFail private IFailureState _failureStatus; [Inject] - private DefaultTaskSetService( + private DefaultElasticContext( + [Parameter(typeof(ElasticServiceConfigurationOptions.StartingPort))] int startingPort, + [Parameter(typeof(ElasticServiceConfigurationOptions.PortRange))] int portRange, [Parameter(typeof(ElasticServiceConfigurationOptions.DriverId))] string driverId, - [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultSubscriptionName))] string defaultSubscriptionName, + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string defaultStageName, [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorMemorySize))] int memory, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorNumCores))] int cores, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorBatchId))] string batchId, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorRackName))] string rackName, AvroConfigurationSerializer configSerializer, + IEvaluatorRequestor evaluatorRequestor, INameServer nameServer, IFailureStateMachine defaultFailureStateMachine) { + _startingPort = startingPort; + _portRange = portRange; _driverId = driverId; _numEvaluators = numEvaluators; - _defaultSubscriptionName = defaultSubscriptionName; + _defaultStageName = defaultStageName; _defaultFailureMachine = defaultFailureStateMachine; + _evaluatorRequestor = evaluatorRequestor; + _memory = memory; + _cores = cores; + _batchId = batchId; + _rackName = rackName; _failureStatus = new DefaultFailureState(); _configSerializer = configSerializer; - _subscriptions = new Dictionary(); + _stages = new Dictionary(); _nameServer = nameServer; IPEndPoint localEndpoint = nameServer.LocalEndpoint; @@ -93,41 +117,41 @@ private DefaultTaskSetService( } /// - /// Returns a subscription with the default settings (default name and failure machine). + /// Returns a stage with the default settings (default name and failure machine). /// - /// A subscription with default settings - public IElasticTaskSetSubscription DefaultTaskSetSubscription() + /// A stage with default settings + public IElasticStage DefaultStage() { lock (_subsLock) { - IElasticTaskSetSubscription defaultSubscription; - _subscriptions.TryGetValue(_defaultSubscriptionName, out defaultSubscription); + IElasticStage defaultStage; + _stages.TryGetValue(_defaultStageName, out defaultStage); - if (defaultSubscription == null) + if (defaultStage == null) { - NewTaskSetSubscription(_defaultSubscriptionName, _numEvaluators, _defaultFailureMachine.Clone(_numEvaluators, (int)DefaultFailureStates.Fail)); + CreateNewStage(_defaultStageName, _numEvaluators, _defaultFailureMachine.Clone(_numEvaluators, (int)DefaultFailureStates.Fail)); } - return _subscriptions[_defaultSubscriptionName]; + return _stages[_defaultStageName]; } } /// - /// Creates a new subscription. - /// The subscription lifecicle is managed by the service. + /// Creates a new stage. + /// The stage lifecicle is managed by the context. /// - /// The name of the subscription - /// The number of tasks required by the subscription - /// An optional failure machine governing the subscription + /// The name of the stage + /// The number of tasks required by the stage + /// An optional failure machine governing the stage /// The new task Set subscrption - public IElasticTaskSetSubscription NewTaskSetSubscription( - string subscriptionName, + public IElasticStage CreateNewStage( + string stageName, int numTasks, IFailureStateMachine failureMachine = null) { - if (string.IsNullOrEmpty(subscriptionName)) + if (string.IsNullOrEmpty(stageName)) { - throw new ArgumentNullException($"{nameof(subscriptionName)} cannot be null."); + throw new ArgumentNullException($"{nameof(stageName)} cannot be null."); } if (numTasks <= 0) @@ -137,62 +161,107 @@ public IElasticTaskSetSubscription NewTaskSetSubscription( lock (_subsLock) { - if (_subscriptions.ContainsKey(subscriptionName)) + if (_stages.ContainsKey(stageName)) { - throw new ArgumentException($"Subscription {subscriptionName} already registered with the service."); + throw new ArgumentException($"Stage {stageName} already registered with the context."); } - var subscription = new DefaultTaskSetSubscription( - subscriptionName, + var stage = new DefaultElasticStage( + stageName, numTasks, this, failureMachine ?? _defaultFailureMachine.Clone(numTasks, (int)DefaultFailureStates.Fail)); - _subscriptions[subscriptionName] = subscription; + _stages[stageName] = stage; - return subscription; + return stage; } } /// - /// Remove a task Set subscription from the service. + /// Remove a task Set stage from the context. /// - /// The name of the subscription to be removed - public void RemoveTaskSetSubscription(string subscriptionName) + /// The name of the stage to be removed + public void RemoveElasticStage(string stageName) { lock (_subsLock) { - if (!_subscriptions.ContainsKey(subscriptionName)) + if (!_stages.ContainsKey(stageName)) { - throw new ArgumentException($"Subscription {subscriptionName} is not registered with the service."); + throw new ArgumentException($"Stage {stageName} is not registered with the context."); } - _subscriptions.Remove(subscriptionName); + _stages.Remove(stageName); } } /// - /// Get the subscriptions names from the context. + /// Generate the base configuration module for tasks. + /// This method is method can be used to generate configurations for the task set menager. /// - /// An activeContext - /// The subscriptions representented in the context - public string GetContextSubscriptions(IActiveContext activeContext) + /// The id of the task the configuration is generate for + /// The module with the service properly set up for the task + public ConfigurationModule GetTaskConfigurationModule(string taskId) { - return Utils.GetContextSubscriptions(activeContext); + return TaskConfiguration.ConfigurationModule + .Set(TaskConfiguration.Identifier, taskId) + .Set(TaskConfiguration.OnMessage, GenericType.Class) + .Set(TaskConfiguration.OnClose, GenericType.Class); } /// - /// Generate the service configuration object. - /// This method is used to properly configure Contexts with the service. + /// Start the elastic group communicatio context. + /// This will trigger requests for resources as specified by the parameters. /// - /// The service Configuration - public IConfiguration GetServiceConfiguration() + public void Start() { - IConfiguration serviceConfig = ServiceConfiguration.ConfigurationModule + var request = _evaluatorRequestor.NewBuilder() + .SetNumber(_numEvaluators) + .SetMegabytes(_memory) + .SetCores(_cores) + .SetRackName(_rackName) + .SetEvaluatorBatchId(_batchId) + .Build(); + + _evaluatorRequestor.Submit(request); + } + + /// + /// Create a new task set manager. + /// + /// The configuration for the master task + /// The configuration for the slave task + /// A new task set manager + + public IElasticTaskSetManager CreateNewTaskSetManager(Func masterTaskConfiguration, Func slaveTaskConfiguration = null) + { + return CreateNewTaskSetManager(_numEvaluators, masterTaskConfiguration, slaveTaskConfiguration); + } + + /// + /// Create a new task set manager. + /// + /// The number of tasks the task set should manager + /// The configuration for the master task + /// The configuration for the slave task + /// A new task set manager + public IElasticTaskSetManager CreateNewTaskSetManager(int numOfTasks, Func masterTaskConfiguration, Func slaveTaskConfiguration = null) + { + return new DefaultElasticTaskSetManager(numOfTasks, _evaluatorRequestor, _driverId, masterTaskConfiguration, slaveTaskConfiguration); + } + + /// + /// Generate the elastic service configuration object. + /// This method is used to properly configure task contexts with the elastic service. + /// + /// The ealstic service configuration + public IConfiguration GetElasticServiceConfiguration() + { + IConfiguration contextConfig = ServiceConfiguration.ConfigurationModule .Set(ServiceConfiguration.Services, - GenericType>.Class) + GenericType>.Class) .Build(); - return TangFactory.GetTang().NewConfigurationBuilder(serviceConfig) + return TangFactory.GetTang().NewConfigurationBuilder(contextConfig) .BindNamedParameter( GenericType.Class, _nameServerAddr) @@ -201,42 +270,31 @@ public IConfiguration GetServiceConfiguration() _nameServerPort.ToString(CultureInfo.InvariantCulture)) .BindImplementation(GenericType.Class, GenericType.Class) - - .Build(); - } - - /// - /// Creates a generic task Configuration object for the tasks registering to the service. - /// - /// The configuration of the subscription the task will register to - /// The configuration for the task with added service parameters - public IConfiguration GetTaskConfiguration(ICsConfigurationBuilder subscriptionsConf) - { - return subscriptionsConf - .BindNamedParameter( - GenericType.Class, - _driverId) + .BindNamedParameter(GenericType.Class, + _startingPort.ToString(CultureInfo.InvariantCulture)) + .BindNamedParameter(GenericType.Class, + _portRange.ToString(CultureInfo.InvariantCulture)) .Build(); } /// - /// Appends a subscription configuration to a configuration builder object. + /// Appends a stage configuration to a configuration builder object. /// - /// The configuration where the subscription configuration will be appended to - /// The subscription configuration at hand - /// The configuration containing the serialized subscription configuration - public void SerializeSubscriptionConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration subscriptionConfiguration) + /// The configuration where the stage configuration will be appended to + /// The stage configuration at hand + /// The configuration containing the serialized stage configuration + public void SerializeStageConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration stageConfiguration) { - confBuilder.BindSetEntry( - GenericType.Class, - _configSerializer.ToString(subscriptionConfiguration)); + confBuilder.BindSetEntry( + GenericType.Class, + _configSerializer.ToString(stageConfiguration)); } /// /// Append an operator configuration to a configuration builder object. /// /// The list where the operator configuration will be appended to - /// The operator configuration at hand + /// The operator configuration at hand /// The configuration containing the serialized operator configuration public void SerializeOperatorConfiguration(ref IList serializedOperatorsConfs, IConfiguration operatorConfiguration) { @@ -262,7 +320,7 @@ public void OnTaskFailure(IFailedTask value, ref List failureEven /// /// The alarm triggering the timeput /// A list of messages encoding how remote Tasks need to reach - /// /// The next timeouts to be scheduled + /// The next timeouts to be scheduled public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) { } @@ -303,7 +361,7 @@ public void EventDispatcher(ref IFailureEvent @event) /// Mechanism to execute when a reconfigure event is triggered. /// /// - public void OnReconfigure(ref IReconfigure info) + public void OnReconfigure(ref IReconfigure reconfigureEvent) { lock (_statusLock) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetSubscription.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticStage.cs similarity index 85% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetSubscription.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticStage.cs index bef5ccfe59..affd770ab2 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultTaskSetSubscription.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticStage.cs @@ -39,14 +39,14 @@ namespace Org.Apache.REEF.Network.Elastic.Driver.Impl { /// /// Used to group elastic operators into logical units. - /// All operators in the same subscriptions share similar semantics and behavior - /// under failures. Subscriptions can only be created by a service. - /// This class is used to create subscriptions able to manage default failure events. + /// All operators in the same stages share similar semantics and behavior + /// under failures. Stages can only be created by a service. + /// This class is used to create stages able to manage default failure events. /// [Unstable("0.16", "API may change")] - public sealed class DefaultTaskSetSubscription : IElasticTaskSetSubscription, IDefaultFailureEventResponse + public sealed class DefaultElasticStage : IElasticStage, IDefaultFailureEventResponse { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultTaskSetSubscription)); + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticStage)); private bool _finalized; private volatile bool _scheduled; @@ -65,19 +65,19 @@ public sealed class DefaultTaskSetSubscription : IElasticTaskSetSubscription, ID private readonly object _statusLock = new object(); /// - /// Create a new subscription with the input settings. + /// Create a new stage with the input settings. /// - /// The name of the subscription - /// The number of tasks managed by the subscription - /// The service managing the subscription - /// The failure machine for the subscription - internal DefaultTaskSetSubscription( - string subscriptionName, + /// The name of the stage + /// The number of tasks managed by the stage + /// The service managing the stage + /// The failure machine for the stage + internal DefaultElasticStage( + string stageName, int numTasks, - IElasticTaskSetService elasticService, + IElasticContext elasticService, IFailureStateMachine failureMachine = null) { - SubscriptionName = subscriptionName; + StageName = stageName; _finalized = false; _scheduled = false; _numTasks = numTasks; @@ -85,7 +85,7 @@ internal DefaultTaskSetSubscription( _masterTasks = new HashSet(); _datasetConfiguration = Optional.Empty(); IsCompleted = false; - Service = elasticService; + Context = elasticService; _defaultFailureMachine = failureMachine ?? new DefaultFailureStateMachine(numTasks, DefaultFailureStates.Fail); FailureState = _defaultFailureMachine.State; RootOperator = new DefaultEmpty(this, _defaultFailureMachine.Clone()); @@ -94,9 +94,9 @@ internal DefaultTaskSetSubscription( } /// - /// The name of the subscriptions. + /// The name of the stages. /// - public string SubscriptionName { get; set; } + public string StageName { get; set; } /// /// The operator at the beginning of the computation workflow. @@ -104,27 +104,27 @@ internal DefaultTaskSetSubscription( public ElasticOperator RootOperator { get; private set; } /// - /// The service managing the subscriptions. + /// The service managing the stages. /// - /// Whether the subscriptions contains iterations or not. + /// Whether the stages contains iterations or not. /// public bool IsIterative { get; set; } /// - /// The failure state of the target subscriptions. + /// The failure state of the target stages. /// public IFailureState FailureState { get; private set; } /// - /// Whether the subscriptions is completed or not. + /// Whether the stages is completed or not. /// public bool IsCompleted { get; set; } /// - /// Generates an id to uniquely identify operators in the subscriptions. + /// Generates an id to uniquely identify operators in the stages. /// /// A new unique id public int GetNextOperatorId() @@ -133,7 +133,7 @@ public int GetNextOperatorId() } /// - /// Add a partitioned dataset to the subscription. + /// Add a partitioned dataset to the stage. /// /// The partitioned dataset /// Whether the master node should get a partition @@ -143,7 +143,7 @@ public void AddDataset(IPartitionedInputDataSet inputDataSet, bool isMasterGetti } /// - /// Add a set of datasets to the subscription. + /// Add a set of datasets to the stage. /// /// The configuration for the datasets /// Whether the master node should get a partition @@ -155,16 +155,16 @@ public void AddDataset(IConfiguration[] inputDataSet, bool isMasterGettingInputD } /// - /// Finalizes the subscriptions. - /// After the subscriptions has been finalized, no more operators can + /// Finalizes the stages. + /// After the stages has been finalized, no more operators can /// be added to the group. /// - /// The same finalized subscriptions - public IElasticTaskSetSubscription Build() + /// The same finalized stages + public IElasticStage Build() { if (_finalized == true) { - throw new IllegalStateException("Subscription cannot be built more than once"); + throw new IllegalStateException("Stage cannot be built more than once"); } if (_datasetConfiguration.IsPresent()) @@ -186,11 +186,11 @@ public IElasticTaskSetSubscription Build() } /// - /// Add a task to the subscriptions. - /// The subscriptions must have been buit before tasks can be added. + /// Add a task to the stages. + /// The stages must have been buit before tasks can be added. /// /// The id of the task to add - /// True if the task is correctly added to the subscriptions + /// True if the task is correctly added to the stages public bool AddTask(string taskId) { if (taskId == string.Empty) @@ -206,7 +206,7 @@ public bool AddTask(string taskId) if (!_finalized) { - throw new IllegalStateException("Subscription must be finalized before adding tasks."); + throw new IllegalStateException("Stage must be finalized before adding tasks."); } lock (_tasksLock) @@ -246,14 +246,14 @@ public bool AddTask(string taskId) } /// - /// Decides if the tasks added to the subscriptions can be scheduled for execution + /// Decides if the tasks added to the stages can be scheduled for execution /// or not. This method is used for implementing different policies for /// triggering the scheduling of tasks. /// /// True if the previously added tasks can be scheduled for execution - public bool ScheduleSubscription() + public bool ScheduleStage() { - // Schedule if we reach the number of requested tasks or the subscription contains an iterative pipeline that is ready to be scheduled and the + // Schedule if we reach the number of requested tasks or the stage contains an iterative pipeline that is ready to be scheduled and the // policy requested by the user allow early start with ramp up. if (!_scheduled && (_numTasks == _tasksAdded || (IsIterative && _defaultFailureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule && RootOperator.CanBeScheduled()))) { @@ -283,18 +283,18 @@ public bool IsMasterTaskContext(IActiveContext activeContext) /// /// Creates the Configuration for the input task. - /// Must be called only after all tasks have been added to the subscriptions. + /// Must be called only after all tasks have been added to the stages. /// /// The configuration builder the configuration will be appended to - /// The task id of the task that belongs to this subscriptions - /// The configuration for the Task with added subscriptions informations + /// The task id of the task that belongs to this stages + /// The configuration for the Task with added stages informations public IConfiguration GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId) { IList serializedOperatorsConfs = new List(); builder = builder - .BindNamedParameter( - GenericType.Class, - SubscriptionName); + .BindNamedParameter( + GenericType.Class, + StageName); RootOperator.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); @@ -312,7 +312,7 @@ public IConfiguration GetTaskConfiguration(ref ICsConfigurationBuilder builder, /// (if any). /// /// The task id of the task we wanto to retrieve the data partition. - /// The task is required to belong to thq subscriptions + /// The task is required to belong to thq stages /// The configuration of the data partition (if any) of the task public Optional GetPartitionConf(string taskId) { @@ -334,15 +334,15 @@ public Optional GetPartitionConf(string taskId) /// /// Retrieve the log the final statistics of the computation: this is the sum of all - /// the stats of all the Operators compising the subscription. This method can be called - /// only once the subscriptions is completed. + /// the stats of all the Operators compising the stage. This method can be called + /// only once the stages is completed. /// /// The final statistics for the computation public string LogFinalStatistics() { if (!IsCompleted) { - throw new IllegalStateException($"Cannot log statistics before Subscription {SubscriptionName} is completed"); + throw new IllegalStateException($"Cannot log statistics before Stage {StageName} is completed"); } return RootOperator.LogFinalStatistics(); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs index 66c7d56442..45a1cb8c7c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs @@ -39,9 +39,9 @@ public interface ICheckpointState int OperatorId { get; set; } /// - /// The subscription name of the checkpoint. + /// The stage name of the checkpoint. /// - string SubscriptionName { get; set; } + string StageName { get; set; } /// /// The actual state of the checkpoint. @@ -60,6 +60,6 @@ public interface ICheckpointState /// to be sent among nodes to recover computation. /// /// A checkpoint ready to be communicated - GroupCommunicationMessage ToMessage(); + ElasticGroupCommunicationMessage ToMessage(); } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs index 2cc49aec2c..286a582549 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs @@ -25,7 +25,7 @@ namespace Org.Apache.REEF.Network.Elastic.Failures { /// /// Entry point for classes expected to be aware and act over failures. - /// Used to propagate failures through operators, subscriptions and the service. + /// Used to propagate failures through operators, stages and the context. /// [Unstable("0.16", "API may change")] public interface IFailureResponse @@ -44,7 +44,7 @@ public interface IFailureResponse /// /// The alarm triggering the timeput /// A list of messages encoding how remote Tasks need to reach - /// /// The next timeouts to be scheduled + /// The next timeouts to be scheduled void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts); /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs index 73618a5349..dd9061a00c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs @@ -28,7 +28,7 @@ namespace Org.Apache.REEF.Network.Elastic.Failures public interface IReschedule : IReconfigure { /// - /// The configurations for the subscriptions of the task. + /// The configurations for the stages of the task. /// Dictionary> RescheduleTaskConfigurations { get; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs index ae70923f90..63a0042c9b 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs @@ -33,7 +33,7 @@ private DefaultCheckpointState() { Iteration = -1; OperatorId = -1; - SubscriptionName = string.Empty; + StageName = string.Empty; } /// @@ -47,9 +47,9 @@ private DefaultCheckpointState() public int OperatorId { get; set; } /// - /// The subscription name of the checkpoint. + /// The stage name of the checkpoint. /// - public string SubscriptionName { get; set; } + public string StageName { get; set; } /// /// The actual state of the checkpoint. @@ -74,7 +74,7 @@ public ICheckpointState Create(object state) /// to be sent among nodes to recover computation. /// /// A checkpoint ready to be communicated - public GroupCommunicationMessage ToMessage() + public ElasticGroupCommunicationMessage ToMessage() { return new CheckpointMessage(this); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs index c6c74698f3..3e9b4f84f8 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs @@ -15,9 +15,7 @@ // specific language governing permissions and limitations // under the License. -using System; using System.Collections.Generic; -using Org.Apache.REEF.Driver.Context; using Org.Apache.REEF.Driver.Task; using Org.Apache.REEF.Utilities; using Org.Apache.REEF.Network.Elastic.Comm; @@ -76,12 +74,13 @@ public int FailureEvent public Optional Iteration { get; set; } /// - /// The response message generated to react to the failure event. + /// Messages implementing the response from the driver to the tasks + /// to reconfigure the compution. /// public List FailureResponse { get; private set; } /// - /// The configurations for the subscriptions of the task. + /// The configurations for the stages of the task. /// public Dictionary> RescheduleTaskConfigurations { get; private set; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs index f85e1729e5..a6b46b5365 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs @@ -66,7 +66,7 @@ protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilde { confBuilder .BindImplementation(GenericType>.Class, GenericType>.Class) - .BindImplementation(GenericType.Class, GenericType>.Class); + .BindImplementation(GenericType.Class, GenericType>.Class); SetMessageType(typeof(Physical.Impl.DefaultBroadcast), ref confBuilder); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultEmpty.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultEmpty.cs index 76c1102e54..122b3d9cad 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultEmpty.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultEmpty.cs @@ -35,10 +35,10 @@ class DefaultEmpty : ElasticOperatorWithDefaultDispatcher /// /// Basic constructor for the empty operator. /// - /// The subscription the operator is part of + /// The stage the operator is part of /// The failure machine goverining the opeartor - public DefaultEmpty(IElasticTaskSetSubscription subscription, IFailureStateMachine failureMachine) : - base(subscription, null, new EmptyTopology(), failureMachine) + public DefaultEmpty(IElasticStage stage, IFailureStateMachine failureMachine) : + base(stage, null, new EmptyTopology(), failureMachine) { OperatorName = Constants.Empty; MasterId = 1; @@ -85,7 +85,7 @@ internal override void GatherMasterIds(ref HashSet masterTasks) { if (!_operatorFinalized) { - throw new IllegalStateException("Operator need to be build before finalizing the subscription"); + throw new IllegalStateException("Operator need to be build before finalizing the stage"); } if (_next != null) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultOneToN.cs index fdde6ebc19..f8c622c879 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultOneToN.cs @@ -93,7 +93,7 @@ protected override bool ReactOnTaskMessage(ITaskMessage message, ref List /// Base constructor for an abstract operator implementing the default failure logic. /// - /// The subscription the operator is part of + /// The stage the operator is part of /// The previous operator in the pipelines /// The topology for the operator /// The failure machine of the operator /// The chckpoint level for the opearator /// Additonal opeartor specific configurations protected ElasticOperatorWithDefaultDispatcher( - IElasticTaskSetSubscription subscription, + IElasticStage stage, ElasticOperator prev, ITopology topology, IFailureStateMachine failureMachine, CheckpointLevel checkpointLevel = CheckpointLevel.None, params IConfiguration[] configurations) : - base(subscription, prev, topology, failureMachine, checkpointLevel, configurations) + base(stage, prev, topology, failureMachine, checkpointLevel, configurations) { } @@ -242,7 +242,7 @@ protected override bool PropagateFailureDownstream() /// protected override void LogOperatorState() { - string intro = $"State for Operator {OperatorName} in Subscription {Subscription.SubscriptionName}:\n"; + string intro = $"State for Operator {OperatorName} in Stage {Stage.StageName}:\n"; string topologyState = $"Topology:\n{_topology.LogTopologyState()}\n"; string failureMachineState = $"Failure State: {(DefaultFailureStates)_failureMachine.State.FailureState}" + $"\nFailure(s) Reported: {_failureMachine.NumOfFailedDataPoints}/{_failureMachine.NumOfDataPoints}"; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs index fb942f7b05..16ef19d2d5 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs @@ -36,12 +36,13 @@ using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Wake.StreamingCodec.CommonStreamingCodecs; namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl { /// /// Basic implementation for logical operators. - /// Each operator is part of a subscription and is parametrized by a topology, a failure + /// Each operator is part of a stage and is parametrized by a topology, a failure /// state machine and a checkpoint policy. /// Operators are composed into pipelines. /// Once a pipeline is finalized, tasks can be added to the operator, which @@ -54,6 +55,31 @@ public abstract class ElasticOperator : IFailureResponse, ITaskMessageResponse { private static readonly Logger LOGGER = Logger.GetLogger(typeof(ElasticOperator)); + protected static readonly Dictionary CODECMAP = new Dictionary() + { + { + typeof(int), StreamingCodecConfiguration.Conf + .Set(StreamingCodecConfiguration.Codec, GenericType.Class) + .Build() + }, + { + typeof(int[]), StreamingCodecConfiguration.Conf + .Set(StreamingCodecConfiguration.Codec, GenericType.Class) + .Build() + }, + { + typeof(float), StreamingCodecConfiguration.Conf + .Set(StreamingCodecConfiguration.Codec, GenericType.Class) + .Build() + }, + { + typeof(float[]), StreamingCodecConfiguration.Conf + .Set(StreamingCodecConfiguration.Codec, GenericType.Class) + .Build() + } + + }; + // For the moment we consider only linear sequences (pipelines) of operators (no branching for e.g., joins) protected ElasticOperator _next = null; protected readonly ElasticOperator _prev; @@ -66,28 +92,28 @@ public abstract class ElasticOperator : IFailureResponse, ITaskMessageResponse protected bool _operatorFinalized; protected volatile bool _operatorStateFinalized; - protected IElasticTaskSetSubscription _subscription; + protected IElasticStage _stage; /// /// Specification for generic elastic operators. /// - /// The subscription this operator is part of + /// The stage this operator is part of /// The previous operator in the pipeline /// The topology of the operator /// The behavior of the operator under failures /// The checkpoint policy for the operator /// Additional configuration parameters public ElasticOperator( - IElasticTaskSetSubscription subscription, + IElasticStage stage, ElasticOperator prev, ITopology topology, IFailureStateMachine failureMachine, CheckpointLevel checkpointLevel = CheckpointLevel.None, params IConfiguration[] configurations) { - _subscription = subscription; + _stage = stage; _prev = prev; - _id = Subscription.GetNextOperatorId(); + _id = Stage.GetNextOperatorId(); _topology = topology; _failureMachine = failureMachine; _checkpointLevel = checkpointLevel; @@ -96,7 +122,7 @@ public ElasticOperator( _operatorStateFinalized = false; _topology.OperatorId = _id; - _topology.SubscriptionName = Subscription.SubscriptionName; + _topology.StageName = Stage.StageName; } /// @@ -115,25 +141,25 @@ public ElasticOperator( public bool WithinIteration { get; protected set; } /// - /// The subscription this operator is part of. + /// The stage this operator is part of. /// - public IElasticTaskSetSubscription Subscription + public IElasticStage Stage { get { - if (_subscription == null) + if (_stage == null) { if (_prev == null) { - throw new IllegalStateException("The reference to the parent subscription is lost."); + throw new IllegalStateException("The reference to the parent stage is lost."); } - _subscription = _prev.Subscription; + _stage = _prev.Stage; - return _prev.Subscription; + return _prev.Stage; } - return _subscription; + return _stage; } } @@ -261,6 +287,14 @@ public virtual ElasticOperator BuildState() return this; } + internal virtual void GetCodecConfiguration(ref IConfiguration confBuilder) + { + if (_next != null) + { + _next.GetCodecConfiguration(ref confBuilder); + } + } + /// /// Whether this is the last iterator in the pipeline. /// @@ -301,7 +335,7 @@ public virtual bool CheckIfLastIterator() /// /// The alarm triggering the timeput /// A list of messages encoding how remote Tasks need to reach - /// /// The next timeouts to be scheduled + /// The next timeouts to be scheduled public abstract void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts); /// @@ -354,6 +388,41 @@ internal bool CanBeScheduled() return canBeScheduled; } + /// + /// Utility method gathering the set of master task ids of the operators in the current pipeline. + /// + /// The id of the master tasks of the current and successive operators + internal virtual void GatherMasterIds(ref HashSet masterTasks) + { + if (_operatorFinalized != true) + { + throw new IllegalStateException("Operator need to be build before gathering information."); + } + + masterTasks.Add(Utils.BuildTaskId(Stage.StageName, MasterId)); + + if (_next != null) + { + _next.GatherMasterIds(ref masterTasks); + } + } + + /// + /// Log the final statistics of the operator. + /// This is called when the pipeline execution is completed. + /// + internal virtual string LogFinalStatistics() + { + var str = LogInternalStatistics(); + + if (_next != null) + { + str += _next.LogFinalStatistics(); + } + + return str; + } + /// /// Appends the message type to the configuration. /// @@ -401,7 +470,7 @@ protected virtual void GetOperatorConfiguration(ref IList serializedOper PhysicalOperatorConfiguration(ref operatorBuilder); - if (!Subscription.IsIterative && _next == null) + if (!Stage.IsIterative && _next == null) { operatorBuilder.BindNamedParameter( GenericType.Class, @@ -422,7 +491,7 @@ protected virtual void GetOperatorConfiguration(ref IList serializedOper operatorConf = Configurations.Merge(operatorConf, conf); } - Subscription.Service.SerializeOperatorConfiguration(ref serializedOperatorsConfs, operatorConf); + Stage.Context.SerializeOperatorConfiguration(ref serializedOperatorsConfs, operatorConf); } /// @@ -445,31 +514,12 @@ protected virtual bool ReactOnTaskMessage(ITaskMessage message, ref List - /// Utility method gathering the set of master task ids of the operators in the current pipeline. - /// - /// The id of the master tasks of the current and successive operators - internal virtual void GatherMasterIds(ref HashSet masterTasks) - { - if (_operatorFinalized != true) - { - throw new IllegalStateException("Operator need to be build before gathering information."); - } - - masterTasks.Add(Utils.BuildTaskId(Subscription.SubscriptionName, MasterId)); - - if (_next != null) - { - _next.GatherMasterIds(ref masterTasks); - } - } - /// /// Logs the current operator state. /// protected virtual void LogOperatorState() { - string intro = $"State for Operator {OperatorName} in Subscription {Subscription.SubscriptionName}:\n"; + string intro = $"State for Operator {OperatorName} in Stage {Stage.StageName}:\n"; string topologyState = $"Topology:\n{_topology.LogTopologyState()}"; string failureMachineState = "Failure State: " + _failureMachine.State.FailureState + "\nFailure(s) Reported: " + _failureMachine.NumOfFailedDataPoints; @@ -477,28 +527,6 @@ protected virtual void LogOperatorState() LOGGER.Log(Level.Info, intro + topologyState + failureMachineState); } - /// - /// Binding from logical to physical operator. - /// - /// The configuration builder the binding will be added to - protected abstract void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder builder); - - /// - /// Log the final statistics of the operator. - /// This is called when the pipeline execution is completed. - /// - internal virtual string LogFinalStatistics() - { - var str = LogInternalStatistics(); - - if (_next != null) - { - str += _next.LogFinalStatistics(); - } - - return str; - } - /// /// Log the final internal statistics of the operator. /// @@ -507,6 +535,12 @@ protected virtual string LogInternalStatistics() return _topology.LogFinalStatistics(); } + /// + /// Binding from logical to physical operator. + /// + /// The configuration builder the binding will be added to + protected abstract void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder builder); + private ITopology GetTopology(TopologyType topologyType) { ITopology topology; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs index 76bb2b34cd..69750cb665 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs @@ -63,7 +63,7 @@ public void Send(T data) _position = PositionTracker.InSend; int iteration = IteratorReference == null ? 0 : (int)IteratorReference.Current; - var message = _topology.AssembleDataMessage(iteration, new[] { data }); + var message = _topology.GetDataMessage(iteration, new[] { data }); Checkpoint(message, message.Iteration); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs index 8dffa6c37f..94a0433fb9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs @@ -25,6 +25,7 @@ using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; +using Org.Apache.REEF.Network.Elastic.Comm; namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Impl { @@ -106,7 +107,7 @@ public string FailureInfo /// /// The set of messages checkpointed in memory. /// - private List CheckpointedMessages { get; set; } + private List CheckpointedMessages { get; set; } /// /// Receive a message from neighbors broadcasters. @@ -117,16 +118,18 @@ public T Receive() _position = PositionTracker.InReceive; var received = false; - DataMessageWithTopology message = null; + DataMessage dataMessage = null; + ITypedDataMessage typedDataMessage = null; var isIterative = IteratorReference != null; while (!received && !CancellationSource.IsCancellationRequested) { - message = _topology.Receive(CancellationSource) as DataMessageWithTopology; + dataMessage = _topology.Receive(CancellationSource) as DataMessage; + typedDataMessage = dataMessage as ITypedDataMessage; - if (isIterative && message.Iteration < (int)IteratorReference.Current) + if (isIterative && typedDataMessage.Iteration < (int)IteratorReference.Current) { - LOGGER.Log(Level.Warning, $"Received message for iteration {message.Iteration} but I am already in iteration {(int)IteratorReference.Current}: ignoring."); + LOGGER.Log(Level.Warning, $"Received message for iteration {typedDataMessage.Iteration} but I am already in iteration {(int)IteratorReference.Current}: ignoring."); } else { @@ -134,21 +137,21 @@ public T Receive() } } - if (message == null) + if (typedDataMessage == null) { throw new OperationCanceledException("Impossible to receive messages: operation cancelled."); } if (isIterative) { - IteratorReference.SyncIteration(message.Iteration); + IteratorReference.SyncIteration(typedDataMessage.Iteration); } - Checkpoint(message, message.Iteration); + Checkpoint(dataMessage, dataMessage.Iteration); _position = PositionTracker.AfterReceive; - return message.Data; + return typedDataMessage.Data; } /// @@ -187,7 +190,7 @@ public void Dispose() { if (_isLast) { - _topology.SignalSubscriptionComplete(); + _topology.StageComplete(); } _topology.Dispose(); } @@ -197,14 +200,14 @@ public void Dispose() /// /// The messages to checkpoint /// The iteration of the checkpoint - internal void Checkpoint(GroupCommunicationMessage data, int iteration) + internal void Checkpoint(ElasticGroupCommunicationMessage data, int iteration) { if (_checkpointableState.Level > CheckpointLevel.None) { - var state = _checkpointableState.Create(iteration); + var state = _checkpointableState.Create(); state.MakeCheckpointable(data); - _topology.Checkpoint(state); + _topology.Checkpoint(state, iteration); } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ICheckpointLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ICheckpointLayer.cs new file mode 100644 index 0000000000..1588a48753 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ICheckpointLayer.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Checkpointing service interface used to locally checkpoint some task state or retrieve previously checkpointed local / remote states. + /// + [Unstable("0.16", "API may change")] + [DefaultImplementation(typeof(CentralizedCheckpointLayer))] + internal interface ICheckpointLayer : IDisposable + { + /// + /// The service for communicating with the other available nodes. + /// + CommunicationLayer CommunicationLayer { set; } + + /// + /// Register the current task id as well as notify the root task for the operator. + /// + /// The name of the stage + /// The operator identifier + /// The identifier of the current task + /// The identifier of the root task of the operator + void RegisterNode(string stageName, int operatorId, string taskId, string rootTaskId); + + /// + /// Checkpoint the input state. + /// + /// The state to checkpoint + void Checkpoint(ICheckpointState state); + + /// + /// Retrieve a checkpoint. + /// + /// The retrieve checkpoint if exists + /// The local task identifier + /// The name of the stage + /// The operator identifier + /// The iteration number of the checkpoint + /// Whether to request the checkpoint remotely if not found locally + /// True if the checkpoint is found, false otherwise + bool GetCheckpoint(out ICheckpointState checkpoint, string taskId, string stageName, int operatorId, int iteration = -1, bool requestToRemote = true); + + /// + /// Remove a checkpoint. + /// + /// The stage of the checkpoint to remove + /// The operator id of the checkpoint to remove + void RemoveCheckpoint(string stageName, int operatorId); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CancellationSource.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CancellationSource.cs new file mode 100644 index 0000000000..76c3ad671e --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CancellationSource.cs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; +using System.Threading; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Generic cancellation source for task operations. + /// This class basically wraps and uses Tang + /// to inject the same source through the elastic communication services. + /// + [Unstable("0.16", "API may change")] + internal sealed class CancellationSource + { + [Inject] + public CancellationSource() + { + Source = new CancellationTokenSource(); + } + + /// + /// The wrapped cancellation source. + /// + public CancellationTokenSource Source { get; private set; } + + /// + /// Whether the operation is cancelled. + /// + /// + public bool IsCancelled + { + get { return Source.IsCancellationRequested; } + } + + /// + /// Cancel the currently running computation. + /// + public void Cancel() + { + Source.Cancel(); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CentralizedCheckpointService.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CentralizedCheckpointService.cs new file mode 100644 index 0000000000..b20226cbd4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CentralizedCheckpointService.cs @@ -0,0 +1,243 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Utilities.Logging; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Checkpointing service used to locally checkpoint some task state or retrieve previously checkpointed local / remote states. + /// This service allows to reach remote checkpoints stored in root node when operators support it. + /// + [Unstable("0.16", "API may change")] + internal class CentralizedCheckpointLayer : ICheckpointLayer + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(CentralizedCheckpointLayer)); + + private readonly ConcurrentDictionary> _checkpoints; + private readonly ConcurrentDictionary _roots; + private readonly ConcurrentDictionary _checkpointsWaiting; + + private readonly int _limit; + private readonly int _timeout; + private readonly int _retry; + + private readonly CancellationSource _cancellationSource; + + [Inject] + private CentralizedCheckpointLayer( + [Parameter(typeof(ElasticServiceConfigurationOptions.NumCheckpoints))] int num, + [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, + [Parameter(typeof(GroupCommunicationConfigurationOptions.Retry))] int retry, + CancellationSource cancellationSource, + CommunicationLayer commLayer) + { + _limit = num; + _timeout = timeout; + _retry = retry; + + _cancellationSource = cancellationSource; + + _checkpoints = new ConcurrentDictionary>(); + _roots = new ConcurrentDictionary(); + _checkpointsWaiting = new ConcurrentDictionary(); + } + + /// The service for communicating with the other available nodes. + /// + public CommunicationLayer CommunicationLayer { private get; set; } + + /// + /// Register the current task id as well as notify the root task for the operator. + /// + /// The name of the stage + /// The operator identifier + /// The identifier of the current task + /// The identifier of the root task of the operator + public void RegisterNode(string stageName, int operatorId, string taskId, string rootTaskId) + { + var id = new CheckpointIdentifier(stageName, operatorId); + if (!_roots.ContainsKey(id) && taskId != rootTaskId) + { + _roots.TryAdd(id, rootTaskId); + } + } + + /// + /// Checkpoint the input state. + /// + /// The state to checkpoint + public void Checkpoint(ICheckpointState state) + { + if (state.StageName == null || state.StageName == string.Empty) + { + throw new ArgumentException(nameof(state.StageName), "Null or empty."); + } + + if (state.OperatorId < 0) + { + throw new ArgumentException(nameof(state.OperatorId), "Invalid."); + } + + SortedDictionary checkpoints; + var id = new CheckpointIdentifier(state.StageName, state.OperatorId); + ManualResetEvent waiting; + + if (!_checkpoints.TryGetValue(id, out checkpoints)) + { + checkpoints = new SortedDictionary(); + _checkpoints.TryAdd(id, checkpoints); + } + + checkpoints[state.Iteration] = state; + + if (_checkpointsWaiting.TryRemove(id, out waiting)) + { + waiting.Set(); + } + + CheckSize(checkpoints); + } + + /// + /// Retrieve a checkpoint. + /// + /// The retrieve checkpoint if exists + /// The local task identifier + /// The name of the stage + /// The operator identifier + /// The iteration number of the checkpoint + /// Whether to request the checkpoint remotely if not found locally + /// True if the checkpoint is found, false otherwise + public bool GetCheckpoint(out ICheckpointState checkpoint, string taskId, string stageName, int operatorId, int iteration = -1, bool requestToRemote = true) + { + SortedDictionary checkpoints; + var id = new CheckpointIdentifier(stageName, operatorId); + checkpoint = null; + + if (!_checkpoints.TryGetValue(id, out checkpoints)) + { + LOGGER.Log(Level.Warning, "Asking for a checkpoint not in the service."); + + if (!requestToRemote) + { + LOGGER.Log(Level.Warning, "Trying to recover from a non existing checkpoint."); + return false; + } + + string rootTaskId; + + if (!_roots.TryGetValue(id, out rootTaskId)) + { + LOGGER.Log(Level.Warning, "Trying to recover from a non existing checkpoint."); + return false; + } + + if (CommunicationLayer == null) + { + throw new IllegalStateException("Communication service not set up."); + } + + var received = new ManualResetEvent(false); + var retry = 0; + + do + { + LOGGER.Log(Level.Info, $"Retrieving the checkpoint from {rootTaskId}."); + var cpm = new CheckpointMessageRequest(stageName, operatorId, iteration); + + CommunicationLayer.Send(rootTaskId, cpm, _cancellationSource.Source); + + _checkpointsWaiting.TryAdd(id, received); + retry++; + } + while (!received.WaitOne(_timeout) && retry < _retry); + + if (!_checkpoints.TryGetValue(id, out checkpoints)) + { + LOGGER.Log(Level.Warning, "Checkpoint not retrieved."); + _checkpointsWaiting.TryRemove(id, out received); + return false; + } + } + + iteration = iteration < 0 ? checkpoints.Keys.Last() : iteration; + + if (!checkpoints.TryGetValue(iteration, out checkpoint)) + { + LOGGER.Log(Level.Warning, $"Checkpoint for iteration {iteration} not found."); + } + + return true; + } + + /// + /// Remove a checkpoint. + /// + /// The stage of the checkpoint to remove + /// The operator id of the checkpoint to remove + public void RemoveCheckpoint(string stageName, int operatorId) + { + if (stageName == null || stageName == string.Empty) + { + throw new ArgumentException(nameof(stageName), "Null or empty."); + } + + if (operatorId < 0) + { + throw new ArgumentException(nameof(operatorId), "Invalid."); + } + + var id = new CheckpointIdentifier(stageName, operatorId); + SortedDictionary checkpoints; + + _checkpoints.TryRemove(id, out checkpoints); + } + + /// + /// Dispose the service. + /// + public void Dispose() + { + foreach (var waiting in _checkpointsWaiting.Values) + { + waiting.Set(); + waiting.Close(); + } + } + + private void CheckSize(SortedDictionary checkpoint) + { + if (checkpoint.Keys.Count > _limit) + { + var first = checkpoint.Keys.First(); + checkpoint.Remove(first); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CheckpointIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CheckpointIdentifier.cs new file mode 100644 index 0000000000..cfc2bf50e9 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CheckpointIdentifier.cs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// An identifier for a given node in the group communication graph. + /// A node is uniquely identifiable by a combination of its Task ID, + /// , and . + /// + [Unstable("0.16", "API may change")] + internal sealed class CheckpointIdentifier + { + /// + /// Construct a new checkpoint identifier. + /// + /// The stage name + /// The operator identifier + public CheckpointIdentifier(string stageName, int operatorId) + { + StageName = stageName; + OperatorId = operatorId; + } + + /// + /// The stage name of the node. + /// + public string StageName { get; private set; } + + /// + /// The operator id of the node. + /// + public int OperatorId { get; private set; } + + /// + /// Overrides . Simply compares equivalence of instance fields. + /// + public override bool Equals(object obj) + { + if (ReferenceEquals(null, obj)) + { + return false; + } + + if (ReferenceEquals(this, obj)) + { + return true; + } + + return obj is CheckpointIdentifier && Equals((CheckpointIdentifier)obj); + } + + /// + /// Overrides . Generates hashcode based on the instance fields. + /// + public override int GetHashCode() + { + int hash = 17; + hash = (hash * 31) + StageName.GetHashCode(); + return (hash * 31) + OperatorId.GetHashCode(); + } + + /// + /// Compare equality of instance fields. + /// + private bool Equals(CheckpointIdentifier other) + { + return StageName.Equals(other.StageName) && + OperatorId.Equals(other.OperatorId); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs index 1e305b97e4..bcbd249b1a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs @@ -37,9 +37,9 @@ public interface ITopology int OperatorId { get; set; } /// - /// The subscription of the operator using the topology. + /// The stage of the operator using the topology. /// - string SubscriptionName { get; set; } + string StageName { get; set; } /// /// Adds a new task to the topology. @@ -89,7 +89,8 @@ public interface ITopology string LogTopologyState(); /// - /// This method is triggered when a node detects a change in the topology and asks the driver for an update. + /// This method is triggered when a node contacts the driver to synchronize the remote topology + /// with the driver's one. /// /// The identifier of the task asking for the update /// A list of message containing the topology update @@ -108,7 +109,7 @@ public interface ITopology /// The task id responsible for the topology change /// Some additional topology-specific information /// The optional iteration number in which the event occurred - /// One or more messages for reconfiguring the Tasks + /// One or more messages for reconfiguring the tasks IList Reconfigure(string taskId, Optional info, Optional iteration); /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs index c4cbe71758..ae20ddfd25 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs @@ -50,9 +50,9 @@ public EmptyTopology() public int OperatorId { get; set; } /// - /// The subscription of the operator using the topology. + /// The stage of the operator using the topology. /// - public string SubscriptionName { get; set; } + public string StageName { get; set; } /// /// Adds a new task to the topology. @@ -102,9 +102,9 @@ public ITopology Build() throw new IllegalStateException("Topology cannot be built because not linked to any operator"); } - if (SubscriptionName == string.Empty) + if (StageName == string.Empty) { - throw new IllegalStateException("Topology cannot be built because not linked to any subscription"); + throw new IllegalStateException("Topology cannot be built because not linked to any stage"); } _finalized = true; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index dbdec652a2..c345d8f6e0 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -43,7 +43,7 @@ public class FlatTopology : ITopology private string _rootTaskId; private int _rootId; - private string _taskSubscription; + private string _taskStage; private volatile int _iteration; private bool _finalized; private readonly bool _sorted; @@ -67,7 +67,7 @@ public class FlatTopology : ITopology public FlatTopology(int rootId, bool sorted = false) { _rootTaskId = string.Empty; - _taskSubscription = string.Empty; + _taskStage = string.Empty; _rootId = rootId; _finalized = false; _sorted = sorted; @@ -89,9 +89,9 @@ public FlatTopology(int rootId, bool sorted = false) public int OperatorId { get; set; } /// - /// The subscription of the operator using the topology. + /// The stage of the operator using the topology. /// - public string SubscriptionName { get; set; } + public string StageName { get; set; } /// /// Adds a new task to the topology. @@ -146,9 +146,9 @@ public bool AddTask(string taskId, IFailureStateMachine failureMachine) } // This is required later in order to build the topology - if (_taskSubscription == string.Empty) + if (_taskStage == string.Empty) { - _taskSubscription = Utils.GetTaskSubscriptions(taskId); + _taskStage = Utils.GetTaskStages(taskId); } } @@ -234,14 +234,14 @@ public ITopology Build() throw new IllegalStateException("Topology cannot be built because not linked to any operator"); } - if (SubscriptionName == string.Empty) + if (StageName == string.Empty) { - throw new IllegalStateException("Topology cannot be built because not linked to any subscription"); + throw new IllegalStateException("Topology cannot be built because not linked to any stage"); } BuildTopology(); - _rootTaskId = Utils.BuildTaskId(_taskSubscription, _rootId); + _rootTaskId = Utils.BuildTaskId(_taskStage, _rootId); _finalized = true; return this; @@ -323,7 +323,7 @@ public void TopologyUpdateResponse(string taskId, ref List() { update }, SubscriptionName, OperatorId, _iteration); + var data = new UpdateMessagePayload(new List() { update }, StageName, OperatorId, _iteration); var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); returnMessages.Add(returnMessage); @@ -397,7 +397,7 @@ public IList Reconfigure(string taskId, Optional { new TopologyUpdate(_rootTaskId, children) }; - var data = new FailureMessagePayload(update, SubscriptionName, OperatorId, -1); + var data = new FailureMessagePayload(update, StageName, OperatorId, -1); var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); LOGGER.Log(Level.Info, $"Task {taskId} is removed from topology"); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs index 73a537fb76..fcdfcd520e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs @@ -26,6 +26,7 @@ using System.Linq; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Failures.Impl; +using Org.Apache.REEF.Network.Elastic.Task; namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl { @@ -37,7 +38,7 @@ internal sealed class DefaultBroadcastTopology : OneToNTopology { [Inject] private DefaultBroadcastTopology( - [Parameter(typeof(OperatorParameters.SubscriptionName))] string subscriptionName, + [Parameter(typeof(OperatorParameters.StageName))] string stageName, [Parameter(typeof(OperatorParameters.TopologyRootTaskId))] int rootId, [Parameter(typeof(OperatorParameters.TopologyChildTaskIds))] ISet children, [Parameter(typeof(OperatorParameters.PiggybackTopologyUpdates))] bool piggyback, @@ -46,11 +47,11 @@ private DefaultBroadcastTopology( [Parameter(typeof(GroupCommunicationConfigurationOptions.Retry))] int retry, [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, [Parameter(typeof(GroupCommunicationConfigurationOptions.DisposeTimeout))] int disposeTimeout, - CommunicationService commLayer, - CheckpointService checkpointService) : base( - subscriptionName, + CommunicationLayer commLayer, + ICheckpointLayer checkpointLayer) : base( + stageName, taskId, - Utils.BuildTaskId(subscriptionName, rootId), + Utils.BuildTaskId(stageName, rootId), operatorId, children, piggyback, @@ -58,19 +59,19 @@ private DefaultBroadcastTopology( timeout, disposeTimeout, commLayer, - checkpointService) + checkpointLayer) { } - public override DataMessage AssembleDataMessage(int iteration, T[] data) + public override DataMessage GetDataMessage(int iteration, T[] data) { if (_piggybackTopologyUpdates) { - return new DataMessageWithTopology(SubscriptionName, OperatorId, iteration, data[0]); + return new DataMessageWithTopology(StageName, OperatorId, iteration, data[0]); } else { - return new DataMessage(SubscriptionName, OperatorId, iteration, data[0]); + return new DataMessage(StageName, OperatorId, iteration, data[0]); } } @@ -80,7 +81,7 @@ public override DataMessage AssembleDataMessage(int iteration, T[] data) /// The source in case the task is cancelled protected override void Send(CancellationTokenSource cancellationSource) { - GroupCommunicationMessage message; + ElasticGroupCommunicationMessage message; int retry = 0; // Check if we have a message to send @@ -121,7 +122,7 @@ protected override void Send(CancellationTokenSource cancellationSource) // Deliver the message to the commonication layer. foreach (var node in _children.Where(x => !_nodesToRemove.TryGetValue(x.Value, out byte val))) { - _commService.Send(node.Value, message, cancellationSource); + _commLayer.Send(node.Value, message, cancellationSource); } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs index 9289a44181..03747bf1ec 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs @@ -30,12 +30,12 @@ internal abstract class DriverAwareOperatorTopology : OperatorTopology, IObserve /// /// Constructor. /// + /// The stage name the topology is working on /// The identifier of the task the topology is running on /// The identifier of the root note in the topology - /// The subscription name the topology is working on /// The identifier of the operator for this topology - public DriverAwareOperatorTopology(string taskId, string rootTaskId, string subscriptionName, int operatorId) - : base(taskId, rootTaskId, subscriptionName, operatorId) + public DriverAwareOperatorTopology(string stageName, string taskId, string rootTaskId, int operatorId) + : base(stageName, taskId, rootTaskId, operatorId) { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs index 6da744f6fa..c7b6bf0f79 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs @@ -29,6 +29,7 @@ using System.Linq; using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Task; namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl { @@ -40,7 +41,7 @@ internal abstract class OneToNTopology : OperatorTopologyWithCommunication, IChe { protected static readonly Logger LOGGER = Logger.GetLogger(typeof(OneToNTopology)); - private readonly CheckpointService _checkpointService; + private readonly ICheckpointLayer _checkpointLayer; protected readonly ConcurrentDictionary _nodesToRemove; protected readonly ManualResetEvent _topologyUpdateReceived; @@ -49,42 +50,42 @@ internal abstract class OneToNTopology : OperatorTopologyWithCommunication, IChe /// /// Construct a one to N topology. /// + /// The stage name the topology is working on /// The identifier of the task the topology is running on /// The identifier of the root note in the topology - /// The subscription name the topology is working on /// The identifier of the operator for this topology /// The list of nodes this task has to send messages to /// Whether to piggyback topology update messages to data message /// How many times the topology will retry to send a message /// After how long the topology waits for an event /// Maximum wait time for topology disposal - /// Service responsible for communication - /// Service responsible for saving and retrieving checkpoints + /// Layer responsible for communication + /// Layer responsible for saving and retrieving checkpoints public OneToNTopology( + string stageName, string taskId, string rootTaskId, - string subscriptionName, int operatorId, ISet children, bool piggyback, int retry, int timeout, int disposeTimeout, - CommunicationService commService, - CheckpointService checkpointService) : base(taskId, rootTaskId, subscriptionName, operatorId, commService, retry, timeout, disposeTimeout) + CommunicationLayer commLayer, + ICheckpointLayer checkpointLayer) : base(stageName, taskId, rootTaskId, operatorId, commLayer, retry, timeout, disposeTimeout) { - _checkpointService = checkpointService; + _checkpointLayer = checkpointLayer; _nodesToRemove = new ConcurrentDictionary(); _topologyUpdateReceived = new ManualResetEvent(RootTaskId == taskId ? false : true); - _commService.RegisterOperatorTopologyForTask(this); - _commService.RegisterOperatorTopologyForDriver(this); + _commLayer.RegisterOperatorTopologyForTask(this); + _commLayer.RegisterOperatorTopologyForDriver(this); _piggybackTopologyUpdates = piggyback; foreach (var child in children) { - var childTaskId = Utils.BuildTaskId(SubscriptionName, child); + var childTaskId = Utils.BuildTaskId(StageName, child); _children.TryAdd(child, childTaskId); } @@ -112,6 +113,8 @@ public bool IsSending public void Checkpoint(ICheckpointableState state, int iteration) { + ICheckpointState checkpoint; + switch (state.Level) { case CheckpointLevel.None: @@ -148,7 +151,7 @@ public bool GetCheckpoint(out ICheckpointState checkpoint, int iteration = -1) return true; } - return _checkpointService.GetCheckpoint(out checkpoint, TaskId, SubscriptionName, OperatorId, iteration, false); + return _checkpointLayer.GetCheckpoint(out checkpoint, TaskId, StageName, OperatorId, iteration, false); } /// @@ -160,7 +163,7 @@ public void WaitCompletionBeforeDisposing(CancellationTokenSource cancellationSo { foreach (var node in _children.Values) { - while (_commService.Lookup(node) && !cancellationSource.IsCancellationRequested) + while (_commLayer.Lookup(node) && !cancellationSource.IsCancellationRequested) { Thread.Sleep(100); } @@ -168,7 +171,7 @@ public void WaitCompletionBeforeDisposing(CancellationTokenSource cancellationSo } } - public abstract DataMessage AssembleDataMessage(int iteration, T[] data); + public abstract DataMessage GetDataMessage(int iteration, T[] data); /// /// Initializes the communication group. @@ -179,7 +182,7 @@ public override void WaitForTaskRegistration(CancellationTokenSource cancellatio { try { - _commService.WaitForTaskRegistration(_children.Values.ToList(), cancellationSource, _nodesToRemove); + _commLayer.WaitForTaskRegistration(_children.Values.ToList(), cancellationSource, _nodesToRemove); } catch (Exception e) { @@ -195,7 +198,7 @@ public override void WaitForTaskRegistration(CancellationTokenSource cancellatio /// Handler for incoming messages from other topology nodes. /// /// The message that need to be devlivered to the operator - public override void OnNext(NsMessage message) + public override void OnNext(NsMessage message) { if (_messageQueue.IsAddingCompleted) { @@ -241,7 +244,7 @@ public override void OnNext(DriverMessagePayload message) foreach (var node in updates.Children) { _nodesToRemove.TryAdd(node, new byte()); - _commService.RemoveConnection(node); + _commLayer.RemoveConnection(node); } } break; @@ -250,7 +253,7 @@ public override void OnNext(DriverMessagePayload message) { if (_sendQueue.Count > 0) { - if (_sendQueue.TryPeek(out GroupCommunicationMessage toSendmsg)) + if (_sendQueue.TryPeek(out ElasticGroupCommunicationMessage toSendmsg)) { var rmsg = message as TopologyMessagePayload; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs index 3ff8ef265a..edbf97daee 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs @@ -29,23 +29,22 @@ internal abstract class OperatorTopology /// /// Constructor for an operator topology. /// + /// The stage name the topology is working on /// The identifier of the task the topology is running on /// The identifier of the root note in the topology - /// The subscription name the topology is working on /// The identifier of the operator for this topology - public OperatorTopology(string taskId, string rootTaskId, string subscriptionName, int operatorId) + public OperatorTopology(string stageName, string taskId, string rootTaskId, int operatorId) { + StageName = stageName; TaskId = taskId; - SubscriptionName = subscriptionName; - OperatorId = operatorId; - RootTaskId = rootTaskId; + OperatorId = operatorId; } /// - /// The subscription name context in which the topology is running. + /// The stage name context in which the topology is running. /// - public string SubscriptionName { get; private set; } + public string StageName { get; private set; } /// /// The identifier of the operator in which the topology is running. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs index d9b5dc6633..aacca911ca 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs @@ -37,48 +37,48 @@ internal abstract class OperatorTopologyWithCommunication : DriverAwareOperatorTopology, IWaitForTaskRegistration, IDisposable, - IObserver> + IObserver> { protected bool _initialized; - protected CommunicationService _commService; + protected CommunicationLayer _commLayer; protected readonly int _disposeTimeout; protected readonly int _timeout; protected readonly int _retry; - protected ConcurrentQueue _sendQueue; - protected BlockingCollection _messageQueue; + protected ConcurrentQueue _sendQueue; + protected BlockingCollection _messageQueue; protected readonly ConcurrentDictionary _children; protected readonly CancellationTokenSource _cancellationSignal; /// /// Constructor for a communicating topology. /// - //// The identifier of the task the topology is running on + /// The stage name the topology is working on + /// The identifier of the task the topology is running on /// The identifier of the root note in the topology - /// The subscription name the topology is working on /// The identifier of the operator for this topology /// How many times the topology will retry to send a message /// After how long the topology waits for an event /// Maximum wait time for topology disposal - /// Class responsible for communication + /// Class responsible for communication public OperatorTopologyWithCommunication( + string stageName, string taskId, string rootTaskId, - string subscription, int operatorId, - CommunicationService commService, + CommunicationLayer commLayer, int retry, int timeout, - int disposeTimeout) : base(taskId, rootTaskId, subscription, operatorId) + int disposeTimeout) : base(stageName, taskId, rootTaskId, operatorId) { _initialized = false; - _commService = commService; + _commLayer = commLayer; _children = new ConcurrentDictionary(); - _messageQueue = new BlockingCollection(); - _sendQueue = new ConcurrentQueue(); + _messageQueue = new BlockingCollection(); + _sendQueue = new ConcurrentQueue(); _cancellationSignal = new CancellationTokenSource(); @@ -91,11 +91,11 @@ public OperatorTopologyWithCommunication( /// Communicate to the driver that the current subscrition has completed its /// execution. /// - public void SubscriptionComplete() + public void StageComplete() { if (TaskId == RootTaskId) { - _commService.SubscriptionComplete(TaskId); + _commLayer.StageComplete(TaskId); } } @@ -104,7 +104,7 @@ public void SubscriptionComplete() /// public void TopologyUpdateRequest() { - _commService.TopologyUpdateRequest(TaskId, OperatorId); + _commLayer.TopologyUpdateRequest(TaskId, OperatorId); } /// @@ -126,7 +126,7 @@ public override void WaitCompletionBeforeDisposing() /// public virtual void JoinTopology() { - _commService.JoinTopology(TaskId, OperatorId); + _commLayer.JoinTopology(TaskId, OperatorId); } /// @@ -138,7 +138,7 @@ public virtual void WaitForTaskRegistration(CancellationTokenSource cancellation { try { - _commService.WaitForTaskRegistration(_children.Values.ToList(), cancellationSource); + _commLayer.WaitForTaskRegistration(_children.Values.ToList(), cancellationSource); } catch (Exception e) { @@ -156,9 +156,9 @@ public virtual void WaitForTaskRegistration(CancellationTokenSource cancellation /// /// The signal that the operation is cacelled /// - public virtual GroupCommunicationMessage Receive(CancellationTokenSource cancellationSource) + public virtual ElasticGroupCommunicationMessage Receive(CancellationTokenSource cancellationSource) { - GroupCommunicationMessage message; + ElasticGroupCommunicationMessage message; int retry = 1; while (!_messageQueue.TryTake(out message, _timeout, cancellationSource.Token)) @@ -173,7 +173,7 @@ public virtual GroupCommunicationMessage Receive(CancellationTokenSource cancell throw new Exception($"Failed to receive message after {_retry} try."); } - _commService.NextDataRequest(TaskId, -1); + _commLayer.NextDataRequest(TaskId, -1); } return message; @@ -184,7 +184,7 @@ public virtual GroupCommunicationMessage Receive(CancellationTokenSource cancell /// /// The message to communicate /// The signal for cancelling the operation - public virtual void Send(GroupCommunicationMessage message, CancellationTokenSource cancellationSource) + public virtual void Send(ElasticGroupCommunicationMessage message, CancellationTokenSource cancellationSource) { _sendQueue.Enqueue(message); @@ -198,7 +198,7 @@ public virtual void Send(GroupCommunicationMessage message, CancellationTokenSou /// Handler for incoming messages from other topology nodes. /// /// The message that need to be devlivered to the operator - public virtual void OnNext(NsMessage message) + public virtual void OnNext(NsMessage message) { if (_messageQueue.IsAddingCompleted) { @@ -207,7 +207,7 @@ public virtual void OnNext(NsMessage message) throw new IllegalStateException("Trying to add messages to a closed non-empty queue."); } - _messageQueue = new BlockingCollection(); + _messageQueue = new BlockingCollection(); } _messageQueue.Add(message.Data); @@ -233,7 +233,7 @@ public virtual void Dispose() _cancellationSignal.Cancel(); - _commService.Dispose(); + _commLayer.Dispose(); } /// @@ -260,12 +260,12 @@ public virtual void Dispose() /// The singal in case the task is cancelled protected virtual void Send(CancellationTokenSource cancellationSource) { - GroupCommunicationMessage message; + ElasticGroupCommunicationMessage message; while (_sendQueue.TryDequeue(out message) && !cancellationSource.IsCancellationRequested) { foreach (var child in _children.Values) { - _commService.Send(child, message, cancellationSource); + _commLayer.Send(child, message, cancellationSource); } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs index a40761ece8..d49f204812 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs @@ -39,31 +39,31 @@ public static int GetContextNum(IActiveContext activeContext) } /// - /// Gets the subscriptions associated with the active context id. + /// Gets the stages associated with the active context id. /// /// The active context to check - /// The subscription names associated with the active context id - public static string GetContextSubscriptions(IActiveContext activeContext) + /// The stage names associated with the active context id + public static string GetContextStages(IActiveContext activeContext) { return GetValue(1, activeContext.Id); } /// - /// Gets the subscriptions associated with the context id. + /// Gets the stages associated with the context id. /// /// The context id to check - /// The subscription names associated with the context id - public static string GetContextSubscriptions(string id) + /// The stage names associated with the context id + public static string GetContextStages(string id) { return GetValue(1, id); } /// - /// Gets the subscriptions associated with the Task id. + /// Gets the stages associated with the Task id. /// /// The task id to check - /// The subscription names associated with the task id - public static string GetTaskSubscriptions(string taskId) + /// The stage names associated with the task id + public static string GetTaskStages(string taskId) { return GetValue(1, taskId); } @@ -79,25 +79,25 @@ public static int GetTaskNum(string taskId) } /// - /// Builds a context identifier out of a subscription(s) and a context number. + /// Builds a context identifier out of a stage(s) and a context number. /// - /// The subscriptions active in the context + /// The stages active in the context /// The context number /// The context identifier - public static string BuildContextId(string subscriptionName, int contextNum) + public static string BuildContextId(string stageName, int contextNum) { - return BuildIdentifier("Context", subscriptionName, contextNum); + return BuildIdentifier("Context", stageName, contextNum); } /// - /// Builds a task identifier out of a subscription(s) and an id. + /// Builds a task identifier out of a stage(s) and an id. /// - /// The subscriptions active in the task + /// The stages active in the task /// The task id /// The task identifier - public static string BuildTaskId(string subscriptionName, int id) + public static string BuildTaskId(string stageName, int id) { - return BuildIdentifier("Task", subscriptionName, id); + return BuildIdentifier("Task", stageName, id); } /// From 14896fa781c45056d9e663c43d703ca7f11d5ff9 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Fri, 28 Dec 2018 09:58:45 -0800 Subject: [PATCH 03/29] Removed checkpointing stuff --- .../Elastic/Comm/Impl/CheckpointMessage.cs | 52 ---- .../Comm/Impl/CheckpointMessageRequest.cs | 55 ---- .../CheckpointMessageRequestStreamingCodec.cs | 133 ---------- .../Impl/CheckpointMessageStreamingCodec.cs | 155 ----------- .../Config/StreamingCodecConfiguration.cs | 4 - .../Elastic/Failures/Enum/CheckpointLevel.cs | 4 - .../Elastic/Failures/ICheckpointState.cs | 65 ----- .../Elastic/Failures/ICheckpointableState.cs | 57 ---- .../Impl/CheckpointableImmutableObject.cs | 102 -------- .../Failures/Impl/DefaultCheckpointState.cs | 82 ------ .../Logical/Impl/DefaultBroadcast.cs | 3 +- .../Physical/Impl/DefaultBroadcast.cs | 5 +- .../Operators/Physical/Impl/DefaultOneToN.cs | 22 +- .../Elastic/Task/ICheckpointLayer.cs | 72 ------ .../Task/Impl/CentralizedCheckpointService.cs | 243 ------------------ .../Elastic/Task/Impl/CheckpointIdentifier.cs | 88 ------- .../Physical/ICheckpointingTopology.cs | 54 ---- .../Physical/Impl/DefaultBroadcastTopology.cs | 6 +- .../Topology/Physical/Impl/OneToNTopology.cs | 62 +---- 19 files changed, 7 insertions(+), 1257 deletions(-) delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequest.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequestStreamingCodec.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageStreamingCodec.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointableState.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/CheckpointableImmutableObject.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/ICheckpointLayer.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CentralizedCheckpointService.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CheckpointIdentifier.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/ICheckpointingTopology.cs diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs deleted file mode 100644 index 54e0566105..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessage.cs +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Comm.Impl -{ - /// - /// Message used to communicate checkpoints between nodes in order to - /// recover execution. - /// - [Unstable("0.16", "API may change")] - internal sealed class CheckpointMessage : ElasticGroupCommunicationMessage - { - /// - /// Constructor for a message containig a checkpoint. - /// - /// The checkpoint state - public CheckpointMessage(ICheckpointState checkpoint) : base(checkpoint.StageName, checkpoint.OperatorId) - { - Checkpoint = checkpoint; - } - - /// - /// The checkpoint contained in the message. - /// - public ICheckpointState Checkpoint { get; internal set; } - - /// - /// Clone the message. - /// - public override object Clone() - { - return new CheckpointMessage(Checkpoint); - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequest.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequest.cs deleted file mode 100644 index 44acb47d22..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequest.cs +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Comm.Impl -{ - /// - /// Message sent to checkpoint service to retrieve a remote checkpoint. - /// - [Unstable("0.16", "API may change")] - internal sealed class CheckpointMessageRequest : ElasticGroupCommunicationMessage - { - /// - /// Constructor. - /// - /// The stage name ffor the checkpoint to retrieve - /// The operator identifier - /// The iteration of the checkpoint of interest - public CheckpointMessageRequest( - string stageName, - int operatorId, - int iteration) : base(stageName, operatorId) - { - Iteration = iteration; - } - - /// - /// Iteration number for the checkpoint of interest. - /// - public int Iteration { get; set; } - - /// - /// Clone the message. - /// - public override object Clone() - { - return new CheckpointMessageRequest(StageName, OperatorId, Iteration); - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequestStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequestStreamingCodec.cs deleted file mode 100644 index d216ef935c..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageRequestStreamingCodec.cs +++ /dev/null @@ -1,133 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using System; -using System.Threading; -using System.Threading.Tasks; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Wake.Remote; -using Org.Apache.REEF.Wake.StreamingCodec; -using Org.Apache.REEF.Utilities; - -namespace Org.Apache.REEF.Network.Elastic.Comm.Impl -{ - /// - /// Streaming codec for the checkpoint message request - /// - internal sealed class CheckpointMessageRequestStreamingCodec : IStreamingCodec - { - /// - /// Empty constructor to allow instantiation by reflection. - /// - [Inject] - private CheckpointMessageRequestStreamingCodec() - { - } - - /// - /// Read the class fields. - /// - /// The reader from which to read - /// The checkpoint message request - public CheckpointMessageRequest Read(IDataReader reader) - { - int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); - byte[] metadata = new byte[metadataSize]; - reader.Read(ref metadata, 0, metadataSize); - var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); - - return new CheckpointMessageRequest(stageName, operatorId, iteration); - } - - /// - /// Writes the class fields. - /// - /// The message to write - /// The writer to which to write - public void Write(CheckpointMessageRequest obj, IDataWriter writer) - { - byte[] encodedMetadata = GenerateMetaDataEncoding(obj); - - writer.Write(encodedMetadata, 0, encodedMetadata.Length); - } - - /// - /// Read the class fields. - /// - /// The reader from which to read - /// The cancellation token - /// The checkpoint message request - public async Task ReadAsync(IDataReader reader, - CancellationToken token) - { - int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); - byte[] metadata = new byte[metadataSize]; - await reader.ReadAsync(metadata, 0, metadataSize, token); - var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); - - return new CheckpointMessageRequest(stageName, operatorId, iteration); - } - - /// - /// Writes the class fields. - /// - /// The message to write - /// The writer to which to write - /// The cancellation token - public async System.Threading.Tasks.Task WriteAsync(CheckpointMessageRequest obj, IDataWriter writer, CancellationToken token) - { - byte[] encodedMetadata = GenerateMetaDataEncoding(obj); - - await writer.WriteAsync(encodedMetadata, 0, encodedMetadata.Length, token); - } - - private static byte[] GenerateMetaDataEncoding(CheckpointMessageRequest obj) - { - byte[] stageBytes = ByteUtilities.StringToByteArrays(obj.StageName); - var length = stageBytes.Length; - byte[] metadataBytes = new byte[sizeof(int) + length + sizeof(int) + sizeof(int)]; - int offset = 0; - - Buffer.BlockCopy(BitConverter.GetBytes(length), 0, metadataBytes, offset, sizeof(int)); - offset += sizeof(int); - - Buffer.BlockCopy(stageBytes, 0, metadataBytes, offset, length); - offset += length; - - Buffer.BlockCopy(BitConverter.GetBytes(obj.OperatorId), 0, metadataBytes, offset, sizeof(int)); - offset += sizeof(int); - - Buffer.BlockCopy(BitConverter.GetBytes(obj.Iteration), 0, metadataBytes, offset, sizeof(int)); - - return metadataBytes; - } - - private static (string stageName, int operatorId, int iteration) GenerateMetaDataDecoding(byte[] obj, int stageLength) - { - int offset = 0; - string stageName = ByteUtilities.ByteArraysToString(obj, offset, stageLength); - offset += stageLength; - - int operatorId = BitConverter.ToInt32(obj, offset); - offset += sizeof(int); - - int iteration = BitConverter.ToInt32(obj, offset); - - return (stageName, operatorId, iteration); - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageStreamingCodec.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageStreamingCodec.cs deleted file mode 100644 index 24756e8aeb..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/CheckpointMessageStreamingCodec.cs +++ /dev/null @@ -1,155 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using System; -using System.Threading; -using System.Threading.Tasks; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Wake.Remote; -using Org.Apache.REEF.Wake.StreamingCodec; -using Org.Apache.REEF.Utilities; -using Org.Apache.REEF.Network.Elastic.Failures; - -namespace Org.Apache.REEF.Network.Elastic.Comm.Impl -{ - /// - /// Streaming Codec for the checkpoint message. - /// - internal sealed class CheckpointMessageStreamingCodec : IStreamingCodec - { - private readonly IStreamingCodec _codec; - private readonly ICheckpointState _checkpoint; - - /// - /// Empty constructor to allow instantiation by reflection - /// - [Inject] - private CheckpointMessageStreamingCodec(IStreamingCodec codec, ICheckpointState checkpoint) - { - _codec = codec; - _checkpoint = checkpoint; - } - - /// - /// Read the class fields. - /// - /// The reader from which to read - /// The checkpoint message - public CheckpointMessage Read(IDataReader reader) - { - int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); - byte[] metadata = new byte[metadataSize]; - reader.Read(ref metadata, 0, metadataSize); - var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); - var data = _codec.Read(reader); - var payload = _checkpoint.Create(data); - - payload.StageName = stageName; - payload.OperatorId = operatorId; - payload.Iteration = iteration; - - return new CheckpointMessage(payload); - } - - /// - /// Writes the class fields. - /// - /// The message to write - /// The writer to which to write - public void Write(CheckpointMessage obj, IDataWriter writer) - { - byte[] encodedMetadata = GenerateMetaDataEncoding(obj); - - writer.Write(encodedMetadata, 0, encodedMetadata.Length); - - _codec.Write((T)obj.Checkpoint.State, writer); - } - - /// - /// Read the class fields. - /// - /// The reader from which to read - /// The cancellation token - /// The checkpoint message - public async Task ReadAsync(IDataReader reader, - CancellationToken token) - { - int metadataSize = reader.ReadInt32() + sizeof(int) + sizeof(int); - byte[] metadata = new byte[metadataSize]; - await reader.ReadAsync(metadata, 0, metadataSize, token); - var (stageName, operatorId, iteration) = GenerateMetaDataDecoding(metadata, metadataSize - sizeof(int) - sizeof(int)); - var data = await _codec.ReadAsync(reader, token); - var payload = _checkpoint.Create(data); - - payload.StageName = stageName; - payload.OperatorId = operatorId; - payload.Iteration = iteration; - - return new CheckpointMessage(payload); - } - - /// - /// Writes the class fields. - /// - /// The message to write - /// The writer to which to write - /// The cancellation token - public async System.Threading.Tasks.Task WriteAsync(CheckpointMessage obj, IDataWriter writer, CancellationToken token) - { - byte[] encodedMetadata = GenerateMetaDataEncoding(obj); - - await writer.WriteAsync(encodedMetadata, 0, encodedMetadata.Length, token); - - await _codec.WriteAsync((T)obj.Checkpoint.State, writer, token); - } - - private static byte[] GenerateMetaDataEncoding(CheckpointMessage obj) - { - byte[] stageBytes = ByteUtilities.StringToByteArrays(obj.StageName); - var length = stageBytes.Length; - byte[] metadataBytes = new byte[sizeof(int) + length + sizeof(int) + sizeof(int)]; - int offset = 0; - - Buffer.BlockCopy(BitConverter.GetBytes(length), 0, metadataBytes, offset, sizeof(int)); - offset += sizeof(int); - - Buffer.BlockCopy(stageBytes, 0, metadataBytes, offset, length); - offset += length; - - Buffer.BlockCopy(BitConverter.GetBytes(obj.OperatorId), 0, metadataBytes, offset, sizeof(int)); - offset += sizeof(int); - - Buffer.BlockCopy(BitConverter.GetBytes(obj.Checkpoint.Iteration), 0, metadataBytes, offset, sizeof(int)); - - return metadataBytes; - } - - private static (string stageName, int operatorId, int iteration) GenerateMetaDataDecoding(byte[] obj, int stageLength) - { - int offset = 0; - string stageName = ByteUtilities.ByteArraysToString(obj, offset, stageLength); - offset += stageLength; - - int operatorId = BitConverter.ToInt32(obj, offset); - offset += sizeof(int); - - int iteration = BitConverter.ToInt32(obj, offset); - - return (stageName, operatorId, iteration); - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs index 8082231e86..0863307aca 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/StreamingCodecConfiguration.cs @@ -42,10 +42,6 @@ public sealed class StreamingCodecConfiguration : ConfigurationModuleBuilder GenericType>.Class) .BindImplementation(GenericType>>.Class, GenericType>.Class) - .BindImplementation(GenericType>.Class, - GenericType>.Class) - .BindImplementation(GenericType>.Class, - GenericType.Class) .Build(); } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs index 885b084e21..ad32568308 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/CheckpointLevel.cs @@ -26,9 +26,5 @@ namespace Org.Apache.REEF.Network.Elastic.Failures.Enum public enum CheckpointLevel : int { None = 0, // No checkpointing - - EphemeralMaster = 10, // Checkpointing on the master task, not tolerant to task failures - - EphemeralAll = 11, // Checkpointing on all tasks, not tolerant to task failures } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs deleted file mode 100644 index 45a1cb8c7c..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointState.cs +++ /dev/null @@ -1,65 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Comm.Impl; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Failures -{ - /// - /// Interface for a state that is checkpointed. - /// - [Unstable("0.16", "API may change")] - [DefaultImplementation(typeof(DefaultCheckpointState))] - public interface ICheckpointState - { - /// - /// The iteration number for this checkpoint. - /// - int Iteration { get; set; } - - /// - /// The operator id for this checkpoint. - /// - int OperatorId { get; set; } - - /// - /// The stage name of the checkpoint. - /// - string StageName { get; set; } - - /// - /// The actual state of the checkpoint. - /// - object State { get; } - - /// - /// Create a new empty checkpoint from the settings of the current one. - /// - /// A checkpoint with no state but with properly set up fields - ICheckpointState Create(object state); - - /// - /// Utility method used to create message out of - /// the checkpoint. This is used when checkpoints need - /// to be sent among nodes to recover computation. - /// - /// A checkpoint ready to be communicated - ElasticGroupCommunicationMessage ToMessage(); - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointableState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointableState.cs deleted file mode 100644 index 5f03b0ac95..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/ICheckpointableState.cs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Failures.Enum; -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Failures -{ - /// - /// Interface for checkpointing some task state. - /// Clients can implement this interface and inject it into operators to save the current task state. - /// The workflow is as follows: - /// 1-Create a checkpointable state either through injection or for an iteration - /// 2-Make an object checkpointable using the MakeCheckpointable. At this point the state is not checkpointed. - /// 3-Create a checkpoint state. - /// - [Unstable("0.16", "API may change")] - public interface ICheckpointableState - { - /// - /// The current checkpoint level. - /// - CheckpointLevel Level { get; } - - /// - /// Make the given input state a checkpointable state. - /// - /// The state that needs to be make checkpointable - void MakeCheckpointable(object state); - - /// - /// Checkpoint the current state. - /// - /// A checkpoint state - ICheckpointState Checkpoint(); - - /// - /// Create a new empty checkpointable state from the current one. - /// - /// An empty checkpointable state - ICheckpointableState Create(); - } -} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/CheckpointableImmutableObject.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/CheckpointableImmutableObject.cs deleted file mode 100644 index 9531539bee..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/CheckpointableImmutableObject.cs +++ /dev/null @@ -1,102 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Config; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Utilities.Attributes; -using System; - -namespace Org.Apache.REEF.Network.Elastic.Failures -{ - /// - /// Checkpointable state wrapping an immutable object. - /// Since immutable when creating a checkpoint we don't need a copy. - /// - /// - [Unstable("0.16", "API may change")] - public class CheckpointableImmutableObject : ICheckpointableState - { - protected ICheckpointState _checkpoint; - - [Inject] - private CheckpointableImmutableObject( - [Parameter(typeof(OperatorParameters.Checkpointing))] int level, - ICheckpointState checkpoint) : this() - { - Level = (CheckpointLevel)level; - _checkpoint = checkpoint; - } - - /// - /// Basic constructor returning a checkponitable object with default state and iteration number = 0. - /// - protected CheckpointableImmutableObject() - { - Level = 0; - State = default; - } - - /// - /// The current checkpoint level. - /// - public CheckpointLevel Level { get; internal set; } - - /// - /// The actual state to checkpoint. - /// - internal T State { get; set; } - - /// - /// Make the given input state a checkpointable state. - /// - /// The state that needs to be make checkpointable - public void MakeCheckpointable(object model) - { - State = (T)model; - } - - /// - /// Checkpoint the current state. - /// - /// A checkpoint state - public virtual ICheckpointState Checkpoint() - { - switch (Level) - { - case CheckpointLevel.EphemeralMaster: - case CheckpointLevel.EphemeralAll: - return _checkpoint.Create(State); - default: - throw new ArgumentException($"Level {Level} not recognized."); - } - } - - /// - /// Create a new empty checkpointable state from the current one. - /// - /// The current iteration for which we need to create a new checkpointable state - /// An empty checkpointable state - public virtual ICheckpointableState Create() - { - return new CheckpointableImmutableObject() - { - Level = Level, - }; - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs deleted file mode 100644 index 63a0042c9b..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultCheckpointState.cs +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Comm.Impl; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Failures -{ - /// - /// Class wrapping a state that has been checkpointed or is ready to. - /// - [Unstable("0.16", "API may change")] - public sealed class DefaultCheckpointState : ICheckpointState - { - [Inject] - private DefaultCheckpointState() - { - Iteration = -1; - OperatorId = -1; - StageName = string.Empty; - } - - /// - /// The iteration number for this checkpoint. - /// - public int Iteration { get; set; } - - /// - /// The operator id for this checkpoint. - /// - public int OperatorId { get; set; } - - /// - /// The stage name of the checkpoint. - /// - public string StageName { get; set; } - - /// - /// The actual state of the checkpoint. - /// - public object State { get; private set; } - - /// - /// Create a new empty checkpoint from the settings of the current one. - /// - /// A checkpoint with no state but with properly set up fields - public ICheckpointState Create(object state) - { - return new DefaultCheckpointState() - { - State = state, - }; - } - - /// - /// Utility method used to create message out of - /// the checkpoint. This is used when checkpoints need - /// to be sent among nodes to recover computation. - /// - /// A checkpoint ready to be communicated - public ElasticGroupCommunicationMessage ToMessage() - { - return new CheckpointMessage(this); - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs index a6b46b5365..e11436117e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/DefaultBroadcast.cs @@ -65,8 +65,7 @@ public DefaultBroadcast( protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder confBuilder) { confBuilder - .BindImplementation(GenericType>.Class, GenericType>.Class) - .BindImplementation(GenericType.Class, GenericType>.Class); + .BindImplementation(GenericType>.Class, GenericType>.Class); SetMessageType(typeof(Physical.Impl.DefaultBroadcast), ref confBuilder); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs index 69750cb665..79ecdca863 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs @@ -40,8 +40,7 @@ public sealed class DefaultBroadcast : DefaultOneToN, IElasticBroadcast private DefaultBroadcast( [Parameter(typeof(OperatorParameters.OperatorId))] int id, [Parameter(typeof(OperatorParameters.IsLast))] bool isLast, - ICheckpointableState checkpointableState, - DefaultBroadcastTopology topology) : base(id, isLast, checkpointableState, topology) + DefaultBroadcastTopology topology) : base(id, isLast, topology) { OperatorName = Constants.Broadcast; } @@ -65,8 +64,6 @@ public void Send(T data) int iteration = IteratorReference == null ? 0 : (int)IteratorReference.Current; var message = _topology.GetDataMessage(iteration, new[] { data }); - Checkpoint(message, message.Iteration); - _topology.Send(message, CancellationSource); _position = PositionTracker.AfterSend; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs index 94a0433fb9..4cf68480a2 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs @@ -38,7 +38,6 @@ public abstract class DefaultOneToN { private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultOneToN<>)); - private readonly ICheckpointableState _checkpointableState; internal readonly OneToNTopology _topology; internal volatile PositionTracker _position; @@ -51,10 +50,9 @@ public abstract class DefaultOneToN /// The checkpoint level for the operator /// Whether this operator is the last in the pipeline /// The operator topology layer - internal DefaultOneToN(int id, bool isLast, ICheckpointableState checkpointableState, OneToNTopology topology) + internal DefaultOneToN(int id, bool isLast, OneToNTopology topology) { OperatorId = id; - _checkpointableState = checkpointableState; _isLast = isLast; _topology = topology; _position = PositionTracker.Nil; @@ -147,8 +145,6 @@ public T Receive() IteratorReference.SyncIteration(typedDataMessage.Iteration); } - Checkpoint(dataMessage, dataMessage.Iteration); - _position = PositionTracker.AfterReceive; return typedDataMessage.Data; @@ -194,21 +190,5 @@ public void Dispose() } _topology.Dispose(); } - - /// - /// Checkpoint the input data for the input iteration using the defined checkpoint level. - /// - /// The messages to checkpoint - /// The iteration of the checkpoint - internal void Checkpoint(ElasticGroupCommunicationMessage data, int iteration) - { - if (_checkpointableState.Level > CheckpointLevel.None) - { - var state = _checkpointableState.Create(); - - state.MakeCheckpointable(data); - _topology.Checkpoint(state, iteration); - } - } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ICheckpointLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ICheckpointLayer.cs deleted file mode 100644 index 1588a48753..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ICheckpointLayer.cs +++ /dev/null @@ -1,72 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Network.Elastic.Task.Impl; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Utilities.Attributes; -using System; - -namespace Org.Apache.REEF.Network.Elastic.Task -{ - /// - /// Checkpointing service interface used to locally checkpoint some task state or retrieve previously checkpointed local / remote states. - /// - [Unstable("0.16", "API may change")] - [DefaultImplementation(typeof(CentralizedCheckpointLayer))] - internal interface ICheckpointLayer : IDisposable - { - /// - /// The service for communicating with the other available nodes. - /// - CommunicationLayer CommunicationLayer { set; } - - /// - /// Register the current task id as well as notify the root task for the operator. - /// - /// The name of the stage - /// The operator identifier - /// The identifier of the current task - /// The identifier of the root task of the operator - void RegisterNode(string stageName, int operatorId, string taskId, string rootTaskId); - - /// - /// Checkpoint the input state. - /// - /// The state to checkpoint - void Checkpoint(ICheckpointState state); - - /// - /// Retrieve a checkpoint. - /// - /// The retrieve checkpoint if exists - /// The local task identifier - /// The name of the stage - /// The operator identifier - /// The iteration number of the checkpoint - /// Whether to request the checkpoint remotely if not found locally - /// True if the checkpoint is found, false otherwise - bool GetCheckpoint(out ICheckpointState checkpoint, string taskId, string stageName, int operatorId, int iteration = -1, bool requestToRemote = true); - - /// - /// Remove a checkpoint. - /// - /// The stage of the checkpoint to remove - /// The operator id of the checkpoint to remove - void RemoveCheckpoint(string stageName, int operatorId); - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CentralizedCheckpointService.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CentralizedCheckpointService.cs deleted file mode 100644 index b20226cbd4..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CentralizedCheckpointService.cs +++ /dev/null @@ -1,243 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Comm.Impl; -using Org.Apache.REEF.Network.Elastic.Config; -using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Exceptions; -using Org.Apache.REEF.Utilities.Attributes; -using Org.Apache.REEF.Utilities.Logging; -using System; -using System.Collections.Concurrent; -using System.Collections.Generic; -using System.Linq; -using System.Threading; - -namespace Org.Apache.REEF.Network.Elastic.Task.Impl -{ - /// - /// Checkpointing service used to locally checkpoint some task state or retrieve previously checkpointed local / remote states. - /// This service allows to reach remote checkpoints stored in root node when operators support it. - /// - [Unstable("0.16", "API may change")] - internal class CentralizedCheckpointLayer : ICheckpointLayer - { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(CentralizedCheckpointLayer)); - - private readonly ConcurrentDictionary> _checkpoints; - private readonly ConcurrentDictionary _roots; - private readonly ConcurrentDictionary _checkpointsWaiting; - - private readonly int _limit; - private readonly int _timeout; - private readonly int _retry; - - private readonly CancellationSource _cancellationSource; - - [Inject] - private CentralizedCheckpointLayer( - [Parameter(typeof(ElasticServiceConfigurationOptions.NumCheckpoints))] int num, - [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, - [Parameter(typeof(GroupCommunicationConfigurationOptions.Retry))] int retry, - CancellationSource cancellationSource, - CommunicationLayer commLayer) - { - _limit = num; - _timeout = timeout; - _retry = retry; - - _cancellationSource = cancellationSource; - - _checkpoints = new ConcurrentDictionary>(); - _roots = new ConcurrentDictionary(); - _checkpointsWaiting = new ConcurrentDictionary(); - } - - /// The service for communicating with the other available nodes. - /// - public CommunicationLayer CommunicationLayer { private get; set; } - - /// - /// Register the current task id as well as notify the root task for the operator. - /// - /// The name of the stage - /// The operator identifier - /// The identifier of the current task - /// The identifier of the root task of the operator - public void RegisterNode(string stageName, int operatorId, string taskId, string rootTaskId) - { - var id = new CheckpointIdentifier(stageName, operatorId); - if (!_roots.ContainsKey(id) && taskId != rootTaskId) - { - _roots.TryAdd(id, rootTaskId); - } - } - - /// - /// Checkpoint the input state. - /// - /// The state to checkpoint - public void Checkpoint(ICheckpointState state) - { - if (state.StageName == null || state.StageName == string.Empty) - { - throw new ArgumentException(nameof(state.StageName), "Null or empty."); - } - - if (state.OperatorId < 0) - { - throw new ArgumentException(nameof(state.OperatorId), "Invalid."); - } - - SortedDictionary checkpoints; - var id = new CheckpointIdentifier(state.StageName, state.OperatorId); - ManualResetEvent waiting; - - if (!_checkpoints.TryGetValue(id, out checkpoints)) - { - checkpoints = new SortedDictionary(); - _checkpoints.TryAdd(id, checkpoints); - } - - checkpoints[state.Iteration] = state; - - if (_checkpointsWaiting.TryRemove(id, out waiting)) - { - waiting.Set(); - } - - CheckSize(checkpoints); - } - - /// - /// Retrieve a checkpoint. - /// - /// The retrieve checkpoint if exists - /// The local task identifier - /// The name of the stage - /// The operator identifier - /// The iteration number of the checkpoint - /// Whether to request the checkpoint remotely if not found locally - /// True if the checkpoint is found, false otherwise - public bool GetCheckpoint(out ICheckpointState checkpoint, string taskId, string stageName, int operatorId, int iteration = -1, bool requestToRemote = true) - { - SortedDictionary checkpoints; - var id = new CheckpointIdentifier(stageName, operatorId); - checkpoint = null; - - if (!_checkpoints.TryGetValue(id, out checkpoints)) - { - LOGGER.Log(Level.Warning, "Asking for a checkpoint not in the service."); - - if (!requestToRemote) - { - LOGGER.Log(Level.Warning, "Trying to recover from a non existing checkpoint."); - return false; - } - - string rootTaskId; - - if (!_roots.TryGetValue(id, out rootTaskId)) - { - LOGGER.Log(Level.Warning, "Trying to recover from a non existing checkpoint."); - return false; - } - - if (CommunicationLayer == null) - { - throw new IllegalStateException("Communication service not set up."); - } - - var received = new ManualResetEvent(false); - var retry = 0; - - do - { - LOGGER.Log(Level.Info, $"Retrieving the checkpoint from {rootTaskId}."); - var cpm = new CheckpointMessageRequest(stageName, operatorId, iteration); - - CommunicationLayer.Send(rootTaskId, cpm, _cancellationSource.Source); - - _checkpointsWaiting.TryAdd(id, received); - retry++; - } - while (!received.WaitOne(_timeout) && retry < _retry); - - if (!_checkpoints.TryGetValue(id, out checkpoints)) - { - LOGGER.Log(Level.Warning, "Checkpoint not retrieved."); - _checkpointsWaiting.TryRemove(id, out received); - return false; - } - } - - iteration = iteration < 0 ? checkpoints.Keys.Last() : iteration; - - if (!checkpoints.TryGetValue(iteration, out checkpoint)) - { - LOGGER.Log(Level.Warning, $"Checkpoint for iteration {iteration} not found."); - } - - return true; - } - - /// - /// Remove a checkpoint. - /// - /// The stage of the checkpoint to remove - /// The operator id of the checkpoint to remove - public void RemoveCheckpoint(string stageName, int operatorId) - { - if (stageName == null || stageName == string.Empty) - { - throw new ArgumentException(nameof(stageName), "Null or empty."); - } - - if (operatorId < 0) - { - throw new ArgumentException(nameof(operatorId), "Invalid."); - } - - var id = new CheckpointIdentifier(stageName, operatorId); - SortedDictionary checkpoints; - - _checkpoints.TryRemove(id, out checkpoints); - } - - /// - /// Dispose the service. - /// - public void Dispose() - { - foreach (var waiting in _checkpointsWaiting.Values) - { - waiting.Set(); - waiting.Close(); - } - } - - private void CheckSize(SortedDictionary checkpoint) - { - if (checkpoint.Keys.Count > _limit) - { - var first = checkpoint.Keys.First(); - checkpoint.Remove(first); - } - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CheckpointIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CheckpointIdentifier.cs deleted file mode 100644 index cfc2bf50e9..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CheckpointIdentifier.cs +++ /dev/null @@ -1,88 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Task.Impl -{ - /// - /// An identifier for a given node in the group communication graph. - /// A node is uniquely identifiable by a combination of its Task ID, - /// , and . - /// - [Unstable("0.16", "API may change")] - internal sealed class CheckpointIdentifier - { - /// - /// Construct a new checkpoint identifier. - /// - /// The stage name - /// The operator identifier - public CheckpointIdentifier(string stageName, int operatorId) - { - StageName = stageName; - OperatorId = operatorId; - } - - /// - /// The stage name of the node. - /// - public string StageName { get; private set; } - - /// - /// The operator id of the node. - /// - public int OperatorId { get; private set; } - - /// - /// Overrides . Simply compares equivalence of instance fields. - /// - public override bool Equals(object obj) - { - if (ReferenceEquals(null, obj)) - { - return false; - } - - if (ReferenceEquals(this, obj)) - { - return true; - } - - return obj is CheckpointIdentifier && Equals((CheckpointIdentifier)obj); - } - - /// - /// Overrides . Generates hashcode based on the instance fields. - /// - public override int GetHashCode() - { - int hash = 17; - hash = (hash * 31) + StageName.GetHashCode(); - return (hash * 31) + OperatorId.GetHashCode(); - } - - /// - /// Compare equality of instance fields. - /// - private bool Equals(CheckpointIdentifier other) - { - return StageName.Equals(other.StageName) && - OperatorId.Equals(other.OperatorId); - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/ICheckpointingTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/ICheckpointingTopology.cs deleted file mode 100644 index 01a5660b7f..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/ICheckpointingTopology.cs +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Utilities.Attributes; -using System; - -namespace Org.Apache.REEF.Network.Elastic.Topology.Physical -{ - /// - /// Interface for topologies able to checkpoint state. - /// - [Unstable("0.16", "API may change")] - internal interface ICheckpointingTopology : IDisposable - { - /// - /// An internal (to the topology) checkpoint. This can be used to implement - /// ephemeral level checkpoints. - /// - // For the moment the assumption is that only one object is stored - ICheckpointState InternalCheckpoint { get; } - - /// - /// Checkpoint the input state for the given iteration. - /// - /// The state to checkpoint - /// The iteration in which the checkpoint is happening - void Checkpoint(ICheckpointableState state, int iteration); - - /// - /// Retrieve a previously saved checkpoint. - /// The iteration number specificy which cehckpoint to retrieve, where -1 - /// is used by default to indicate the latest available checkpoint. - /// - /// The retrieved checkpoint - /// The iteration number for the checkpoint to retrieve. - /// - bool GetCheckpoint(out ICheckpointState checkpoint, int iteration = -1); - } -} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs index fcdfcd520e..4856ae5075 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs @@ -47,8 +47,7 @@ private DefaultBroadcastTopology( [Parameter(typeof(GroupCommunicationConfigurationOptions.Retry))] int retry, [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, [Parameter(typeof(GroupCommunicationConfigurationOptions.DisposeTimeout))] int disposeTimeout, - CommunicationLayer commLayer, - ICheckpointLayer checkpointLayer) : base( + CommunicationLayer commLayer) : base( stageName, taskId, Utils.BuildTaskId(stageName, rootId), @@ -58,8 +57,7 @@ private DefaultBroadcastTopology( retry, timeout, disposeTimeout, - commLayer, - checkpointLayer) + commLayer) { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs index c7b6bf0f79..d6d13cdf1e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs @@ -37,11 +37,10 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl /// Base class for topologies following a one to N communication pattern. /// [Unstable("0.16", "API may change")] - internal abstract class OneToNTopology : OperatorTopologyWithCommunication, ICheckpointingTopology + internal abstract class OneToNTopology : OperatorTopologyWithCommunication { protected static readonly Logger LOGGER = Logger.GetLogger(typeof(OneToNTopology)); - private readonly ICheckpointLayer _checkpointLayer; protected readonly ConcurrentDictionary _nodesToRemove; protected readonly ManualResetEvent _topologyUpdateReceived; @@ -71,10 +70,8 @@ public OneToNTopology( int retry, int timeout, int disposeTimeout, - CommunicationLayer commLayer, - ICheckpointLayer checkpointLayer) : base(stageName, taskId, rootTaskId, operatorId, commLayer, retry, timeout, disposeTimeout) + CommunicationLayer commLayer) : base(stageName, taskId, rootTaskId, operatorId, commLayer, retry, timeout, disposeTimeout) { - _checkpointLayer = checkpointLayer; _nodesToRemove = new ConcurrentDictionary(); _topologyUpdateReceived = new ManualResetEvent(RootTaskId == taskId ? false : true); @@ -91,12 +88,6 @@ public OneToNTopology( } } - /// - /// An internal (to the topology) checkpoint. This can be used to implement - /// ephemeral level checkpoints. - /// - public ICheckpointState InternalCheckpoint { get; private set; } - /// /// Whether the topology is still sending messages or not. /// @@ -105,55 +96,6 @@ public bool IsSending get { return !_sendQueue.IsEmpty; } } - /// - /// Checkpoint the input state for the given iteration. - /// - /// The state to checkpoint - /// The iteration in which the checkpoint is happening - - public void Checkpoint(ICheckpointableState state, int iteration) - { - ICheckpointState checkpoint; - - switch (state.Level) - { - case CheckpointLevel.None: - break; - case CheckpointLevel.EphemeralMaster: - if (TaskId == RootTaskId) - { - InternalCheckpoint = state.Checkpoint(); - InternalCheckpoint.Iteration = iteration; - } - break; - case CheckpointLevel.EphemeralAll: - InternalCheckpoint = state.Checkpoint(); - InternalCheckpoint.Iteration = iteration; - break; - default: - throw new IllegalStateException($"Checkpoint level {state.Level} not supported."); - } - } - - /// - /// Retrieve a previously saved checkpoint. - /// The iteration number specificy which cehckpoint to retrieve, where -1 - /// is used by default to indicate the latest available checkpoint. - /// - /// The retrieved checkpoint - /// The iteration number for the checkpoint to retrieve. - /// - public bool GetCheckpoint(out ICheckpointState checkpoint, int iteration = -1) - { - if (InternalCheckpoint != null && (iteration == -1 || InternalCheckpoint.Iteration == iteration)) - { - checkpoint = InternalCheckpoint; - return true; - } - - return _checkpointLayer.GetCheckpoint(out checkpoint, TaskId, StageName, OperatorId, iteration, false); - } - /// /// Waiting logic before disposing topologies. /// From 0a57845e572597496f37468b893dd730d060ddfe Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Sun, 30 Dec 2018 16:39:59 -0800 Subject: [PATCH 04/29] some dir refactoring plus added some stuff. --- .../DefaultElasticContext.cs | 23 +- .../{Impl => Default}/DefaultElasticStage.cs | 26 +- .../Default/DefaultElasticTaskSetManager.cs | 1361 +++++++++++++++++ .../DefaultElasticTaskSetManagerParameters.cs | 107 ++ .../Elastic/Driver/IElasticContext.cs | 2 +- .../Elastic/Driver/IElasticStage.cs | 2 +- .../{Impl => Default}/DefaultFailureState.cs | 3 +- .../DefaultFailureStateEvents.cs | 2 +- .../DefaultFailureStateMachine.cs | 5 +- .../{Enum => Default}/DefaultFailureStates.cs | 2 +- .../Failures/{Impl => Default}/FailEvent.cs | 5 +- .../IDefaultFailureEventResponse.cs | 10 +- .../{Impl => Default}/ReconfigureEvent.cs | 25 +- .../{Impl => Default}/RescheduleEvent.cs | 41 +- .../Failures/{Impl => Default}/StopEvent.cs | 5 +- .../Elastic/Failures/FailuresClock.cs | 272 ++++ .../Elastic/Failures/IFailureResponse.cs | 3 +- .../Elastic/Failures/IFailureStateMachine.cs | 2 +- .../Elastic/Failures/IReconfigure.cs | 40 - .../Elastic/Failures/IReschedule.cs | 40 - .../Elastic/Failures/IStop.cs | 29 - .../Failures/{Impl => }/OperatorException.cs | 2 +- .../{Impl => Default}/DefaultBroadcast.cs | 9 +- .../Logical/{Impl => Default}/DefaultEmpty.cs | 2 +- .../{Impl => Default}/DefaultOneToN.cs | 12 +- .../ElastiOperatorWithDefaultDispatcher.cs | 24 +- .../Logical/{Impl => }/ElasticOperator.cs | 2 +- .../{Impl => Default}/DefaultBroadcast.cs | 5 +- .../{Impl => Default}/DefaultOneToN.cs | 4 +- .../{ => Physical}/IElasticBroadcast.cs | 0 .../Task/{Impl => }/CancellationSource.cs | 2 +- .../Task/ElasticDriverMessageHandler.cs | 73 + .../Elastic/Task/NodeObserverIdentifier.cs | 122 ++ .../DefaultBroadcastTopology.cs | 4 +- .../{Impl => Default}/OneToNTopology.cs | 6 +- ...eratorTopologyWithDefaultCommunication.cs} | 6 +- .../{Impl => }/DriverAwareOperatorTopology.cs | 4 +- .../Physical/{Impl => }/OperatorTopology.cs | 4 +- 38 files changed, 2037 insertions(+), 249 deletions(-) rename lang/cs/Org.Apache.REEF.Network/Elastic/Driver/{Impl => Default}/DefaultElasticContext.cs (96%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Driver/{Impl => Default}/DefaultElasticStage.cs (95%) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Impl => Default}/DefaultFailureState.cs (95%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Enum => Default}/DefaultFailureStateEvents.cs (95%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Impl => Default}/DefaultFailureStateMachine.cs (98%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Enum => Default}/DefaultFailureStates.cs (96%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Impl => Default}/FailEvent.cs (93%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{ => Default}/IDefaultFailureEventResponse.cs (85%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Impl => Default}/ReconfigureEvent.cs (82%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Impl => Default}/RescheduleEvent.cs (61%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Impl => Default}/StopEvent.cs (93%) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReconfigure.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IReschedule.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IStop.cs rename lang/cs/Org.Apache.REEF.Network/Elastic/Failures/{Impl => }/OperatorException.cs (98%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/{Impl => Default}/DefaultBroadcast.cs (89%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/{Impl => Default}/DefaultEmpty.cs (98%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/{Impl => Default}/DefaultOneToN.cs (94%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/{Impl => Default}/ElastiOperatorWithDefaultDispatcher.cs (93%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/{Impl => }/ElasticOperator.cs (99%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/{Impl => Default}/DefaultBroadcast.cs (94%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/{Impl => Default}/DefaultOneToN.cs (98%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/{ => Physical}/IElasticBroadcast.cs (100%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Task/{Impl => }/CancellationSource.cs (97%) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs rename lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/{Impl => Default}/DefaultBroadcastTopology.cs (97%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/{Impl => Default}/OneToNTopology.cs (98%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/{Impl/OperatorTopologyWithCommunication.cs => Default/OperatorTopologyWithDefaultCommunication.cs} (98%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/{Impl => }/DriverAwareOperatorTopology.cs (93%) rename lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/{Impl => }/OperatorTopology.cs (96%) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs similarity index 96% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticContext.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs index 5008122729..91af722405 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs @@ -32,18 +32,17 @@ using Org.Apache.REEF.Utilities.Logging; using Org.Apache.REEF.Network.Elastic.Config; using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Network.Elastic.Failures.Impl; using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Network.Elastic.Comm; using Org.Apache.REEF.Wake.Time.Event; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Wake.Remote.Parameters; using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic.Task.Impl; using Org.Apache.REEF.Driver.Evaluator; +using Org.Apache.REEF.Network.Elastic.Failures.Default; -namespace Org.Apache.REEF.Network.Elastic.Driver.Impl +namespace Org.Apache.REEF.Network.Elastic.Driver.Default { /// /// Default implementation for the task context. @@ -51,7 +50,7 @@ namespace Org.Apache.REEF.Network.Elastic.Driver.Impl /// Also manages configurations for Elastic Group Communication operators/contexts. /// [Unstable("0.16", "API may change")] - public sealed class DefaultElasticContext : IElasticContext, IDefaultFailureEventResponse + internal sealed class DefaultElasticContext : IElasticContext, IDefaultFailureEventResponse { private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticContext)); @@ -204,12 +203,12 @@ public ConfigurationModule GetTaskConfigurationModule(string taskId) { return TaskConfiguration.ConfigurationModule .Set(TaskConfiguration.Identifier, taskId) - .Set(TaskConfiguration.OnMessage, GenericType.Class) + .Set(TaskConfiguration.OnMessage, GenericType.Class) .Set(TaskConfiguration.OnClose, GenericType.Class); } /// - /// Start the elastic group communicatio context. + /// Start the elastic group communication context. /// This will trigger requests for resources as specified by the parameters. /// public void Start() @@ -337,15 +336,15 @@ public void EventDispatcher(ref IFailureEvent @event) switch ((DefaultFailureStateEvents)@event.FailureEvent) { case DefaultFailureStateEvents.Reconfigure: - var rec = @event as IReconfigure; + var rec = @event as ReconfigureEvent; OnReconfigure(ref rec); break; case DefaultFailureStateEvents.Reschedule: - var res = @event as IReschedule; + var res = @event as RescheduleEvent; OnReschedule(ref res); break; case DefaultFailureStateEvents.Stop: - var stp = @event as IStop; + var stp = @event as StopEvent; OnStop(ref stp); break; default: @@ -361,7 +360,7 @@ public void EventDispatcher(ref IFailureEvent @event) /// Mechanism to execute when a reconfigure event is triggered. /// /// - public void OnReconfigure(ref IReconfigure reconfigureEvent) + public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { lock (_statusLock) { @@ -373,7 +372,7 @@ public void OnReconfigure(ref IReconfigure reconfigureEvent) /// Mechanism to execute when a reschedule event is triggered. /// /// - public void OnReschedule(ref IReschedule rescheduleEvent) + public void OnReschedule(ref RescheduleEvent rescheduleEvent) { lock (_statusLock) { @@ -385,7 +384,7 @@ public void OnReschedule(ref IReschedule rescheduleEvent) /// Mechanism to execute when a stop event is triggered. /// /// - public void OnStop(ref IStop stopEvent) + public void OnStop(ref StopEvent stopEvent) { lock (_statusLock) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs similarity index 95% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticStage.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index affd770ab2..7c19c7e3fd 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Impl/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -20,22 +20,22 @@ using Org.Apache.REEF.Tang.Interface; using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Utilities.Logging; -using Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl; using System.Threading; using Org.Apache.REEF.Driver.Context; using Org.Apache.REEF.Network.Elastic.Failures; -using Org.Apache.REEF.Network.Elastic.Failures.Impl; using System.Collections.Generic; using Org.Apache.REEF.Network.Elastic.Comm; using System.Linq; using Org.Apache.REEF.Wake.Time.Event; using Org.Apache.REEF.IO.PartitionedData; using Org.Apache.REEF.Utilities; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using System; using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Network.Elastic.Operators.Logical.Default; +using Org.Apache.REEF.Network.Elastic.Operators.Logical; -namespace Org.Apache.REEF.Network.Elastic.Driver.Impl +namespace Org.Apache.REEF.Network.Elastic.Driver.Default { /// /// Used to group elastic operators into logical units. @@ -44,7 +44,7 @@ namespace Org.Apache.REEF.Network.Elastic.Driver.Impl /// This class is used to create stages able to manage default failure events. /// [Unstable("0.16", "API may change")] - public sealed class DefaultElasticStage : IElasticStage, IDefaultFailureEventResponse + internal sealed class DefaultElasticStage : IElasticStage, IDefaultFailureEventResponse { private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticStage)); @@ -360,12 +360,12 @@ public void OnTaskMessage(ITaskMessage message, ref List } #region Failure Response + /// /// Used to react when a timeout event is triggered. - /// It gets a failed task as input and in response it produces zero or more failure events. /// /// The alarm triggering the timeput - /// A list of messages encoding how remote Tasks need to reach + /// A list of messages encoding how remote tasks need to react /// The next timeouts to be scheduled public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) { @@ -396,15 +396,15 @@ public void EventDispatcher(ref IFailureEvent @event) switch ((DefaultFailureStateEvents)@event.FailureEvent) { case DefaultFailureStateEvents.Reconfigure: - var rec = @event as IReconfigure; + var rec = @event as ReconfigureEvent; OnReconfigure(ref rec); break; case DefaultFailureStateEvents.Reschedule: - var res = @event as IReschedule; + var res = @event as RescheduleEvent; OnReschedule(ref res); break; case DefaultFailureStateEvents.Stop: - var stp = @event as IStop; + var stp = @event as StopEvent; OnStop(ref stp); break; default: @@ -423,7 +423,7 @@ public void EventDispatcher(ref IFailureEvent @event) /// Mechanism to execute when a reconfigure event is triggered. /// /// - public void OnReconfigure(ref IReconfigure reconfigureEvent) + public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { lock (_statusLock) { @@ -435,7 +435,7 @@ public void OnReconfigure(ref IReconfigure reconfigureEvent) /// Mechanism to execute when a reschedule event is triggered. /// /// - public void OnReschedule(ref IReschedule rescheduleEvent) + public void OnReschedule(ref RescheduleEvent rescheduleEvent) { lock (_statusLock) { @@ -447,7 +447,7 @@ public void OnReschedule(ref IReschedule rescheduleEvent) /// Mechanism to execute when a stop event is triggered. /// /// - public void OnStop(ref IStop stopEvent) + public void OnStop(ref StopEvent stopEvent) { lock (_statusLock) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs new file mode 100644 index 0000000000..656695ec92 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -0,0 +1,1361 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Driver.Evaluator; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Comm; +using System.Collections.Concurrent; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Failures.Default; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Class managing the scheduling of tasks and task-related events. + /// + internal sealed class DefaultElasticTaskSetManager : + IElasticTaskSetManager, + IDefaultFailureEventResponse, + IObserver + { + #region Private structs + + // Struct managing state for re-scheduling contexts after evaluator failures. + private struct ContextInfo + { + public ContextInfo(int id) + { + Id = id; + NumRetry = 1; + } + + /// + /// The context id. + /// + public int Id { get; private set; } + + /// + /// The number of times we tried to submit the context. + /// + public int NumRetry { get; set; } + } + + /// + /// Definition of the the different states in which a task can be. + /// + private enum TaskState + { + Init = 1, + + Queued = 2, + + Submitted = 3, + + Recovering = 4, + + Running = 5, + + Failed = 6, + + Completed = 7 + } + + #endregion + + #region Private classes + + /// + /// Wraps all the info required to proper manage a task life cycle. + /// + private sealed class TaskInfo : IDisposable + { + private volatile bool _isTaskDisposed; + private volatile bool _isActiveContextDisposed; + private volatile bool _isDisposed; + + /// + /// Constructor. + /// + /// The inital configuration for the task + /// The active context for the task + /// The evalutor id + /// The task status + /// The stage the task belongs to + public TaskInfo(IConfiguration config, IActiveContext context, string evaluatorId, TaskState status, IList stages) + { + _isTaskDisposed = false; + _isActiveContextDisposed = false; + _isDisposed = false; + TaskConfiguration = config; + ActiveContext = context; + EvaluatorId = evaluatorId; + Stages = stages; + NumRetry = 1; + TaskStatus = status; + RescheduleConfigurations = new Dictionary>(); + Lock = new object(); + } + + /// + /// The task configuration. + /// + public IConfiguration TaskConfiguration { get; private set; } + + /// + /// The active context for the task. + /// + public IActiveContext ActiveContext { get; private set; } + + /// + /// Whether the active task context was previously diposed or not. + /// + public bool IsActiveContextDisposed + { + get { return _isActiveContextDisposed; } + } + + /// + /// The id of the evalutor of the task. + /// + public string EvaluatorId { get; private set; } + + /// + /// The stages the task will be exeucting. + /// + public IList Stages { get; private set; } + + /// + /// Configurations when the task will be rescheduled after a failure. + /// + public Dictionary> RescheduleConfigurations { get; set; } + + /// + /// Reference to the remote running task. + /// + public IRunningTask TaskRunner { get; private set; } + + /// + /// The current status of the task. + /// + public TaskState TaskStatus { get; private set; } + + /// + /// How many times the task have been scheduled. + /// + public int NumRetry { get; set; } + + /// + ///An object used as lock for the task info. + /// + public object Lock { get; private set; } + + /// + /// Save the reference to the remote running task. + /// + /// The reference to the remote running task + public void SetTaskRunner(IRunningTask taskRunner) + { + TaskRunner = taskRunner; + _isTaskDisposed = false; + } + + /// + /// Change the status of the task. + /// + /// The new task state + public void SetTaskStatus(TaskState status) + { + TaskStatus = status; + } + + /// + /// Update the task runtime. + /// + /// The active context of the task + /// The id of the evaluator + public void UpdateRuntime(IActiveContext newActiveContext, string evaluatorId) + { + if (!_isActiveContextDisposed) + { + throw new IllegalStateException("Updating Task with not disposed active context"); + } + + ActiveContext = newActiveContext; + EvaluatorId = evaluatorId; + _isActiveContextDisposed = false; + } + + /// + /// Set the task runtime as diposed. + /// + public void DropRuntime() + { + _isActiveContextDisposed = true; + _isTaskDisposed = true; + } + + /// + /// Dipose the task. + /// + public void DisposeTask() + { + if (!_isTaskDisposed) + { + if (TaskRunner != null) + { + TaskRunner.Dispose(); + } + + _isTaskDisposed = true; + } + } + + /// + /// Dipose the active context of the task. + /// + public void DisposeActiveContext() + { + if (!_isActiveContextDisposed) + { + if (ActiveContext != null) + { + ActiveContext.Dispose(); + } + + _isActiveContextDisposed = true; + } + } + + /// + /// Dipose the task info. + /// + public void Dispose() + { + if (!_isDisposed) + { + DisposeTask(); + + DisposeActiveContext(); + + _isDisposed = true; + } + } + } + + /// + /// Utility class used to recognize particular task states. + /// + private static class TaskStateUtils + { + private static List recoverable = new List() { TaskState.Failed, TaskState.Queued }; + + private static List notRunnable = new List() { TaskState.Failed, TaskState.Completed }; + + /// + /// Whether a task is recoverable or not. + /// + /// The current state of the task + /// True if the task is recoverable + public static bool IsRecoverable(TaskState state) + { + return recoverable.Contains(state); + } + + /// + /// Whether a task can be run or not. + /// + /// The current state of the task + /// True if the task can be run + public static bool IsRunnable(TaskState state) + { + return !notRunnable.Contains(state); + } + } + + /// + /// Represent an event triggered by some timeout registered by the task set. + /// + private sealed class TasksetAlarm : Alarm + { + /// + /// Constructor. + /// + /// The timestamp when the alarm should be triggered + /// The handler of the event triggered by the alarm + public TasksetAlarm(long timestamp, IObserver handler) : base(timestamp, handler) + { + } + } + + /// + /// Class used to define a timeout on the task set triggering an alarm. + /// + private sealed class TaskSetTimeout : ITimeout + { + private readonly IObserver _handler; + private readonly long _offset; + private readonly string _id; + + /// + /// Constructor. + /// + /// The offset used to define when the timeout will be triggered + /// The handler for the alarm + public TaskSetTimeout(long offset, IObserver handler) + { + _handler = handler ?? throw new ArgumentNullException(nameof(handler)); + _offset = offset; + } + + /// + /// Get the actual alarm to be scheduled. + /// + /// The current time + /// + public Alarm GetAlarm(long time) + { + return new TasksetAlarm(time + _offset, _handler); + } + } + + #endregion + + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticTaskSetManager)); + + private bool _finalized; + private volatile bool _disposed; + private volatile bool _scheduled; + private volatile bool _completed; + private readonly DefaultElasticTaskSetManagerParameters _parameters; + + private volatile int _contextsAdded; + private int _tasksAdded; + private int _tasksRunning; + private volatile int _totFailedTasks; + private volatile int _totFailedEvaluators; + + private readonly int _numTasks; + private readonly IEvaluatorRequestor _evaluatorRequestor; + private readonly string _driverId; + private readonly Func _masterTaskConfiguration; + private readonly Func _slaveTaskConfiguration; + + // Task info 0-indexed + private readonly List _taskInfos; + private readonly Dictionary _stages; + private readonly ConcurrentQueue _queuedTasks; + private readonly ConcurrentQueue _queuedContexts; + + // Used both for knowing which evaluator the task set is responsible for and to + // maintain a mapping betwween evaluators and contextes. + // This latter is necessary because evaluators may fail between context init + // and the time when the context is installed on the evaluator + private readonly ConcurrentDictionary _evaluatorToContextIdMapping; + private IFailureState _failureStatus; + private volatile bool _hasProgress; + + private readonly object _statusLock; + + /// + /// Constructor for the task set manager. + /// + /// The total number of tasks in the task set + /// The requestor to spawn new evaluator + /// The id of the dirver + /// The configuration for the master task + /// The configuration for the slave tasks + /// Additional configurations + public DefaultElasticTaskSetManager( + int numTasks, + IEvaluatorRequestor evaluatorRequestor, + string driverId, + Func masterTaskConfiguration, + Func slaveTaskConfiguration = null, + params IConfiguration[] confs) + { + _finalized = false; + _scheduled = false; + _disposed = false; + _completed = false; + + _contextsAdded = 0; + _tasksAdded = 0; + _tasksRunning = 0; + _totFailedTasks = 0; + _totFailedEvaluators = 0; + + _numTasks = numTasks; + _evaluatorRequestor = evaluatorRequestor; + _driverId = driverId; + _masterTaskConfiguration = masterTaskConfiguration; + _slaveTaskConfiguration = slaveTaskConfiguration ?? masterTaskConfiguration; + + _taskInfos = new List(numTasks); + _stages = new Dictionary(); + _queuedTasks = new ConcurrentQueue(); + _queuedContexts = new ConcurrentQueue(); + _evaluatorToContextIdMapping = new ConcurrentDictionary(); + _failureStatus = new DefaultFailureState(); + _hasProgress = true; + + _statusLock = new object(); + + for (int i = 0; i < numTasks; i++) + { + _taskInfos.Add(null); + } + + var injector = TangFactory.GetTang().NewInjector(confs); + Type parametersType = typeof(DefaultElasticTaskSetManagerParameters); + _parameters = injector.GetInstance(parametersType) as DefaultElasticTaskSetManagerParameters; + + // Set up the timeout + List msgs = null; + var nextTimeouts = new List(); + + OnTimeout(new TasksetAlarm(0, this), ref msgs, ref nextTimeouts); + } + + /// + /// An identifier for the set of stages the task manager is subscribed to. + /// The task set has to be built before retrieving its stages id. + /// + public string StagesId + { + get + { + if (_finalized != true) + { + throw new IllegalStateException("Task set have to be built before getting its stages"); + } + + return _stages.Keys.Aggregate((current, next) => current + "+" + next); + } + } + + /// + /// Subscribe the current task set manager to a new stage. + /// + /// The stage to subscribe to + public void AddStage(IElasticStage stage) + { + if (_finalized == true) + { + throw new IllegalStateException("Cannot add stage to an already built task set manager"); + } + + _stages.Add(stage.StageName, stage); + } + + /// + /// Decides whether more contexts have to be added to this Task Manger or not. + /// + /// True if the number of added contexts is less than the available slots + + public bool HasMoreContextToAdd() + { + return _contextsAdded < _numTasks; + } + + /// + /// Method used to generate unique context ids. + /// + /// The evaluator the context will run on + /// A new unique context id + /// True if an new context id is sucessufully created + public bool TryGetNextTaskContextId(IAllocatedEvaluator evaluator, out string identifier) + { + int id; + ContextInfo cinfo; + + if (_queuedTasks.TryDequeue(out id)) + { + identifier = Utils.BuildContextId(StagesId, id); + cinfo = new ContextInfo(id); + _evaluatorToContextIdMapping.TryAdd(evaluator.Id, cinfo); + return true; + } + + if (_queuedContexts.TryDequeue(out cinfo)) + { + identifier = Utils.BuildContextId(StagesId, cinfo.Id); + _evaluatorToContextIdMapping.TryAdd(evaluator.Id, cinfo); + return true; + } + + id = Interlocked.Increment(ref _contextsAdded); + + if (_contextsAdded > _numTasks) + { + LOGGER.Log(Level.Warning, "Trying to schedule too many contexts"); + identifier = string.Empty; + return false; + } + + identifier = Utils.BuildContextId(StagesId, id); + cinfo = new ContextInfo(id); + _evaluatorToContextIdMapping.TryAdd(evaluator.Id, cinfo); + + LOGGER.Log(Level.Info, $"Evaluator {evaluator.Id} is scheduled on node {evaluator.GetEvaluatorDescriptor().NodeDescriptor.HostName}"); + + return true; + } + + /// + /// Method used to generate unique task ids. + /// + /// The context the task will run on + /// A new task id + public string GetTaskId(IActiveContext context) + { + var id = Utils.GetContextNum(context); + return Utils.BuildTaskId(StagesId, id); + } + + /// + /// Retrieve all stages having the context passed as a parameter as master task context. + /// + /// The target context + /// A list of stages having the master task running on context + public IEnumerable IsMasterTaskContext(IActiveContext activeContext) + { + return _stages.Values.Where(stage => stage.IsMasterTaskContext(activeContext)); + } + + /// + /// Get the configuration of the codecs used for data transmission. + /// The codecs are automatically generated from the operator pipeline. + /// + /// A configuration object with the codecs for data transmission + public IConfiguration GetCodecConfiguration() + { + var conf = TangFactory.GetTang().NewConfigurationBuilder().Build(); + + foreach (var stage in _stages.Values) + { + stage.RootOperator.GetCodecConfiguration(ref conf); + } + + return conf; + } + + /// + /// Method implementing how the task set manager should react when a new context is active. + /// + /// The new active context + public void OnNewActiveContext(IActiveContext activeContext) + { + if (_finalized != true) + { + throw new IllegalStateException("Task set have to be finalized before adding tasks"); + } + + if (Completed() || Failed()) + { + LOGGER.Log(Level.Warning, "Adding tasks to already completed task set: ignoring"); + activeContext.Dispose(); + return; + } + + _hasProgress = true; + var id = Utils.GetContextNum(activeContext) - 1; + var taskId = Utils.BuildTaskId(StagesId, id + 1); + + // We reschedule the task only if the context was active (_taskInfos[id] != null) and the task was actually scheduled at least once (_taskInfos[id].TaskStatus > TaskStatus.Init) + if (_taskInfos[id] != null && _taskInfos[id].TaskStatus > TaskState.Init) + { + LOGGER.Log(Level.Info, $"{taskId} already part of task set: going to directly submit it"); + + lock (_taskInfos[id].Lock) + { + _taskInfos[id].UpdateRuntime(activeContext, activeContext.EvaluatorId); + } + + SubmitTask(id); + } + else + { + bool isMaster = IsMasterTaskContext(activeContext).Any(); + + LOGGER.Log(Level.Info, $"Task {taskId} to be scheduled on {activeContext.EvaluatorId}"); + + List partialTaskConfs = new List(); + + if (isMaster) + { + partialTaskConfs.Add(_masterTaskConfiguration(taskId)); + } + else + { + partialTaskConfs.Add(_slaveTaskConfiguration(taskId)); + } + + AddTask(taskId, activeContext, partialTaskConfs); + } + } + + /// + /// Finalizes the task set manager. + /// After the task set has been finalized, no more stages can be added. + /// + /// The same finalized task set manager + public IElasticTaskSetManager Build() + { + if (_finalized == true) + { + throw new IllegalStateException("Task set manager cannot be built more than once"); + } + + _finalized = true; + + return this; + } + + /// + /// Method implementing how the task set manager should react when a notification that a task is running is received. + /// + /// The running task + public void OnTaskRunning(IRunningTask task) + { + if (IsTaskManagedBy(task.Id)) + { + var id = Utils.GetTaskNum(task.Id) - 1; + _hasProgress = true; + + lock (_taskInfos[id].Lock) + { + _taskInfos[id].SetTaskRunner(task); + + if (Completed() || Failed()) + { + LOGGER.Log(Level.Info, $"Received running from task {task.Id} but task set is completed or failed: ignoring"); + _taskInfos[id].DisposeTask(); + + return; + } + if (!TaskStateUtils.IsRunnable(_taskInfos[id].TaskStatus)) + { + LOGGER.Log(Level.Info, $"Received running from task {task.Id} which is not runnable: ignoring"); + _taskInfos[id].DisposeTask(); + + return; + } + + if (_taskInfos[id].TaskStatus != TaskState.Running) + { + if (_taskInfos[id].TaskStatus == TaskState.Recovering) + { + foreach (var stage in _stages) + { + stage.Value.AddTask(task.Id); + } + } + + _taskInfos[id].SetTaskStatus(TaskState.Running); + Interlocked.Increment(ref _tasksRunning); + } + } + } + } + + /// + /// Method implementing how the task set manager should react when a notification that a task is completed is received. + /// + /// The completed task + public void OnTaskCompleted(ICompletedTask taskInfo) + { + if (IsTaskManagedBy(taskInfo.Id)) + { + Interlocked.Decrement(ref _tasksRunning); + var id = Utils.GetTaskNum(taskInfo.Id) - 1; + _hasProgress = true; + + lock (_taskInfos[id].Lock) + { + _taskInfos[id].SetTaskStatus(TaskState.Completed); + } + if (Completed()) + { + foreach (var info in _taskInfos.Where(info => info != null && info.TaskStatus < TaskState.Failed)) + { + info.DisposeTask(); + } + } + } + } + + /// + /// Method implementing how the task set manager should react when a task message is received. + /// + /// A message from a task + public void OnTaskMessage(ITaskMessage message) + { + if (IsTaskManagedBy(message.TaskId)) + { + var id = Utils.GetTaskNum(message.TaskId) - 1; + var returnMessages = new List(); + _hasProgress = true; + + foreach (var stage in _stages.Values) + { + stage.OnTaskMessage(message, ref returnMessages); + } + + SendToTasks(returnMessages); + } + } + + /// + /// Whether this task set is done. + /// + public bool IsCompleted() + { + return Completed() && _tasksRunning == 0; + } + + #region Failure Response + + /// + /// Used to react on a task failure. + /// + /// The failed task + public void OnTaskFailure(IFailedTask task) + { + var failureEvents = new List(); + + OnTaskFailure(task, ref failureEvents); + } + + /// + /// Used to react when a timeout event is triggered. + /// + /// The alarm triggering the timeput + /// A list of messages encoding how remote tasks need to react + /// The next timeouts to be scheduled + public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) + { + var isInit = msgs == null; + + // Taskset is just started, init the timeouts + if (isInit) + { + _hasProgress = false; + LOGGER.Log(Level.Info, "Timeout alarm for task set initialized"); + nextTimeouts.Add(new TaskSetTimeout(_parameters.Timeout, this)); + + foreach (var stage in _stages.Values) + { + stage.OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + } + else if (alarm.GetType() == typeof(TasksetAlarm)) + { + if (!_hasProgress) + { + if (Completed() || Failed()) + { + LOGGER.Log(Level.Warning, "Taskset made no progress in the last {0}ms. Forcing Disposal.", _parameters.Timeout); + Dispose(); + } + else + { + LOGGER.Log(Level.Error, "Taskset made no progress in the last {0}ms. Aborting.", _parameters.Timeout); + OnFail(); + return; + } + } + else + { + _hasProgress = false; + nextTimeouts.Add(new TaskSetTimeout(_parameters.Timeout, this)); + } + } + else + { + foreach (var stage in _stages.Values) + { + stage.OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + + SendToTasks(msgs); + } + + foreach (var timeout in nextTimeouts) + { + _parameters.Clock.ScheduleAlarm(timeout); + } + } + + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + + public void OnTaskFailure(IFailedTask task, ref List failureEvents) + { + if (IsTaskManagedBy(task.Id)) + { + LOGGER.Log(Level.Info, "Received a failure from " + task.Id, task.AsError()); + + Interlocked.Decrement(ref _tasksRunning); + _totFailedTasks++; + _hasProgress = true; + var id = Utils.GetTaskNum(task.Id) - 1; + + if (Completed() || Failed()) + { + LOGGER.Log(Level.Info, $"Received a failure from task {task.Id} but the task set is completed or failed: ignoring the failure", task.AsError()); + + lock (_taskInfos[id].Lock) + { + _taskInfos[id].SetTaskStatus(TaskState.Failed); + } + + _taskInfos[id].Dispose(); + + return; + } + + failureEvents = failureEvents ?? new List(); + + lock (_taskInfos[id].Lock) + { + if (_taskInfos[id].TaskStatus < TaskState.Failed) + { + _taskInfos[id].SetTaskStatus(TaskState.Failed); + } + + foreach (var stage in _taskInfos[id].Stages) + { + stage.OnTaskFailure(task, ref failureEvents); + } + + // Failures have to be propagated up to the context + _taskInfos[id].Stages.First().Context.OnTaskFailure(task, ref failureEvents); + } + + for (int i = 0; i < failureEvents.Count; i++) + { + var @event = failureEvents[i]; + EventDispatcher(ref @event); + } + } + } + + /// + /// Used to react of a failure event occurred on an evaluator. + /// + /// The failed evaluator + public void OnEvaluatorFailure(IFailedEvaluator evaluator) + { + LOGGER.Log(Level.Info, "Received a failure from " + evaluator.Id, evaluator.EvaluatorException); + + _totFailedEvaluators++; + + if (evaluator.FailedTask.IsPresent()) + { + var failedTask = evaluator.FailedTask.Value; + var id = Utils.GetTaskNum(failedTask.Id) - 1; + + lock (_taskInfos[id].Lock) + { + _taskInfos[id].DropRuntime(); + } + + OnTaskFailure(failedTask); + _evaluatorToContextIdMapping.TryRemove(evaluator.Id, out ContextInfo cinfo); + } + else + { + _hasProgress = true; + + if (!Completed() && !Failed()) + { + if (_evaluatorToContextIdMapping.TryRemove(evaluator.Id, out ContextInfo cinfo)) + { + int id = cinfo.Id - 1; + + if (_taskInfos[id] != null) + { + lock (_taskInfos[id].Lock) + { + _taskInfos[id].DropRuntime(); + _taskInfos[id].SetTaskStatus(TaskState.Failed); + } + } + + cinfo.NumRetry++; + + if (cinfo.NumRetry > _parameters.NumEvaluatorFailures) + { + LOGGER.Log(Level.Error, $"Context {cinfo.Id} failed more than {_parameters.NumEvaluatorFailures} times: Aborting"); + OnFail(); + } + + _queuedContexts.Enqueue(cinfo); + } + SpawnNewEvaluator(cinfo.Id); + } + } + } + + /// + /// When a new failure state is reached, this method is used to dispatch + /// such event to the proper failure mitigation logic. + /// It gets a failure event as input and produces zero or more failure response messages + /// for tasks (appended into the event). + /// + /// The failure event to react upon + public void EventDispatcher(ref IFailureEvent @event) + { + var id = Utils.GetTaskNum(@event.TaskId) - 1; + + _taskInfos[id].Stages.First().Context.EventDispatcher(ref @event); + + foreach (var stage in _taskInfos[id].Stages) + { + stage.EventDispatcher(ref @event); + } + + switch ((DefaultFailureStateEvents)@event.FailureEvent) + { + case DefaultFailureStateEvents.Reconfigure: + var rec = @event as ReconfigureEvent; + OnReconfigure(ref rec); + break; + case DefaultFailureStateEvents.Reschedule: + var res = @event as RescheduleEvent; + OnReschedule(ref res); + break; + case DefaultFailureStateEvents.Stop: + var stp = @event as StopEvent; + OnStop(ref stp); + break; + case DefaultFailureStateEvents.Fail: + OnFail(); + break; + default: + throw new IllegalStateException("Failure event not recognized"); + } + } + + /// + /// Mechanism to execute when a reconfigure event is triggered. + /// + /// + public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + } + + SendToTasks(reconfigureEvent.FailureResponse); + } + + /// + /// Mechanism to execute when a reschedule event is triggered. + /// + /// + public void OnReschedule(ref RescheduleEvent rescheduleEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + } + + SendToTasks(rescheduleEvent.FailureResponse); + + var id = Utils.GetTaskNum(rescheduleEvent.TaskId) - 1; + + lock (_taskInfos[id].Lock) + { + _taskInfos[id].NumRetry++; + + if (_taskInfos[id].NumRetry > _parameters.NumTaskFailures) + { + LOGGER.Log(Level.Error, $"Task {rescheduleEvent.TaskId} failed more than {_parameters.NumTaskFailures} times: aborting"); + OnFail(); + } + + if (rescheduleEvent.Reschedule) + { + LOGGER.Log(Level.Info, $"Rescheduling task {rescheduleEvent.TaskId}"); + + _taskInfos[id].RescheduleConfigurations = rescheduleEvent.RescheduleTaskConfigurations; + + SubmitTask(id); + } + } + } + + /// + /// Mechanism to execute when a stop event is triggered. + /// + /// + public void OnStop(ref StopEvent stopEvent) + { + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + } + + SendToTasks(stopEvent.FailureResponse); + } + + /// + /// Mechanism to execute when a fail event is triggered. + /// + public void OnFail() + { + LOGGER.Log(Level.Info, "Task set failed"); + + lock (_statusLock) + { + _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.Fail)); + } + + Dispose(); + } + + #endregion + + public void Dispose() + { + if (!_disposed) + { + _disposed = true; + LogFinalStatistics(); + + foreach (var info in _taskInfos) + { + if (info != null) + { + lock (info.Lock) + { + info.Dispose(); + } + } + } + } + } + + /// + /// Whether the imput task is managed by this task set manger. + /// + /// The task identifier + public bool IsTaskManagedBy(string id) + { + return Utils.GetTaskStages(id) == StagesId; + } + + /// + /// Whether the imput context is managed by this task set manger. + /// + /// The context identifier + public bool IsContextManagedBy(string id) + { + return Utils.GetContextStages(id) == StagesId; + } + + /// + /// Whether the imput evaluator is managed by this task set manger. + /// + /// The context identifier + public bool IsEvaluatorManagedBy(string id) + { + return _evaluatorToContextIdMapping.ContainsKey(id); + } + + /// + /// Observer reacting to an alarm event. + /// + /// The alarm + public void OnNext(Alarm alarm) + { + var msgs = new List(); + var nextTimeouts = new List(); + + OnTimeout(alarm, ref msgs, ref nextTimeouts); + } + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + } + + private void AddTask(string taskId, IActiveContext activeContext, List partialTaskConfigs) + { + Interlocked.Increment(ref _tasksAdded); + var stageList = new List(); + var id = Utils.GetTaskNum(taskId) - 1; + + foreach (var stage in _stages) + { + if (stage.Value.AddTask(taskId)) + { + stageList.Add(stage.Value); + var partitionConf = stage.Value.GetPartitionConf(taskId); + + if (partitionConf.IsPresent()) + { + partialTaskConfigs.Add(partitionConf.Value); + } + } + else + { + LOGGER.Log(Level.Warning, $"{taskId} cannot be added to stage " + stage.Key); + activeContext.Dispose(); + return; + } + } + + var aggregatedConfs = partialTaskConfigs.Aggregate((x, y) => Configurations.Merge(x, y)); + + _taskInfos[id] = new TaskInfo(aggregatedConfs, activeContext, activeContext.EvaluatorId, TaskState.Init, stageList); + + if (_scheduled) + { + SubmitTask(id); + } + else if (StartSubmitTasks()) + { + SubmitTasks(); + } + } + + private bool StartSubmitTasks() + { + lock (_statusLock) + { + if (_scheduled) + { + return false; + } + + if (_stages.All(stage => stage.Value.ScheduleStage())) + { + _scheduled = true; + + LOGGER.Log(Level.Info, string.Format("Scheduling {0} tasks from Taskset {1}", _tasksAdded, StagesId)); + } + } + + return _scheduled; + } + + private void SubmitTasks() + { + for (int i = 0; i < _numTasks; i++) + { + if (_taskInfos[i] != null) + { + SubmitTask(i); + } + } + } + + private void SubmitTask(int id) + { + if (Completed() || Failed()) + { + LOGGER.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring"); + _taskInfos[id].DisposeTask(); + + return; + } + + lock (_taskInfos[id].Lock) + { + // Check that the task was not already submitted. This may happen for instance if _scheduled is set to true + // and a new active context message is received. + if (_taskInfos[id].TaskStatus == TaskState.Submitted) + { + return; + } + + var stages = _taskInfos[id].Stages; + ICsConfigurationBuilder confBuilder = TangFactory.GetTang().NewConfigurationBuilder(); + var rescheduleConfs = _taskInfos[id].RescheduleConfigurations; + + foreach (var stage in stages) + { + ICsConfigurationBuilder confSubBuilder = TangFactory.GetTang().NewConfigurationBuilder(); + var confSub = stage.GetTaskConfiguration(ref confSubBuilder, id + 1); + + if (rescheduleConfs.TryGetValue(stage.StageName, out var confs)) + { + foreach (var additionalConf in confs) + { + confSub = Configurations.Merge(confSub, additionalConf); + } + } + + _stages.Values.First().Context.SerializeStageConfiguration(ref confBuilder, confSub); + } + + IConfiguration baseConf = confBuilder + .BindNamedParameter( + GenericType.Class, + _driverId) + .Build(); + + IConfiguration mergedTaskConf = Configurations.Merge(_taskInfos[id].TaskConfiguration, baseConf); + + if (_taskInfos[id].IsActiveContextDisposed) + { + LOGGER.Log(Level.Warning, string.Format("Task submit for {0} with a non-active context: spawning a new evaluator", id + 1)); + + if (_taskInfos[id].TaskStatus == TaskState.Failed) + { + _queuedTasks.Enqueue(id + 1); + _taskInfos[id].SetTaskStatus(TaskState.Queued); + + SpawnNewEvaluator(id); + } + + return; + } + + _taskInfos[id].ActiveContext.SubmitTask(mergedTaskConf); + + if (TaskStateUtils.IsRecoverable(_taskInfos[id].TaskStatus)) + { + _taskInfos[id].SetTaskStatus(TaskState.Recovering); + } + else + { + _taskInfos[id].SetTaskStatus(TaskState.Submitted); + } + } + } + + private void SendToTasks(IList messages, int retry = 0) + { + foreach (var returnMessage in messages) + { + if (returnMessage != null) + { + var destination = Utils.GetTaskNum(returnMessage.Destination) - 1; + + if (_taskInfos[destination] == null) + { + throw new ArgumentNullException("Task Info"); + } + lock (_taskInfos[destination].Lock) + { + if (Completed() || Failed()) + { + LOGGER.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring"); + _taskInfos[destination].DisposeTask(); + + return; + } + if (_taskInfos[destination].TaskStatus != TaskState.Running || + _taskInfos[destination].TaskRunner == null) + { + var msg = string.Format("Cannot send message to {0}:", destination + 1); + msg += ": Task Status is " + _taskInfos[destination].TaskStatus; + + if (_taskInfos[destination].TaskStatus == TaskState.Submitted && retry < _parameters.Retry) + { + LOGGER.Log(Level.Warning, msg + " Retry"); + System.Threading.Tasks.Task.Run(() => + { + Thread.Sleep(_parameters.WaitTime); + SendToTasks(new List() { returnMessage }, retry + 1); + }); + } + else if (retry >= _parameters.Retry) + { + LOGGER.Log(Level.Warning, msg + " Aborting"); + OnFail(); + } + else + { + LOGGER.Log(Level.Warning, msg + " Ignoring"); + } + + continue; + } + + _taskInfos[destination].TaskRunner.Send(returnMessage.Serialize()); + } + } + } + } + + private void SpawnNewEvaluator(int id) + { + LOGGER.Log(Level.Warning, $"Spawning new evaluator for id {id}"); + + var request = _evaluatorRequestor.NewBuilder() + .SetNumber(1) + .SetMegabytes(_parameters.NewEvaluatorMemorySize) + .SetCores(_parameters.NewEvaluatorNumCores) + .SetRackName(_parameters.NewEvaluatorRackName) + .Build(); + + _evaluatorRequestor.Submit(request); + } + + private void LogFinalStatistics() + { + var msg = string.Format("Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}", _totFailedTasks, _totFailedEvaluators); + msg += _stages.Select(x => x.Value.LogFinalStatistics()).Aggregate((a, b) => a + "\n" + b); + LOGGER.Log(Level.Info, msg); + } + + private bool Completed() + { + if (!_completed) + { + _completed = _stages.Select(stage => stage.Value.IsCompleted).Aggregate((com1, com2) => com1 && com2); + + if (_completed) + { + LOGGER.Log(Level.Info, "Task set completed"); + } + } + + return _completed; + } + + private bool Failed() + { + return _failureStatus.FailureState == (int)DefaultFailureStates.Fail; + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs new file mode 100644 index 0000000000..c94ccbed71 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Config; +using System.Threading.Tasks; +using Org.Apache.REEF.Network.Elastic.Failures; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Injectable class containing all the parameters for the default task set manager. + /// + internal sealed class DefaultElasticTaskSetManagerParameters + { + [Inject] + private DefaultElasticTaskSetManagerParameters( + FailuresClock clock, + [Parameter(typeof(ElasticServiceConfigurationOptions.Timeout))] int timeout, + [Parameter(typeof(ElasticServiceConfigurationOptions.SendRetry))] int retry, + [Parameter(typeof(ElasticServiceConfigurationOptions.RetryWaitTime))] int waitTime, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumTaskFailures))] int numTaskFailures, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluatorFailures))] int numEvaluatorFailures, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorRackName))] string rackName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorBatchId))] string batchId, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorNumCores))] int numCores, + [Parameter(typeof(ElasticServiceConfigurationOptions.NewEvaluatorMemorySize))] int memorySize) + { + Clock = clock; + Timeout = timeout; + Retry = retry; + WaitTime = waitTime; + NumTaskFailures = numTaskFailures; + NumEvaluatorFailures = numEvaluatorFailures; + NewEvaluatorRackName = rackName; + NewEvaluatorBatchId = batchId; + NewEvaluatorNumCores = numCores; + NewEvaluatorMemorySize = memorySize; + + System.Threading.Tasks.Task.Factory.StartNew(() => Clock.Run(), TaskCreationOptions.LongRunning); + } + + /// + /// The clock for scheduling alarms. + /// + public FailuresClock Clock { get; private set; } + + /// + /// Timeout after which computation is considered inactive. + /// + public int Timeout { get; private set; } + + /// + /// How many times a message communication can be retried. + /// + public int Retry { get; private set; } + + /// + /// How much time to wait between messages retry. + /// + public int WaitTime { get; private set; } + + /// + /// Supported number of task failures. + /// + public int NumTaskFailures { get; private set; } + + /// + /// Supported number of evaluator failures. + /// + public int NumEvaluatorFailures { get; private set; } + + /// + /// The rack name when spawning new evaluators. + /// + public string NewEvaluatorRackName { get; private set; } + + /// + /// The batch id when spawning new evaluators. + /// + public string NewEvaluatorBatchId { get; private set; } + + /// + /// Number of cores for new evaluators. + /// + public int NewEvaluatorNumCores { get; private set; } + + /// + /// Memory size for new evaluators. + /// + public int NewEvaluatorMemorySize { get; private set; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs index 9201767969..9ff4d288e0 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -using Org.Apache.REEF.Network.Elastic.Driver.Impl; +using Org.Apache.REEF.Network.Elastic.Driver.Default; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Tang.Formats; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs index 1c4bfa3c17..a7db07ef81 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs @@ -16,7 +16,7 @@ // under the License. using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl; +using Org.Apache.REEF.Network.Elastic.Operators.Logical; using Org.Apache.REEF.Driver.Context; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Utilities; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs similarity index 95% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureState.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs index a7b9443fcc..c9aebcf7b0 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureState.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; using System; @@ -23,7 +22,7 @@ /// The default implementation for IFailureState. /// These events are generated based on the default failure states defined in the enum. /// -namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { [Unstable("0.16", "API may change")] internal sealed class DefaultFailureState : IFailureState diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStateEvents.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateEvents.cs similarity index 95% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStateEvents.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateEvents.cs index 83c120985c..e63673fa64 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStateEvents.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateEvents.cs @@ -17,7 +17,7 @@ using Org.Apache.REEF.Utilities.Attributes; -namespace Org.Apache.REEF.Network.Elastic.Failures.Enum +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { /// /// The list of default failure events triggered by default state chages. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs similarity index 98% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureStateMachine.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs index a4641a4543..cc9d099ee2 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/DefaultFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Tang.Exceptions; using Org.Apache.REEF.Utilities.Attributes; @@ -32,10 +31,10 @@ /// - Stop the computation and try to reschedule the tasks /// - Fail. /// -namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { [Unstable("0.16", "API may change")] - public class DefaultFailureStateMachine : IFailureStateMachine + internal sealed class DefaultFailureStateMachine : IFailureStateMachine { private readonly object _statusLock; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStates.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs similarity index 96% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStates.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs index 319e5bb907..4ed161ce39 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Enum/DefaultFailureStates.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs @@ -18,7 +18,7 @@ using Org.Apache.REEF.Utilities.Attributes; -namespace Org.Apache.REEF.Network.Elastic.Failures.Enum +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { /// /// The default failure states. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/FailEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs similarity index 93% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/FailEvent.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs index 313cbfa7c4..caf2265860 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/FailEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs @@ -16,17 +16,16 @@ // under the License. using Org.Apache.REEF.Network.Elastic.Comm; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; using System.Collections.Generic; -namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { /// /// Faile the current execution. /// [Unstable("0.16", "API may change")] - public class FailEvent : IFailureEvent + internal class FailEvent : IFailureEvent { /// /// Constructor for the faile event. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IDefaultFailureEventResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/IDefaultFailureEventResponse.cs similarity index 85% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IDefaultFailureEventResponse.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/IDefaultFailureEventResponse.cs index 8af2ad3f62..6f3d5d7f56 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IDefaultFailureEventResponse.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/IDefaultFailureEventResponse.cs @@ -17,7 +17,7 @@ using Org.Apache.REEF.Utilities.Attributes; -namespace Org.Apache.REEF.Network.Elastic.Failures +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { /// /// Default failures response interface. @@ -25,25 +25,25 @@ namespace Org.Apache.REEF.Network.Elastic.Failures /// Mechanisms implementing the default failure responses must extend this interface. /// [Unstable("0.16", "API may change")] - public interface IDefaultFailureEventResponse + internal interface IDefaultFailureEventResponse { /// /// Mechanism to execute when a reconfigure event is triggered. /// /// - void OnReconfigure(ref IReconfigure reconfigureEvent); + void OnReconfigure(ref ReconfigureEvent reconfigureEvent); /// /// Mechanism to execute when a reschedule event is triggered. /// /// - void OnReschedule(ref IReschedule rescheduleEvent); + void OnReschedule(ref RescheduleEvent rescheduleEvent); /// /// Mechanism to execute when a stop event is triggered. /// /// - void OnStop(ref IStop stopEvent); + void OnStop(ref StopEvent stopEvent); /// /// Mechanism to execute when a fail event is triggered. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/ReconfigureEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs similarity index 82% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/ReconfigureEvent.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs index 0752377791..4ef242eb91 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/ReconfigureEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs @@ -17,18 +17,17 @@ using Org.Apache.REEF.Driver.Task; using Org.Apache.REEF.Network.Elastic.Comm; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities; using Org.Apache.REEF.Utilities.Attributes; using System.Collections.Generic; -namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { /// /// Reconfigure the execution to work with fewer tasks. /// [Unstable("0.16", "API may change")] - public class ReconfigureEvent : IReconfigure + public class ReconfigureEvent : IFailureEvent { /// /// Constructor for a reconfigure event. @@ -37,17 +36,25 @@ public class ReconfigureEvent : IReconfigure /// The operator identifier in which the event was detected public ReconfigureEvent(IFailedTask failedTask, int opertorId) { - FailedTask = Optional.Of(failedTask); + if (failedTask != null) + { + FailedTask = Optional.Of(failedTask); + TaskId = failedTask.Id; + } + else + { + FailedTask = Optional.Empty(); + } + OperatorId = opertorId; FailureResponse = new List(); Iteration = Optional.Empty(); - TaskId = failedTask.Id; } /// /// The event / action raised by the transition to the new failure state. /// - public int FailureEvent + public virtual int FailureEvent { get { return (int)DefaultFailureStateEvents.Reconfigure; } } @@ -65,16 +72,16 @@ public int FailureEvent /// /// The identifier of the task triggering the event. /// - public string TaskId { get; private set; } + public string TaskId { get; protected set; } /// /// The opeartor id in which the failure is rised. /// - public int OperatorId { get; private set; } + public int OperatorId { get; protected set; } /// /// The response message generated to react to the failure event. /// - public List FailureResponse { get; private set; } + public List FailureResponse { get; protected set; } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs similarity index 61% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs index 3e9b4f84f8..8606f80f20 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/RescheduleEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs @@ -16,69 +16,36 @@ // under the License. using System.Collections.Generic; -using Org.Apache.REEF.Driver.Task; -using Org.Apache.REEF.Utilities; -using Org.Apache.REEF.Network.Elastic.Comm; using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; -namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { /// /// Reconfigure the execution to work with fewer tasks and simultaneusly try to /// reschedule a new task. /// [Unstable("0.16", "API may change")] - public class RescheduleEvent : IReschedule + public sealed class RescheduleEvent : ReconfigureEvent { /// /// Constructor for the reschedule event. /// /// The identifier of the task triggering the failure event - public RescheduleEvent(string taskId) + public RescheduleEvent(string taskId) : base(null, -1) { TaskId = taskId; - OperatorId = -1; - FailureResponse = new List(); RescheduleTaskConfigurations = new Dictionary>(); - Iteration = Optional.Empty(); } /// /// The event / action raised by the transition to the new failure state. /// - public int FailureEvent + public override int FailureEvent { get { return (int)DefaultFailureStateEvents.Reschedule; } } - /// - /// The failed task triggering the event. - /// - public Optional FailedTask { get; set; } - - /// - /// The identifier of the task triggering the event. - /// - public string TaskId { get; private set; } - - /// - /// The opeartor id in which the failure is rised. - /// - public int OperatorId { get; private set; } - - /// - /// The iteration in which the failure is rised. - /// - public Optional Iteration { get; set; } - - /// - /// Messages implementing the response from the driver to the tasks - /// to reconfigure the compution. - /// - public List FailureResponse { get; private set; } - /// /// The configurations for the stages of the task. /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/StopEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs similarity index 93% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/StopEvent.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs index 53605cd5e4..c9283cbbd6 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Impl/StopEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs @@ -16,17 +16,16 @@ // under the License. using Org.Apache.REEF.Network.Elastic.Comm; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; using System.Collections.Generic; -namespace Org.Apache.REEF.Network.Elastic.Failures.Impl +namespace Org.Apache.REEF.Network.Elastic.Failures.Default { /// /// Stop the execution and try to add new tasks. /// [Unstable("0.16", "API may change")] - public class StopEvent : IStop + public class StopEvent : IFailureEvent { /// /// Constructor for the stop event. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs new file mode 100644 index 0000000000..7ed5a5911d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs @@ -0,0 +1,272 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Tang.Implementations.InjectionPlan; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Utilities.Collections; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Wake.RX.Impl; +using Org.Apache.REEF.Wake.Time.Event; +using Org.Apache.REEF.Wake.Time.Runtime.Event; +using Org.Apache.REEF.Wake.Time; +using Org.Apache.REEF.Wake.Time.Runtime; + +namespace Org.Apache.REEF.Network.Elastic.Failures +{ + /// + /// Clock used to trigger failures events. + /// + internal sealed class FailuresClock : IClock + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(FailuresClock)); + + private static int numberOfInstantiations = 0; + + private readonly ITimer _timer; + private readonly PubSubSubject - public virtual void OnStop(ref IStop stopEvent) + public virtual void OnStop(ref StopEvent stopEvent) { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs similarity index 99% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index 16ef19d2d5..f761a264c4 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Impl/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -38,7 +38,7 @@ using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; using Org.Apache.REEF.Wake.StreamingCodec.CommonStreamingCodecs; -namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Impl +namespace Org.Apache.REEF.Network.Elastic.Operators.Logical { /// /// Basic implementation for logical operators. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs similarity index 94% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs index 79ecdca863..48e682e2c0 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs @@ -16,13 +16,12 @@ // under the License. using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl; +using Org.Apache.REEF.Network.Elastic.Topology.Physical.Default; using Org.Apache.REEF.Network.Elastic.Config; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; -using Org.Apache.REEF.Network.Elastic.Failures; -namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Impl +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Default { /// /// Default implementation of a group communication operator used to broadcast messages. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs similarity index 98% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index 4cf68480a2..ea032353e9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Impl/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -17,7 +17,7 @@ using System.Threading; using System.Collections.Generic; -using Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl; +using Org.Apache.REEF.Network.Elastic.Topology.Physical.Default; using Org.Apache.REEF.Network.Elastic.Failures; using System; using Org.Apache.REEF.Network.Elastic.Comm.Impl; @@ -27,7 +27,7 @@ using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; using Org.Apache.REEF.Network.Elastic.Comm; -namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Impl +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Default { /// /// Generic implementation of a group communication operator where one node sends to N. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/IElasticBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticBroadcast.cs similarity index 100% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Operators/IElasticBroadcast.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticBroadcast.cs diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CancellationSource.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs similarity index 97% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CancellationSource.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs index 76c3ad671e..f9facca194 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Impl/CancellationSource.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs @@ -19,7 +19,7 @@ using Org.Apache.REEF.Utilities.Attributes; using System.Threading; -namespace Org.Apache.REEF.Network.Elastic.Task.Impl +namespace Org.Apache.REEF.Network.Elastic.Task { /// /// Generic cancellation source for task operations. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs new file mode 100644 index 0000000000..843f37e811 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Common.Tasks.Events; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Topology.Physical; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Exceptions; +using System.Collections.Concurrent; +using System.Collections.Generic; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Handler for incoming messages from the driver. + /// + internal sealed class ElasticDriverMessageHandler : IDriverMessageHandler + { + /// + /// Injectable constructor. + /// + [Inject] + private ElasticDriverMessageHandler() + { + DriverMessageObservers = new ConcurrentDictionary(); + } + + /// + /// Observers of incoming messages from the driver. + /// + internal ConcurrentDictionary DriverMessageObservers { get; set; } + + /// + /// Handle an incoming message. + /// + /// The message from the driver + public void Handle(IDriverMessage message) + { + + if (!message.Message.IsPresent()) + { + throw new IllegalStateException("Received message with no payload."); + } + + var edm = ElasticDriverMessageImpl.From(message.Message.Value); + var id = NodeObserverIdentifier.FromMessage(edm.Message); + DriverAwareOperatorTopology operatorObserver; + + if (!DriverMessageObservers.TryGetValue(id, out operatorObserver)) + { + throw new KeyNotFoundException("Unable to find registered operator topology for stage " + + edm.Message.StageName + " operator " + edm.Message.OperatorId); + } + + operatorObserver.OnNext(edm.Message); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs new file mode 100644 index 0000000000..cb272d5c57 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Topology.Physical; +using Org.Apache.REEF.Network.Elastic.Topology.Physical.Default; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// An identifier for a given node in the group communication graph. + /// A node is uniquely identifiable by a combination of its + /// , and . + /// + internal sealed class NodeObserverIdentifier + { + private readonly string _stageName; + private readonly int _operatorId; + + /// + /// Creates a NodeObserverIdentifier from an observer. + /// + public static NodeObserverIdentifier FromObserver(OperatorTopologyWithDefaultCommunication observer) + { + return new NodeObserverIdentifier(observer.StageName, observer.OperatorId); + } + + /// + /// Creates a NodeObserverIdentifier from an observer. + /// + public static NodeObserverIdentifier FromObserver(DriverAwareOperatorTopology observer) + { + return new NodeObserverIdentifier(observer.StageName, observer.OperatorId); + } + + /// + /// Creates a NodeObserverIdentifier from a group communication message. + /// + public static NodeObserverIdentifier FromMessage(ElasticGroupCommunicationMessage message) + { + return new NodeObserverIdentifier(message.StageName, message.OperatorId); + } + + /// + /// Basic constructor. + /// + /// The name of the stage + /// The identifier of the operator + private NodeObserverIdentifier(string stageName, int operatorId) + { + _stageName = stageName; + _operatorId = operatorId; + } + + /// + /// The stage name. + /// + public string StageName + { + get { return _stageName; } + } + + /// + /// The operator name. + /// + public int OperatorId + { + get { return _operatorId; } + } + + /// + /// Overrides . Simply compares equivalence of instance fields. + /// + public override bool Equals(object obj) + { + if (ReferenceEquals(null, obj)) + { + return false; + } + + if (ReferenceEquals(this, obj)) + { + return true; + } + + return obj is NodeObserverIdentifier && Equals((NodeObserverIdentifier)obj); + } + + /// + /// Overrides . Generates hashcode based on the instance fields. + /// + public override int GetHashCode() + { + int hash = 17; + hash = (hash * 31) + _stageName.GetHashCode(); + return (hash * 31) + _operatorId.GetHashCode(); + } + + /// + /// Compare equality of instance fields. + /// + private bool Equals(NodeObserverIdentifier other) + { + return _stageName.Equals(other.StageName) && + _operatorId.Equals(other.OperatorId); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs similarity index 97% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs index 4856ae5075..d7d4dbe71e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DefaultBroadcastTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs @@ -25,10 +25,10 @@ using Org.Apache.REEF.Utilities.Logging; using System.Linq; using Org.Apache.REEF.Utilities.Attributes; -using Org.Apache.REEF.Network.Elastic.Failures.Impl; using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Failures; -namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default { /// /// Topology class managing data communication for broadcast operators. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs similarity index 98% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index d6d13cdf1e..5b9acb932d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -19,7 +19,6 @@ using System.Collections.Generic; using System; using Org.Apache.REEF.Network.Elastic.Comm; -using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Tang.Exceptions; using System.Threading; using Org.Apache.REEF.Network.Elastic.Comm.Impl; @@ -27,17 +26,16 @@ using Org.Apache.REEF.Network.NetworkService; using System.Collections.Concurrent; using System.Linq; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Task; -namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default { /// /// Base class for topologies following a one to N communication pattern. /// [Unstable("0.16", "API may change")] - internal abstract class OneToNTopology : OperatorTopologyWithCommunication + internal abstract class OneToNTopology : OperatorTopologyWithDefaultCommunication { protected static readonly Logger LOGGER = Logger.GetLogger(typeof(OneToNTopology)); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs similarity index 98% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs index aacca911ca..1b4b72ea24 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopologyWithCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -26,14 +26,14 @@ using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Utilities.Attributes; -namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default { /// /// Base class for topologies where nodes are allowed to communicated between themselves /// and to the driver. /// [Unstable("0.16", "API may change")] - internal abstract class OperatorTopologyWithCommunication : + internal abstract class OperatorTopologyWithDefaultCommunication : DriverAwareOperatorTopology, IWaitForTaskRegistration, IDisposable, @@ -63,7 +63,7 @@ internal abstract class OperatorTopologyWithCommunication : /// After how long the topology waits for an event /// Maximum wait time for topology disposal /// Class responsible for communication - public OperatorTopologyWithCommunication( + public OperatorTopologyWithDefaultCommunication( string stageName, string taskId, string rootTaskId, diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs similarity index 93% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs index 03747bf1ec..38dc9f9e87 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/DriverAwareOperatorTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs @@ -19,13 +19,13 @@ using Org.Apache.REEF.Utilities.Attributes; using System; -namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical { /// /// Abstract class for topologies able to receive messages from the driver. /// [Unstable("0.16", "API may change")] - internal abstract class DriverAwareOperatorTopology : OperatorTopology, IObserver + public abstract class DriverAwareOperatorTopology : OperatorTopology, IObserver { /// /// Constructor. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs similarity index 96% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs index edbf97daee..58fcfc37e0 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Impl/OperatorTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs @@ -17,14 +17,14 @@ using Org.Apache.REEF.Utilities.Attributes; -namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Impl +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical { /// /// Base class for task-side topologies. Task-side topologies are /// not generic but directly related to the operators using them to communicate data. /// [Unstable("0.16", "API may change")] - internal abstract class OperatorTopology + public abstract class OperatorTopology { /// /// Constructor for an operator topology. From ef5f339a11e34d29e521f6bd8329e6b0a68ad561 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 1 Jan 2019 12:20:00 -0800 Subject: [PATCH 05/29] done with communication layer --- .../cs/Org.Apache.REEF.Common/AssemblyInfo.cs | 20 ++ .../Elastic/Comm/Enum/TaskMessageType.cs | 13 +- .../Driver/Default/DefaultElasticStage.cs | 13 +- .../Logical/Default/DefaultOneToN.cs | 9 +- .../Elastic/Task/CommunicationLayer.cs | 291 ++++++++++++++++++ .../Task/Default/DefaultCommunicationLayer.cs | 121 ++++++++ .../DefaultTaskToDriverMessageDispatcher.cs | 108 +++++++ .../Default/IDefaultTaskToDrivermessages.cs | 46 +++ .../Elastic/Task/NodeObserverIdentifier.cs | 15 +- .../Task/TaskToDriverMessageDispatcher.cs | 77 +++++ .../Default/DefaultBroadcastTopology.cs | 2 +- .../Physical/Default/OneToNTopology.cs | 2 +- ...peratorTopologyWithDefaultCommunication.cs | 16 +- .../IOperatorTopologyWithCommunication.cs | 43 +++ .../NetworkService/IConnection.cs | 5 + .../NetworkService/NsConnection.cs | 16 +- .../NetworkService/StreamingNetworkService.cs | 14 + pom.xml | 2 +- 18 files changed, 777 insertions(+), 36 deletions(-) create mode 100644 lang/cs/Org.Apache.REEF.Common/AssemblyInfo.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs diff --git a/lang/cs/Org.Apache.REEF.Common/AssemblyInfo.cs b/lang/cs/Org.Apache.REEF.Common/AssemblyInfo.cs new file mode 100644 index 0000000000..e60e48f257 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Common/AssemblyInfo.cs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("Org.Apache.REEF.Network")] \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs index c506f66f9a..a55f945383 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Enum/TaskMessageType.cs @@ -19,17 +19,16 @@ namespace Org.Apache.REEF.Network.Elastic.Comm.Enum { + /// + /// Supported type of messages between task and driver. + /// [Unstable("0.16", "Types may change")] internal enum TaskMessageType : ushort { - IterationNumber = 0, + JoinTopology = 0, - JoinTopology = 1, + TopologyUpdateRequest = 1, - TopologyUpdateRequest = 2, - - NextDataRequest = 3, - - CompleteStage = 4 + CompleteStage = 2 } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index 7c19c7e3fd..33cd736bbf 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -355,8 +355,17 @@ public string LogFinalStatistics() /// A list of messages containing the instructions for the task public void OnTaskMessage(ITaskMessage message, ref List returnMessages) { - // Messages have to be propagated down to the operators - RootOperator.OnTaskMessage(message, ref returnMessages); + int offset = 0; + var length = BitConverter.ToUInt16(message.Message, offset); + offset += sizeof(ushort); + var stageName = BitConverter.ToString(message.Message, sizeof(ushort), length); + offset += length; + + if (stageName == StageName) + { + // Messages have to be propagated down to the operators + RootOperator.OnTaskMessage(message, ref returnMessages); + } } #region Failure Response diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs index 5a1bdd0641..46866ea11a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -80,13 +80,16 @@ public DefaultOneToN( /// True if the operator has reacted to the task message protected override bool ReactOnTaskMessage(ITaskMessage message, ref List returnMessages) { - var msgReceived = (TaskMessageType)BitConverter.ToUInt16(message.Message, 0); + var offset = BitConverter.ToUInt16(message.Message, 0); + offset += sizeof(ushort); + var msgReceived = (TaskMessageType)BitConverter.ToUInt16(message.Message, offset); + offset += sizeof(ushort); switch (msgReceived) { case TaskMessageType.JoinTopology: { - var operatorId = BitConverter.ToInt16(message.Message, sizeof(ushort)); + var operatorId = BitConverter.ToInt16(message.Message, offset); if (operatorId != _id) { @@ -105,7 +108,7 @@ protected override bool ReactOnTaskMessage(ITaskMessage message, ref List + /// Handles all incoming / outcoming messages for a given task. + /// + internal abstract class CommunicationLayer : + IObserver>> + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(CommunicationLayer)); + + private readonly int _timeout; + private readonly int _retryRegistration; + private readonly int _retrySending; + private readonly int _sleepTime; + private readonly StreamingNetworkService _networkService; + protected readonly DefaultTaskToDriverMessageDispatcher _taskToDriverDispatcher; + private readonly ElasticDriverMessageHandler _driverMessagesHandler; + private readonly IIdentifierFactory _idFactory; + private IDisposable _communicationObserver; + private readonly ConcurrentDictionary _driverMessageObservers; + + protected bool _disposed; + + protected readonly ConcurrentDictionary _groupMessageObservers = + new ConcurrentDictionary(); + + /// + /// Creates a new communication layer. + /// + protected CommunicationLayer( + int timeout, + int retryRegistration, + int sleepTime, + int retrySending, + StreamingNetworkService networkService, + DefaultTaskToDriverMessageDispatcher taskToDriverDispatcher, + ElasticDriverMessageHandler driverMessagesHandler, + IIdentifierFactory idFactory) + { + _timeout = timeout; + _retryRegistration = retryRegistration; + _sleepTime = sleepTime; + _retrySending = retrySending; + _networkService = networkService; + _taskToDriverDispatcher = taskToDriverDispatcher; + _driverMessagesHandler = driverMessagesHandler; + _idFactory = idFactory; + + _disposed = false; + + _communicationObserver = _networkService.RemoteManager.RegisterObserver(this); + _driverMessageObservers = _driverMessagesHandler.DriverMessageObservers; + } + + /// + /// Registers a with the communication layer. + /// + /// The observer of the communicating topology operator + public void RegisterOperatorTopologyForTask(IOperatorTopologyWithCommunication operatorObserver) + { + var id = NodeObserverIdentifier.FromObserver(operatorObserver); + + if (_groupMessageObservers.ContainsKey(id)) + { + throw new IllegalStateException($"Topology for id {id} already added among listeners."); + } + + _groupMessageObservers.TryAdd(id, operatorObserver); + } + + /// + /// Registers a with the communication layer. + /// + /// The observer of the driver aware topology + internal void RegisterOperatorTopologyForDriver(DriverAwareOperatorTopology operatorObserver) + { + var id = NodeObserverIdentifier.FromObserver(operatorObserver); + + if (_driverMessageObservers.ContainsKey(id)) + { + throw new IllegalStateException($"Topology for id {id} already added among driver listeners."); + } + + _driverMessageObservers.TryAdd(id, operatorObserver); + } + + /// + /// Send the communication message to the task whose name is included in the message. + /// + /// The message to send + internal void Send(string destination, ElasticGroupCommunicationMessage message, CancellationTokenSource cancellationSource) + { + if (message == null) + { + throw new ArgumentNullException(nameof(message)); + } + if (string.IsNullOrEmpty(destination)) + { + throw new ArgumentException("Message destination cannot be null or empty."); + } + if (_disposed) + { + LOGGER.Log(Level.Warning, "Received send message request after disposing: Ignoring."); + return; + } + + IIdentifier destId = _idFactory.Create(destination); + int retry = 0; + + while (!Send(destId, message)) + { + if (retry > _retrySending) + { + throw new IllegalStateException($"Unable to send message after retying {retry} times."); + } + Thread.Sleep(_timeout); + + retry++; + } + } + + /// + /// Forward the received message to the target . + /// + /// The received message + public abstract void OnNext(IRemoteMessage> remoteMessage); + + /// + /// Checks if the identifier is registered with the name server. + /// Throws exception if the operation fails more than the retry count. + /// + /// The identifier to look up + /// The token to cancel the operation + public void WaitForTaskRegistration(IList identifiers, CancellationTokenSource cancellationSource, ConcurrentDictionary removed = null) + { + if (removed == null) + { + removed = new ConcurrentDictionary(); + } + + IList foundList = new List(); + for (var i = 0; i < _retryRegistration; i++) + { + if (cancellationSource != null && cancellationSource.Token.IsCancellationRequested) + { + LOGGER.Log(Level.Warning, $"WaitForTaskRegistration is canceled in retryCount {i}."); + throw new OperationCanceledException("WaitForTaskRegistration is canceled"); + } + + LOGGER.Log(Level.Info, $"WaitForTaskRegistration, in retryCount {i}."); + foreach (var identifier in identifiers) + { + var notFound = !foundList.Contains(identifier); + if (notFound && removed.ContainsKey(identifier)) + { + foundList.Add(identifier); + LOGGER.Log(Level.Verbose, $"WaitForTaskRegistration, dependent id {identifier} was removed at loop {i}."); + } + else if (notFound && Lookup(identifier)) + { + foundList.Add(identifier); + LOGGER.Log(Level.Verbose, $"WaitForTaskRegistration, find a dependent id {identifier} at loop {i}."); + } + } + + if (foundList.Count >= identifiers.Count) + { + LOGGER.Log(Level.Info, $"WaitForTaskRegistration, found all {foundList.Count} dependent ids at loop {i}."); + return; + } + + Thread.Sleep(_sleepTime); + } + + ICollection leftovers = foundList.Count == 0 ? identifiers : identifiers.Where(e => !foundList.Contains(e)).ToList(); + var msg = string.Join(",", leftovers); + + LOGGER.Log(Level.Error, "Cannot find registered parent/children: {0}.", msg); + throw new Exception("Failed to find parent/children nodes"); + } + + /// + /// Look up an identifier with the name server. + /// + /// The identifier to look up + /// + public bool Lookup(string identifier) + { + if (_disposed || _networkService == null) + { + return false; + } + return _networkService.NamingClient.Lookup(identifier) != null; + } + + /// + /// Remove the connection to the target destination. + /// + /// The node to remove the connection + public void RemoveConnection(string destination) + { + IIdentifier destId = _idFactory.Create(destination); + _networkService.RemoveConnection(destId); + } + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + foreach (var observer in _groupMessageObservers.Values) + { + observer.OnCompleted(); + } + } + + /// + /// Dispose the connection layer. + /// + public void Dispose() + { + if (!_disposed) + { + OnCompleted(); + + _groupMessageObservers.Clear(); + + _communicationObserver.Dispose(); + + _disposed = true; + + LOGGER.Log(Level.Info, "Communication layer disposed."); + } + } + + private bool Send(IIdentifier destId, ElasticGroupCommunicationMessage message) + { + var connection = _networkService.NewConnection(destId); + + try + { + if (!connection.IsOpen) + { + connection.Open(); + } + + connection.Write(message); + LOGGER.Log(Level.Verbose, $"message sent to {destId}"); + } + catch (Exception e) + { + LOGGER.Log(Level.Warning, "Unable to send message " + e.Message); + connection.Dispose(); + return false; + } + + return true; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs new file mode 100644 index 0000000000..4d3b413669 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Collections.Generic; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Wake.Remote; +using Org.Apache.REEF.Wake; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Topology.Physical; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Implementation of the communication layer with default task to driver messages. + /// + internal sealed class DefaultCommunicationLayer : + CommunicationLayer, + IDefaultTaskToDriverMessages + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultCommunicationLayer)); + + /// + /// Creates a new communication layer. + /// + [Inject] + private DefaultCommunicationLayer( + [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, + [Parameter(typeof(GroupCommunicationConfigurationOptions.RetryCountWaitingForRegistration))] int retryRegistration, + [Parameter(typeof(GroupCommunicationConfigurationOptions.SleepTimeWaitingForRegistration))] int sleepTime, + [Parameter(typeof(ElasticServiceConfigurationOptions.SendRetry))] int retrySending, + StreamingNetworkService networkService, + DefaultTaskToDriverMessageDispatcher taskToDriverDispatcher, + ElasticDriverMessageHandler driverMessagesHandler, + IIdentifierFactory idFactory) : base( + timeout, + retryRegistration, + sleepTime, + retrySending, + networkService, + taskToDriverDispatcher, + driverMessagesHandler, + idFactory) + { + } + + /// + /// Forward the received message to the target . + /// + /// The received message + public override void OnNext(IRemoteMessage> remoteMessage) + { + if (_disposed) + { + LOGGER.Log(Level.Warning, "Received message after disposing: Ignoring."); + return; + } + + var nsMessage = remoteMessage.Message; + var gcm = nsMessage.Data; + var gcMessageTaskSource = nsMessage.SourceId.ToString(); + + // Data message + var id = NodeObserverIdentifier.FromMessage(gcm); + IOperatorTopologyWithCommunication operatorObserver; + + if (!_groupMessageObservers.TryGetValue(id, out operatorObserver)) + { + throw new KeyNotFoundException($"Unable to find registered operator topology for stage {gcm.StageName} operator {gcm.OperatorId}"); + } + + operatorObserver.OnNext(nsMessage); + } + + /// + /// Notify the driver that operator is ready to join the + /// group communication topology. + /// + /// The current task + /// The identifier of the operator ready to join the topology + public void JoinTopology(string taskId, string stageName, int operatorId) + { + _taskToDriverDispatcher.JoinTopology(taskId, stageName, operatorId); + } + + /// + /// Send a notification to the driver for an update on topology state. + /// + /// The current task id + /// The operator requiring the topology update + public void TopologyUpdateRequest(string taskId, string stageName, int operatorId) + { + _taskToDriverDispatcher.TopologyUpdateRequest(taskId, stageName, operatorId); + } + + /// + /// Signal the driver that the current stage is completed. + /// + /// The current task identifier + public void StageComplete(string taskId, string stageName) + { + _taskToDriverDispatcher.StageComplete(taskId, stageName); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs new file mode 100644 index 0000000000..0d89b65fd4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Tang.Annotations; +using System; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Comm.Enum; +using Org.Apache.REEF.Utilities; +using Org.Apache.REEF.Tang.Interface; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Implemention of with default + /// messages dispatcher. + /// + internal sealed class DefaultTaskToDriverMessageDispatcher : TaskToDriverMessageDispatcher, IDefaultTaskToDriverMessages + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultTaskToDriverMessageDispatcher)); + + /// + /// Injectable constrcutor. + /// + /// + [Inject] + private DefaultTaskToDriverMessageDispatcher(IInjector injector) : base(injector) + { + } + + /// + /// Notify the driver that operator is ready to join the + /// group communication topology. + /// + /// The current task + /// The identifier of the operator ready to join the topology + public void JoinTopology(string taskId, string stageName, int operatorId) + { + int offset = 0; + byte[] message = new byte[sizeof(ushort) + stageName.Length + sizeof(ushort) + sizeof(ushort)]; + Buffer.BlockCopy(BitConverter.GetBytes(stageName.Length), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); + offset += stageName.Length; + Buffer.BlockCopy(BitConverter.GetBytes((ushort)TaskMessageType.JoinTopology), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); + + LOGGER.Log(Level.Info, $"Operator {operatorId} requesting to join the topology through heartbeat."); + + Send(taskId, message); + } + + /// + /// Send a notification to the driver for an update on topology state. + /// + /// The current task id + /// The operator requiring the topology update + public void TopologyUpdateRequest(string taskId, string stageName, int operatorId) + { + int offset = 0; + byte[] message = new byte[sizeof(ushort) + stageName.Length + sizeof(ushort) + sizeof(ushort)]; + Buffer.BlockCopy(BitConverter.GetBytes(stageName.Length), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); + offset += stageName.Length; + Buffer.BlockCopy(BitConverter.GetBytes((ushort)TaskMessageType.TopologyUpdateRequest), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); + + LOGGER.Log(Level.Info, string.Format("Operator {0} requesting a topology update through heartbeat", operatorId)); + + Send(taskId, message); + } + + /// + /// Signal the driver that the current stage is completed. + /// + /// The current task identifier + public void StageComplete(string taskId, string stageName) + { + int offset = 0; + byte[] message = new byte[sizeof(ushort) + stageName.Length + sizeof(ushort)]; + Buffer.BlockCopy(BitConverter.GetBytes(stageName.Length), 0, message, offset, sizeof(ushort)); + offset += sizeof(ushort); + Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); + offset += stageName.Length; + Buffer.BlockCopy(BitConverter.GetBytes((ushort)TaskMessageType.CompleteStage), 0, message, offset, sizeof(ushort)); + + LOGGER.Log(Level.Info, "Sending notification that the stage is completed."); + + Send(taskId, message); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs new file mode 100644 index 0000000000..239cbede79 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Interface defining the messages supported in tasks to driver communications. + /// + internal interface IDefaultTaskToDriverMessages + { + /// + /// Notify the driver that operator is ready to join the + /// group communication topology. + /// + /// The current task + /// The identifier of the operator ready to join the topology + void JoinTopology(string taskId, string stageName, int operatorId); + + /// + /// Send a notification to the driver for an update on topology state. + /// + /// The current task id + /// The operator requiring the topology update + void TopologyUpdateRequest(string taskId, string stageName, int operatorId); + + /// + /// Signal the driver that the current stage is completed. + /// + /// The current task identifier + void StageComplete(string taskId, string stageName); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs index cb272d5c57..1324ab492b 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs @@ -17,14 +17,13 @@ using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Network.Elastic.Topology.Physical; -using Org.Apache.REEF.Network.Elastic.Topology.Physical.Default; -namespace Org.Apache.REEF.Network.Elastic.Task +namespace Org.Apache.REEF.Network.Elastic.Task.Impl { /// - /// An identifier for a given node in the group communication graph. + /// An identifier for a given node in the group communication topology. /// A node is uniquely identifiable by a combination of its - /// , and . + /// , and . /// internal sealed class NodeObserverIdentifier { @@ -32,15 +31,15 @@ internal sealed class NodeObserverIdentifier private readonly int _operatorId; /// - /// Creates a NodeObserverIdentifier from an observer. + /// Creates an identifier from an operator topology with communication. /// - public static NodeObserverIdentifier FromObserver(OperatorTopologyWithDefaultCommunication observer) + public static NodeObserverIdentifier FromObserver(IOperatorTopologyWithCommunication observer) { return new NodeObserverIdentifier(observer.StageName, observer.OperatorId); } /// - /// Creates a NodeObserverIdentifier from an observer. + /// Creates an from a driver aware topology. /// public static NodeObserverIdentifier FromObserver(DriverAwareOperatorTopology observer) { @@ -48,7 +47,7 @@ public static NodeObserverIdentifier FromObserver(DriverAwareOperatorTopology ob } /// - /// Creates a NodeObserverIdentifier from a group communication message. + /// Creates an identifier from a group communication message. /// public static NodeObserverIdentifier FromMessage(ElasticGroupCommunicationMessage message) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs new file mode 100644 index 0000000000..7a27fc6ea8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Common.Runtime.Evaluator; +using Org.Apache.REEF.Common.Protobuf.ReefProtocol; +using Org.Apache.REEF.Tang.Interface; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Class used to manage messages going from tasks to the driver. + /// Messages are notifying through the heartbeat. + /// + internal abstract class TaskToDriverMessageDispatcher + { + private readonly IHeartBeatManager _heartBeatManager; + + /// + /// Constrcutor. + /// + /// Reference to the heartbeat manager + protected TaskToDriverMessageDispatcher(IInjector subInjector) + { + _heartBeatManager = subInjector.GetInstance(); + } + + /// + /// Send a serialized message to the driver. + /// + /// The id of the task sending the message + /// The serizlied message to send + protected void Send(string taskId, byte[] message) + { + TaskStatusProto taskStatusProto = new TaskStatusProto() + { + task_id = taskId, + context_id = Utils.GetContextIdFromTaskId(taskId) + }; + + TaskStatusProto.TaskMessageProto taskMessageProto = new TaskStatusProto.TaskMessageProto() + { + source_id = taskId, + message = message, + }; + + taskStatusProto.task_message.Add(taskMessageProto); + + Heartbeat(taskStatusProto); + } + + private void Heartbeat(TaskStatusProto proto) + { + var state = _heartBeatManager.ContextManager.GetTaskStatus(); + + if (state.IsPresent()) + { + proto.state = state.Value.state; + } + + _heartBeatManager.OnNext(proto); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs index d7d4dbe71e..791d23261a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs @@ -47,7 +47,7 @@ private DefaultBroadcastTopology( [Parameter(typeof(GroupCommunicationConfigurationOptions.Retry))] int retry, [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, [Parameter(typeof(GroupCommunicationConfigurationOptions.DisposeTimeout))] int disposeTimeout, - CommunicationLayer commLayer) : base( + DefaultCommunicationLayer commLayer) : base( stageName, taskId, Utils.BuildTaskId(stageName, rootId), diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index 5b9acb932d..0aa9aff30a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -68,7 +68,7 @@ public OneToNTopology( int retry, int timeout, int disposeTimeout, - CommunicationLayer commLayer) : base(stageName, taskId, rootTaskId, operatorId, commLayer, retry, timeout, disposeTimeout) + DefaultCommunicationLayer commLayer) : base(stageName, taskId, rootTaskId, operatorId, commLayer, retry, timeout, disposeTimeout) { _nodesToRemove = new ConcurrentDictionary(); _topologyUpdateReceived = new ManualResetEvent(RootTaskId == taskId ? false : true); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs index 1b4b72ea24..dac4b347d0 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -35,13 +35,11 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default [Unstable("0.16", "API may change")] internal abstract class OperatorTopologyWithDefaultCommunication : DriverAwareOperatorTopology, - IWaitForTaskRegistration, - IDisposable, - IObserver> + IOperatorTopologyWithCommunication { protected bool _initialized; - protected CommunicationLayer _commLayer; + protected DefaultCommunicationLayer _commLayer; protected readonly int _disposeTimeout; protected readonly int _timeout; @@ -68,7 +66,7 @@ public OperatorTopologyWithDefaultCommunication( string taskId, string rootTaskId, int operatorId, - CommunicationLayer commLayer, + DefaultCommunicationLayer commLayer, int retry, int timeout, int disposeTimeout) : base(stageName, taskId, rootTaskId, operatorId) @@ -95,7 +93,7 @@ public void StageComplete() { if (TaskId == RootTaskId) { - _commLayer.StageComplete(TaskId); + _commLayer.StageComplete(TaskId, StageName); } } @@ -104,7 +102,7 @@ public void StageComplete() /// public void TopologyUpdateRequest() { - _commLayer.TopologyUpdateRequest(TaskId, OperatorId); + _commLayer.TopologyUpdateRequest(TaskId, StageName, OperatorId); } /// @@ -126,7 +124,7 @@ public override void WaitCompletionBeforeDisposing() /// public virtual void JoinTopology() { - _commLayer.JoinTopology(TaskId, OperatorId); + _commLayer.JoinTopology(TaskId, StageName, OperatorId); } /// @@ -172,8 +170,6 @@ public virtual ElasticGroupCommunicationMessage Receive(CancellationTokenSource { throw new Exception($"Failed to receive message after {_retry} try."); } - - _commLayer.NextDataRequest(TaskId, -1); } return message; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs new file mode 100644 index 0000000000..0b5635b85f --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.NetworkService; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Topology.Physical +{ + /// + /// Base interface for topologies where nodes communicate betwen themselves. + /// + internal interface IOperatorTopologyWithCommunication : + IWaitForTaskRegistration, + IDisposable, + IObserver> + { + /// + /// The stage name context in which the topology is running. + /// + string StageName { get; } + + /// + /// The identifier of the operator in which the topology is running. + /// + int OperatorId { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/IConnection.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/IConnection.cs index 70f814e827..f90df23360 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/IConnection.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/IConnection.cs @@ -24,6 +24,11 @@ namespace Org.Apache.REEF.Network.NetworkService /// public interface IConnection : IDisposable { + /// + /// Whether the connection is open or not. + /// + bool IsOpen { get; } + /// /// Opens the connection /// diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/NsConnection.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/NsConnection.cs index 6355ed5f91..fc99f35e54 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/NsConnection.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/NsConnection.cs @@ -40,7 +40,6 @@ public class NsConnection : IConnection private readonly IIdentifier _destId; private readonly INameClient _nameClient; private readonly IRemoteManager> _remoteManager; - private readonly Dictionary> _connectionMap; private IObserver> _remoteSender; /// @@ -63,9 +62,14 @@ public NsConnection( _destId = destId; _nameClient = nameClient; _remoteManager = remoteManager; - _connectionMap = connectionMap; + IsOpen = false; } + /// + /// Whether the connection is open or not. + /// + public bool IsOpen { get; private set; } + /// /// Opens the connection to the remote host. /// @@ -83,6 +87,7 @@ public void Open() try { _remoteSender = _remoteManager.GetRemoteObserver(destAddr); + IsOpen = true; LOGGER.Log(Level.Verbose, "Network service completed connection to {0}.", destStr); } catch (SocketException) @@ -129,7 +134,12 @@ public void Write(T message) /// public void Dispose() { - _connectionMap.Remove(_destId); + if (_remoteSender != null) + { + IsOpen = false; + var disposable = _remoteSender as IDisposable; + disposable.Dispose(); + } } } } diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs index a34e8cb7ea..ab20cb826b 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs @@ -162,6 +162,20 @@ public IConnection NewConnection(IIdentifier destinationId) } } + /// + /// Remove the connection to the destination node from the connection map. + /// + /// The id of the node to disconnect + public void RemoveConnection(IIdentifier destinationId) + { + IConnection connection; + if (_connectionMap.TryGetValue(destinationId, out connection)) + { + connection.Dispose(); + _connectionMap.Remove(destinationId); + } + } + /// /// Register the identifier for the NetworkService with the NameService. /// diff --git a/pom.xml b/pom.xml index 3d185e8ef8..ac2db38e5f 100644 --- a/pom.xml +++ b/pom.xml @@ -60,7 +60,7 @@ under the License. 2.17 1.20.0 6.17 - 3.0.2 + 3.0.5 0.9.9-RC1 3.0.1 3.0.3 From d79e997e8190bc63e4eec9fc690eaadf54c70437 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 1 Jan 2019 18:24:58 -0800 Subject: [PATCH 06/29] Done with the task side --- .../Elastic/Task/CommunicationLayer.cs | 2 +- .../Task/Default/DefaultElasticContext.cs | 157 ++++++++++ .../Task/Default/DefaultElasticStage.cs | 141 +++++++++ .../Elastic/Task/IElasticContext.cs | 41 +++ .../Elastic/Task/IElasticStage.cs | 45 +++ .../Elastic/Task/Workflow.cs | 286 ++++++++++++++++++ 6 files changed, 671 insertions(+), 1 deletion(-) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index 3ae9f1aeff..fa2652c7a3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -126,7 +126,7 @@ internal void Send(string destination, ElasticGroupCommunicationMessage message, } if (string.IsNullOrEmpty(destination)) { - throw new ArgumentException("Message destination cannot be null or empty."); + throw new ArgumentNullException(nameof(destination)); } if (_disposed) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs new file mode 100644 index 0000000000..76a4c20452 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Collections.Generic; +using System.Threading; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Wake.Remote.Impl; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Elastic.Comm.Impl; +using Org.Apache.REEF.Common.Tasks.Events; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Default implementation of the task-side context. + /// Used by REEF tasks to initialize group communication and fetch stages. + /// + internal sealed class DefaultElasticContext : IElasticContext + { + private readonly Dictionary _stages; + private readonly string _taskId; + + private readonly INetworkService _networkService; + + private readonly object _lock; + private bool _disposed; + + /// + /// Creates a new elastic context and registers the task id with the Name Server. + /// + /// The set of serialized stages configurations + /// The identifier for this task + /// The writable network service used to send messages + /// Used to deserialize service configuration + /// Dependency injector + [Inject] + public DefaultElasticContext( + [Parameter(typeof(ElasticServiceConfigurationOptions.SerializedStageConfigs))] ISet stageConfigs, + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + StreamingNetworkService networkService, + AvroConfigurationSerializer configSerializer, + DefaultTaskToDriverMessageDispatcher taskToDriverDispatcher, // Otherwise the correct instance does not propagate through + ElasticDriverMessageHandler driverMessageHandler, + IInjector injector) + { + _stages = new Dictionary(); + _networkService = networkService; + _taskId = taskId; + + _disposed = false; + _lock = new object(); + + foreach (string serializedGroupConfig in stageConfigs) + { + IConfiguration stageConfig = configSerializer.FromString(serializedGroupConfig); + IInjector subInjector = injector.ForkInjector(stageConfig); + + var stageClient = subInjector.GetInstance(); + + _stages[stageClient.StageName] = stageClient; + } + + _networkService.Register(new StringIdentifier(_taskId)); + } + + /// + /// This is to ensure all the nodes in the groups are registered before starting communications. + /// + /// The token used to signal if the operation got cancelled + public void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null) + { + foreach (var stage in _stages.Values) + { + stage.WaitForTaskRegistration(cancellationSource); + } + } + + /// + /// Gets the stage object for the given stage name. + /// + /// The name of the stage + /// The task-side stage object + public IElasticStage GetStage(string stagepName) + { + if (string.IsNullOrEmpty(stagepName)) + { + throw new ArgumentNullException("stagepName"); + } + if (!_stages.ContainsKey(stagepName)) + { + throw new ArgumentException("No stage with name: " + stagepName); + } + + return _stages[stagepName]; + } + + /// + /// Disposes the services. + /// + public void Dispose() + { + lock (_lock) + { + if (!_disposed) + { + foreach (var sub in _stages.Values) + { + sub.Dispose(); + } + + _networkService.Unregister(); + + _disposed = true; + } + } + } + + /// + /// Action to trigger in case a is received. + /// + /// The close event + public void OnNext(ICloseEvent value) + { + foreach (var stage in _stages.Values) + { + stage.Cancel(); + } + } + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs new file mode 100644 index 0000000000..27fce06912 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Threading; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Config; +using System.Collections.Generic; +using Org.Apache.REEF.Tang.Formats; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Elastic.Task.Impl; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Default implementation of the task-side stage. + /// + internal sealed class DefaultElasticStage : IElasticStage + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticStage)); + + private readonly CancellationSource _cancellationSource; + + private readonly object _lock; + private bool _disposed; + + /// + /// Injectable constructor. + /// + [Inject] + private DefaultElasticStage( + [Parameter(typeof(OperatorParameters.StageName))] string stageName, + [Parameter(typeof(OperatorParameters.SerializedOperatorConfigs))] IList operatorConfigs, + [Parameter(typeof(OperatorParameters.StartIteration))] int startIteration, + AvroConfigurationSerializer configSerializer, + Workflow workflow, + CommunicationLayer commLayer, + CancellationSource cancellationSource, + IInjector injector) + { + StageName = stageName; + Workflow = workflow; + + _cancellationSource = cancellationSource; + _disposed = false; + _lock = new object(); + + foreach (string operatorConfigStr in operatorConfigs) + { + IConfiguration operatorConfig = configSerializer.FromString(operatorConfigStr); + + IInjector operatorInjector = injector.ForkInjector(operatorConfig); + string msgType = operatorInjector.GetNamedInstance( + GenericType.Class); + + Type groupCommOperatorGenericInterface = typeof(IElasticTypedOperator<>); + Type groupCommOperatorInterface = groupCommOperatorGenericInterface.MakeGenericType(Type.GetType(msgType)); + var operatorObj = operatorInjector.GetInstance(groupCommOperatorInterface); + + Workflow.Add(operatorObj as IElasticOperator); + } + } + + /// + /// The stage name. + /// + public string StageName { get; private set; } + + /// + /// The workflow of the stage. + /// + public Workflow Workflow { get; private set; } + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + public void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null) + { + try + { + Workflow.WaitForTaskRegistration(cancellationSource ?? _cancellationSource.Source); + } + catch (OperationCanceledException e) + { + LOGGER.Log(Level.Error, $"Stage {StageName} failed during registration."); + throw e; + } + } + + /// + /// Dispose the stage. + /// + public void Dispose() + { + lock (_lock) + { + if (!_disposed) + { + if (Workflow != null) + { + Workflow.Dispose(); + } + + _disposed = true; + } + } + } + + /// + /// Cancel the execution of stage. + /// + public void Cancel() + { + if (!_cancellationSource.IsCancelled) + { + _cancellationSource.Cancel(); + + LOGGER.Log(Level.Info, "Received request to close stage ", StageName); + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs new file mode 100644 index 0000000000..7677029e75 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Common.Tasks.Events; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Used by REEF tasks to initialize group communication and fetch Stages. + /// + [DefaultImplementation(typeof(DefaultElasticContext))] + public interface IElasticContext : + IWaitForTaskRegistration, + IDisposable, + IObserver + { + /// + /// Gets the stage with the given name. + /// + /// The name of the stage + /// The task-side configured stage + IElasticStage GetStage(string stageName); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs new file mode 100644 index 0000000000..c7f72507bb --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Tang.Annotations; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Used by tasks to fetch the workflow of the stages configured in the driver. + /// + //[DefaultImplementation(typeof(DefaultTaskSetStage))] + public interface IElasticStage : IWaitForTaskRegistration, IDisposable + { + /// + /// The name of the stage. + /// + string StageName { get; } + + /// + /// Cacnel the execution of the stage. + /// + void Cancel(); + + /// + /// The workflow of operators. + /// + Workflow Workflow { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs new file mode 100644 index 0000000000..816c1392b7 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -0,0 +1,286 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Logging; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Threading; + +namespace Org.Apache.REEF.Network.Elastic.Task.Impl +{ + /// + /// Task-side representation of the the sequence of group communication operations to execute. + /// Exception rised during execution are managed by the framework and recovered through the user-defined + /// policies / mechanisms. + /// + public sealed class Workflow : IEnumerator + { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(Workflow)); + + private int _position = -1; + private bool _failed; + private bool _disposed; + private List _iteratorsPosition; + + private readonly object _lock; + private readonly IList _operators; + private readonly CancellationSource _cancellationSource; + + /// + /// Injectable constructor. + /// + /// + [Inject] + private Workflow(CancellationSource cancellationSource) + { + _operators = new List(); + _failed = false; + _disposed = false; + _lock = new object(); + _iteratorsPosition = new List(); + _cancellationSource = cancellationSource; + } + + /// + /// The current iteration value. + /// + public object Iteration + { + get + { + if (_iteratorsPosition.Count == 0) + { + return 0; + } + else + { + var iterPos = _iteratorsPosition[0]; + var iterator = _operators[iterPos] as IElasticIterator; + return iterator.Current; + } + } + } + + /// + /// Try to move to the next operation in the workflow. + /// + /// + public bool MoveNext() + { + _position++; + + if (_failed || _cancellationSource.IsCancelled) + { + return false; + } + + // Check if we need to iterate + if (_iteratorsPosition.Count > 0 && _position == _iteratorsPosition[0]) + { + var iteratorOperator = _operators[_position] as IElasticIterator; + + if (iteratorOperator.MoveNext()) + { + _position++; + ResetOperatorPositions(); + + return true; + } + else + { + if (_iteratorsPosition.Count > 1) + { + _iteratorsPosition.RemoveAt(0); + _position = _iteratorsPosition[0] - 1; + } + + return false; + } + } + + // In case we have one or zero iterators (or we are at the last iterator when multiple iterators exists) + if (_position >= _operators.Count || (_iteratorsPosition.Count > 1 && _position == _iteratorsPosition[1])) + { + if (_iteratorsPosition.Count == 0) + { + return false; + } + else + { + _position = _iteratorsPosition[0] - 1; + + return MoveNext(); + } + } + + return true; + } + + /// + /// Method used to make the framework aware that an exception as been thrown during execution. + /// + /// The rised exception + public void Throw(Exception e) + { + if (_cancellationSource.IsCancelled) + { + LOGGER.Log(Level.Warning, "Workflow captured an exception while cancellation source was true.", e); + } + else + { + LOGGER.Log(Level.Error, "Workflow captured an exception.", e); + _failed = true; + + throw new OperatorException( + "Workflow captured an exception", Current.OperatorId, e, Current.FailureInfo); + } + } + + /// + /// Start the execution of the workflow from the first operator / iterator. + /// + public void Reset() + { + if (_iteratorsPosition.Count > 0) + { + _position = _iteratorsPosition[0]; + } + else + { + _position = 0; + } + } + + /// + /// Get the current elastic operator. + /// + public IElasticOperator Current + { + get + { + return _position == -1 ? _operators[0] : _operators[_position]; + } + } + + object IEnumerator.Current + { + get { return Current; } + } + + /// + /// Dispose the workflow. + /// + public void Dispose() + { + lock (_lock) + { + if (!_disposed) + { + if (_operators != null) + { + // Clean dispose, check that the computation is completed + if (_failed == false) + { + foreach (var op in _operators) + { + if (op != null) + { + op.WaitCompletionBeforeDisposing(); + } + } + } + + foreach (var op in _operators) + { + if (op != null) + { + var disposableOperator = op as IDisposable; + + disposableOperator.Dispose(); + } + } + } + + _disposed = true; + } + } + } + + /// + /// Add an elastic operator to the workflow. + /// + /// + internal void Add(IElasticOperator op) + { + op.CancellationSource = _cancellationSource.Source; + + _operators.Add(op); + + if (_iteratorsPosition.Count > 0) + { + var iterPos = _iteratorsPosition.Last(); + var iterator = _operators[iterPos] as IElasticIterator; + + op.IteratorReference = iterator; + iterator.RegisterActionOnTaskRescheduled(op.OnTaskRescheduled); + } + + if (op.OperatorName == Constants.Iterate) + { + _iteratorsPosition.Add(_operators.Count - 1); + } + } + + /// + /// Initializes the communication group. + /// Computation blocks until all required tasks are registered in the group. + /// + /// The signal to cancel the operation + internal void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null) + { + try + { + foreach (var op in _operators) + { + op.WaitForTaskRegistration(cancellationSource); + } + } + catch (OperationCanceledException e) + { + throw e; + } + } + + /// + /// Reset the position tracker for all operators in the workflow. + /// + private void ResetOperatorPositions() + { + for (int pos = _position; pos < _operators.Count; pos++) + { + _operators[pos].ResetPosition(); + } + } + } +} From 1ce68544404dacc1b56c4708f881822d05347bf9 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Wed, 2 Jan 2019 13:51:40 -0800 Subject: [PATCH 07/29] Added boradcast example plus few bug fixes --- .../ElasticBroadcastClient.cs | 112 +++++++++++++++ .../Run.cs | 12 +- .../Elastic/BroadcastMasterTask.cs | 59 ++++++++ .../Elastic/BroadcastSlaveTask.cs | 55 +++++++ .../Elastic/DefaultElasticDriver.cs | 135 ++++++++++++++++++ .../Elastic/DefaultElasticTask.cs | 88 ++++++++++++ .../Elastic/ElasticBroadcastDriver.cs | 72 ++++++++++ .../Org.Apache.REEF.Network.Examples.csproj | 2 +- .../Driver/Default/DefaultElasticStage.cs | 14 +- .../Default/DefaultElasticTaskSetManager.cs | 3 +- .../Elastic/Driver/IElasticStage.cs | 2 +- .../Logical/Default/DefaultBroadcast.cs | 21 ++- .../Operators/Logical/ElasticOperator.cs | 5 +- .../Elastic/Task/CancellationSource.cs | 4 +- .../Task/Default/DefaultElasticContext.cs | 2 + .../Task/Default/DefaultElasticStage.cs | 2 +- .../Elastic/Task/IElasticStage.cs | 2 +- .../Topology/Logical/Impl/FlatTopology.cs | 4 +- .../NetworkService/StreamingNetworkService.cs | 3 +- .../Formats/AvroConfigurationSerializer.cs | 44 ++++-- .../Formats/ConfigurationFile.cs | 91 +++++++----- .../Configuration/ConfigurationBuilderImpl.cs | 82 +++++++---- .../CsConfigurationBuilderImpl.cs | 18 ++- 23 files changed, 725 insertions(+), 107 deletions(-) create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticDriver.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticTask.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs new file mode 100644 index 0000000000..ec50f09278 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Globalization; +using System.IO; +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Client.API; +using Org.Apache.REEF.Client.Local; +using Org.Apache.REEF.Client.Yarn; +using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Network.Examples.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Client +{ + public class ElasticBroadcastClient + { + const string Local = "local"; + const string Yarn = "yarn"; + const string DefaultRuntimeFolder = "REEF_LOCAL_RUNTIME"; + + public void RunElasticBroadcast(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + { + const string driverId = "ElasticBroadcastDriver"; + const string stage = "Broadcast"; + + IConfiguration driverConfig = TangFactory.GetTang().NewConfigurationBuilder( + DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build()) + .BindNamedParameter( + GenericType.Class, + numTasks.ToString(CultureInfo.InvariantCulture)) + .BindNamedParameter( + GenericType.Class, + startingPortNo.ToString(CultureInfo.InvariantCulture)) + .BindNamedParameter( + GenericType.Class, + portRange.ToString(CultureInfo.InvariantCulture)) + .Build(); + + IConfiguration elsticGroupCommServiceDriverConfig = TangFactory.GetTang().NewConfigurationBuilder() + .BindStringNamedParam(driverId) + .BindStringNamedParam(stage) + .BindIntNamedParam(numTasks.ToString(CultureInfo.InvariantCulture)) + .Build(); + + IConfiguration merged = Configurations.Merge(driverConfig, elsticGroupCommServiceDriverConfig); + + string runPlatform = runOnYarn ? "yarn" : "local"; + TestRun(merged, typeof(ElasticBroadcastDriver), numTasks, "ElasticBroadcastDriver", runPlatform); + } + + internal static void TestRun(IConfiguration driverConfig, Type globalAssemblyType, int numberOfEvaluator, string jobIdentifier = "myDriver", string runOnYarn = "local", string runtimeFolder = DefaultRuntimeFolder) + { + IInjector injector = TangFactory.GetTang().NewInjector(GetRuntimeConfiguration(runOnYarn, numberOfEvaluator, runtimeFolder)); + var reefClient = injector.GetInstance(); + var jobRequestBuilder = injector.GetInstance(); + var jobSubmission = jobRequestBuilder + .AddDriverConfiguration(driverConfig) + .AddGlobalAssemblyForType(globalAssemblyType) + .SetJobIdentifier(jobIdentifier) + .Build(); + + reefClient.SubmitAndGetJobStatus(jobSubmission); + } + + internal static IConfiguration GetRuntimeConfiguration(string runOnYarn, int numberOfEvaluator, string runtimeFolder) + { + switch (runOnYarn) + { + case Local: + var dir = Path.Combine(".", runtimeFolder); + return LocalRuntimeClientConfiguration.ConfigurationModule + .Set(LocalRuntimeClientConfiguration.NumberOfEvaluators, numberOfEvaluator.ToString()) + .Set(LocalRuntimeClientConfiguration.RuntimeFolder, dir) + .Build(); + case Yarn: + return YARNClientConfiguration.ConfigurationModule.Build(); + default: + throw new Exception("Unknown runtime: " + runOnYarn); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs index 6b989f1057..d57c25d168 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs @@ -26,10 +26,10 @@ public static void Main(string[] args) { Console.WriteLine("start running client: " + DateTime.Now); bool runOnYarn = false; - int numNodes = 9; + int numNodes = 3; int startPort = 8900; int portRange = 1000; - string testToRun = "RunBroadcastAndReduce"; + string testToRun = "ElasticBroadcast"; testToRun = testToRun.ToLower(); if (args != null) @@ -80,7 +80,13 @@ public static void Main(string[] args) { new BroadcastAndReduceClient().RunBroadcastAndReduce(runOnYarn, numNodes, startPort, portRange); Console.WriteLine("RunBroadcastAndReduce completed!!!"); - } + } + + if (testToRun.Equals("ElasticBroadcast".ToLower()) || testToRun.Equals("all")) + { + new ElasticBroadcastClient().RunElasticBroadcast(runOnYarn, numNodes, startPort, portRange); + Console.WriteLine("ElasticRunBroadcast completed!!!"); + } } } } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs new file mode 100644 index 0000000000..24d5292fd9 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Task.Impl; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastMasterTask : DefaultElasticTask + { + [Inject] + private BroadcastMasterTask(CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + var rand = new Random(); + int number = 0; + + while (workflow.MoveNext()) + { + number = rand.Next(); + + switch (workflow.Current.OperatorName) + { + case Constants.Broadcast: + var sender = workflow.Current as IElasticBroadcast; + + sender.Send(number); + + Console.WriteLine($"Master has sent {number}"); + break; + default: + throw new InvalidOperationException($"Operation {workflow.Current} in workflow not implemented."); + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs new file mode 100644 index 0000000000..901252126f --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Impl; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTask : DefaultElasticTask + { + + [Inject] + public BroadcastSlaveTask(CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + while (workflow.MoveNext()) + { + switch (workflow.Current.OperatorName) + { + case Constants.Broadcast: + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Console.WriteLine($"Slave has received {rec}", rec); + break; + default: + break; + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticDriver.cs new file mode 100644 index 0000000000..e8d1eaa068 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticDriver.cs @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Driver.Context; +using Org.Apache.REEF.Driver.Evaluator; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Configuration; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Driver.Task; +using Org.Apache.REEF.Common.Context; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Default implementation of the elastic driver. + /// + public abstract class DefaultElasticDriver : + IObserver, + IObserver, + IObserver, + IObserver, + IObserver, + IObserver, + IObserver, + IObserver + { + [Inject] + protected DefaultElasticDriver(IElasticContext context) + { + Context = context; + } + + public IElasticContext Context { get; } + + public IElasticTaskSetManager TaskSetManager { get; set; } + + public void OnNext(IDriverStarted value) + { + Context.Start(); + } + + public void OnNext(IAllocatedEvaluator allocatedEvaluator) + { + System.Threading.Thread.Sleep(10000); + if (TaskSetManager.TryGetNextTaskContextId(allocatedEvaluator, out string identifier)) + { + IConfiguration contextConf = ContextConfiguration.ConfigurationModule + .Set(ContextConfiguration.Identifier, identifier) + .Build(); + IConfiguration serviceConf = Context.GetElasticServiceConfiguration(); + IConfiguration codecConf = TaskSetManager.GetCodecConfiguration(); + + serviceConf = Configurations.Merge(serviceConf, codecConf); + allocatedEvaluator.SubmitContextAndService(contextConf, serviceConf); + } + else + { + allocatedEvaluator.Dispose(); + } + } + + public void OnNext(IActiveContext activeContext) + { + System.Threading.Thread.Sleep(10000); + TaskSetManager.OnNewActiveContext(activeContext); + } + + public void OnNext(IRunningTask value) + { + TaskSetManager.OnTaskRunning(value); + } + + public void OnNext(ICompletedTask value) + { + TaskSetManager.OnTaskCompleted(value); + + if (TaskSetManager.IsCompleted()) + { + TaskSetManager.Dispose(); + } + } + + public void OnNext(IFailedEvaluator failedEvaluator) + { + TaskSetManager.OnEvaluatorFailure(failedEvaluator); + + if (TaskSetManager.IsCompleted()) + { + TaskSetManager.Dispose(); + } + } + + public void OnNext(IFailedTask failedTask) + { + TaskSetManager.OnTaskFailure(failedTask); + + if (TaskSetManager.IsCompleted()) + { + TaskSetManager.Dispose(); + } + } + + public void OnNext(ITaskMessage taskMessage) + { + TaskSetManager.OnTaskMessage(taskMessage); + } + + public void OnCompleted() + { + TaskSetManager.Dispose(); + } + + public void OnError(Exception error) + { + TaskSetManager.Dispose(); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticTask.cs new file mode 100644 index 0000000000..21121e9273 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticTask.cs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Common.Tasks.Events; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Default implementation of a task using the elastic group communication service. + /// + public abstract class DefaultElasticTask : ITask, IObserver + { + private readonly IElasticContext _context; + private readonly IElasticStage _stage; + + private readonly CancellationSource _cancellationSource; + + public DefaultElasticTask( + CancellationSource source, + IElasticContext context, + string stageName) + { + _context = context; + _cancellationSource = source; + + _stage = _context.GetStage(stageName); + } + + public byte[] Call(byte[] memento) + { + _context.WaitForTaskRegistration(_cancellationSource.Source); + + using (var workflow = _stage.Workflow) + { + try + { + Execute(memento, workflow); + } + catch (Exception e) + { + workflow.Throw(e); + } + } + + return null; + } + + public void Dispose() + { + _cancellationSource.Cancel(); + _context.Dispose(); + } + + public void OnNext(ICloseEvent value) + { + _stage.Cancel(); + } + + public void OnError(Exception error) + { + } + + public void OnCompleted() + { + } + + protected abstract void Execute(byte[] memento, Workflow workflow); + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs new file mode 100644 index 0000000000..2c8288f867 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Network.Elastic.Failures.Enum; +using Org.Apache.REEF.Network.Elastic.Operators.Logical; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public class ElasticBroadcastDriver : DefaultElasticDriver + { + [Inject] + private ElasticBroadcastDriver(IElasticContext context) : base(context) + { + Func masterTaskConfiguration = (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + + Func slaveTaskConfiguration = (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + + IElasticStage stage = Context.DefaultStage(); + + ElasticOperator pipeline = stage.RootOperator; + + // Create and build the pipeline + pipeline.Broadcast(TopologyType.Flat) + .Build(); + + // Build the stage + stage = stage.Build(); + + // Create the task manager + TaskSetManager = Context.CreateNewTaskSetManager(masterTaskConfiguration, slaveTaskConfiguration); + + // Register the stage to the task manager + TaskSetManager.AddStage(stage); + + // Build the task set manager + TaskSetManager.Build(); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj b/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj index 0855e6d076..ba72c8d26b 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj @@ -31,6 +31,6 @@ under the License. - + diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index 33cd736bbf..062b926514 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -34,6 +34,7 @@ using Org.Apache.REEF.Network.Elastic.Failures.Default; using Org.Apache.REEF.Network.Elastic.Operators.Logical.Default; using Org.Apache.REEF.Network.Elastic.Operators.Logical; +using Org.Apache.REEF.Tang.Implementations.Tang; namespace Org.Apache.REEF.Network.Elastic.Driver.Default { @@ -288,23 +289,22 @@ public bool IsMasterTaskContext(IActiveContext activeContext) /// The configuration builder the configuration will be appended to /// The task id of the task that belongs to this stages /// The configuration for the Task with added stages informations - public IConfiguration GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId) + public IConfiguration GetTaskConfiguration(int taskId) { + ICsConfigurationBuilder confBuilder = TangFactory.GetTang().NewConfigurationBuilder(); IList serializedOperatorsConfs = new List(); - builder = builder - .BindNamedParameter( + + confBuilder.BindNamedParameter( GenericType.Class, StageName); RootOperator.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); - var subConf = builder + return confBuilder .BindList( GenericType.Class, serializedOperatorsConfs) .Build(); - - return subConf; } /// @@ -358,7 +358,7 @@ public void OnTaskMessage(ITaskMessage message, ref List int offset = 0; var length = BitConverter.ToUInt16(message.Message, offset); offset += sizeof(ushort); - var stageName = BitConverter.ToString(message.Message, sizeof(ushort), length); + var stageName = ByteUtilities.ByteArraysToString(message.Message, offset, length); offset += length; if (stageName == StageName) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index 656695ec92..d3d71ee3c0 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -1212,8 +1212,7 @@ private void SubmitTask(int id) foreach (var stage in stages) { - ICsConfigurationBuilder confSubBuilder = TangFactory.GetTang().NewConfigurationBuilder(); - var confSub = stage.GetTaskConfiguration(ref confSubBuilder, id + 1); + var confSub = stage.GetTaskConfiguration(id + 1); if (rescheduleConfs.TryGetValue(stage.StageName, out var confs)) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs index a7db07ef81..d82fe99108 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs @@ -122,7 +122,7 @@ public interface IElasticStage : IFailureResponse, ITaskMessageResponse /// The configuration builder the configuration will be appended to /// The task id of the task that belongs to this stages /// The configuration for the Task with added stages informations - IConfiguration GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId); + IConfiguration GetTaskConfiguration(int taskId); /// /// Given a task id, this method returns the configuration of the task's data partition diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs index 5a7fbb9cf1..5f5e5f7ccf 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs @@ -22,6 +22,8 @@ using Org.Apache.REEF.Network.Elastic.Topology.Logical; using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Tang.Implementations.Configuration; namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default { @@ -57,10 +59,27 @@ public DefaultBroadcast( OperatorName = Constants.Broadcast; } + /// + /// Generate the data serializer configuration for the target operator. + /// + /// The conf builder where to attach the codec configuration + internal override void GetCodecConfiguration(ref IConfiguration conf) + { + if (CODECMAP.TryGetValue(typeof(T), out IConfiguration codecConf)) + { + conf = Configurations.Merge(conf, codecConf); + base.GetCodecConfiguration(ref conf); + } + else + { + throw new IllegalStateException($"Codec for type {typeof(T)} not found."); + } + } + /// /// Binding from logical to physical operator. /// - /// The configuration builder the binding will be added to + /// The configuration builder the binding will be added to protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder confBuilder) { confBuilder diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index f761a264c4..a32efc00f8 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -32,7 +32,6 @@ using Org.Apache.REEF.Network.Elastic.Config; using Org.Apache.REEF.Network.Elastic.Comm; using Org.Apache.REEF.Wake.Time.Event; -using System.Linq; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; @@ -287,6 +286,10 @@ public virtual ElasticOperator BuildState() return this; } + /// + /// Generate the data serializer configuration for the target operator. + /// + /// The conf builder where to attach the codec configuration internal virtual void GetCodecConfiguration(ref IConfiguration confBuilder) { if (_next != null) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs index f9facca194..9a238c31af 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs @@ -27,10 +27,10 @@ namespace Org.Apache.REEF.Network.Elastic.Task /// to inject the same source through the elastic communication services. /// [Unstable("0.16", "API may change")] - internal sealed class CancellationSource + public sealed class CancellationSource { [Inject] - public CancellationSource() + private CancellationSource() { Source = new CancellationTokenSource(); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs index 76a4c20452..41780ca439 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs @@ -69,6 +69,8 @@ public DefaultElasticContext( _disposed = false; _lock = new object(); + System.Threading.Thread.Sleep(10000); + foreach (string serializedGroupConfig in stageConfigs) { IConfiguration stageConfig = configSerializer.FromString(serializedGroupConfig); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs index 27fce06912..a198ea2ca6 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs @@ -51,7 +51,7 @@ private DefaultElasticStage( [Parameter(typeof(OperatorParameters.StartIteration))] int startIteration, AvroConfigurationSerializer configSerializer, Workflow workflow, - CommunicationLayer commLayer, + DefaultCommunicationLayer commLayer, CancellationSource cancellationSource, IInjector injector) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs index c7f72507bb..b66a1668a3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs @@ -24,7 +24,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task /// /// Used by tasks to fetch the workflow of the stages configured in the driver. /// - //[DefaultImplementation(typeof(DefaultTaskSetStage))] + [DefaultImplementation(typeof(DefaultElasticStage))] public interface IElasticStage : IWaitForTaskRegistration, IDisposable { /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index c345d8f6e0..f134c93d41 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -347,7 +347,7 @@ public void TopologyUpdateResponse(string taskId, ref List - /// Action to trigger when the operator recdeives a notification that a new iteration is started. + /// Action to trigger when the operator receives a notification that a new iteration is started. /// /// The new iteration number public void OnNewIteration(int iteration) @@ -414,7 +414,7 @@ public IList Reconfigure(string taskId, Optional /// public string LogFinalStatistics() { - return $"\nAverage number of nodes in the topology of Operator {OperatorId}: {(float)_totNumberofNodes / (_iteration > 2 ? _iteration - 1 : 1)}"; + return $"\nAverage number of nodes in the topology of Operator {OperatorId}: {(_iteration >= 2 ? (float)_totNumberofNodes / (_iteration - 1) : _availableDataPoints)}"; } private void BuildTopology() diff --git a/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs b/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs index ab20cb826b..9a8e2208cc 100644 --- a/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs +++ b/lang/cs/Org.Apache.REEF.Network/NetworkService/StreamingNetworkService.cs @@ -56,12 +56,11 @@ public class StreamingNetworkService : INetworkService /// The local address provider [Inject] private StreamingNetworkService( - IObserver> universalObserver, INameClient nameClient, StreamingRemoteManagerFactory remoteManagerFactory, NsMessageStreamingCodec codec, ILocalAddressProvider localAddressProvider) - : this(universalObserver, null, nameClient, remoteManagerFactory, codec, localAddressProvider) + : this(null, null, nameClient, remoteManagerFactory, codec, localAddressProvider) { } diff --git a/lang/cs/Org.Apache.REEF.Tang/Formats/AvroConfigurationSerializer.cs b/lang/cs/Org.Apache.REEF.Tang/Formats/AvroConfigurationSerializer.cs index 565d4fdd57..86df2bea29 100644 --- a/lang/cs/Org.Apache.REEF.Tang/Formats/AvroConfigurationSerializer.cs +++ b/lang/cs/Org.Apache.REEF.Tang/Formats/AvroConfigurationSerializer.cs @@ -15,13 +15,6 @@ // specific language governing permissions and limitations // under the License. -using System; -using System.Collections; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Runtime.Serialization; -using System.Text; using Microsoft.Hadoop.Avro; using Microsoft.Hadoop.Avro.Container; using Newtonsoft.Json; @@ -34,6 +27,13 @@ using Org.Apache.REEF.Tang.Types; using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Utilities.Logging; +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.Serialization; +using System.Text; namespace Org.Apache.REEF.Tang.Formats { @@ -99,7 +99,7 @@ public void ToFile(IConfiguration c, string fileName) var e = new TangApplicationException("Error during file operation. Quitting method: " + fileName); Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(e, LOGGER); } - } + } } public IConfiguration FromByteArray(byte[] bytes) @@ -178,7 +178,7 @@ public AvroConfiguration AvroDeserializeFromFile(string fileName) } buffer.Seek(0, SeekOrigin.Begin); - using (var reader = new SequentialReader(AvroContainer.CreateReader(buffer, true))) + using (var reader = new SequentialReader(AvroContainer.CreateReader(buffer, true))) { var results = reader.Objects; @@ -254,15 +254,35 @@ public AvroConfiguration ToAvroConfiguration(IConfiguration c) } else { - Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new IllegalStateException(), LOGGER); + throw new TangApplicationException("Unable to serialize set of type {e.Value.GetType()}"); } l.Add(new ConfigurationEntry(e.Key.GetFullName(), val)); } + foreach (var kvp in conf.GetBoundList()) + { + foreach (var item in kvp.Value) + { + string val = null; + if (item is string) + { + val = (string)item; + } + else if (item is INode) + { + val = ((INode)item).GetFullName(); + } + else + { + throw new TangApplicationException("Unable to serialize list of type {item.GetType()}"); + } + l.Add(new ConfigurationEntry(kvp.Key.GetFullName(), val)); + } + } return new AvroConfiguration(Language.Cs.ToString(), l); } - + private byte[] AvroSerialize(AvroConfiguration obj) { var serializer = AvroSerializer.Create(); @@ -327,7 +347,7 @@ private IConfiguration AddFromAvro(IConfigurationBuilder cb, AvroConfiguration a { settings.Add(new KeyValuePair(e.key, e.value)); } - ConfigurationFile.ProcessConfigData(cb, settings, avroConfiguration.language); + ConfigurationFile.ProcessConfigData(cb, settings, avroConfiguration.language); return cb.Build(); } } diff --git a/lang/cs/Org.Apache.REEF.Tang/Formats/ConfigurationFile.cs b/lang/cs/Org.Apache.REEF.Tang/Formats/ConfigurationFile.cs index 01a20c8a26..86a31b615e 100644 --- a/lang/cs/Org.Apache.REEF.Tang/Formats/ConfigurationFile.cs +++ b/lang/cs/Org.Apache.REEF.Tang/Formats/ConfigurationFile.cs @@ -15,12 +15,6 @@ // specific language governing permissions and limitations // under the License. -using System; -using System.Collections; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; using Org.Apache.REEF.Tang.Exceptions; using Org.Apache.REEF.Tang.Implementations.Configuration; using Org.Apache.REEF.Tang.Implementations.Tang; @@ -28,6 +22,12 @@ using Org.Apache.REEF.Tang.Types; using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Utilities.Logging; +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; namespace Org.Apache.REEF.Tang.Formats { @@ -47,10 +47,10 @@ public static void WriteConfigurationFile(IConfiguration c, string fileName) } } - public static string ToConfigurationString(IConfiguration c) + public static string ToConfigurationString(IConfiguration c) { StringBuilder sb = new StringBuilder(); - foreach (string s in ToConfigurationStringList(c)) + foreach (string s in ToConfigurationStringList(c)) { sb.Append(s); sb.Append('\n'); @@ -64,7 +64,7 @@ private static string GetFullName(INode n) Type t = ReflectionUtilities.GetTypeByName(s); return t.FullName; } - + private static string GetFullName(string name) { try @@ -93,20 +93,20 @@ private static string GetAssemblyName(string s) } } - public static HashSet ToConfigurationStringList(IConfiguration c) + public static HashSet ToConfigurationStringList(IConfiguration c) { ConfigurationImpl conf = (ConfigurationImpl)c; HashSet l = new HashSet(); - foreach (IClassNode opt in conf.GetBoundImplementations()) + foreach (IClassNode opt in conf.GetBoundImplementations()) { l.Add(GetFullName(opt) + '=' + Escape(GetFullName(conf.GetBoundImplementation(opt)))); } - - foreach (IClassNode opt in conf.GetBoundConstructors()) + + foreach (IClassNode opt in conf.GetBoundConstructors()) { l.Add(GetFullName(opt) + '=' + Escape(GetFullName(conf.GetBoundConstructor(opt)))); } - foreach (INamedParameterNode opt in conf.GetNamedParameters()) + foreach (INamedParameterNode opt in conf.GetNamedParameters()) { l.Add(GetFullName(opt) + '=' + Escape(GetFullName(conf.GetNamedParameter(opt)))); } @@ -123,22 +123,43 @@ public static HashSet ToConfigurationStringList(IConfiguration c) KeyValuePair e = (KeyValuePair)bs.Current; string val = null; - if (e.Value is string) + if (e.Value is string) { val = GetFullName((string)e.Value); - } - else if (e.Value is INode) + } + else if (e.Value is INode) { val = GetFullName((INode)e.Value); - } - else + } + else { - Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new IllegalStateException(), LOGGER); + throw new BindException($"Failed to serialize set of unsupported type {e.Value.GetType()}"); } - + l.Add(GetFullName(e.Key) + '=' + Escape(val)); } + foreach (var kvp in conf.GetBoundList()) + { + foreach (var item in kvp.Value) + { + string val = null; + if (item is string) + { + val = GetFullName((string)item); + } + else if (kvp.Value is INode) + { + val = GetFullName((INode)kvp.Value); + } + else + { + throw new BindException($"Failed to serialize list of unsupported type {item.GetType()}"); + } + l.Add(GetFullName(kvp.Key) + '=' + Escape(val)); + } + } + return l; } @@ -159,12 +180,12 @@ public static void AddConfigurationFromStream(IConfigurationBuilder conf, byte[] { using (StreamReader reader = new StreamReader(new MemoryStream(configData), Encoding.GetEncoding(0))) { - AddConfiguration(conf, reader); + AddConfiguration(conf, reader); } } public static void AddConfigurationFromFile(IConfigurationBuilder conf, string configFileName) - { + { using (StreamReader reader = File.OpenText(configFileName)) { AddConfiguration(conf, reader); @@ -188,7 +209,7 @@ private static void AddConfiguration(IConfigurationBuilder conf, StreamReader re if (p.Length == 2) { settings.Add(new KeyValuePair(GetAssemblyName(p[0]), GetAssemblyName(p[1]))); - } + } else if (p.Length > 2) { string v = line.Substring(p[0].Length + 1, line.Length - p[0].Length - 1); @@ -198,7 +219,7 @@ private static void AddConfiguration(IConfigurationBuilder conf, StreamReader re { var e = new TangApplicationException("Config data is not in format of KeyValuePair: " + line); Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(e, LOGGER); - } + } } ProcessConfigData(conf, settings); } @@ -275,13 +296,13 @@ public static void ProcessConfigData(IConfigurationBuilder conf, IDictionary 0) + if (types.Length > 0) { sb.Append(types[0].GetType()); - for (int i = 1; i < types.Length; i++) + for (int i = 1; i < types.Length; i++) { sb.Append(sep).Append(types[i].GetType()); } diff --git a/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/ConfigurationBuilderImpl.cs b/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/ConfigurationBuilderImpl.cs index 4eab9a1e8a..290d6fb8a3 100644 --- a/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/ConfigurationBuilderImpl.cs +++ b/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/ConfigurationBuilderImpl.cs @@ -44,7 +44,7 @@ internal class ConfigurationBuilderImpl : IConfigurationBuilder public const string DuplicatedEntryForNamedParamater = "Duplicated entries: "; private static readonly Logger LOGGER = Logger.GetLogger(typeof(ConfigurationBuilderImpl)); - protected ConfigurationBuilderImpl() + protected ConfigurationBuilderImpl() { this.ClassHierarchy = TangFactory.GetTang().GetDefaultClassHierarchy(); } @@ -57,28 +57,28 @@ public ConfigurationBuilderImpl(IClassHierarchy classHierarchy) protected ConfigurationBuilderImpl(string[] assemblies, IConfiguration[] confs, Type[] parsers) { this.ClassHierarchy = TangFactory.GetTang().GetDefaultClassHierarchy(assemblies, parsers); - foreach (IConfiguration tc in confs) + foreach (IConfiguration tc in confs) { if (tc == null) { throw new ArgumentNullException("One of specified configurations is null"); - } - + } + AddConfiguration((ConfigurationImpl)tc); } } - public ConfigurationBuilderImpl(ConfigurationBuilderImpl t) + public ConfigurationBuilderImpl(ConfigurationBuilderImpl t) { this.ClassHierarchy = t.GetClassHierarchy(); - try + try { AddConfiguration(t.GetClassHierarchy(), t); - } - catch (BindException e) + } + catch (BindException e) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Error, LOGGER); - Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new IllegalStateException("Could not copy builder", e), LOGGER); + Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new IllegalStateException("Could not copy builder", e), LOGGER); } } @@ -98,20 +98,20 @@ public void AddConfiguration(IConfiguration conf) private void AddConfiguration(IClassHierarchy ns, ConfigurationBuilderImpl builder) { this.ClassHierarchy = this.ClassHierarchy.Merge(ns); - - if (ClassHierarchy is ClassHierarchyImpl || builder.ClassHierarchy is ClassHierarchyImpl) + + if (ClassHierarchy is ClassHierarchyImpl || builder.ClassHierarchy is ClassHierarchyImpl) { if (ClassHierarchy is ClassHierarchyImpl && builder.ClassHierarchy is ClassHierarchyImpl) { ((ClassHierarchyImpl)ClassHierarchy).Parameterparser.MergeIn(((ClassHierarchyImpl)builder.ClassHierarchy).Parameterparser); - } - else + } + else { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(new ArgumentException("Attempt to merge Java and non-Java class hierarchy! Not supported."), LOGGER); } } - foreach (IClassNode cn in builder.BoundImpls.Keys) + foreach (IClassNode cn in builder.BoundImpls.Keys) { IClassNode n = null; builder.BoundImpls.TryGetValue(cn, out n); @@ -121,7 +121,7 @@ private void AddConfiguration(IClassHierarchy ns, ConfigurationBuilderImpl build } } - foreach (IClassNode cn in builder.BoundConstructors.Keys) + foreach (IClassNode cn in builder.BoundConstructors.Keys) { IClassNode n = null; builder.BoundConstructors.TryGetValue(cn, out n); @@ -134,32 +134,32 @@ private void AddConfiguration(IClassHierarchy ns, ConfigurationBuilderImpl build // The namedParameters set contains the strings that can be used to // instantiate new // named parameter instances. Create new ones where we can. - foreach (INamedParameterNode np in builder.NamedParameters.Keys) + foreach (INamedParameterNode np in builder.NamedParameters.Keys) { string v = null; builder.NamedParameters.TryGetValue(np, out v); Bind(np.GetFullName(), v); } - - foreach (IClassNode cn in builder.LegacyConstructors.Keys) + + foreach (IClassNode cn in builder.LegacyConstructors.Keys) { IConstructorDef cd = null; builder.LegacyConstructors.TryGetValue(cn, out cd); - RegisterLegacyConstructor(cn, cd.GetArgs()); + RegisterLegacyConstructor(cn, cd.GetArgs()); } - foreach (KeyValuePair e in builder.BoundSetEntries) + foreach (KeyValuePair e in builder.BoundSetEntries) { string name = ((INamedParameterNode)e.Key).GetFullName(); - if (e.Value is INode) + if (e.Value is INode) { BindSetEntry(name, (INode)e.Value); - } - else if (e.Value is string) + } + else if (e.Value is string) { BindSetEntry(name, (string)e.Value); - } - else + } + else { var ex = new IllegalStateException(string.Format(CultureInfo.CurrentCulture, "The value {0} set to the named parameter {1} is illegel.", e.Value, name)); Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(ex, LOGGER); @@ -266,16 +266,20 @@ public void Bind(Types.INode key, Types.INode value) public void BindParameter(INamedParameterNode name, string value) { /* Parse and discard value; this is just for type checking, skip for now*/ - if (this.ClassHierarchy is ICsClassHierarchy) + if (this.ClassHierarchy is ICsClassHierarchy) { ((ICsClassHierarchy)ClassHierarchy).Parse(name, value); } - if (name.IsSet()) + if (name.IsSet()) { BindSetEntry((INamedParameterNode)name, value); - } - else + } + else if (name.IsList()) + { + BindList((INamedParameterNode)name, value); + } + else { try { @@ -289,6 +293,17 @@ public void BindParameter(INamedParameterNode name, string value) } } + public void BindList(INamedParameterNode iface, string impl) + { + IList l; + if (!BoundLists.TryGetValue(iface, out l)) + { + l = new List(); + BoundLists.Add(iface, l); + } + l.Add((object)impl); + } + public void BindImplementation(IClassNode n, IClassNode m) { if (this.ClassHierarchy.IsImplementation(n, m)) @@ -337,6 +352,11 @@ public void BindList(INamedParameterNode iface, IList impl) IList l = new List(); foreach (var n in impl) { + if (string.IsNullOrEmpty(n)) + { + throw new ArgumentException("List cannot contain string that are null or empty"); + } + l.Add((object)n); } BoundLists.Add(iface, l); @@ -344,7 +364,7 @@ public void BindList(INamedParameterNode iface, IList impl) public void BindList(string iface, IList impl) { - BindList((INamedParameterNode)ClassHierarchy.GetNode(iface), impl); + BindList((INamedParameterNode)ClassHierarchy.GetNode(iface), impl); } public void BindList(string iface, IList impl) @@ -387,4 +407,4 @@ public string ClassPrettyDescriptionString(string fullName) return param.GetDocumentation() + "\n" + param.GetFullName(); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/CsConfigurationBuilderImpl.cs b/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/CsConfigurationBuilderImpl.cs index 569d4670a1..9f94ab1266 100644 --- a/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/CsConfigurationBuilderImpl.cs +++ b/lang/cs/Org.Apache.REEF.Tang/Implementations/Configuration/CsConfigurationBuilderImpl.cs @@ -49,7 +49,7 @@ public CsConfigurationBuilderImpl(ICsClassHierarchy classHierarchy) : base(classHierarchy) { } - + public CsConfigurationBuilderImpl(string[] assemblies) : base(assemblies) { @@ -225,13 +225,13 @@ public ICsConfigurationBuilder BindList(GenericType iface, IList(GenericType iface, IList impl) where U : Name> { - return ((ICsInternalConfigurationBuilder)this).BindList(typeof(U), impl); + return ((ICsInternalConfigurationBuilder)this).BindList(typeof(U), impl); } public ICsConfigurationBuilder BindList(Type iface, IList implList) @@ -399,7 +399,15 @@ ICsInternalConfigurationBuilder ICsInternalConfigurationBuilder.BindList(Type if Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(ex, LOGGER); } - BindList((INamedParameterNode)n, implList); + try + { + BindList((INamedParameterNode)n, implList); + } + catch (ArgumentException ex) + { + throw new BindException($"BindList failed to bind for {iface.Name}, reason: {ex.Message}"); + } + return this; } @@ -433,7 +441,7 @@ ICsInternalConfigurationBuilder ICsInternalConfigurationBuilder.BindConstructor( #endregion ICsInternalConfigurationBuilder #region extension methods - + public ICsConfigurationBuilder BindNamedParam(string str) where TName : Name { return BindNamedParameter(GenericType.Class, str); From d86b265260da1ea2ff5754ca8e4ad748362fe7a3 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 3 Jan 2019 22:09:04 -0800 Subject: [PATCH 08/29] fixed bugs --- .../ElasticBroadcastClient.cs | 37 ++++--- ...stClientWithFailEvaluatorBeforeWorkflow.cs | 53 ++++++++++ ...roadcastClientWithFailureAfterBroadcast.cs | 53 ++++++++++ ...oadcastClientWithFailureBeforeBroadcast.cs | 53 ++++++++++ ...roadcastClientWithFailureBeforeWorkflow.cs | 53 ++++++++++ ...BroadcastClientWithFailureInConstructor.cs | 53 ++++++++++ ...sticBroadcastClientWithMultipleFailures.cs | 53 ++++++++++ ...Apache.REEF.Network.Examples.Client.csproj | 3 +- .../Run.cs | 49 +++++++++- .../Elastic/BroadcastMasterTask.cs | 2 +- .../Elastic/BroadcastSlaveTask.cs | 4 +- .../Elastic/ElasticBroadcastDriver.cs | 50 ++++++---- .../ElasticBroadcastDriverWithFailures.cs | 97 +++++++++++++++++++ .../BroadcastSlaveTaskDieAfterBroadcast.cs | 68 +++++++++++++ .../BroadcastSlaveTaskDieBeforeBroadcast.cs | 67 +++++++++++++ .../BroadcastSlaveTaskDieBeforeWorkflow.cs | 66 +++++++++++++ ...castSlaveTaskDieEvaluatorBeforeWorkflow.cs | 67 +++++++++++++ .../BroadcastSlaveTaskDieInConstructor.cs | 62 ++++++++++++ .../BroadcastSlaveTaskDieMultiple.cs | 70 +++++++++++++ ...stDriverWithFailEvaluatorBeforeWorkflow.cs | 54 +++++++++++ ...roadcastDriverWithFailureAfterBroadcast.cs | 54 +++++++++++ ...oadcastDriverWithFailureBeforeBroadcast.cs | 54 +++++++++++ ...roadcastDriverWithFailureBeforeWorkflow.cs | 54 +++++++++++ ...BroadcastDriverWithFailureInConstructor.cs | 54 +++++++++++ ...sticBroadcastDriverWithMultipleFailures.cs | 54 +++++++++++ .../Org.Apache.REEF.Network.Examples.csproj | 1 + .../Org.Apache.REEF.Network.Tests.csproj | 1 + .../GroupCommunicationConfigurationOptions.cs | 7 +- .../Driver/Default}/DefaultElasticDriver.cs | 9 +- .../Driver/Default/DefaultElasticStage.cs | 42 ++++---- .../Default/DefaultElasticTaskSetManager.cs | 92 +++++++++++------- .../DefaultElasticTaskSetManagerParameters.cs | 2 + .../Elastic/Driver/IElasticStage.cs | 2 +- .../Failures/Default/DefaultFailureState.cs | 2 +- .../Default/DefaultFailureStateMachine.cs | 14 +-- .../Failures/Default/RescheduleEvent.cs | 2 +- .../Elastic/Failures/Default/StopEvent.cs | 21 +--- .../Logical/Default/DefaultOneToN.cs | 40 ++++++-- .../ElastiOperatorWithDefaultDispatcher.cs | 4 +- .../Operators/Logical/ElasticOperator.cs | 11 ++- .../Physical/Default/DefaultOneToN.cs | 2 - .../Elastic/Task/CommunicationLayer.cs | 3 + .../Task/Default/DefaultCommunicationLayer.cs | 2 + .../Task/Default/DefaultElasticContext.cs | 4 +- .../Task/Default/DefaultElasticStage.cs | 2 + .../Task/Default}/DefaultElasticTask.cs | 28 +++++- .../DefaultTaskToDriverMessageDispatcher.cs | 4 +- .../Default/IDefaultTaskToDrivermessages.cs | 3 + .../Task/ElasticDriverMessageHandler.cs | 2 + .../Elastic/Task/IElasticContext.cs | 2 + .../Elastic/Task/IElasticStage.cs | 3 +- .../Elastic/Task/NodeObserverIdentifier.cs | 2 + .../Task/TaskToDriverMessageDispatcher.cs | 2 + .../Elastic/Task/Workflow.cs | 16 ++- .../Topology/Logical/Impl/FlatTopology.cs | 2 +- .../Physical/Default/OneToNTopology.cs | 2 +- ...peratorTopologyWithDefaultCommunication.cs | 1 - .../IOperatorTopologyWithCommunication.cs | 2 + .../Org.Apache.REEF.Network/Elastic/Utils.cs | 2 +- 59 files changed, 1458 insertions(+), 160 deletions(-) create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureAfterBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeWorkflow.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureInConstructor.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithMultipleFailures.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs rename lang/cs/{Org.Apache.REEF.Network.Examples/Elastic => Org.Apache.REEF.Network/Elastic/Driver/Default}/DefaultElasticDriver.cs (95%) rename lang/cs/{Org.Apache.REEF.Network.Examples/Elastic => Org.Apache.REEF.Network/Elastic/Task/Default}/DefaultElasticTask.cs (66%) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs index ec50f09278..5dbb175520 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs @@ -38,23 +38,12 @@ public class ElasticBroadcastClient const string Yarn = "yarn"; const string DefaultRuntimeFolder = "REEF_LOCAL_RUNTIME"; - public void RunElasticBroadcast(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + public ElasticBroadcastClient(bool runOnYarn, int numTasks, int startingPortNo, int portRange) { const string driverId = "ElasticBroadcastDriver"; const string stage = "Broadcast"; - IConfiguration driverConfig = TangFactory.GetTang().NewConfigurationBuilder( - DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) - .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) - .Build()) + IConfiguration driverConfig = TangFactory.GetTang().NewConfigurationBuilder(GetDriverConf()) .BindNamedParameter( GenericType.Class, numTasks.ToString(CultureInfo.InvariantCulture)) @@ -75,7 +64,7 @@ public void RunElasticBroadcast(bool runOnYarn, int numTasks, int startingPortNo IConfiguration merged = Configurations.Merge(driverConfig, elsticGroupCommServiceDriverConfig); string runPlatform = runOnYarn ? "yarn" : "local"; - TestRun(merged, typeof(ElasticBroadcastDriver), numTasks, "ElasticBroadcastDriver", runPlatform); + TestRun(merged, typeof(ElasticBroadcastDriver), numTasks, JobIdentifier, runPlatform); } internal static void TestRun(IConfiguration driverConfig, Type globalAssemblyType, int numberOfEvaluator, string jobIdentifier = "myDriver", string runOnYarn = "local", string runtimeFolder = DefaultRuntimeFolder) @@ -108,5 +97,25 @@ internal static IConfiguration GetRuntimeConfiguration(string runOnYarn, int num throw new Exception("Unknown runtime: " + runOnYarn); } } + + protected virtual string JobIdentifier + { + get { return "ElasticBroadcast"; } + } + + protected virtual IConfiguration GetDriverConf() + { + return DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); + } } } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow.cs new file mode 100644 index 0000000000..c1ea6ce626 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Examples.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Client +{ + public sealed class ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow : ElasticBroadcastClient + { + public ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + : base (runOnYarn, numTasks, startingPortNo, portRange) + { + } + + protected override string JobIdentifier + { + get { return "ElasticBroadcastWithFailure"; } + } + + protected override IConfiguration GetDriverConf() + { + return DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureAfterBroadcast.cs new file mode 100644 index 0000000000..db3872d776 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureAfterBroadcast.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Examples.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Client +{ + public sealed class ElasticBroadcastClientWithFailureAfterBroadcast : ElasticBroadcastClient + { + public ElasticBroadcastClientWithFailureAfterBroadcast(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + : base (runOnYarn, numTasks, startingPortNo, portRange) + { + } + + protected override string JobIdentifier + { + get { return "ElasticBroadcastWithFailure"; } + } + + protected override IConfiguration GetDriverConf() + { + return DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeBroadcast.cs new file mode 100644 index 0000000000..13e5d9de55 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeBroadcast.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Examples.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Client +{ + public sealed class ElasticBroadcastClientWithFailureBeforeBroadcast : ElasticBroadcastClient + { + public ElasticBroadcastClientWithFailureBeforeBroadcast(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + : base (runOnYarn, numTasks, startingPortNo, portRange) + { + } + + protected override string JobIdentifier + { + get { return "ElasticBroadcastWithFailure"; } + } + + protected override IConfiguration GetDriverConf() + { + return DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeWorkflow.cs new file mode 100644 index 0000000000..f3b8c3dbd5 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeWorkflow.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Examples.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Client +{ + public sealed class ElasticBroadcastClientWithFailureBeforeWorkflow : ElasticBroadcastClient + { + public ElasticBroadcastClientWithFailureBeforeWorkflow(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + : base (runOnYarn, numTasks, startingPortNo, portRange) + { + } + + protected override string JobIdentifier + { + get { return "ElasticBroadcastWithFailure"; } + } + + protected override IConfiguration GetDriverConf() + { + return DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureInConstructor.cs new file mode 100644 index 0000000000..6f7e1815e3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureInConstructor.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Examples.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Client +{ + public sealed class ElasticBroadcastClientWithFailureInConstructor : ElasticBroadcastClient + { + public ElasticBroadcastClientWithFailureInConstructor(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + : base (runOnYarn, numTasks, startingPortNo, portRange) + { + } + + protected override string JobIdentifier + { + get { return "ElasticBroadcastWithFailure"; } + } + + protected override IConfiguration GetDriverConf() + { + return DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithMultipleFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithMultipleFailures.cs new file mode 100644 index 0000000000..550964d3e7 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithMultipleFailures.cs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Driver; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Utilities.Logging; +using Org.Apache.REEF.Network.Examples.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Client +{ + public sealed class ElasticBroadcastClientWithMultipleFailures : ElasticBroadcastClient + { + public ElasticBroadcastClientWithMultipleFailures(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + : base (runOnYarn, numTasks, startingPortNo, portRange) + { + } + + protected override string JobIdentifier + { + get { return "ElasticBroadcastWithFailure"; } + } + + protected override IConfiguration GetDriverConf() + { + return DriverConfiguration.ConfigurationModule + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Org.Apache.REEF.Network.Examples.Client.csproj b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Org.Apache.REEF.Network.Examples.Client.csproj index 42f75ce322..c2db3a7fbf 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Org.Apache.REEF.Network.Examples.Client.csproj +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Org.Apache.REEF.Network.Examples.Client.csproj @@ -21,6 +21,7 @@ under the License. Org.Apache.REEF.Network.Examples.Client REEF Network Client examples REEF Examples Network Client + @@ -34,5 +35,5 @@ under the License. - + \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs index d57c25d168..a161d967f1 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs @@ -26,10 +26,10 @@ public static void Main(string[] args) { Console.WriteLine("start running client: " + DateTime.Now); bool runOnYarn = false; - int numNodes = 3; + int numNodes = 5; int startPort = 8900; int portRange = 1000; - string testToRun = "ElasticBroadcast"; + string testToRun = "ElasticBroadcastWithMultipleFailures"; testToRun = testToRun.ToLower(); if (args != null) @@ -84,9 +84,52 @@ public static void Main(string[] args) if (testToRun.Equals("ElasticBroadcast".ToLower()) || testToRun.Equals("all")) { - new ElasticBroadcastClient().RunElasticBroadcast(runOnYarn, numNodes, startPort, portRange); + new ElasticBroadcastClient(runOnYarn, numNodes, startPort, portRange); Console.WriteLine("ElasticRunBroadcast completed!!!"); } + + if (testToRun.Equals("ElasticBroadcastWithFailureInConstructor".ToLower()) || testToRun.Equals("all")) + { + // This stage should fail + new ElasticBroadcastClientWithFailureInConstructor(runOnYarn, numNodes, startPort, portRange); + Console.WriteLine("ElasticBroadcastWithFailureInConstructor completed!!!"); + } + + if (testToRun.Equals("ElasticBroadcastWithFailureBeforeWorkflow".ToLower()) || testToRun.Equals("all")) + { + new ElasticBroadcastClientWithFailureBeforeWorkflow(runOnYarn, numNodes, startPort, portRange); + Console.WriteLine("ElasticBroadcastWithFailureBeforeWorkflow completed!!!"); + } + + if (testToRun.Equals("ElasticBroadcastWithFailEvaluatorBeforeWorkflow".ToLower()) || testToRun.Equals("all")) + { + new ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow(runOnYarn, numNodes, startPort, portRange); + Console.WriteLine("ElasticBroadcastWithFailEvaluatorBeforeWorkflow completed!!!"); + } + + if (testToRun.Equals("ElasticBroadcastWithFailureBeforeBroadcast".ToLower()) || testToRun.Equals("all")) + { + new ElasticBroadcastClientWithFailureBeforeBroadcast(runOnYarn, numNodes, startPort, portRange); + Console.WriteLine("ElasticBroadcastWithFailureBeforeBroadcast completed!!!"); + } + + if (testToRun.Equals("ElasticBroadcastWithFailureAfterBroadcast".ToLower()) || testToRun.Equals("all")) + { + new ElasticBroadcastClientWithFailureAfterBroadcast(runOnYarn, numNodes, startPort, portRange); + Console.WriteLine("ElasticBroadcastWithFailureAfterBroadcast completed!!!"); + } + + if (testToRun.Equals("ElasticBroadcastWithFailureAfterBroadcast".ToLower()) || testToRun.Equals("all")) + { + new ElasticBroadcastClientWithFailureAfterBroadcast(runOnYarn, numNodes, startPort, portRange); + Console.WriteLine("ElasticBroadcastWithFailureAfterBroadcast completed!!!"); + } + + if (testToRun.Equals("ElasticBroadcastWithMultipleFailures".ToLower()) || testToRun.Equals("all")) + { + new ElasticBroadcastClientWithMultipleFailures(runOnYarn, numNodes, startPort, portRange); + Console.WriteLine("ElasticBroadcastWithMultipleFailures completed!!!"); + } } } } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs index 24d5292fd9..c00ba3995a 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs @@ -20,7 +20,7 @@ using Org.Apache.REEF.Network.Elastic.Task; using Org.Apache.REEF.Network.Elastic.Operators; using Org.Apache.REEF.Network.Elastic.Operators.Physical; -using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Network.Elastic.Task.Default; namespace Org.Apache.REEF.Network.Examples.Elastic { diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs index 901252126f..e5a2d092ff 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs @@ -20,7 +20,7 @@ using Org.Apache.REEF.Network.Elastic.Task; using Org.Apache.REEF.Network.Elastic.Operators.Physical; using Org.Apache.REEF.Network.Elastic.Operators; -using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Network.Elastic.Task.Default; namespace Org.Apache.REEF.Network.Examples.Elastic { @@ -44,7 +44,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - Console.WriteLine($"Slave has received {rec}", rec); + Console.WriteLine($"Slave has received {rec}"); break; default: break; diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs index 2c8288f867..1e7ea6d892 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs @@ -23,8 +23,7 @@ using Org.Apache.REEF.Network.Elastic.Driver; using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; -using Org.Apache.REEF.Network.Elastic.Operators.Logical; +using Org.Apache.REEF.Network.Elastic.Driver.Default; namespace Org.Apache.REEF.Network.Examples.Elastic { @@ -34,33 +33,20 @@ namespace Org.Apache.REEF.Network.Examples.Elastic public class ElasticBroadcastDriver : DefaultElasticDriver { [Inject] - private ElasticBroadcastDriver(IElasticContext context) : base(context) + protected ElasticBroadcastDriver(IElasticContext context) : base(context) { - Func masterTaskConfiguration = (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) - .Build(); - - Func slaveTaskConfiguration = (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) - .Build(); - IElasticStage stage = Context.DefaultStage(); - ElasticOperator pipeline = stage.RootOperator; - // Create and build the pipeline - pipeline.Broadcast(TopologyType.Flat) - .Build(); + stage.PipelineRoot + .Broadcast(TopologyType.Flat) + .Build(); // Build the stage stage = stage.Build(); // Create the task manager - TaskSetManager = Context.CreateNewTaskSetManager(masterTaskConfiguration, slaveTaskConfiguration); + TaskSetManager = Context.CreateNewTaskSetManager(MasterTaskConfiguration, SlaveTaskConfiguration); // Register the stage to the task manager TaskSetManager.AddStage(stage); @@ -68,5 +54,29 @@ private ElasticBroadcastDriver(IElasticContext context) : base(context) // Build the task set manager TaskSetManager.Build(); } + + protected virtual Func MasterTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + + protected virtual Func SlaveTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } } } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs new file mode 100644 index 0000000000..508fe3402e --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Network.Elastic.Driver.Default; +using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Network.Elastic.Config; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public abstract class ElasticBroadcastDriverWithFailures : DefaultElasticDriver + { + protected ElasticBroadcastDriverWithFailures( + string stageName, + int numEvaluators, + IElasticContext context) : base(context) + { + IFailureStateMachine failureMachine = new DefaultFailureStateMachine(); + + failureMachine.SetThreasholds(new Tuple[] + { + Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure) as IFailureState, 0.01F), + Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule) as IFailureState, 0.40F), + Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule) as IFailureState, 0.60F), + Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.Fail) as IFailureState, 0.80F) + }); + + IElasticStage stage = Context.CreateNewStage(stageName, numEvaluators, failureMachine); + + // Create and build the pipeline + stage.PipelineRoot + .Broadcast(TopologyType.Flat) + .Build(); + + // Build the stage + stage = stage.Build(); + + // Create the task manager + TaskSetManager = Context.CreateNewTaskSetManager(MasterTaskConfiguration, SlaveTaskConfiguration); + + // Register the stage to the task manager + TaskSetManager.AddStage(stage); + + // Build the task set manager + TaskSetManager.Build(); + } + + protected virtual Func MasterTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + + protected virtual Func SlaveTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs new file mode 100644 index 0000000000..e9eac4ba69 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieAfterBroadcast : DefaultElasticTask + { + private readonly string _taskId; + + [Inject] + public BroadcastSlaveTaskDieAfterBroadcast( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + _taskId = taskId; + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + while (workflow.MoveNext()) + { + switch (workflow.Current.OperatorName) + { + case Constants.Broadcast: + + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Console.WriteLine($"Slave has received {rec}"); + + if (Utils.GetTaskNum(_taskId) == 2) + { + throw new Exception("Die after broadcast."); + } + + break; + default: + break; + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs new file mode 100644 index 0000000000..e16a9b6b9b --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieBeforeBroadcast : DefaultElasticTask + { + private readonly string _taskId; + + [Inject] + public BroadcastSlaveTaskDieBeforeBroadcast( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + _taskId = taskId; + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + while (workflow.MoveNext()) + { + switch (workflow.Current.OperatorName) + { + case Constants.Broadcast: + + if (Utils.GetTaskNum(_taskId) == 2) + { + throw new Exception("Die before broadcast."); + } + + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Console.WriteLine($"Slave has received {rec}"); + break; + default: + break; + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs new file mode 100644 index 0000000000..826c94493c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieBeforeWorkflow : DefaultElasticTask + { + private readonly string _taskId; + + [Inject] + public BroadcastSlaveTaskDieBeforeWorkflow( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + _taskId = taskId; + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + if (Utils.GetTaskNum(_taskId) == 2) + { + throw new Exception("Die before workflow."); + } + + while (workflow.MoveNext()) + { + switch (workflow.Current.OperatorName) + { + case Constants.Broadcast: + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Console.WriteLine($"Slave has received {rec}"); + break; + default: + break; + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs new file mode 100644 index 0000000000..a7a5d3353d --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieEvaluatorBeforeWorkflow : DefaultElasticTask + { + private readonly string _taskId; + + [Inject] + public BroadcastSlaveTaskDieEvaluatorBeforeWorkflow( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + _taskId = taskId; + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + if (Utils.GetTaskNum(_taskId) == 2) + { + Console.WriteLine("Die before workflow."); + Environment.Exit(0); + } + + while (workflow.MoveNext()) + { + switch (workflow.Current.OperatorName) + { + case Constants.Broadcast: + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Console.WriteLine($"Slave has received {rec}"); + break; + default: + break; + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs new file mode 100644 index 0000000000..07dad15f4b --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieInConstructor : DefaultElasticTask + { + [Inject] + public BroadcastSlaveTaskDieInConstructor( + [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + if (Utils.GetTaskNum(taskId) == 2) + { + throw new Exception("Die in Constructor."); + } + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + while (workflow.MoveNext()) + { + switch (workflow.Current.OperatorName) + { + case Constants.Broadcast: + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Console.WriteLine($"Slave has received {rec}"); + break; + default: + break; + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs new file mode 100644 index 0000000000..e53d76e4c8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieMultiple : DefaultElasticTask + { + private const int _failProb = 80; + private readonly Random _rand = new Random(); + + [Inject] + public BroadcastSlaveTaskDieMultiple( + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + if (_rand.Next(100) < _failProb) + { + throw new Exception("Die."); + } + + while (workflow.MoveNext()) + { + switch (workflow.Current.OperatorName) + { + case Constants.Broadcast: + + if (_rand.Next(100) < _failProb) + { + throw new Exception("Die"); + } + + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + Console.WriteLine($"Slave has received {rec}"); + + break; + default: + break; + } + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs new file mode 100644 index 0000000000..91ae6a5b2c --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Config; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public sealed class ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow : ElasticBroadcastDriverWithFailures + { + [Inject] + private ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow( + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + IElasticContext context) : base(stageName, numEvaluators, context) + { + } + + protected override Func SlaveTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs new file mode 100644 index 0000000000..950a4e86c4 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Config; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public sealed class ElasticBroadcastDriverWithFailureAfterBroadcast : ElasticBroadcastDriverWithFailures + { + [Inject] + private ElasticBroadcastDriverWithFailureAfterBroadcast( + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + IElasticContext context) : base(stageName, numEvaluators, context) + { + } + + protected override Func SlaveTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs new file mode 100644 index 0000000000..5c2e0fc621 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Config; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public sealed class ElasticBroadcastDriverWithFailureBeforeBroadcast : ElasticBroadcastDriverWithFailures + { + [Inject] + private ElasticBroadcastDriverWithFailureBeforeBroadcast( + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + IElasticContext context) : base(stageName, numEvaluators, context) + { + } + + protected override Func SlaveTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs new file mode 100644 index 0000000000..c20e9b5f03 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Config; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public sealed class ElasticBroadcastDriverWithFailureBeforeWorkflow : ElasticBroadcastDriverWithFailures + { + [Inject] + private ElasticBroadcastDriverWithFailureBeforeWorkflow( + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + IElasticContext context) : base(stageName, numEvaluators, context) + { + } + + protected override Func SlaveTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs new file mode 100644 index 0000000000..98a596fbda --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Config; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public sealed class ElasticBroadcastDriverWithFailureInConstructor : ElasticBroadcastDriverWithFailures + { + [Inject] + private ElasticBroadcastDriverWithFailureInConstructor( + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + IElasticContext context) : base(stageName, numEvaluators, context) + { + } + + protected override Func SlaveTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs new file mode 100644 index 0000000000..9df2f290f6 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Network.Elastic.Driver; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Config; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + /// + /// Example implementation of broadcasting using the elastic group communication service. + /// + public sealed class ElasticBroadcastDriverWithMultipleFailures : ElasticBroadcastDriverWithFailures + { + [Inject] + private ElasticBroadcastDriverWithMultipleFailures( + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + IElasticContext context) : base(stageName, numEvaluators, context) + { + } + + protected override Func SlaveTaskConfiguration + { + get + { + return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); + } + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj b/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj index ba72c8d26b..30ad4f6842 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Org.Apache.REEF.Network.Examples.csproj @@ -24,6 +24,7 @@ under the License. + diff --git a/lang/cs/Org.Apache.REEF.Network.Tests/Org.Apache.REEF.Network.Tests.csproj b/lang/cs/Org.Apache.REEF.Network.Tests/Org.Apache.REEF.Network.Tests.csproj index 2cd937f690..1352026b20 100644 --- a/lang/cs/Org.Apache.REEF.Network.Tests/Org.Apache.REEF.Network.Tests.csproj +++ b/lang/cs/Org.Apache.REEF.Network.Tests/Org.Apache.REEF.Network.Tests.csproj @@ -24,6 +24,7 @@ under the License. + diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs index 4972d074e6..b433709b4d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Config/GroupCommunicationConfigurationOptions.cs @@ -24,7 +24,7 @@ namespace Org.Apache.REEF.Network.Elastic.Config /// public sealed class GroupCommunicationConfigurationOptions { - [NamedParameter("Timeout for sending or receiving messages", defaultValue: "600000")] + [NamedParameter("Timeout for sending or receiving messages", defaultValue: "10000")] public class Timeout : Name { } @@ -59,5 +59,10 @@ internal sealed class SleepTimeWaitingForRegistration : Name internal sealed class RetryCountWaitingForRegistration : Name { } + + [NamedParameter("Whether the operator is in a rescheduled task", defaultValue: "false")] + public sealed class IsRescheduled : Name + { + } } } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticDriver.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs similarity index 95% rename from lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticDriver.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs index e8d1eaa068..f975f65398 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs @@ -22,15 +22,17 @@ using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Tang.Implementations.Configuration; using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Network.Elastic.Driver; using Org.Apache.REEF.Driver.Task; using Org.Apache.REEF.Common.Context; +using Org.Apache.REEF.Utilities.Attributes; +using System.Threading; -namespace Org.Apache.REEF.Network.Examples.Elastic +namespace Org.Apache.REEF.Network.Elastic.Driver.Default { /// /// Default implementation of the elastic driver. /// + [Unstable("0.16", "API may change")] public abstract class DefaultElasticDriver : IObserver, IObserver, @@ -58,7 +60,7 @@ public void OnNext(IDriverStarted value) public void OnNext(IAllocatedEvaluator allocatedEvaluator) { - System.Threading.Thread.Sleep(10000); + Thread.Sleep(10000); if (TaskSetManager.TryGetNextTaskContextId(allocatedEvaluator, out string identifier)) { IConfiguration contextConf = ContextConfiguration.ConfigurationModule @@ -78,7 +80,6 @@ public void OnNext(IAllocatedEvaluator allocatedEvaluator) public void OnNext(IActiveContext activeContext) { - System.Threading.Thread.Sleep(10000); TaskSetManager.OnNewActiveContext(activeContext); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index 062b926514..459f82efdf 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -56,7 +56,7 @@ internal sealed class DefaultElasticStage : IElasticStage, IDefaultFailureEventR private int _tasksAdded; private HashSet _missingMasterTasks; private HashSet _masterTasks; - private readonly IFailureStateMachine _defaultFailureMachine; + private readonly IFailureStateMachine _failureMachine; private int _numOperators; private Optional _datasetConfiguration; @@ -87,9 +87,9 @@ internal DefaultElasticStage( _datasetConfiguration = Optional.Empty(); IsCompleted = false; Context = elasticService; - _defaultFailureMachine = failureMachine ?? new DefaultFailureStateMachine(numTasks, DefaultFailureStates.Fail); - FailureState = _defaultFailureMachine.State; - RootOperator = new DefaultEmpty(this, _defaultFailureMachine.Clone()); + _failureMachine = failureMachine ?? new DefaultFailureStateMachine(numTasks, DefaultFailureStates.Fail); + FailureState = _failureMachine.State; + PipelineRoot = new DefaultEmpty(this, _failureMachine.Clone()); IsIterative = false; } @@ -102,7 +102,7 @@ internal DefaultElasticStage( /// /// The operator at the beginning of the computation workflow. /// - public ElasticOperator RootOperator { get; private set; } + public ElasticOperator PipelineRoot { get; private set; } /// /// The service managing the stages. @@ -178,7 +178,7 @@ public IElasticStage Build() } } - RootOperator.GatherMasterIds(ref _masterTasks); + PipelineRoot.GatherMasterIds(ref _masterTasks); _missingMasterTasks = new HashSet(_masterTasks); _finalized = true; @@ -188,7 +188,7 @@ public IElasticStage Build() /// /// Add a task to the stages. - /// The stages must have been buit before tasks can be added. + /// The stages must have been built before tasks can be added. /// /// The id of the task to add /// True if the task is correctly added to the stages @@ -231,7 +231,7 @@ public bool AddTask(string taskId) return false; } - if (!RootOperator.AddTask(taskId)) + if (!PipelineRoot.AddTask(taskId)) { return true; } @@ -240,7 +240,7 @@ public bool AddTask(string taskId) _missingMasterTasks.Remove(taskId); - _defaultFailureMachine.AddDataPoints(1, false); + _failureMachine.AddDataPoints(1, false); } return true; @@ -256,11 +256,11 @@ public bool ScheduleStage() { // Schedule if we reach the number of requested tasks or the stage contains an iterative pipeline that is ready to be scheduled and the // policy requested by the user allow early start with ramp up. - if (!_scheduled && (_numTasks == _tasksAdded || (IsIterative && _defaultFailureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule && RootOperator.CanBeScheduled()))) + if (!_scheduled && (_numTasks == _tasksAdded || (IsIterative && _failureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule && PipelineRoot.CanBeScheduled()))) { _scheduled = true; - RootOperator.BuildState(); + PipelineRoot.BuildState(); } return _scheduled; @@ -298,7 +298,7 @@ public IConfiguration GetTaskConfiguration(int taskId) GenericType.Class, StageName); - RootOperator.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); + PipelineRoot.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); return confBuilder .BindList( @@ -340,12 +340,14 @@ public Optional GetPartitionConf(string taskId) /// The final statistics for the computation public string LogFinalStatistics() { - if (!IsCompleted) + if (IsCompleted || FailureState.FailureState == (int)DefaultFailureStates.Fail) { - throw new IllegalStateException($"Cannot log statistics before Stage {StageName} is completed"); + return PipelineRoot.LogFinalStatistics(); + } + else + { + throw new IllegalStateException($"Cannot log statistics before Stage {StageName} is completed or failed."); } - - return RootOperator.LogFinalStatistics(); } /// @@ -364,7 +366,7 @@ public void OnTaskMessage(ITaskMessage message, ref List if (stageName == StageName) { // Messages have to be propagated down to the operators - RootOperator.OnTaskMessage(message, ref returnMessages); + PipelineRoot.OnTaskMessage(message, ref returnMessages); } } @@ -378,7 +380,7 @@ public void OnTaskMessage(ITaskMessage message, ref List /// The next timeouts to be scheduled public void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) { - RootOperator.OnTimeout(alarm, ref msgs, ref nextTimeouts); + PipelineRoot.OnTimeout(alarm, ref msgs, ref nextTimeouts); } /// @@ -390,7 +392,7 @@ public void OnTimeout(Alarm alarm, ref List msgs, ref Lis public void OnTaskFailure(IFailedTask task, ref List failureEvents) { // Failures have to be propagated down to the operators - RootOperator.OnTaskFailure(task, ref failureEvents); + PipelineRoot.OnTaskFailure(task, ref failureEvents); } /// @@ -421,7 +423,7 @@ public void EventDispatcher(ref IFailureEvent @event) break; } - RootOperator.EventDispatcher(ref @event); + PipelineRoot.EventDispatcher(ref @event); } #endregion diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index d3d71ee3c0..34ecbe7d88 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -34,12 +34,14 @@ using Org.Apache.REEF.Network.Elastic.Config; using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Driver.Default { /// /// Class managing the scheduling of tasks and task-related events. /// + [Unstable("0.16", "API may change")] internal sealed class DefaultElasticTaskSetManager : IElasticTaskSetManager, IDefaultFailureEventResponse, @@ -560,7 +562,7 @@ public IConfiguration GetCodecConfiguration() foreach (var stage in _stages.Values) { - stage.RootOperator.GetCodecConfiguration(ref conf); + stage.PipelineRoot.GetCodecConfiguration(ref conf); } return conf; @@ -574,12 +576,12 @@ public void OnNewActiveContext(IActiveContext activeContext) { if (_finalized != true) { - throw new IllegalStateException("Task set have to be finalized before adding tasks"); + throw new IllegalStateException("Task set have to be finalized before adding tasks."); } if (Completed() || Failed()) { - LOGGER.Log(Level.Warning, "Adding tasks to already completed task set: ignoring"); + LOGGER.Log(Level.Warning, "Adding tasks to already completed task set: ignoring."); activeContext.Dispose(); return; } @@ -591,7 +593,7 @@ public void OnNewActiveContext(IActiveContext activeContext) // We reschedule the task only if the context was active (_taskInfos[id] != null) and the task was actually scheduled at least once (_taskInfos[id].TaskStatus > TaskStatus.Init) if (_taskInfos[id] != null && _taskInfos[id].TaskStatus > TaskState.Init) { - LOGGER.Log(Level.Info, $"{taskId} already part of task set: going to directly submit it"); + LOGGER.Log(Level.Info, $"{taskId} already part of task set: going to directly submit it."); lock (_taskInfos[id].Lock) { @@ -655,15 +657,15 @@ public void OnTaskRunning(IRunningTask task) if (Completed() || Failed()) { - LOGGER.Log(Level.Info, $"Received running from task {task.Id} but task set is completed or failed: ignoring"); - _taskInfos[id].DisposeTask(); + LOGGER.Log(Level.Info, $"Received running from task {task.Id} but task set is completed or failed: ignoring."); + _taskInfos[id].Dispose(); return; } if (!TaskStateUtils.IsRunnable(_taskInfos[id].TaskStatus)) { - LOGGER.Log(Level.Info, $"Received running from task {task.Id} which is not runnable: ignoring"); - _taskInfos[id].DisposeTask(); + LOGGER.Log(Level.Info, $"Received running from task {task.Id} which is not runnable: ignoring."); + _taskInfos[id].Dispose(); return; } @@ -781,13 +783,13 @@ public void OnTimeout(Alarm alarm, ref List msgs, ref Lis { if (Completed() || Failed()) { - LOGGER.Log(Level.Warning, "Taskset made no progress in the last {0}ms. Forcing Disposal.", _parameters.Timeout); + LOGGER.Log(Level.Warning, $"Taskset made no progress in the last {_parameters.Timeout}ms. Forcing Disposal."); Dispose(); } else { - LOGGER.Log(Level.Error, "Taskset made no progress in the last {0}ms. Aborting.", _parameters.Timeout); - OnFail(); + LOGGER.Log(Level.Error, $"Taskset made no progress in the last {_parameters.Timeout}ms. Aborting."); + Fail(); return; } } @@ -918,7 +920,7 @@ public void OnEvaluatorFailure(IFailedEvaluator evaluator) if (cinfo.NumRetry > _parameters.NumEvaluatorFailures) { LOGGER.Log(Level.Error, $"Context {cinfo.Id} failed more than {_parameters.NumEvaluatorFailures} times: Aborting"); - OnFail(); + Fail(); } _queuedContexts.Enqueue(cinfo); @@ -964,7 +966,7 @@ public void EventDispatcher(ref IFailureEvent @event) OnFail(); break; default: - throw new IllegalStateException("Failure event not recognized"); + throw new IllegalStateException("Failure event not recognized."); } } @@ -995,27 +997,7 @@ public void OnReschedule(ref RescheduleEvent rescheduleEvent) SendToTasks(rescheduleEvent.FailureResponse); - var id = Utils.GetTaskNum(rescheduleEvent.TaskId) - 1; - - lock (_taskInfos[id].Lock) - { - _taskInfos[id].NumRetry++; - - if (_taskInfos[id].NumRetry > _parameters.NumTaskFailures) - { - LOGGER.Log(Level.Error, $"Task {rescheduleEvent.TaskId} failed more than {_parameters.NumTaskFailures} times: aborting"); - OnFail(); - } - - if (rescheduleEvent.Reschedule) - { - LOGGER.Log(Level.Info, $"Rescheduling task {rescheduleEvent.TaskId}"); - - _taskInfos[id].RescheduleConfigurations = rescheduleEvent.RescheduleTaskConfigurations; - - SubmitTask(id); - } - } + Reschedule(rescheduleEvent); } /// @@ -1030,6 +1012,10 @@ public void OnStop(ref StopEvent stopEvent) } SendToTasks(stopEvent.FailureResponse); + + var rescheduleEvent = stopEvent as RescheduleEvent; + + Reschedule(rescheduleEvent); } /// @@ -1169,7 +1155,7 @@ private bool StartSubmitTasks() { _scheduled = true; - LOGGER.Log(Level.Info, string.Format("Scheduling {0} tasks from Taskset {1}", _tasksAdded, StagesId)); + LOGGER.Log(Level.Info, $"Scheduling {_tasksAdded} tasks from Taskset {StagesId}"); } } @@ -1235,7 +1221,7 @@ private void SubmitTask(int id) if (_taskInfos[id].IsActiveContextDisposed) { - LOGGER.Log(Level.Warning, string.Format("Task submit for {0} with a non-active context: spawning a new evaluator", id + 1)); + LOGGER.Log(Level.Warning, $"Task submit for {id + 1} with a non-active context: spawning a new evaluator."); if (_taskInfos[id].TaskStatus == TaskState.Failed) { @@ -1300,7 +1286,7 @@ private void SendToTasks(IList messages, int retry = 0) else if (retry >= _parameters.Retry) { LOGGER.Log(Level.Warning, msg + " Aborting"); - OnFail(); + Fail(returnMessage.Destination); } else { @@ -1330,6 +1316,38 @@ private void SpawnNewEvaluator(int id) _evaluatorRequestor.Submit(request); } + private void Reschedule(RescheduleEvent rescheduleEvent) + { + var id = Utils.GetTaskNum(rescheduleEvent.TaskId) - 1; + + lock (_taskInfos[id].Lock) + { + _taskInfos[id].NumRetry++; + + if (_taskInfos[id].NumRetry > _parameters.NumTaskFailures) + { + LOGGER.Log(Level.Error, $"Task {rescheduleEvent.TaskId} failed more than {_parameters.NumTaskFailures} times: aborting"); + Fail(rescheduleEvent.TaskId); + } + + if (rescheduleEvent.Reschedule) + { + LOGGER.Log(Level.Info, $"Rescheduling task {rescheduleEvent.TaskId}"); + + _taskInfos[id].RescheduleConfigurations = rescheduleEvent.RescheduleTaskConfigurations; + + SubmitTask(id); + } + } + } + + private void Fail(string taskId = "") + { + IFailureEvent @event = new FailEvent(taskId); + + EventDispatcher(ref @event); + } + private void LogFinalStatistics() { var msg = string.Format("Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}", _totFailedTasks, _totFailedEvaluators); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs index c94ccbed71..1447a5cfd1 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs @@ -19,12 +19,14 @@ using Org.Apache.REEF.Network.Elastic.Config; using System.Threading.Tasks; using Org.Apache.REEF.Network.Elastic.Failures; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Driver.Default { /// /// Injectable class containing all the parameters for the default task set manager. /// + [Unstable("0.16", "API may change")] internal sealed class DefaultElasticTaskSetManagerParameters { [Inject] diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs index d82fe99108..a937f17f9b 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs @@ -42,7 +42,7 @@ public interface IElasticStage : IFailureResponse, ITaskMessageResponse /// /// The operator at the beginning of the computation workflow. /// - ElasticOperator RootOperator { get; } + ElasticOperator PipelineRoot { get; } /// /// The failure state of the target stages. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs index c9aebcf7b0..72f2d5987e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs @@ -25,7 +25,7 @@ namespace Org.Apache.REEF.Network.Elastic.Failures.Default { [Unstable("0.16", "API may change")] - internal sealed class DefaultFailureState : IFailureState + public sealed class DefaultFailureState : IFailureState { /// /// Create a default failure state for 0 (Continue). diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs index cc9d099ee2..258d7a8349 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -34,7 +34,7 @@ namespace Org.Apache.REEF.Network.Elastic.Failures.Default { [Unstable("0.16", "API may change")] - internal sealed class DefaultFailureStateMachine : IFailureStateMachine + public sealed class DefaultFailureStateMachine : IFailureStateMachine { private readonly object _statusLock; @@ -56,10 +56,10 @@ internal sealed class DefaultFailureStateMachine : IFailureStateMachine private readonly IDictionary transitionWeights = new Dictionary() { - { DefaultFailureStates.ContinueAndReconfigure, 0.0F }, - { DefaultFailureStates.ContinueAndReschedule, 0.000001F }, - { DefaultFailureStates.StopAndReschedule, 0.5F }, - { DefaultFailureStates.Fail, 0.5F } + { DefaultFailureStates.ContinueAndReconfigure, 0.01F }, + { DefaultFailureStates.ContinueAndReschedule, 0.40F }, + { DefaultFailureStates.StopAndReschedule, 0.60F }, + { DefaultFailureStates.Fail, 0.80F } }; /// @@ -192,12 +192,12 @@ public void SetThreasholds(Tuple[] weights) { if (!weights.All(weight => weight.Item1 is DefaultFailureState)) { - throw new ArgumentException("Input is not of type DefaultFailureStateMachine"); + throw new ArgumentException("Input is not of type DefaultFailureStateMachine,"); } if (weights.Any(weight => weight.Item1.FailureState == (int)DefaultFailureStates.Continue)) { - throw new ArgumentException("Cannot change the threshold for Continue state"); + throw new ArgumentException("Cannot change the threshold for Continue state."); } lock (_statusLock) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs index 8606f80f20..fc4247d4b7 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/RescheduleEvent.cs @@ -26,7 +26,7 @@ namespace Org.Apache.REEF.Network.Elastic.Failures.Default /// reschedule a new task. /// [Unstable("0.16", "API may change")] - public sealed class RescheduleEvent : ReconfigureEvent + public class RescheduleEvent : ReconfigureEvent { /// /// Constructor for the reschedule event. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs index c9283cbbd6..eda798ac9a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/StopEvent.cs @@ -25,13 +25,13 @@ namespace Org.Apache.REEF.Network.Elastic.Failures.Default /// Stop the execution and try to add new tasks. /// [Unstable("0.16", "API may change")] - public class StopEvent : IFailureEvent + public sealed class StopEvent : RescheduleEvent { /// /// Constructor for the stop event. /// /// The identifier of the task triggering the failure event - public StopEvent(string taskId) + public StopEvent(string taskId) : base(taskId) { TaskId = taskId; OperatorId = -1; @@ -41,24 +41,9 @@ public StopEvent(string taskId) /// /// The event / action raised by the transition to the new failure state. /// - public int FailureEvent + public override int FailureEvent { get { return (int)DefaultFailureStateEvents.Stop; } } - - /// - /// The identifier of the task triggering the event. - /// - public string TaskId { get; private set; } - - /// - /// The opeartor id in which the failure is rised. - /// - public int OperatorId { get; private set; } - - /// - /// The response message generated to react to the failure event. - /// - public List FailureResponse { get; private set; } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs index 46866ea11a..6608b37fbe 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -28,6 +28,10 @@ using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Comm.Enum; using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Tang.Implementations.Tang; +using Org.Apache.REEF.Network.Elastic.Config; +using System.Globalization; +using Org.Apache.REEF.Tang.Util; namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default { @@ -102,6 +106,14 @@ protected override bool ReactOnTaskMessage(ITaskMessage message, ref List public override void OnReschedule(ref RescheduleEvent rescheduleEvent) { + // Iterators manage the re-schuedling of tasks. If not iterator exists, setup the rescheduling. + if (!WithinIteration) + { + LOGGER.Log(Level.Info, "Going to reschedule task " + rescheduleEvent.TaskId); + + if (!rescheduleEvent.RescheduleTaskConfigurations.TryGetValue(Stage.StageName, out IList confs)) + { + confs = new List(); + rescheduleEvent.RescheduleTaskConfigurations.Add(Stage.StageName, confs); + } + confs.Add(TangFactory.GetTang().NewConfigurationBuilder() + .BindNamedParameter( + GenericType.Class, + true.ToString(CultureInfo.InvariantCulture)) + .Build()); + } + var reconfigureEvent = rescheduleEvent as ReconfigureEvent; OnReconfigure(ref reconfigureEvent); @@ -192,6 +216,10 @@ public override void OnStop(ref StopEvent stopEvent) { _stop = true; } + + var rescheduleEvent = stopEvent as RescheduleEvent; + + OnReschedule(ref rescheduleEvent); } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs index e15d01b1b5..b4463d732c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs @@ -92,7 +92,7 @@ public override void OnTaskFailure(IFailedTask task, ref List fai } else { - LOGGER.Log(Level.Info, $"Failure from {task.Id} cannot be properly managed: failing"); + LOGGER.Log(Level.Info, $"Failure from {task.Id} cannot be properly managed: failing."); failureEvents.Add(new FailEvent(task.Id)); } @@ -159,7 +159,7 @@ public override void OnTimeout(Alarm alarm, ref List msgs /// The failure event to react upon public override void EventDispatcher(ref IFailureEvent @event) { - if (@event.OperatorId == _id || (@event.OperatorId < 0 && WithinIteration)) + if (@event.OperatorId == _id || @event.OperatorId < 0) { switch ((DefaultFailureStateEvents)@event.FailureEvent) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index a32efc00f8..859d8abd48 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -195,6 +195,7 @@ public ElasticOperator Broadcast(TopologyType topology, CheckpointLevel check /// /// The task message for the operator /// A list of messages containing the instructions for the task + /// True if the message was managed correctly, false otherwise public void OnTaskMessage(ITaskMessage message, ref List returnMessages) { var hasReacted = ReactOnTaskMessage(message, ref returnMessages); @@ -207,7 +208,7 @@ public void OnTaskMessage(ITaskMessage message, ref List /// /// Add a task to the operator. - /// The bperator must have called Build() before adding tasks. + /// The operator must have called Build() before adding tasks. /// /// The id of the task to add /// True if the task is new and is added to the operator @@ -218,10 +219,10 @@ public virtual bool AddTask(string taskId) throw new IllegalStateException("Operator needs to be finalized before adding tasks."); } - if (_operatorStateFinalized) - { - throw new IllegalStateException("Task cannot be added to an operator with finalized state."); - } + //if (_operatorStateFinalized) + //{ + // throw new IllegalStateException("Task cannot be added to an operator with finalized state."); + //} var newTask = _topology.AddTask(taskId, _failureMachine); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index ea032353e9..5276924e96 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -18,11 +18,9 @@ using System.Threading; using System.Collections.Generic; using Org.Apache.REEF.Network.Elastic.Topology.Physical.Default; -using Org.Apache.REEF.Network.Elastic.Failures; using System; using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Utilities.Logging; -using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; using Org.Apache.REEF.Network.Elastic.Comm; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index fa2652c7a3..f3373dcb96 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -27,12 +27,14 @@ using Org.Apache.REEF.Tang.Exceptions; using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Network.Elastic.Topology.Physical; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Task.Impl { /// /// Handles all incoming / outcoming messages for a given task. /// + [Unstable("0.16", "API may change")] internal abstract class CommunicationLayer : IObserver>> { @@ -169,6 +171,7 @@ public void WaitForTaskRegistration(IList identifiers, CancellationToken } IList foundList = new List(); + for (var i = 0; i < _retryRegistration; i++) { if (cancellationSource != null && cancellationSource.Token.IsCancellationRequested) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs index 4d3b413669..df22ea4069 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs @@ -24,12 +24,14 @@ using Org.Apache.REEF.Utilities.Logging; using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Network.Elastic.Topology.Physical; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Task.Impl { /// /// Implementation of the communication layer with default task to driver messages. /// + [Unstable("0.16", "API may change")] internal sealed class DefaultCommunicationLayer : CommunicationLayer, IDefaultTaskToDriverMessages diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs index 41780ca439..556a734383 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticContext.cs @@ -27,6 +27,7 @@ using Org.Apache.REEF.Network.Elastic.Config; using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Common.Tasks.Events; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Task.Impl { @@ -34,6 +35,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl /// Default implementation of the task-side context. /// Used by REEF tasks to initialize group communication and fetch stages. /// + [Unstable("0.16", "API may change")] internal sealed class DefaultElasticContext : IElasticContext { private readonly Dictionary _stages; @@ -69,8 +71,6 @@ public DefaultElasticContext( _disposed = false; _lock = new object(); - System.Threading.Thread.Sleep(10000); - foreach (string serializedGroupConfig in stageConfigs) { IConfiguration stageConfig = configSerializer.FromString(serializedGroupConfig); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs index a198ea2ca6..4c7c3a3a1f 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs @@ -26,12 +26,14 @@ using Org.Apache.REEF.Network.Elastic.Operators.Physical; using Org.Apache.REEF.Utilities.Logging; using Org.Apache.REEF.Network.Elastic.Task.Impl; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Task { /// /// Default implementation of the task-side stage. /// + [Unstable("0.16", "API may change")] internal sealed class DefaultElasticStage : IElasticStage { private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticStage)); diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticTask.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticTask.cs similarity index 66% rename from lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticTask.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticTask.cs index 21121e9273..fc05974722 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/DefaultElasticTask.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticTask.cs @@ -17,16 +17,15 @@ using System; using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Network.Elastic.Task; -using Org.Apache.REEF.Network.Elastic.Task.Impl; using Org.Apache.REEF.Common.Tasks.Events; +using Org.Apache.REEF.Utilities.Attributes; -namespace Org.Apache.REEF.Network.Examples.Elastic +namespace Org.Apache.REEF.Network.Elastic.Task.Default { /// /// Default implementation of a task using the elastic group communication service. /// + [Unstable("0.16", "API may change")] public abstract class DefaultElasticTask : ITask, IObserver { private readonly IElasticContext _context; @@ -34,6 +33,12 @@ public abstract class DefaultElasticTask : ITask, IObserver private readonly CancellationSource _cancellationSource; + /// + /// Constructor for the default task implementation using the elastic group communication service. + /// + /// A cancellation source + /// The elastic context + /// The name of the stage to execute public DefaultElasticTask( CancellationSource source, IElasticContext context, @@ -45,6 +50,11 @@ public DefaultElasticTask( _stage = _context.GetStage(stageName); } + /// + /// Implementation of the Call method of . + /// + /// + /// public byte[] Call(byte[] memento) { _context.WaitForTaskRegistration(_cancellationSource.Source); @@ -64,6 +74,9 @@ public byte[] Call(byte[] memento) return null; } + /// + /// Default implementation of the interface. + /// public void Dispose() { _cancellationSource.Cancel(); @@ -83,6 +96,13 @@ public void OnCompleted() { } + /// + /// Method wrapping the actual task logic. + /// Whatever exception happen inside this method call is managed by + /// the elastic framework. + /// + /// The memento object inherited from the Call method + /// The workflow object managing the sequence of operation to execute protected abstract void Execute(byte[] memento, Workflow workflow); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs index 0d89b65fd4..3190041048 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs @@ -21,6 +21,7 @@ using Org.Apache.REEF.Network.Elastic.Comm.Enum; using Org.Apache.REEF.Utilities; using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Task.Impl { @@ -28,6 +29,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl /// Implemention of with default /// messages dispatcher. /// + [Unstable("0.16", "API may change")] internal sealed class DefaultTaskToDriverMessageDispatcher : TaskToDriverMessageDispatcher, IDefaultTaskToDriverMessages { private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultTaskToDriverMessageDispatcher)); @@ -81,7 +83,7 @@ public void TopologyUpdateRequest(string taskId, string stageName, int operatorI offset += sizeof(ushort); Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); - LOGGER.Log(Level.Info, string.Format("Operator {0} requesting a topology update through heartbeat", operatorId)); + LOGGER.Log(Level.Info, string.Format($"Operator {operatorId} requesting a topology update through heartbeat.")); Send(taskId, message); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs index 239cbede79..c3dc804b66 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs @@ -15,11 +15,14 @@ // specific language governing permissions and limitations // under the License. +using Org.Apache.REEF.Utilities.Attributes; + namespace Org.Apache.REEF.Network.Elastic.Task { /// /// Interface defining the messages supported in tasks to driver communications. /// + [Unstable("0.16", "API may change")] internal interface IDefaultTaskToDriverMessages { /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs index 843f37e811..bfa533a7aa 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs @@ -21,6 +21,7 @@ using Org.Apache.REEF.Network.Elastic.Topology.Physical; using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Attributes; using System.Collections.Concurrent; using System.Collections.Generic; @@ -29,6 +30,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl /// /// Handler for incoming messages from the driver. /// + [Unstable("0.16", "API may change")] internal sealed class ElasticDriverMessageHandler : IDriverMessageHandler { /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs index 7677029e75..ac5c908ae4 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticContext.cs @@ -19,12 +19,14 @@ using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Network.Elastic.Task.Impl; using Org.Apache.REEF.Common.Tasks.Events; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Task { /// /// Used by REEF tasks to initialize group communication and fetch Stages. /// + [Unstable("0.16", "API may change")] [DefaultImplementation(typeof(DefaultElasticContext))] public interface IElasticContext : IWaitForTaskRegistration, diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs index b66a1668a3..74b5fac709 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IElasticStage.cs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -using Org.Apache.REEF.Network.Elastic.Task.Impl; using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Attributes; using System; namespace Org.Apache.REEF.Network.Elastic.Task @@ -24,6 +24,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task /// /// Used by tasks to fetch the workflow of the stages configured in the driver. /// + [Unstable("0.16", "API may change")] [DefaultImplementation(typeof(DefaultElasticStage))] public interface IElasticStage : IWaitForTaskRegistration, IDisposable { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs index 1324ab492b..e9cd505933 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs @@ -17,6 +17,7 @@ using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Network.Elastic.Topology.Physical; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Task.Impl { @@ -25,6 +26,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl /// A node is uniquely identifiable by a combination of its /// , and . /// + [Unstable("0.16", "API may change")] internal sealed class NodeObserverIdentifier { private readonly string _stageName; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs index 7a27fc6ea8..ad92fcc610 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs @@ -18,6 +18,7 @@ using Org.Apache.REEF.Common.Runtime.Evaluator; using Org.Apache.REEF.Common.Protobuf.ReefProtocol; using Org.Apache.REEF.Tang.Interface; +using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Task.Impl { @@ -25,6 +26,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl /// Class used to manage messages going from tasks to the driver. /// Messages are notifying through the heartbeat. /// + [Unstable("0.16", "API may change")] internal abstract class TaskToDriverMessageDispatcher { private readonly IHeartBeatManager _heartBeatManager; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index 816c1392b7..8d6a8a05b4 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +using Org.Apache.REEF.Network.Elastic.Config; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Network.Elastic.Operators; using Org.Apache.REEF.Network.Elastic.Operators.Physical; using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Utilities.Logging; using System; using System.Collections; @@ -27,13 +29,14 @@ using System.Linq; using System.Threading; -namespace Org.Apache.REEF.Network.Elastic.Task.Impl +namespace Org.Apache.REEF.Network.Elastic.Task { /// /// Task-side representation of the the sequence of group communication operations to execute. /// Exception rised during execution are managed by the framework and recovered through the user-defined /// policies / mechanisms. /// + [Unstable("0.16", "API may change")] public sealed class Workflow : IEnumerator { private static readonly Logger LOGGER = Logger.GetLogger(typeof(Workflow)); @@ -46,13 +49,16 @@ public sealed class Workflow : IEnumerator private readonly object _lock; private readonly IList _operators; private readonly CancellationSource _cancellationSource; + private readonly bool _isRescheduled; /// /// Injectable constructor. /// /// [Inject] - private Workflow(CancellationSource cancellationSource) + private Workflow( + [Parameter(typeof(GroupCommunicationConfigurationOptions.IsRescheduled))] bool isRescheduled, + CancellationSource cancellationSource) { _operators = new List(); _failed = false; @@ -60,6 +66,7 @@ private Workflow(CancellationSource cancellationSource) _lock = new object(); _iteratorsPosition = new List(); _cancellationSource = cancellationSource; + _isRescheduled = isRescheduled; } /// @@ -134,6 +141,11 @@ public bool MoveNext() } } + if (_isRescheduled) + { + Current.OnTaskRescheduled.Invoke(); + } + return true; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index f134c93d41..196f82b910 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -128,7 +128,7 @@ public bool AddTask(string taskId, IFailureStateMachine failureMachine) return false; } - throw new ArgumentException("Task already added to the topology"); + throw new ArgumentException("Task already added to the topology."); } DataNode node = new DataNode(id, false); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index 0aa9aff30a..7285907fea 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -27,7 +27,6 @@ using System.Collections.Concurrent; using System.Linq; using Org.Apache.REEF.Utilities.Attributes; -using Org.Apache.REEF.Network.Elastic.Task; namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default { @@ -183,6 +182,7 @@ public override void OnNext(DriverMessagePayload message) { foreach (var node in updates.Children) { + LOGGER.Log(Level.Info, $"Removing task {node} from the topology."); _nodesToRemove.TryAdd(node, new byte()); _commLayer.RemoveConnection(node); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs index dac4b347d0..8f184c8d7c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -21,7 +21,6 @@ using System.Threading; using System.Linq; using Org.Apache.REEF.Network.NetworkService; -using Org.Apache.REEF.Network.Elastic.Task; using Org.Apache.REEF.Tang.Exceptions; using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Utilities.Attributes; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs index 0b5635b85f..5a56df14f4 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs @@ -18,6 +18,7 @@ using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Network.Elastic.Task; using Org.Apache.REEF.Network.NetworkService; +using Org.Apache.REEF.Utilities.Attributes; using System; namespace Org.Apache.REEF.Network.Elastic.Topology.Physical @@ -25,6 +26,7 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Physical /// /// Base interface for topologies where nodes communicate betwen themselves. /// + [Unstable("0.16", "API may change")] internal interface IOperatorTopologyWithCommunication : IWaitForTaskRegistration, IDisposable, diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs index d49f204812..a2fdf504c2 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs @@ -26,7 +26,7 @@ namespace Org.Apache.REEF.Network.Elastic /// Utility class. /// [Unstable("0.16", "API may change")] - internal static class Utils + public static class Utils { /// /// Gets the context number associated with the active context id. From 897247a4268638cb466ae3692718432433a4a469 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Fri, 4 Jan 2019 15:07:52 -0800 Subject: [PATCH 09/29] bug fixes --- .../Run.cs | 3 +- .../ElasticBroadcastDriverWithFailures.cs | 2 +- .../BroadcastSlaveTaskDieMultiple.cs | 2 +- .../Elastic/Comm/ITaskMessageResponse.cs | 1 + .../Driver/Default/DefaultElasticContext.cs | 1 + .../Driver/Default/DefaultElasticDriver.cs | 1 - .../Driver/Default/DefaultElasticStage.cs | 25 ++++++++--- .../Default/DefaultElasticTaskSetManager.cs | 25 +++++++++-- .../Elastic/Driver/IElasticStage.cs | 7 ++- .../Default/DefaultFailureStateMachine.cs | 43 +++++++++++++++++++ .../Failures/Default/DefaultFailureStates.cs | 6 ++- .../Elastic/Failures/IFailureResponse.cs | 1 + .../Elastic/Failures/IFailureStateMachine.cs | 5 +++ .../Operators/Logical/Default/DefaultEmpty.cs | 7 +++ .../Logical/Default/DefaultOneToN.cs | 31 ++++++------- .../ElastiOperatorWithDefaultDispatcher.cs | 20 ++++++--- .../Operators/Logical/ElasticOperator.cs | 15 ++++--- .../Physical/Default/DefaultOneToN.cs | 5 ++- .../Elastic/Task/Workflow.cs | 2 +- 19 files changed, 157 insertions(+), 45 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs index a161d967f1..4b7954ac2a 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs @@ -29,7 +29,7 @@ public static void Main(string[] args) int numNodes = 5; int startPort = 8900; int portRange = 1000; - string testToRun = "ElasticBroadcastWithMultipleFailures"; + string testToRun = "ElasticBroadcastWithFailEvaluatorBeforeWorkflow"; testToRun = testToRun.ToLower(); if (args != null) @@ -90,7 +90,6 @@ public static void Main(string[] args) if (testToRun.Equals("ElasticBroadcastWithFailureInConstructor".ToLower()) || testToRun.Equals("all")) { - // This stage should fail new ElasticBroadcastClientWithFailureInConstructor(runOnYarn, numNodes, startPort, portRange); Console.WriteLine("ElasticBroadcastWithFailureInConstructor completed!!!"); } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs index 508fe3402e..3e85ae9fb8 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs @@ -47,7 +47,7 @@ protected ElasticBroadcastDriverWithFailures( Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure) as IFailureState, 0.01F), Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule) as IFailureState, 0.40F), Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule) as IFailureState, 0.60F), - Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.Fail) as IFailureState, 0.80F) + Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.Fail) as IFailureState, 0.601F) }); IElasticStage stage = Context.CreateNewStage(stageName, numEvaluators, failureMachine); diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs index e53d76e4c8..4843d7bb22 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs @@ -26,7 +26,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieMultiple : DefaultElasticTask { - private const int _failProb = 80; + private const int _failProb = 70; private readonly Random _rand = new Random(); [Inject] diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs index 7dd8db1732..f59572d228 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs @@ -32,6 +32,7 @@ public interface ITaskMessageResponse /// /// The task message for the operator /// A list of messages containing the instructions for the task + /// If the message cannot be handled correctly or generate an incorrent state void OnTaskMessage(ITaskMessage message, ref List returnMessages); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs index 91af722405..4e83ea02ad 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs @@ -307,6 +307,7 @@ public void SerializeOperatorConfiguration(ref IList serializedOperators /// /// The failed task /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled public void OnTaskFailure(IFailedTask value, ref List failureEvents) { var task = value.Id; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs index f975f65398..4da4bfe303 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs @@ -60,7 +60,6 @@ public void OnNext(IDriverStarted value) public void OnNext(IAllocatedEvaluator allocatedEvaluator) { - Thread.Sleep(10000); if (TaskSetManager.TryGetNextTaskContextId(allocatedEvaluator, out string identifier)) { IConfiguration contextConf = ContextConfiguration.ConfigurationModule diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index 459f82efdf..09a6cc89dc 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -85,7 +85,6 @@ internal DefaultElasticStage( _tasksAdded = 0; _masterTasks = new HashSet(); _datasetConfiguration = Optional.Empty(); - IsCompleted = false; Context = elasticService; _failureMachine = failureMachine ?? new DefaultFailureStateMachine(numTasks, DefaultFailureStates.Fail); FailureState = _failureMachine.State; @@ -122,7 +121,10 @@ internal DefaultElasticStage( /// /// Whether the stages is completed or not. /// - public bool IsCompleted { get; set; } + public bool IsCompleted + { + get { return FailureState.FailureState == (int)DefaultFailureStates.Complete; } + } /// /// Generates an id to uniquely identify operators in the stages. @@ -332,6 +334,17 @@ public Optional GetPartitionConf(string taskId) return Optional.Of(_datasetConfiguration.Value[index]); } + /// + /// Method used to signal that the stage state can be moved to complete. + /// + public void Complete() + { + lock (_statusLock) + { + FailureState = FailureState.Merge(_failureMachine.Complete()); + } + } + /// /// Retrieve the log the final statistics of the computation: this is the sum of all /// the stats of all the Operators compising the stage. This method can be called @@ -355,6 +368,7 @@ public string LogFinalStatistics() /// /// The task message for the operator /// A list of messages containing the instructions for the task + /// If the message cannot be handled correctly or generate an incorrent state public void OnTaskMessage(ITaskMessage message, ref List returnMessages) { int offset = 0; @@ -389,6 +403,7 @@ public void OnTimeout(Alarm alarm, ref List msgs, ref Lis /// /// The failed task /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled public void OnTaskFailure(IFailedTask task, ref List failureEvents) { // Failures have to be propagated down to the operators @@ -438,7 +453,7 @@ public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { lock (_statusLock) { - FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + FailureState = FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); } } @@ -450,7 +465,7 @@ public void OnReschedule(ref RescheduleEvent rescheduleEvent) { lock (_statusLock) { - FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + FailureState = FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); } } @@ -462,7 +477,7 @@ public void OnStop(ref StopEvent stopEvent) { lock (_statusLock) { - FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + FailureState = FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index 34ecbe7d88..f315beeeec 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -725,9 +725,17 @@ public void OnTaskMessage(ITaskMessage message) var returnMessages = new List(); _hasProgress = true; - foreach (var stage in _stages.Values) + try + { + foreach (var stage in _stages.Values) + { + stage.OnTaskMessage(message, ref returnMessages); + } + } + catch (IllegalStateException e) { - stage.OnTaskMessage(message, ref returnMessages); + LOGGER.Log(Level.Error, e.Message, e); + Fail(message.TaskId); } SendToTasks(returnMessages); @@ -821,6 +829,7 @@ public void OnTimeout(Alarm alarm, ref List msgs, ref Lis /// /// The failed task /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled public void OnTaskFailure(IFailedTask task, ref List failureEvents) { @@ -856,9 +865,17 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent _taskInfos[id].SetTaskStatus(TaskState.Failed); } - foreach (var stage in _taskInfos[id].Stages) + try + { + foreach (var stage in _taskInfos[id].Stages) + { + stage.OnTaskFailure(task, ref failureEvents); + } + } + catch (Exception e) { - stage.OnTaskFailure(task, ref failureEvents); + LOGGER.Log(Level.Error, e.Message, e); + Fail(task.Id); } // Failures have to be propagated up to the context diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs index a937f17f9b..a888330ffa 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticStage.cs @@ -57,7 +57,7 @@ public interface IElasticStage : IFailureResponse, ITaskMessageResponse /// /// Whether the stages is completed or not. /// - bool IsCompleted { get; set; } + bool IsCompleted { get; } /// /// Whether the stages contains iterations or not. @@ -133,6 +133,11 @@ public interface IElasticStage : IFailureResponse, ITaskMessageResponse /// The configuration of the data partition (if any) of the task Optional GetPartitionConf(string taskId); + /// + /// Method used to signal that the stage state can be moved to complete. + /// + void Complete(); + /// /// Retrieve the log the final statistics of the computation: this is the sum of all /// the stats of all the Operators compising the stage. This method can be called diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs index 258d7a8349..4dba3b5e4f 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -62,6 +62,19 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine { DefaultFailureStates.Fail, 0.80F } }; + private static List canMoveToComplete = new List() + { + (int)DefaultFailureStates.Continue, + (int)DefaultFailureStates.ContinueAndReconfigure, + (int)DefaultFailureStates.ContinueAndReschedule, + (int)DefaultFailureStates.Complete + }; + + private static List isFinalState = new List() + { + (int)DefaultFailureStates.Complete + }; + /// /// Default failure state machine starting with 0 data points and in continue state. /// @@ -112,6 +125,11 @@ public IFailureState AddDataPoints(int points, bool isNew) { lock (_statusLock) { + if (isFinalState.Contains(State.FailureState)) + { + return State; + } + if (isNew) { NumOfDataPoints += points; @@ -147,6 +165,11 @@ public IFailureState RemoveDataPoints(int points) float currentRate = (float)NumOfFailedDataPoints / NumOfDataPoints; + if (isFinalState.Contains(State.FailureState) && currentRate >= transitionWeights[DefaultFailureStates.StopAndReschedule]) + { + throw new IllegalStateException("Received remove data point when state is complete: failing."); + } + while (State.FailureState < (int)DefaultFailureStates.Fail && currentRate > transitionWeights[transitionMapUp[(DefaultFailureStates)State.FailureState]]) { @@ -157,6 +180,26 @@ public IFailureState RemoveDataPoints(int points) } } + /// + /// Signal the state machine to move into complete state. + /// + public IFailureState Complete() + { + lock (_statusLock) + { + if (canMoveToComplete.Contains(State.FailureState)) + { + State.FailureState = (int)DefaultFailureStates.Complete; + } + else + { + throw new IllegalStateException($"Failure machine cannot move from state {State.FailureState} to Complete: failing."); + } + } + + return State; + } + /// /// Method used to set or update the current threshold connected with /// a target failure state. The assumption is that higher failure states diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs index 4ed161ce39..9d6d1fa0cb 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs @@ -34,6 +34,10 @@ public enum DefaultFailureStates : int StopAndReschedule = 3, // When a failre is detected, stop the computation and try to reschedule the task - Fail = 4 // Fail + Fail = 4, // Fail + + Complete = 5 // Complete, final state + + } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs index d2214f75d0..efa5462674 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureResponse.cs @@ -36,6 +36,7 @@ public interface IFailureResponse /// /// The failed task /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled void OnTaskFailure(IFailedTask task, ref List failureEvents); /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs index c9aa1a5321..f59d324bd0 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs @@ -83,6 +83,11 @@ public interface IFailureStateMachine /// A failure event resulting from the removal of the data points IFailureState RemoveDataPoints(int points); + /// + /// Signal the state machine to move into complete state. + /// + IFailureState Complete(); + /// /// Utility method used to clone the target failure machine. /// Only the thresholds are cloned, while the machine state is not. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs index 4adef843c1..2e55c2bd84 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs @@ -45,6 +45,13 @@ public DefaultEmpty(IElasticStage stage, IFailureStateMachine failureMachine) : WithinIteration = false; } + /// + /// Used to react on a failure occurred on a task. + /// It gets a failed task as input and in response it produces zero or more failure events. + /// + /// The failed task + /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled public override void OnTaskFailure(IFailedTask task, ref List failureEvents) { if (_next != null) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs index 6608b37fbe..63bd4d30cf 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -32,6 +32,8 @@ using Org.Apache.REEF.Network.Elastic.Config; using System.Globalization; using Org.Apache.REEF.Tang.Util; +using Org.Apache.REEF.Tang.Exceptions; +using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default { @@ -106,14 +108,6 @@ protected override bool ReactOnTaskMessage(ITaskMessage message, ref List.Of(_failureMachine)); - } - else + _topology.TopologyUpdateResponse(message.TaskId, ref returnMessages, Optional.Of(_failureMachine)); + + if (_stop) { - LOGGER.Log(Level.Info, $"Operator {OperatorName} is in stopped: Waiting."); + if (_failureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule) + { + _stop = false; + } + else + { + returnMessages.Clear(); + LOGGER.Log(Level.Info, $"Operator {OperatorName} is in stopped: Waiting."); + } } return true; } case TaskMessageType.CompleteStage: { - Stage.IsCompleted = true; + _failureMachine.Complete(); + Stage.Complete(); return true; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs index b4463d732c..30e7a7bef5 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs @@ -81,6 +81,7 @@ public override ElasticOperator Broadcast(int senderId, ITopology topology, I /// /// The failed task /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled public override void OnTaskFailure(IFailedTask task, ref List failureEvents) { var failedOperatorId = _id; @@ -90,11 +91,11 @@ public override void OnTaskFailure(IFailedTask task, ref List fai var opException = task.AsError() as OperatorException; failedOperatorId = opException.OperatorId; } - else - { - LOGGER.Log(Level.Info, $"Failure from {task.Id} cannot be properly managed: failing."); - failureEvents.Add(new FailEvent(task.Id)); - } + //else + //{ + // LOGGER.Log(Level.Info, $"Failure from {task.Id} cannot be properly managed: failing."); + // failureEvents.Add(new FailEvent(task.Id)); + //} if (WithinIteration || failedOperatorId <= _id) { @@ -117,7 +118,14 @@ public override void OnTaskFailure(IFailedTask task, ref List fai } break; case DefaultFailureStates.StopAndReschedule: - failureEvents.Add(new Failures.Default.StopEvent(task.Id)); + { + var @event = new StopEvent(task.Id); + if (failedOperatorId == _id) + { + @event.FailedTask = Optional.Of(task); + } + failureEvents.Add(@event); + } break; case DefaultFailureStates.Fail: failureEvents.Add(new FailEvent(task.Id)); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index 859d8abd48..6e92de0591 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -196,6 +196,7 @@ public ElasticOperator Broadcast(TopologyType topology, CheckpointLevel check /// The task message for the operator /// A list of messages containing the instructions for the task /// True if the message was managed correctly, false otherwise + /// If the message cannot be handled correctly or generate an incorrent state public void OnTaskMessage(ITaskMessage message, ref List returnMessages) { var hasReacted = ReactOnTaskMessage(message, ref returnMessages); @@ -214,17 +215,18 @@ public void OnTaskMessage(ITaskMessage message, ref List /// True if the task is new and is added to the operator public virtual bool AddTask(string taskId) { + var newTask = false; + if (!_operatorFinalized) { throw new IllegalStateException("Operator needs to be finalized before adding tasks."); } - //if (_operatorStateFinalized) - //{ - // throw new IllegalStateException("Task cannot be added to an operator with finalized state."); - //} - - var newTask = _topology.AddTask(taskId, _failureMachine); + if (!_operatorStateFinalized) + { + // If state is finalized tasks can join the topology only explicitly. + newTask = _topology.AddTask(taskId, _failureMachine); + } if (_next != null) { @@ -331,6 +333,7 @@ public virtual bool CheckIfLastIterator() /// /// The failed task /// A list of events encoding the type of actions to be triggered so far + /// If the task failure cannot be properly handled public abstract void OnTaskFailure(IFailedTask task, ref List failureEvents); /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index 5276924e96..bda3429de3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -40,6 +40,7 @@ public abstract class DefaultOneToN internal volatile PositionTracker _position; private readonly bool _isLast; + private bool _cleanDisposal; /// /// Creates a new one to N operator. @@ -54,6 +55,7 @@ internal DefaultOneToN(int id, bool isLast, OneToNTopology topology) _isLast = isLast; _topology = topology; _position = PositionTracker.Nil; + _cleanDisposal = false; OnTaskRescheduled = new Action(() => { @@ -175,6 +177,7 @@ public void WaitForTaskRegistration(CancellationTokenSource cancellationSource) public void WaitCompletionBeforeDisposing() { _topology.WaitCompletionBeforeDisposing(CancellationSource); + _cleanDisposal = true; } /// @@ -182,7 +185,7 @@ public void WaitCompletionBeforeDisposing() /// public void Dispose() { - if (_isLast) + if (_isLast && _cleanDisposal) { _topology.StageComplete(); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index 8d6a8a05b4..5ad43a9edd 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -212,7 +212,7 @@ public void Dispose() if (_operators != null) { // Clean dispose, check that the computation is completed - if (_failed == false) + if (!_failed) { foreach (var op in _operators) { From 87f0e7bf50ada3604f232be9be697e65ae12d0c3 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 15 Jan 2019 13:42:04 -0800 Subject: [PATCH 10/29] Addressing frist round of reviews from Sergiy. --- .../ElasticBroadcastClient.cs | 67 +++++++---- .../Run.cs | 110 +++++++++++++----- .../Elastic/BroadcastMasterTask.cs | 17 ++- .../Elastic/BroadcastSlaveTask.cs | 16 ++- .../Elastic/ElasticBroadcastDriver.cs | 3 +- .../ElasticBroadcastDriverWithFailures.cs | 21 +++- .../BroadcastSlaveTaskDieAfterBroadcast.cs | 27 +++-- .../BroadcastSlaveTaskDieBeforeBroadcast.cs | 27 +++-- .../BroadcastSlaveTaskDieBeforeWorkflow.cs | 15 ++- ...castSlaveTaskDieEvaluatorBeforeWorkflow.cs | 15 ++- .../BroadcastSlaveTaskDieInConstructor.cs | 15 ++- .../BroadcastSlaveTaskDieMultiple.cs | 15 ++- ...BroadcastSlaveTaskDieMultipleEvaluators.cs | 79 +++++++++++++ ...stDriverWithFailEvaluatorBeforeWorkflow.cs | 22 ++-- ...roadcastDriverWithFailureAfterBroadcast.cs | 21 ++-- ...oadcastDriverWithFailureBeforeBroadcast.cs | 15 ++- ...roadcastDriverWithFailureBeforeWorkflow.cs | 15 ++- ...BroadcastDriverWithFailureInConstructor.cs | 15 ++- ...sticBroadcastDriverWithMultipleFailures.cs | 11 +- .../Driver/Default/DefaultElasticDriver.cs | 3 +- .../Logical/Default/DefaultBroadcast.cs | 6 +- .../Operators/Logical/Default/DefaultEmpty.cs | 10 +- .../Logical/Default/DefaultOneToN.cs | 8 +- .../ElastiOperatorWithDefaultDispatcher.cs | 14 ++- .../Operators/Logical/ElasticOperator.cs | 21 ++-- .../{Constants.cs => OperatorType.cs} | 37 ++++-- .../Physical/Default/DefaultBroadcast.cs | 6 +- .../Physical/Default/DefaultOneToN.cs | 10 +- .../Operators/Physical/IElasticOperator.cs | 8 +- .../Elastic/Task/Workflow.cs | 105 +++++++---------- 30 files changed, 486 insertions(+), 268 deletions(-) create mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/{Constants.cs => OperatorType.cs} (53%) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs index 5dbb175520..fab5e70fc6 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs @@ -38,12 +38,17 @@ public class ElasticBroadcastClient const string Yarn = "yarn"; const string DefaultRuntimeFolder = "REEF_LOCAL_RUNTIME"; - public ElasticBroadcastClient(bool runOnYarn, int numTasks, int startingPortNo, int portRange) + public ElasticBroadcastClient( + bool runOnYarn, + int numTasks, + int startingPortNo, + int portRange) { const string driverId = "ElasticBroadcastDriver"; const string stage = "Broadcast"; - IConfiguration driverConfig = TangFactory.GetTang().NewConfigurationBuilder(GetDriverConf()) + IConfiguration driverConfig = TangFactory.GetTang() + .NewConfigurationBuilder(GetDriverConf()) .BindNamedParameter( GenericType.Class, numTasks.ToString(CultureInfo.InvariantCulture)) @@ -55,21 +60,36 @@ public ElasticBroadcastClient(bool runOnYarn, int numTasks, int startingPortNo, portRange.ToString(CultureInfo.InvariantCulture)) .Build(); - IConfiguration elsticGroupCommServiceDriverConfig = TangFactory.GetTang().NewConfigurationBuilder() + IConfiguration elsticGroupCommServiceDriverConfig = TangFactory.GetTang() + .NewConfigurationBuilder() .BindStringNamedParam(driverId) .BindStringNamedParam(stage) - .BindIntNamedParam(numTasks.ToString(CultureInfo.InvariantCulture)) + .BindIntNamedParam( + numTasks.ToString(CultureInfo.InvariantCulture)) .Build(); - IConfiguration merged = Configurations.Merge(driverConfig, elsticGroupCommServiceDriverConfig); + IConfiguration merged = Configurations + .Merge(driverConfig, elsticGroupCommServiceDriverConfig); string runPlatform = runOnYarn ? "yarn" : "local"; - TestRun(merged, typeof(ElasticBroadcastDriver), numTasks, JobIdentifier, runPlatform); + TestRun( + merged, + typeof(ElasticBroadcastDriver), + numTasks, + JobIdentifier, + runPlatform); } - internal static void TestRun(IConfiguration driverConfig, Type globalAssemblyType, int numberOfEvaluator, string jobIdentifier = "myDriver", string runOnYarn = "local", string runtimeFolder = DefaultRuntimeFolder) + internal static void TestRun( + IConfiguration driverConfig, + Type globalAssemblyType, + int numberOfEvaluator, + string jobIdentifier = "myDriver", + string runOnYarn = "local", + string runtimeFolder = DefaultRuntimeFolder) { - IInjector injector = TangFactory.GetTang().NewInjector(GetRuntimeConfiguration(runOnYarn, numberOfEvaluator, runtimeFolder)); + IInjector injector = TangFactory.GetTang() + .NewInjector(GetRuntimeConfiguration(runOnYarn, numberOfEvaluator, runtimeFolder)); var reefClient = injector.GetInstance(); var jobRequestBuilder = injector.GetInstance(); var jobSubmission = jobRequestBuilder @@ -81,20 +101,25 @@ internal static void TestRun(IConfiguration driverConfig, Type globalAssemblyTyp reefClient.SubmitAndGetJobStatus(jobSubmission); } - internal static IConfiguration GetRuntimeConfiguration(string runOnYarn, int numberOfEvaluator, string runtimeFolder) + internal static IConfiguration GetRuntimeConfiguration( + string runOnYarn, + int numberOfEvaluator, + string runtimeFolder) { switch (runOnYarn) { case Local: var dir = Path.Combine(".", runtimeFolder); return LocalRuntimeClientConfiguration.ConfigurationModule - .Set(LocalRuntimeClientConfiguration.NumberOfEvaluators, numberOfEvaluator.ToString()) + .Set( + LocalRuntimeClientConfiguration.NumberOfEvaluators, + numberOfEvaluator.ToString()) .Set(LocalRuntimeClientConfiguration.RuntimeFolder, dir) .Build(); case Yarn: return YARNClientConfiguration.ConfigurationModule.Build(); default: - throw new Exception("Unknown runtime: " + runOnYarn); + throw new ArgumentException("Unknown runtime: " + runOnYarn); } } @@ -106,16 +131,16 @@ protected virtual string JobIdentifier protected virtual IConfiguration GetDriverConf() { return DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) - .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) - .Build(); + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) + .Build(); } } } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs index 4b7954ac2a..bab9f96f31 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs @@ -15,11 +15,24 @@ // specific language governing permissions and limitations // under the License. -using System; using Org.Apache.REEF.Network.Examples.GroupCommunication; +using System; namespace Org.Apache.REEF.Network.Examples.Client { + internal enum TestType + { + PipelineBroadcastAndReduce, + BroadcastAndReduce, + ElasticBroadcast, + ElasticBroadcastWithFailureInConstructor, + ElasticBroadcastWithFailureBeforeWorkflow, + ElasticBroadcastWithFailEvaluatorBeforeWorkflow, + ElasticBroadcastWithFailureBeforeBroadcast, + ElasticBroadcastWithFailureAfterBroadcast, + ElasticBroadcastWithMultipleFailures + } + public class Run { public static void Main(string[] args) @@ -30,7 +43,6 @@ public static void Main(string[] args) int startPort = 8900; int portRange = 1000; string testToRun = "ElasticBroadcastWithFailEvaluatorBeforeWorkflow"; - testToRun = testToRun.ToLower(); if (args != null) { @@ -56,11 +68,11 @@ public static void Main(string[] args) if (args.Length > 4) { - testToRun = args[4].ToLower(); + testToRun = args[4]; } } - if (testToRun.Equals("RunPipelineBroadcastAndReduce".ToLower()) || testToRun.Equals("all")) + if (TestType.PipelineBroadcastAndReduce.Match(testToRun)) { int arraySize = GroupTestConstants.ArrayLength; int chunkSize = GroupTestConstants.ChunkSize; @@ -71,64 +83,100 @@ public static void Main(string[] args) chunkSize = int.Parse(args[6]); } - new PipelineBroadcastAndReduceClient().RunPipelineBroadcastAndReduce(runOnYarn, numNodes, startPort, - portRange, arraySize, chunkSize); - Console.WriteLine("RunPipelineBroadcastAndReduce completed!!!"); + new PipelineBroadcastAndReduceClient().RunPipelineBroadcastAndReduce( + runOnYarn, + numNodes, + startPort, + portRange, + arraySize, + chunkSize); + Console.WriteLine("PipelineBroadcastAndReduce completed!!!"); } - if (testToRun.Equals("RunBroadcastAndReduce".ToLower()) || testToRun.Equals("all")) + if (TestType.BroadcastAndReduce.Match(testToRun)) { - new BroadcastAndReduceClient().RunBroadcastAndReduce(runOnYarn, numNodes, startPort, portRange); - Console.WriteLine("RunBroadcastAndReduce completed!!!"); + new BroadcastAndReduceClient().RunBroadcastAndReduce( + runOnYarn, + numNodes, + startPort, + portRange); + Console.WriteLine("BroadcastAndReduce completed!!!"); } - if (testToRun.Equals("ElasticBroadcast".ToLower()) || testToRun.Equals("all")) + if (TestType.ElasticBroadcast.Match(testToRun)) { new ElasticBroadcastClient(runOnYarn, numNodes, startPort, portRange); - Console.WriteLine("ElasticRunBroadcast completed!!!"); + Console.WriteLine("ElasticBroadcast completed!!!"); } - if (testToRun.Equals("ElasticBroadcastWithFailureInConstructor".ToLower()) || testToRun.Equals("all")) + if (TestType.ElasticBroadcastWithFailureInConstructor.Match(testToRun)) { - new ElasticBroadcastClientWithFailureInConstructor(runOnYarn, numNodes, startPort, portRange); + new ElasticBroadcastClientWithFailureInConstructor( + runOnYarn, + numNodes, + startPort, + portRange); Console.WriteLine("ElasticBroadcastWithFailureInConstructor completed!!!"); } - if (testToRun.Equals("ElasticBroadcastWithFailureBeforeWorkflow".ToLower()) || testToRun.Equals("all")) + if (TestType.ElasticBroadcastWithFailureBeforeWorkflow.Match(testToRun)) { - new ElasticBroadcastClientWithFailureBeforeWorkflow(runOnYarn, numNodes, startPort, portRange); + new ElasticBroadcastClientWithFailureBeforeWorkflow( + runOnYarn, + numNodes, + startPort, + portRange); Console.WriteLine("ElasticBroadcastWithFailureBeforeWorkflow completed!!!"); } - if (testToRun.Equals("ElasticBroadcastWithFailEvaluatorBeforeWorkflow".ToLower()) || testToRun.Equals("all")) + if (TestType.ElasticBroadcastWithFailEvaluatorBeforeWorkflow.Match(testToRun)) { - new ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow(runOnYarn, numNodes, startPort, portRange); + new ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow( + runOnYarn, + numNodes, + startPort, + portRange); Console.WriteLine("ElasticBroadcastWithFailEvaluatorBeforeWorkflow completed!!!"); } - if (testToRun.Equals("ElasticBroadcastWithFailureBeforeBroadcast".ToLower()) || testToRun.Equals("all")) + if (TestType.ElasticBroadcastWithFailureBeforeBroadcast.Match(testToRun)) { - new ElasticBroadcastClientWithFailureBeforeBroadcast(runOnYarn, numNodes, startPort, portRange); + new ElasticBroadcastClientWithFailureBeforeBroadcast( + runOnYarn, + numNodes, + startPort, + portRange); Console.WriteLine("ElasticBroadcastWithFailureBeforeBroadcast completed!!!"); } - if (testToRun.Equals("ElasticBroadcastWithFailureAfterBroadcast".ToLower()) || testToRun.Equals("all")) - { - new ElasticBroadcastClientWithFailureAfterBroadcast(runOnYarn, numNodes, startPort, portRange); - Console.WriteLine("ElasticBroadcastWithFailureAfterBroadcast completed!!!"); - } - - if (testToRun.Equals("ElasticBroadcastWithFailureAfterBroadcast".ToLower()) || testToRun.Equals("all")) + if (TestType.ElasticBroadcastWithFailureAfterBroadcast.Match(testToRun)) { - new ElasticBroadcastClientWithFailureAfterBroadcast(runOnYarn, numNodes, startPort, portRange); + new ElasticBroadcastClientWithFailureAfterBroadcast( + runOnYarn, + numNodes, + startPort, + portRange); Console.WriteLine("ElasticBroadcastWithFailureAfterBroadcast completed!!!"); } - if (testToRun.Equals("ElasticBroadcastWithMultipleFailures".ToLower()) || testToRun.Equals("all")) + if (TestType.ElasticBroadcastWithMultipleFailures.Match(testToRun)) { - new ElasticBroadcastClientWithMultipleFailures(runOnYarn, numNodes, startPort, portRange); + new ElasticBroadcastClientWithMultipleFailures( + runOnYarn, + numNodes, + startPort, + portRange); Console.WriteLine("ElasticBroadcastWithMultipleFailures completed!!!"); } } } -} + + internal static class TestTypeMatcher + { + public static bool Match(this TestType test, string name) + { + name = name.ToLower(); + return name.Equals("all") || test.ToString().ToLower().Equals(name); + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs index c00ba3995a..aa65b066d3 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs @@ -21,11 +21,14 @@ using Org.Apache.REEF.Network.Elastic.Operators; using Org.Apache.REEF.Network.Elastic.Operators.Physical; using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Utilities.Logging; namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastMasterTask : DefaultElasticTask { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(BroadcastMasterTask)); + [Inject] private BroadcastMasterTask(CancellationSource source, IElasticContext context) : base(source, context, "Broadcast") @@ -37,23 +40,25 @@ protected override void Execute(byte[] memento, Workflow workflow) var rand = new Random(); int number = 0; - while (workflow.MoveNext()) + foreach (var op in workflow) { number = rand.Next(); - switch (workflow.Current.OperatorName) + switch (op.OperatorType) { - case Constants.Broadcast: + case OperatorType.Broadcast: var sender = workflow.Current as IElasticBroadcast; sender.Send(number); - Console.WriteLine($"Master has sent {number}"); + LOGGER.Log(Level.Info, $"Master has sent {number}"); break; + default: - throw new InvalidOperationException($"Operation {workflow.Current} in workflow not implemented."); + throw new InvalidOperationException( + $"Operation {workflow.Current} in workflow not implemented."); } } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs index e5a2d092ff..2057bc4db3 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs @@ -21,11 +21,13 @@ using Org.Apache.REEF.Network.Elastic.Operators.Physical; using Org.Apache.REEF.Network.Elastic.Operators; using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Utilities.Logging; namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTask : DefaultElasticTask { + private static readonly Logger LOGGER = Logger.GetLogger(typeof(BroadcastSlaveTask)); [Inject] public BroadcastSlaveTask(CancellationSource source, IElasticContext context) @@ -35,21 +37,23 @@ public BroadcastSlaveTask(CancellationSource source, IElasticContext context) protected override void Execute(byte[] memento, Workflow workflow) { - while (workflow.MoveNext()) + foreach (var op in workflow) { - switch (workflow.Current.OperatorName) + switch (op.OperatorType) { - case Constants.Broadcast: + case OperatorType.Broadcast: var receiver = workflow.Current as IElasticBroadcast; var rec = receiver.Receive(); - Console.WriteLine($"Slave has received {rec}"); + LOGGER.Log(Level.Info, $"Slave has received {rec}"); break; + default: - break; + throw new InvalidOperationException( + $"Operation {workflow.Current} in workflow not implemented."); } } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs index 1e7ea6d892..a9cc1ccb97 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs @@ -46,7 +46,8 @@ protected ElasticBroadcastDriver(IElasticContext context) : base(context) stage = stage.Build(); // Create the task manager - TaskSetManager = Context.CreateNewTaskSetManager(MasterTaskConfiguration, SlaveTaskConfiguration); + TaskSetManager = Context.CreateNewTaskSetManager( + MasterTaskConfiguration, SlaveTaskConfiguration); // Register the stage to the task manager TaskSetManager.AddStage(stage); diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs index 3e85ae9fb8..bfa0c612f4 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs @@ -44,10 +44,18 @@ protected ElasticBroadcastDriverWithFailures( failureMachine.SetThreasholds(new Tuple[] { - Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure) as IFailureState, 0.01F), - Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule) as IFailureState, 0.40F), - Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule) as IFailureState, 0.60F), - Tuple.Create(new DefaultFailureState((int)DefaultFailureStates.Fail) as IFailureState, 0.601F) + Tuple.Create( + new DefaultFailureState( + (int)DefaultFailureStates.ContinueAndReconfigure) as IFailureState, 0.01F), + Tuple.Create( + new DefaultFailureState( + (int)DefaultFailureStates.ContinueAndReschedule) as IFailureState, 0.40F), + Tuple.Create( + new DefaultFailureState( + (int)DefaultFailureStates.StopAndReschedule) as IFailureState, 0.60F), + Tuple.Create( + new DefaultFailureState( + (int)DefaultFailureStates.Fail) as IFailureState, 0.80F) }); IElasticStage stage = Context.CreateNewStage(stageName, numEvaluators, failureMachine); @@ -61,7 +69,8 @@ protected ElasticBroadcastDriverWithFailures( stage = stage.Build(); // Create the task manager - TaskSetManager = Context.CreateNewTaskSetManager(MasterTaskConfiguration, SlaveTaskConfiguration); + TaskSetManager = Context.CreateNewTaskSetManager( + MasterTaskConfiguration, SlaveTaskConfiguration); // Register the stage to the task manager TaskSetManager.AddStage(stage); @@ -94,4 +103,4 @@ protected virtual Func SlaveTaskConfiguration } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs index e9eac4ba69..d14e8b87db 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs @@ -15,19 +15,23 @@ // specific language governing permissions and limitations // under the License. -using System; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Network.Elastic.Task; -using Org.Apache.REEF.Network.Elastic.Operators.Physical; -using Org.Apache.REEF.Network.Elastic.Operators; -using Org.Apache.REEF.Network.Elastic.Task.Default; using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Logging; +using System; namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieAfterBroadcast : DefaultElasticTask { + private static readonly Logger LOGGER = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieAfterBroadcast)); + private readonly string _taskId; [Inject] @@ -41,17 +45,17 @@ public BroadcastSlaveTaskDieAfterBroadcast( protected override void Execute(byte[] memento, Workflow workflow) { - while (workflow.MoveNext()) + foreach (var op in workflow) { - switch (workflow.Current.OperatorName) + switch (op.OperatorType) { - case Constants.Broadcast: + case OperatorType.Broadcast: var receiver = workflow.Current as IElasticBroadcast; var rec = receiver.Receive(); - Console.WriteLine($"Slave has received {rec}"); + LOGGER.Log(Level.Info, $"Slave has received {rec}"); if (Utils.GetTaskNum(_taskId) == 2) { @@ -59,10 +63,11 @@ protected override void Execute(byte[] memento, Workflow workflow) } break; + default: break; } } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs index e16a9b6b9b..a474ca9072 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs @@ -15,19 +15,23 @@ // specific language governing permissions and limitations // under the License. -using System; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Network.Elastic.Task; -using Org.Apache.REEF.Network.Elastic.Operators.Physical; -using Org.Apache.REEF.Network.Elastic.Operators; -using Org.Apache.REEF.Network.Elastic.Task.Default; using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Utilities.Logging; +using System; namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieBeforeBroadcast : DefaultElasticTask { + private static readonly Logger LOGGER = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieBeforeBroadcast)); + private readonly string _taskId; [Inject] @@ -41,11 +45,11 @@ public BroadcastSlaveTaskDieBeforeBroadcast( protected override void Execute(byte[] memento, Workflow workflow) { - while (workflow.MoveNext()) + foreach (var op in workflow) { - switch (workflow.Current.OperatorName) + switch (op.OperatorType) { - case Constants.Broadcast: + case OperatorType.Broadcast: if (Utils.GetTaskNum(_taskId) == 2) { @@ -56,12 +60,13 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - Console.WriteLine($"Slave has received {rec}"); + LOGGER.Log(Level.Info, $"Slave has received {rec}"); break; + default: break; } } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs index 826c94493c..8570a85838 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs @@ -23,11 +23,15 @@ using Org.Apache.REEF.Network.Elastic.Task.Default; using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Utilities.Logging; namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieBeforeWorkflow : DefaultElasticTask { + private static readonly Logger LOGGER = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieBeforeWorkflow)); + private readonly string _taskId; [Inject] @@ -46,21 +50,22 @@ protected override void Execute(byte[] memento, Workflow workflow) throw new Exception("Die before workflow."); } - while (workflow.MoveNext()) + foreach (var op in workflow) { - switch (workflow.Current.OperatorName) + switch (op.OperatorType) { - case Constants.Broadcast: + case OperatorType.Broadcast: var receiver = workflow.Current as IElasticBroadcast; var rec = receiver.Receive(); - Console.WriteLine($"Slave has received {rec}"); + LOGGER.Log(Level.Info, $"Slave has received {rec}"); break; + default: break; } } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs index a7a5d3353d..0096142d2d 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs @@ -23,11 +23,15 @@ using Org.Apache.REEF.Network.Elastic.Task.Default; using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Utilities.Logging; namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieEvaluatorBeforeWorkflow : DefaultElasticTask { + private static readonly Logger LOGGER = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieEvaluatorBeforeWorkflow)); + private readonly string _taskId; [Inject] @@ -47,21 +51,22 @@ protected override void Execute(byte[] memento, Workflow workflow) Environment.Exit(0); } - while (workflow.MoveNext()) + foreach (var op in workflow) { - switch (workflow.Current.OperatorName) + switch (op.OperatorType) { - case Constants.Broadcast: + case OperatorType.Broadcast: var receiver = workflow.Current as IElasticBroadcast; var rec = receiver.Receive(); - Console.WriteLine($"Slave has received {rec}"); + LOGGER.Log(Level.Info, $"Slave has received {rec}"); break; + default: break; } } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs index 07dad15f4b..562ac92b94 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs @@ -23,11 +23,15 @@ using Org.Apache.REEF.Network.Elastic.Task.Default; using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic; +using Org.Apache.REEF.Utilities.Logging; namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieInConstructor : DefaultElasticTask { + private static readonly Logger LOGGER = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieInConstructor)); + [Inject] public BroadcastSlaveTaskDieInConstructor( [Parameter(typeof(TaskConfigurationOptions.Identifier))] string taskId, @@ -42,21 +46,22 @@ public BroadcastSlaveTaskDieInConstructor( protected override void Execute(byte[] memento, Workflow workflow) { - while (workflow.MoveNext()) + foreach (var op in workflow) { - switch (workflow.Current.OperatorName) + switch (op.OperatorType) { - case Constants.Broadcast: + case OperatorType.Broadcast: var receiver = workflow.Current as IElasticBroadcast; var rec = receiver.Receive(); - Console.WriteLine($"Slave has received {rec}"); + LOGGER.Log(Level.Info, $"Slave has received {rec}"); break; + default: break; } } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs index 4843d7bb22..188d3f8f63 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs @@ -21,11 +21,15 @@ using Org.Apache.REEF.Network.Elastic.Operators.Physical; using Org.Apache.REEF.Network.Elastic.Operators; using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Utilities.Logging; namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieMultiple : DefaultElasticTask { + private static readonly Logger LOGGER = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieMultiple)); + private const int _failProb = 70; private readonly Random _rand = new Random(); @@ -43,11 +47,11 @@ protected override void Execute(byte[] memento, Workflow workflow) throw new Exception("Die."); } - while (workflow.MoveNext()) + foreach (var op in workflow) { - switch (workflow.Current.OperatorName) + switch (op.OperatorType) { - case Constants.Broadcast: + case OperatorType.Broadcast: if (_rand.Next(100) < _failProb) { @@ -58,13 +62,14 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - Console.WriteLine($"Slave has received {rec}"); + LOGGER.Log(Level.Info, $"Slave has received {rec}"); break; + default: break; } } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs new file mode 100644 index 0000000000..702b86e35b --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using Org.Apache.REEF.Tang.Annotations; +using Org.Apache.REEF.Network.Elastic.Task; +using Org.Apache.REEF.Network.Elastic.Operators.Physical; +using Org.Apache.REEF.Network.Elastic.Operators; +using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Utilities.Logging; + +namespace Org.Apache.REEF.Network.Examples.Elastic +{ + public sealed class BroadcastSlaveTaskDieMultipleEvaluators : DefaultElasticTask + { + private static readonly Logger LOGGER = Logger.GetLogger( + typeof(BroadcastSlaveTaskDieMultipleEvaluators)); + + private const int _failProb = 50; + private readonly Random _rand = new Random(); + + [Inject] + public BroadcastSlaveTaskDieMultipleEvaluators( + CancellationSource source, IElasticContext context) + : base(source, context, "Broadcast") + { + } + + protected override void Execute(byte[] memento, Workflow workflow) + { + if (_rand.Next(100) < _failProb) + { + Environment.Exit(0); + } + + foreach (var op in workflow) + { + switch (op.OperatorType) + { + case OperatorType.Broadcast: + + if (_rand.Next(100) < _failProb) + { + Environment.Exit(0); + } + + var receiver = workflow.Current as IElasticBroadcast; + + var rec = receiver.Receive(); + + LOGGER.Log(Level.Info, $"Slave has received {rec}"); + + if (_rand.Next(100) < _failProb) + { + Environment.Exit(0); + } + break; + + default: + break; + } + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs index 91ae6a5b2c..56fac012d6 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs @@ -15,26 +15,28 @@ // specific language governing permissions and limitations // under the License. -using System; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Driver; using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Tang.Implementations.Tang; using Org.Apache.REEF.Tang.Interface; using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Network.Elastic.Driver; -using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Network.Elastic.Config; +using Org.Apache.REEF.Utilities.Logging; +using System; +using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; namespace Org.Apache.REEF.Network.Examples.Elastic { /// /// Example implementation of broadcasting using the elastic group communication service. /// - public sealed class ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow : ElasticBroadcastDriverWithFailures + public sealed class ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow : + ElasticBroadcastDriverWithFailures { [Inject] private ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow( - [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, - [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + [Parameter(typeof(DefaultStageName))] string stageName, + [Parameter(typeof(NumEvaluators))] int numEvaluators, IElasticContext context) : base(stageName, numEvaluators, context) { } @@ -45,10 +47,12 @@ protected override Func SlaveTaskConfiguration { return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) + .Set( + TaskConfiguration.Task, + GenericType.Class) .Build()) .Build(); } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs index 950a4e86c4..4099072f5d 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs @@ -15,26 +15,27 @@ // specific language governing permissions and limitations // under the License. -using System; +using Org.Apache.REEF.Common.Tasks; +using Org.Apache.REEF.Network.Elastic.Driver; using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Tang.Implementations.Tang; using Org.Apache.REEF.Tang.Interface; using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Network.Elastic.Driver; -using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Network.Elastic.Config; +using System; +using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; namespace Org.Apache.REEF.Network.Examples.Elastic { /// /// Example implementation of broadcasting using the elastic group communication service. /// - public sealed class ElasticBroadcastDriverWithFailureAfterBroadcast : ElasticBroadcastDriverWithFailures + public sealed class ElasticBroadcastDriverWithFailureAfterBroadcast : + ElasticBroadcastDriverWithFailures { [Inject] private ElasticBroadcastDriverWithFailureAfterBroadcast( - [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, - [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + [Parameter(typeof(DefaultStageName))] string stageName, + [Parameter(typeof(NumEvaluators))] int numEvaluators, IElasticContext context) : base(stageName, numEvaluators, context) { } @@ -45,10 +46,12 @@ protected override Func SlaveTaskConfiguration { return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) + .Set( + TaskConfiguration.Task, + GenericType.Class) .Build()) .Build(); } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs index 5c2e0fc621..abce50edc9 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs @@ -22,19 +22,20 @@ using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Network.Elastic.Driver; using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Network.Elastic.Config; +using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; namespace Org.Apache.REEF.Network.Examples.Elastic { /// /// Example implementation of broadcasting using the elastic group communication service. /// - public sealed class ElasticBroadcastDriverWithFailureBeforeBroadcast : ElasticBroadcastDriverWithFailures + public sealed class ElasticBroadcastDriverWithFailureBeforeBroadcast : + ElasticBroadcastDriverWithFailures { [Inject] private ElasticBroadcastDriverWithFailureBeforeBroadcast( - [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, - [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + [Parameter(typeof(DefaultStageName))] string stageName, + [Parameter(typeof(NumEvaluators))] int numEvaluators, IElasticContext context) : base(stageName, numEvaluators, context) { } @@ -45,10 +46,12 @@ protected override Func SlaveTaskConfiguration { return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) + .Set( + TaskConfiguration.Task, + GenericType.Class) .Build()) .Build(); } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs index c20e9b5f03..3a54057013 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs @@ -22,19 +22,20 @@ using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Network.Elastic.Driver; using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Network.Elastic.Config; +using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; namespace Org.Apache.REEF.Network.Examples.Elastic { /// /// Example implementation of broadcasting using the elastic group communication service. /// - public sealed class ElasticBroadcastDriverWithFailureBeforeWorkflow : ElasticBroadcastDriverWithFailures + public sealed class ElasticBroadcastDriverWithFailureBeforeWorkflow : + ElasticBroadcastDriverWithFailures { [Inject] private ElasticBroadcastDriverWithFailureBeforeWorkflow( - [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, - [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + [Parameter(typeof(DefaultStageName))] string stageName, + [Parameter(typeof(NumEvaluators))] int numEvaluators, IElasticContext context) : base(stageName, numEvaluators, context) { } @@ -45,10 +46,12 @@ protected override Func SlaveTaskConfiguration { return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) + .Set( + TaskConfiguration.Task, + GenericType.Class) .Build()) .Build(); } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs index 98a596fbda..9f4cadf585 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs @@ -22,19 +22,20 @@ using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Network.Elastic.Driver; using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Network.Elastic.Config; +using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; namespace Org.Apache.REEF.Network.Examples.Elastic { /// /// Example implementation of broadcasting using the elastic group communication service. /// - public sealed class ElasticBroadcastDriverWithFailureInConstructor : ElasticBroadcastDriverWithFailures + public sealed class ElasticBroadcastDriverWithFailureInConstructor : + ElasticBroadcastDriverWithFailures { [Inject] private ElasticBroadcastDriverWithFailureInConstructor( - [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, - [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + [Parameter(typeof(DefaultStageName))] string stageName, + [Parameter(typeof(NumEvaluators))] int numEvaluators, IElasticContext context) : base(stageName, numEvaluators, context) { } @@ -45,10 +46,12 @@ protected override Func SlaveTaskConfiguration { return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) + .Set( + TaskConfiguration.Task, + GenericType.Class) .Build()) .Build(); } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs index 9df2f290f6..887fa913cf 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs @@ -22,19 +22,20 @@ using Org.Apache.REEF.Tang.Util; using Org.Apache.REEF.Network.Elastic.Driver; using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Network.Elastic.Config; +using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; namespace Org.Apache.REEF.Network.Examples.Elastic { /// /// Example implementation of broadcasting using the elastic group communication service. /// - public sealed class ElasticBroadcastDriverWithMultipleFailures : ElasticBroadcastDriverWithFailures + public sealed class ElasticBroadcastDriverWithMultipleFailures : + ElasticBroadcastDriverWithFailures { [Inject] private ElasticBroadcastDriverWithMultipleFailures( - [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, - [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, + [Parameter(typeof(DefaultStageName))] string stageName, + [Parameter(typeof(NumEvaluators))] int numEvaluators, IElasticContext context) : base(stageName, numEvaluators, context) { } @@ -51,4 +52,4 @@ protected override Func SlaveTaskConfiguration } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs index 4da4bfe303..d7bc0e48b3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs @@ -25,7 +25,6 @@ using Org.Apache.REEF.Driver.Task; using Org.Apache.REEF.Common.Context; using Org.Apache.REEF.Utilities.Attributes; -using System.Threading; namespace Org.Apache.REEF.Network.Elastic.Driver.Default { @@ -132,4 +131,4 @@ public void OnError(Exception error) TaskSetManager.Dispose(); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs index 5f5e5f7ccf..272db5cc9c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs @@ -56,7 +56,7 @@ public DefaultBroadcast( checkpointLevel, configurations) { - OperatorName = Constants.Broadcast; + OperatorType = OperatorType.Broadcast; } /// @@ -77,7 +77,7 @@ internal override void GetCodecConfiguration(ref IConfiguration conf) } /// - /// Binding from logical to physical operator. + /// Binding from logical to physical operator. /// /// The configuration builder the binding will be added to protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder confBuilder) @@ -87,4 +87,4 @@ protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilde SetMessageType(typeof(Physical.Default.DefaultBroadcast), ref confBuilder); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs index 2e55c2bd84..585bc37f4a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs @@ -30,7 +30,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default /// Empty operator implementing the default failure logic. To use only as root of pipelines. /// [Unstable("0.16", "API may change")] - class DefaultEmpty : ElasticOperatorWithDefaultDispatcher + internal class DefaultEmpty : ElasticOperatorWithDefaultDispatcher { /// /// Basic constructor for the empty operator. @@ -40,7 +40,7 @@ class DefaultEmpty : ElasticOperatorWithDefaultDispatcher public DefaultEmpty(IElasticStage stage, IFailureStateMachine failureMachine) : base(stage, null, new EmptyTopology(), failureMachine) { - OperatorName = Constants.Empty; + OperatorType = OperatorType.Empty; MasterId = 1; WithinIteration = false; } @@ -61,7 +61,7 @@ public override void OnTaskFailure(IFailedTask task, ref List fai } /// - /// Logs the current operator state. + /// Logs the current operator state. /// protected override void LogOperatorState() { @@ -77,7 +77,7 @@ protected override void GetOperatorConfiguration(ref IList serializedOpe } /// - /// Binding from logical to physical operator. + /// Binding from logical to physical operator. /// /// The configuration builder the binding will be added to protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder confBuilder) @@ -101,4 +101,4 @@ internal override void GatherMasterIds(ref HashSet masterTasks) } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs index 63bd4d30cf..4f6c2926b7 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -121,7 +121,7 @@ protected override bool ReactOnTaskMessage(ITaskMessage message, ref List.Of(_failureMachine)); @@ -134,7 +134,7 @@ protected override bool ReactOnTaskMessage(ITaskMessage message, ref List public override void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { - LOGGER.Log(Level.Info, $"Going to reconfigure the {OperatorName} operator"); + LOGGER.Log(Level.Info, $"Going to reconfigure the {OperatorType.ToString()} operator"); if (reconfigureEvent.FailedTask.IsPresent()) { @@ -223,4 +223,4 @@ public override void OnStop(ref StopEvent stopEvent) OnReschedule(ref rescheduleEvent); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs index 30e7a7bef5..4d3df764bb 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs @@ -107,6 +107,7 @@ public override void OnTaskFailure(IFailedTask task, ref List fai case DefaultFailureStates.ContinueAndReconfigure: failureEvents.Add(new ReconfigureEvent(task, _id)); break; + case DefaultFailureStates.ContinueAndReschedule: if (failedOperatorId == _id) { @@ -117,6 +118,7 @@ public override void OnTaskFailure(IFailedTask task, ref List fai failureEvents.Add(@event); } break; + case DefaultFailureStates.StopAndReschedule: { var @event = new StopEvent(task.Id); @@ -127,9 +129,11 @@ public override void OnTaskFailure(IFailedTask task, ref List fai failureEvents.Add(@event); } break; + case DefaultFailureStates.Fail: failureEvents.Add(new FailEvent(task.Id)); break; + default: LOGGER.Log(Level.Info, $"Failure from {task.Id} requires no action"); break; @@ -175,14 +179,17 @@ public override void EventDispatcher(ref IFailureEvent @event) var rec = @event as ReconfigureEvent; OnReconfigure(ref rec); break; + case DefaultFailureStateEvents.Reschedule: var res = @event as RescheduleEvent; OnReschedule(ref res); break; + case DefaultFailureStateEvents.Stop: var stp = @event as StopEvent; OnStop(ref stp); break; + default: OnFail(); break; @@ -238,17 +245,18 @@ protected override bool PropagateFailureDownstream() case (int)DefaultFailureStates.ContinueAndReconfigure: case (int)DefaultFailureStates.ContinueAndReschedule: return true; + default: return false; } } /// - /// Logs the current operator state. + /// Logs the current operator state. /// protected override void LogOperatorState() { - string intro = $"State for Operator {OperatorName} in Stage {Stage.StageName}:\n"; + string intro = $"State for Operator {OperatorType.ToString()} in Stage {Stage.StageName}:\n"; string topologyState = $"Topology:\n{_topology.LogTopologyState()}\n"; string failureMachineState = $"Failure State: {(DefaultFailureStates)_failureMachine.State.FailureState}" + $"\nFailure(s) Reported: {_failureMachine.NumOfFailedDataPoints}/{_failureMachine.NumOfDataPoints}"; @@ -256,4 +264,4 @@ protected override void LogOperatorState() LOGGER.Log(Level.Info, intro + topologyState + failureMachineState); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index 6e92de0591..b6617ea1c4 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -76,11 +76,11 @@ public abstract class ElasticOperator : IFailureResponse, ITaskMessageResponse .Set(StreamingCodecConfiguration.Codec, GenericType.Class) .Build() } - }; // For the moment we consider only linear sequences (pipelines) of operators (no branching for e.g., joins) protected ElasticOperator _next = null; + protected readonly ElasticOperator _prev; protected readonly IFailureStateMachine _failureMachine; @@ -130,9 +130,9 @@ public ElasticOperator( public int MasterId { get; protected set; } /// - /// An operator type specific name. + /// The operator type. /// - public string OperatorName { get; protected set; } + public OperatorType OperatorType { get; protected set; } /// /// Whether the current operator is or is preeceded by an iterator operator. @@ -190,7 +190,7 @@ public ElasticOperator Broadcast(TopologyType topology, CheckpointLevel check } /// - /// Method triggered when a task to driver message is received. + /// Method triggered when a task to driver message is received. /// This method eventually propagate tasks message through the pipeline. /// /// The task message for the operator @@ -431,7 +431,7 @@ internal virtual string LogFinalStatistics() } /// - /// Appends the message type to the configuration. + /// Appends the message type to the configuration. /// /// The type of the messages the operator is configured to accept /// The configuration builder the message type will be added to @@ -522,11 +522,11 @@ protected virtual bool ReactOnTaskMessage(ITaskMessage message, ref List - /// Logs the current operator state. + /// Logs the current operator state. /// protected virtual void LogOperatorState() { - string intro = $"State for Operator {OperatorName} in Stage {Stage.StageName}:\n"; + string intro = $"State for Operator {OperatorType.ToString()} in Stage {Stage.StageName}:\n"; string topologyState = $"Topology:\n{_topology.LogTopologyState()}"; string failureMachineState = "Failure State: " + _failureMachine.State.FailureState + "\nFailure(s) Reported: " + _failureMachine.NumOfFailedDataPoints; @@ -543,7 +543,7 @@ protected virtual string LogInternalStatistics() } /// - /// Binding from logical to physical operator. + /// Binding from logical to physical operator. /// /// The configuration builder the binding will be added to protected abstract void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder builder); @@ -557,10 +557,11 @@ private ITopology GetTopology(TopologyType topologyType) case TopologyType.Flat: topology = new FlatTopology(MasterId); break; - default: throw new ArgumentException(nameof(topologyType), $"Topology type {topologyType} not supported by {OperatorName}."); + + default: throw new ArgumentException(nameof(topologyType), $"Topology type {topologyType} not supported by {OperatorType.ToString()}."); } return topology; } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Constants.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs similarity index 53% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Constants.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs index 4d833527f4..4d039137f7 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Constants.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs @@ -16,6 +16,7 @@ // under the License. using Org.Apache.REEF.Utilities.Attributes; +using System; namespace Org.Apache.REEF.Network.Elastic.Operators { @@ -23,14 +24,32 @@ namespace Org.Apache.REEF.Network.Elastic.Operators /// Constants labeling the set of available operators. /// [Unstable("0.16", "Constants may change")] - public static class Constants + public enum OperatorType : int { - public const string Broadcast = "broadcast"; - public const string Reduce = "reduce"; - public const string AggregationRing = "aggregation ring"; - public const string Iterate = "iterate"; - public const string Scatter = "scatter"; - public const string Gather = "gather"; - public const string Empty = "empty"; + Empty = 0, + Broadcast = 1, + Reduce = 2, + AggregationRing = 3, + Iterate = 4, + Scatter = 5, + Gather = 6 } -} + + public static class OperatorTypeToString + { + public static string ToString(this OperatorType type) + { + switch (type) + { + case OperatorType.Empty: return "Empty"; + case OperatorType.Broadcast: return "Broadcast"; + case OperatorType.Reduce: return "Reduce"; + case OperatorType.AggregationRing: return "AggregationRing"; + case OperatorType.Iterate: return "Iterate"; + case OperatorType.Scatter: return "Scatter"; + case OperatorType.Gather: return "Gather"; + default: throw new ArgumentException($"Operator type {type} not found"); + } + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs index 48e682e2c0..f5facde679 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs @@ -41,14 +41,14 @@ private DefaultBroadcast( [Parameter(typeof(OperatorParameters.IsLast))] bool isLast, DefaultBroadcastTopology topology) : base(id, isLast, topology) { - OperatorName = Constants.Broadcast; + OperatorType = OperatorType.Broadcast; } /// /// Send the data to all child receivers. /// Send is asynchronous but works in 3 phases: /// 1-The task asks the driver for updates to the topology - /// 2-Updates are received and added to the local topology + /// 2-Updates are received and added to the local topology /// --(Note that altough the method is non-blocking, no message will be sent until /// updates are not received) /// 3-Send the message. @@ -68,4 +68,4 @@ public void Send(T data) _position = PositionTracker.AfterSend; } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index bda3429de3..1443a026c4 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -69,9 +69,9 @@ internal DefaultOneToN(int id, bool isLast, OneToNTopology topology) public int OperatorId { get; private set; } /// - /// The operator name. + /// The operator type. /// - public string OperatorName { get; protected set; } + public OperatorType OperatorType { get; protected set; } /// /// Operator-specific information that is sent to the driver in case of failure. @@ -166,12 +166,12 @@ public void ResetPosition() /// public void WaitForTaskRegistration(CancellationTokenSource cancellationSource) { - LOGGER.Log(Level.Info, $"Waiting for task registration for {OperatorName} operator."); + LOGGER.Log(Level.Info, $"Waiting for task registration for {OperatorType.ToString()} operator."); _topology.WaitForTaskRegistration(cancellationSource); } /// - /// Wait until computation is globally completed for this operator + /// Wait until computation is globally completed for this operator /// before disposing the object. /// public void WaitCompletionBeforeDisposing() @@ -192,4 +192,4 @@ public void Dispose() _topology.Dispose(); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs index 74759f95f9..58d6cfc97a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs @@ -29,9 +29,9 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Physical public interface IElasticOperator : IWaitForTaskRegistration, IDisposable { /// - /// The operator name. + /// The operator type. /// - string OperatorName { get; } + OperatorType OperatorType { get; } /// /// The operator identifier. @@ -54,7 +54,7 @@ public interface IElasticOperator : IWaitForTaskRegistration, IDisposable CancellationTokenSource CancellationSource { get; set; } /// - /// Wait until computation is globally completed for this operator + /// Wait until computation is globally completed for this operator /// before disposing the object. /// void WaitCompletionBeforeDisposing(); @@ -70,4 +70,4 @@ public interface IElasticOperator : IWaitForTaskRegistration, IDisposable /// Action OnTaskRescheduled { get; } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index 5ad43a9edd..87f56d3dc1 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -15,12 +15,10 @@ // specific language governing permissions and limitations // under the License. -using Org.Apache.REEF.Network.Elastic.Config; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Network.Elastic.Operators; using Org.Apache.REEF.Network.Elastic.Operators.Physical; using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Exceptions; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Utilities.Logging; using System; @@ -28,26 +26,27 @@ using System.Collections.Generic; using System.Linq; using System.Threading; +using static Org.Apache.REEF.Network.Elastic.Config.GroupCommunicationConfigurationOptions; namespace Org.Apache.REEF.Network.Elastic.Task { /// /// Task-side representation of the the sequence of group communication operations to execute. - /// Exception rised during execution are managed by the framework and recovered through the user-defined - /// policies / mechanisms. + /// Exception rised during execution are managed by the framework and recovered through + /// the user-defined policies / mechanisms. /// [Unstable("0.16", "API may change")] - public sealed class Workflow : IEnumerator + public sealed class Workflow : IEnumerator, IEnumerable { private static readonly Logger LOGGER = Logger.GetLogger(typeof(Workflow)); private int _position = -1; - private bool _failed; - private bool _disposed; - private List _iteratorsPosition; + private bool _failed = false; + private bool _disposed = false; + private List _iteratorsPosition = new List(); - private readonly object _lock; - private readonly IList _operators; + private readonly object _lock = new object(); + private readonly IList _operators = new List(); private readonly CancellationSource _cancellationSource; private readonly bool _isRescheduled; @@ -57,38 +56,13 @@ public sealed class Workflow : IEnumerator /// [Inject] private Workflow( - [Parameter(typeof(GroupCommunicationConfigurationOptions.IsRescheduled))] bool isRescheduled, + [Parameter(typeof(IsRescheduled))] bool isRescheduled, CancellationSource cancellationSource) { - _operators = new List(); - _failed = false; - _disposed = false; - _lock = new object(); - _iteratorsPosition = new List(); _cancellationSource = cancellationSource; _isRescheduled = isRescheduled; } - /// - /// The current iteration value. - /// - public object Iteration - { - get - { - if (_iteratorsPosition.Count == 0) - { - return 0; - } - else - { - var iterPos = _iteratorsPosition[0]; - var iterator = _operators[iterPos] as IElasticIterator; - return iterator.Current; - } - } - } - /// /// Try to move to the next operation in the workflow. /// @@ -126,8 +100,10 @@ public bool MoveNext() } } - // In case we have one or zero iterators (or we are at the last iterator when multiple iterators exists) - if (_position >= _operators.Count || (_iteratorsPosition.Count > 1 && _position == _iteratorsPosition[1])) + // In case we have one or zero iterators + // (or we are at the last iterator when multiple iterators exists) + if (_position >= _operators.Count || + (_iteratorsPosition.Count > 1 && _position == _iteratorsPosition[1])) { if (_iteratorsPosition.Count == 0) { @@ -150,14 +126,16 @@ public bool MoveNext() } /// - /// Method used to make the framework aware that an exception as been thrown during execution. + /// Method used to make the framework aware that an exception as been thrown + /// during execution. /// /// The rised exception public void Throw(Exception e) { if (_cancellationSource.IsCancelled) { - LOGGER.Log(Level.Warning, "Workflow captured an exception while cancellation source was true.", e); + LOGGER.Log(Level.Warning, + "Workflow captured an exception while cancellation source was true.", e); } else { @@ -216,26 +194,18 @@ public void Dispose() { foreach (var op in _operators) { - if (op != null) - { - op.WaitCompletionBeforeDisposing(); - } - } - } - - foreach (var op in _operators) - { - if (op != null) - { - var disposableOperator = op as IDisposable; - - disposableOperator.Dispose(); + op?.WaitCompletionBeforeDisposing(); } } } - _disposed = true; + foreach (var op in _operators) + { + op?.Dispose(); + } } + + _disposed = true; } } @@ -258,7 +228,7 @@ internal void Add(IElasticOperator op) iterator.RegisterActionOnTaskRescheduled(op.OnTaskRescheduled); } - if (op.OperatorName == Constants.Iterate) + if (op.OperatorType == OperatorType.Iterate) { _iteratorsPosition.Add(_operators.Count - 1); } @@ -271,16 +241,9 @@ internal void Add(IElasticOperator op) /// The signal to cancel the operation internal void WaitForTaskRegistration(CancellationTokenSource cancellationSource = null) { - try - { - foreach (var op in _operators) - { - op.WaitForTaskRegistration(cancellationSource); - } - } - catch (OperationCanceledException e) + foreach (var op in _operators) { - throw e; + op.WaitForTaskRegistration(cancellationSource); } } @@ -294,5 +257,15 @@ private void ResetOperatorPositions() _operators[pos].ResetPosition(); } } + + public IEnumerator GetEnumerator() + { + return this; + } + + IEnumerator IEnumerable.GetEnumerator() + { + return this; + } } -} +} \ No newline at end of file From bc544166e54f331eb726613ba51c5f91d5685f97 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Wed, 30 Jan 2019 17:58:30 -0800 Subject: [PATCH 11/29] - Swtiched log entries from $ to -{0}. - Fixed some line length< 120 - Fixed some var initilization in constructors --- .../Elastic/BroadcastMasterTask.cs | 2 +- .../ElasticBroadcastDriverWithFailures.cs | 1 - .../BroadcastSlaveTaskDieAfterBroadcast.cs | 2 +- .../BroadcastSlaveTaskDieBeforeBroadcast.cs | 2 +- .../BroadcastSlaveTaskDieBeforeWorkflow.cs | 2 +- ...castSlaveTaskDieEvaluatorBeforeWorkflow.cs | 2 +- .../BroadcastSlaveTaskDieInConstructor.cs | 2 +- .../BroadcastSlaveTaskDieMultiple.cs | 2 +- ...BroadcastSlaveTaskDieMultipleEvaluators.cs | 2 +- .../Driver/Default/DefaultElasticContext.cs | 62 ++++--- .../Driver/Default/DefaultElasticStage.cs | 76 ++++---- .../Default/DefaultElasticTaskSetManager.cs | 175 ++++++++++-------- .../Elastic/Failures/FailuresClock.cs | 9 +- .../Logical/Default/DefaultOneToN.cs | 44 +++-- .../ElastiOperatorWithDefaultDispatcher.cs | 19 +- .../Operators/Logical/ElasticOperator.cs | 48 +++-- .../Physical/Default/DefaultOneToN.cs | 5 +- .../Elastic/Task/CommunicationLayer.cs | 34 ++-- .../Task/Default/DefaultCommunicationLayer.cs | 10 +- .../Task/Default/DefaultElasticStage.cs | 5 +- .../DefaultTaskToDriverMessageDispatcher.cs | 23 ++- .../Elastic/Task/Workflow.cs | 2 +- .../Topology/Logical/Impl/FlatTopology.cs | 70 +++---- .../Physical/Default/OneToNTopology.cs | 30 ++- 24 files changed, 382 insertions(+), 247 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs index aa65b066d3..6f2aafb530 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs @@ -51,7 +51,7 @@ protected override void Execute(byte[] memento, Workflow workflow) sender.Send(number); - LOGGER.Log(Level.Info, $"Master has sent {number}"); + LOGGER.Log(Level.Info, "Master has sent {0}", number); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs index bfa0c612f4..3a65836acb 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs @@ -26,7 +26,6 @@ using Org.Apache.REEF.Network.Elastic.Driver.Default; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Network.Elastic.Failures.Default; -using Org.Apache.REEF.Network.Elastic.Config; namespace Org.Apache.REEF.Network.Examples.Elastic { diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs index d14e8b87db..d9bd5de6d1 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs @@ -55,7 +55,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, $"Slave has received {rec}"); + LOGGER.Log(Level.Info, "Slave has received {0}", rec); if (Utils.GetTaskNum(_taskId) == 2) { diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs index a474ca9072..8c8c9dd68d 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs @@ -60,7 +60,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, $"Slave has received {rec}"); + LOGGER.Log(Level.Info, "Slave has received {0}", rec); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs index 8570a85838..15cdc82134 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs @@ -59,7 +59,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, $"Slave has received {rec}"); + LOGGER.Log(Level.Info, "Slave has received {0}", rec); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs index 0096142d2d..224e875133 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs @@ -60,7 +60,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, $"Slave has received {rec}"); + LOGGER.Log(Level.Info, "Slave has received {0}", rec); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs index 562ac92b94..f26155fa21 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs @@ -55,7 +55,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, $"Slave has received {rec}"); + LOGGER.Log(Level.Info, "Slave has received {0}", rec); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs index 188d3f8f63..4cdcc8d8f9 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs @@ -62,7 +62,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, $"Slave has received {rec}"); + LOGGER.Log(Level.Info, "Slave has received {0}", rec); break; diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs index 702b86e35b..9dcb065a58 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs @@ -62,7 +62,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, $"Slave has received {rec}"); + LOGGER.Log(Level.Info, "Slave has received {0}", rec); if (_rand.Next(100) < _failProb) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs index 4e83ea02ad..390db11eb2 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs @@ -52,8 +52,6 @@ namespace Org.Apache.REEF.Network.Elastic.Driver.Default [Unstable("0.16", "API may change")] internal sealed class DefaultElasticContext : IElasticContext, IDefaultFailureEventResponse { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticContext)); - private readonly int _startingPort; private readonly int _portRange; private readonly string _driverId; @@ -69,13 +67,13 @@ internal sealed class DefaultElasticContext : IElasticContext, IDefaultFailureEv private readonly string _batchId; private readonly string _rackName; - private readonly Dictionary _stages; + private readonly Dictionary _stages = new Dictionary(); private readonly AvroConfigurationSerializer _configSerializer; private readonly object _subsLock = new object(); private readonly object _statusLock = new object(); - private IFailureState _failureStatus; + private IFailureState _failureStatus = new DefaultFailureState(); [Inject] private DefaultElasticContext( @@ -104,11 +102,7 @@ private DefaultElasticContext( _cores = cores; _batchId = batchId; _rackName = rackName; - - _failureStatus = new DefaultFailureState(); _configSerializer = configSerializer; - _stages = new Dictionary(); - _nameServer = nameServer; IPEndPoint localEndpoint = nameServer.LocalEndpoint; _nameServerAddr = localEndpoint.Address.ToString(); @@ -128,7 +122,10 @@ public IElasticStage DefaultStage() if (defaultStage == null) { - CreateNewStage(_defaultStageName, _numEvaluators, _defaultFailureMachine.Clone(_numEvaluators, (int)DefaultFailureStates.Fail)); + CreateNewStage( + _defaultStageName, + _numEvaluators, + _defaultFailureMachine.Clone(_numEvaluators, (int)DefaultFailureStates.Fail)); } return _stages[_defaultStageName]; @@ -194,7 +191,7 @@ public void RemoveElasticStage(string stageName) } /// - /// Generate the base configuration module for tasks. + /// Generate the base configuration module for tasks. /// This method is method can be used to generate configurations for the task set menager. /// /// The id of the task the configuration is generate for @@ -231,7 +228,9 @@ public void Start() /// The configuration for the slave task /// A new task set manager - public IElasticTaskSetManager CreateNewTaskSetManager(Func masterTaskConfiguration, Func slaveTaskConfiguration = null) + public IElasticTaskSetManager CreateNewTaskSetManager( + Func masterTaskConfiguration, + Func slaveTaskConfiguration = null) { return CreateNewTaskSetManager(_numEvaluators, masterTaskConfiguration, slaveTaskConfiguration); } @@ -243,9 +242,17 @@ public IElasticTaskSetManager CreateNewTaskSetManager(FuncThe configuration for the master task /// The configuration for the slave task /// A new task set manager - public IElasticTaskSetManager CreateNewTaskSetManager(int numOfTasks, Func masterTaskConfiguration, Func slaveTaskConfiguration = null) + public IElasticTaskSetManager CreateNewTaskSetManager( + int numOfTasks, + Func masterTaskConfiguration, + Func slaveTaskConfiguration = null) { - return new DefaultElasticTaskSetManager(numOfTasks, _evaluatorRequestor, _driverId, masterTaskConfiguration, slaveTaskConfiguration); + return new DefaultElasticTaskSetManager( + numOfTasks, + _evaluatorRequestor, + _driverId, + masterTaskConfiguration, + slaveTaskConfiguration); } /// @@ -282,7 +289,9 @@ public IConfiguration GetElasticServiceConfiguration() /// The configuration where the stage configuration will be appended to /// The stage configuration at hand /// The configuration containing the serialized stage configuration - public void SerializeStageConfiguration(ref ICsConfigurationBuilder confBuilder, IConfiguration stageConfiguration) + public void SerializeStageConfiguration( + ref ICsConfigurationBuilder confBuilder, + IConfiguration stageConfiguration) { confBuilder.BindSetEntry( GenericType.Class, @@ -292,15 +301,19 @@ public void SerializeStageConfiguration(ref ICsConfigurationBuilder confBuilder, /// /// Append an operator configuration to a configuration builder object. /// - /// The list where the operator configuration will be appended to + /// The list where the operator configuration + /// will be appended to /// The operator configuration at hand /// The configuration containing the serialized operator configuration - public void SerializeOperatorConfiguration(ref IList serializedOperatorsConfs, IConfiguration operatorConfiguration) + public void SerializeOperatorConfiguration( + ref IList serializedOperatorsConfs, + IConfiguration operatorConfiguration) { serializedOperatorsConfs.Add(_configSerializer.ToString(operatorConfiguration)); } #region Failure Response + /// /// Used to react on a failure occurred on a task. /// It gets a failed task as input and in response it produces zero or more failure events. @@ -340,23 +353,27 @@ public void EventDispatcher(ref IFailureEvent @event) var rec = @event as ReconfigureEvent; OnReconfigure(ref rec); break; + case DefaultFailureStateEvents.Reschedule: var res = @event as RescheduleEvent; OnReschedule(ref res); break; + case DefaultFailureStateEvents.Stop: var stp = @event as StopEvent; OnStop(ref stp); break; + default: OnFail(); break; } } - #endregion + #endregion Failure Response #region Default Failure event Response + /// /// Mechanism to execute when a reconfigure event is triggered. /// @@ -365,7 +382,8 @@ public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { lock (_statusLock) { - _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); } } @@ -377,7 +395,8 @@ public void OnReschedule(ref RescheduleEvent rescheduleEvent) { lock (_statusLock) { - _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); } } @@ -403,6 +422,7 @@ public void OnFail() _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.Fail)); } } - #endregion + + #endregion Default Failure event Response } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index 09a6cc89dc..e8cf7c518c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -39,8 +39,8 @@ namespace Org.Apache.REEF.Network.Elastic.Driver.Default { /// - /// Used to group elastic operators into logical units. - /// All operators in the same stages share similar semantics and behavior + /// Used to group elastic operators into logical units. + /// All operators in the same stages share similar semantics and behavior /// under failures. Stages can only be created by a service. /// This class is used to create stages able to manage default failure events. /// @@ -49,13 +49,13 @@ internal sealed class DefaultElasticStage : IElasticStage, IDefaultFailureEventR { private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticStage)); - private bool _finalized; - private volatile bool _scheduled; + private bool _finalized = false; + private volatile bool _scheduled = false; private readonly int _numTasks; - private int _tasksAdded; - private HashSet _missingMasterTasks; - private HashSet _masterTasks; + private int _tasksAdded = 0; + private HashSet _missingMasterTasks = new HashSet(); + private HashSet _masterTasks = new HashSet(); private readonly IFailureStateMachine _failureMachine; private int _numOperators; @@ -79,11 +79,7 @@ internal DefaultElasticStage( IFailureStateMachine failureMachine = null) { StageName = stageName; - _finalized = false; - _scheduled = false; _numTasks = numTasks; - _tasksAdded = 0; - _masterTasks = new HashSet(); _datasetConfiguration = Optional.Empty(); Context = elasticService; _failureMachine = failureMachine ?? new DefaultFailureStateMachine(numTasks, DefaultFailureStates.Fail); @@ -114,7 +110,7 @@ internal DefaultElasticStage( public bool IsIterative { get; set; } /// - /// The failure state of the target stages. + /// The failure state of the target stages. /// public IFailureState FailureState { get; private set; } @@ -176,12 +172,13 @@ public IElasticStage Build() if (_datasetConfiguration.Value.Length + adjust < _numTasks) { - throw new IllegalStateException($"Dataset is smaller than the number of tasks: re-submit with {_datasetConfiguration.Value.Length + adjust} tasks"); + throw new IllegalStateException( + $"Dataset is smaller than the number of tasks: " + + "re-submit with {_datasetConfiguration.Value.Length + adjust} tasks"); } } PipelineRoot.GatherMasterIds(ref _masterTasks); - _missingMasterTasks = new HashSet(_masterTasks); _finalized = true; @@ -216,18 +213,21 @@ public bool AddTask(string taskId) { // We don't add a task if eventually we end up by not adding the master task var tooManyTasks = _tasksAdded >= _numTasks; - var notAddingMaster = _tasksAdded + _missingMasterTasks.Count >= _numTasks && !_missingMasterTasks.Contains(taskId); + var notAddingMaster = _tasksAdded + _missingMasterTasks.Count >= _numTasks && + !_missingMasterTasks.Contains(taskId); if (!_scheduled && (tooManyTasks || notAddingMaster)) { if (tooManyTasks) { - LOGGER.Log(Level.Warning, $"Already added {_tasksAdded} tasks when total tasks request is {_numTasks}"); + LOGGER.Log(Level.Warning, + "Already added {0} tasks when total tasks request is {1}", _tasksAdded, _numTasks); } if (notAddingMaster) { - LOGGER.Log(Level.Warning, $"Already added {_tasksAdded} over {_numTasks} but missing master task(s)"); + LOGGER.Log(Level.Warning, + "Already added {0} over {1} but missing master task(s)", _tasksAdded, _numTasks); } return false; @@ -250,15 +250,19 @@ public bool AddTask(string taskId) /// /// Decides if the tasks added to the stages can be scheduled for execution - /// or not. This method is used for implementing different policies for + /// or not. This method is used for implementing different policies for /// triggering the scheduling of tasks. /// /// True if the previously added tasks can be scheduled for execution public bool ScheduleStage() { - // Schedule if we reach the number of requested tasks or the stage contains an iterative pipeline that is ready to be scheduled and the - // policy requested by the user allow early start with ramp up. - if (!_scheduled && (_numTasks == _tasksAdded || (IsIterative && _failureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule && PipelineRoot.CanBeScheduled()))) + // Schedule if we reach the number of requested tasks or the stage contains an iterative pipeline + // that is ready to be scheduled and the policy requested by the user allow early start with ramp up. + if (!_scheduled && + (_numTasks == _tasksAdded || + (IsIterative && + _failureMachine.State.FailureState < (int)DefaultFailureStates.StopAndReschedule && + PipelineRoot.CanBeScheduled()))) { _scheduled = true; @@ -313,7 +317,7 @@ public IConfiguration GetTaskConfiguration(int taskId) /// Given a task id, this method returns the configuration of the task's data partition /// (if any). /// - /// The task id of the task we wanto to retrieve the data partition. + /// The task id of the task we wanto to retrieve the data partition. /// The task is required to belong to thq stages /// The configuration of the data partition (if any) of the task public Optional GetPartitionConf(string taskId) @@ -341,12 +345,12 @@ public void Complete() { lock (_statusLock) { - FailureState = FailureState.Merge(_failureMachine.Complete()); + FailureState = FailureState.Merge(_failureMachine.Complete()); } } /// - /// Retrieve the log the final statistics of the computation: this is the sum of all + /// Retrieve the log the final statistics of the computation: this is the sum of all /// the stats of all the Operators compising the stage. This method can be called /// only once the stages is completed. /// @@ -359,16 +363,18 @@ public string LogFinalStatistics() } else { - throw new IllegalStateException($"Cannot log statistics before Stage {StageName} is completed or failed."); + throw new IllegalStateException( + $"Cannot log statistics before Stage {StageName} is completed or failed."); } } /// - /// Method triggered when a task to driver message is received. + /// Method triggered when a task to driver message is received. /// /// The task message for the operator /// A list of messages containing the instructions for the task - /// If the message cannot be handled correctly or generate an incorrent state + /// If the message cannot be handled correctly or generate + /// an incorrent state public void OnTaskMessage(ITaskMessage message, ref List returnMessages) { int offset = 0; @@ -425,14 +431,17 @@ public void EventDispatcher(ref IFailureEvent @event) var rec = @event as ReconfigureEvent; OnReconfigure(ref rec); break; + case DefaultFailureStateEvents.Reschedule: var res = @event as RescheduleEvent; OnReschedule(ref res); break; + case DefaultFailureStateEvents.Stop: var stp = @event as StopEvent; OnStop(ref stp); break; + default: OnFail(); break; @@ -441,7 +450,7 @@ public void EventDispatcher(ref IFailureEvent @event) PipelineRoot.EventDispatcher(ref @event); } - #endregion + #endregion Failure Response #region Default Failure Events Response @@ -453,7 +462,8 @@ public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { lock (_statusLock) { - FailureState = FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + FailureState = FailureState.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); } } @@ -465,7 +475,8 @@ public void OnReschedule(ref RescheduleEvent rescheduleEvent) { lock (_statusLock) { - FailureState = FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + FailureState = FailureState.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); } } @@ -477,7 +488,8 @@ public void OnStop(ref StopEvent stopEvent) { lock (_statusLock) { - FailureState = FailureState.Merge(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + FailureState = FailureState.Merge( + new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); } } @@ -492,6 +504,6 @@ public void OnFail() } } - #endregion + #endregion Default Failure Events Response } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index f315beeeec..376f5851d2 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -89,7 +89,7 @@ private enum TaskState Completed = 7 } - #endregion + #endregion Private structs #region Private classes @@ -98,9 +98,9 @@ private enum TaskState /// private sealed class TaskInfo : IDisposable { - private volatile bool _isTaskDisposed; - private volatile bool _isActiveContextDisposed; - private volatile bool _isDisposed; + private volatile bool _isTaskDisposed = false; + private volatile bool _isActiveContextDisposed = false; + private volatile bool _isDisposed = false; /// /// Constructor. @@ -110,11 +110,13 @@ private sealed class TaskInfo : IDisposable /// The evalutor id /// The task status /// The stage the task belongs to - public TaskInfo(IConfiguration config, IActiveContext context, string evaluatorId, TaskState status, IList stages) + public TaskInfo( + IConfiguration config, + IActiveContext context, + string evaluatorId, + TaskState status, + IList stages) { - _isTaskDisposed = false; - _isActiveContextDisposed = false; - _isDisposed = false; TaskConfiguration = config; ActiveContext = context; EvaluatorId = evaluatorId; @@ -256,7 +258,7 @@ public void DisposeActiveContext() } /// - /// Dipose the task info. + /// Dipose the task info. /// public void Dispose() { @@ -276,9 +278,15 @@ public void Dispose() /// private static class TaskStateUtils { - private static List recoverable = new List() { TaskState.Failed, TaskState.Queued }; + private static List recoverable = new List() + { + TaskState.Failed, TaskState.Queued + }; - private static List notRunnable = new List() { TaskState.Failed, TaskState.Completed }; + private static List notRunnable = new List() + { + TaskState.Failed, TaskState.Completed + }; /// /// Whether a task is recoverable or not. @@ -347,21 +355,21 @@ public Alarm GetAlarm(long time) } } - #endregion + #endregion Private classes private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticTaskSetManager)); - private bool _finalized; - private volatile bool _disposed; - private volatile bool _scheduled; - private volatile bool _completed; + private bool _finalized = false; + private volatile bool _disposed = false; + private volatile bool _scheduled = false; + private volatile bool _completed = false; private readonly DefaultElasticTaskSetManagerParameters _parameters; - private volatile int _contextsAdded; - private int _tasksAdded; - private int _tasksRunning; - private volatile int _totFailedTasks; - private volatile int _totFailedEvaluators; + private volatile int _contextsAdded = 0; + private int _tasksAdded = 0; + private int _tasksRunning = 0; + private volatile int _totFailedTasks = 0; + private volatile int _totFailedEvaluators = 0; private readonly int _numTasks; private readonly IEvaluatorRequestor _evaluatorRequestor; @@ -371,19 +379,22 @@ public Alarm GetAlarm(long time) // Task info 0-indexed private readonly List _taskInfos; - private readonly Dictionary _stages; - private readonly ConcurrentQueue _queuedTasks; - private readonly ConcurrentQueue _queuedContexts; - // Used both for knowing which evaluator the task set is responsible for and to + private readonly Dictionary _stages = new Dictionary(); + private readonly ConcurrentQueue _queuedTasks = new ConcurrentQueue(); + private readonly ConcurrentQueue _queuedContexts = new ConcurrentQueue(); + + // Used both for knowing which evaluator the task set is responsible for and to // maintain a mapping betwween evaluators and contextes. // This latter is necessary because evaluators may fail between context init // and the time when the context is installed on the evaluator - private readonly ConcurrentDictionary _evaluatorToContextIdMapping; - private IFailureState _failureStatus; - private volatile bool _hasProgress; + private readonly ConcurrentDictionary _evaluatorToContextIdMapping = + new ConcurrentDictionary(); + + private IFailureState _failureStatus = new DefaultFailureState(); + private volatile bool _hasProgress = true; - private readonly object _statusLock; + private readonly object _statusLock = new object(); /// /// Constructor for the task set manager. @@ -402,17 +413,6 @@ public DefaultElasticTaskSetManager( Func slaveTaskConfiguration = null, params IConfiguration[] confs) { - _finalized = false; - _scheduled = false; - _disposed = false; - _completed = false; - - _contextsAdded = 0; - _tasksAdded = 0; - _tasksRunning = 0; - _totFailedTasks = 0; - _totFailedEvaluators = 0; - _numTasks = numTasks; _evaluatorRequestor = evaluatorRequestor; _driverId = driverId; @@ -420,14 +420,6 @@ public DefaultElasticTaskSetManager( _slaveTaskConfiguration = slaveTaskConfiguration ?? masterTaskConfiguration; _taskInfos = new List(numTasks); - _stages = new Dictionary(); - _queuedTasks = new ConcurrentQueue(); - _queuedContexts = new ConcurrentQueue(); - _evaluatorToContextIdMapping = new ConcurrentDictionary(); - _failureStatus = new DefaultFailureState(); - _hasProgress = true; - - _statusLock = new object(); for (int i = 0; i < numTasks; i++) { @@ -525,7 +517,9 @@ public bool TryGetNextTaskContextId(IAllocatedEvaluator evaluator, out string id cinfo = new ContextInfo(id); _evaluatorToContextIdMapping.TryAdd(evaluator.Id, cinfo); - LOGGER.Log(Level.Info, $"Evaluator {evaluator.Id} is scheduled on node {evaluator.GetEvaluatorDescriptor().NodeDescriptor.HostName}"); + LOGGER.Log(Level.Info, "Evaluator {0} is scheduled on node {1}", + evaluator.Id, + evaluator.GetEvaluatorDescriptor().NodeDescriptor.HostName); return true; } @@ -590,10 +584,11 @@ public void OnNewActiveContext(IActiveContext activeContext) var id = Utils.GetContextNum(activeContext) - 1; var taskId = Utils.BuildTaskId(StagesId, id + 1); - // We reschedule the task only if the context was active (_taskInfos[id] != null) and the task was actually scheduled at least once (_taskInfos[id].TaskStatus > TaskStatus.Init) + // We reschedule the task only if the context was active (_taskInfos[id] != null) and the task was + // actually scheduled at least once (_taskInfos[id].TaskStatus > TaskStatus.Init) if (_taskInfos[id] != null && _taskInfos[id].TaskStatus > TaskState.Init) { - LOGGER.Log(Level.Info, $"{taskId} already part of task set: going to directly submit it."); + LOGGER.Log(Level.Info, "{0} already part of task set: going to directly submit it.", taskId); lock (_taskInfos[id].Lock) { @@ -606,7 +601,7 @@ public void OnNewActiveContext(IActiveContext activeContext) { bool isMaster = IsMasterTaskContext(activeContext).Any(); - LOGGER.Log(Level.Info, $"Task {taskId} to be scheduled on {activeContext.EvaluatorId}"); + LOGGER.Log(Level.Info, "Task {0} to be scheduled on {1}", taskId, activeContext.EvaluatorId); List partialTaskConfs = new List(); @@ -641,7 +636,8 @@ public IElasticTaskSetManager Build() } /// - /// Method implementing how the task set manager should react when a notification that a task is running is received. + /// Method implementing how the task set manager should react when a notification that a task is + /// running is received. /// /// The running task public void OnTaskRunning(IRunningTask task) @@ -657,14 +653,16 @@ public void OnTaskRunning(IRunningTask task) if (Completed() || Failed()) { - LOGGER.Log(Level.Info, $"Received running from task {task.Id} but task set is completed or failed: ignoring."); + LOGGER.Log(Level.Info, "Received running from task {0} but task set is completed " + + "or failed: ignoring.", task.Id); _taskInfos[id].Dispose(); return; } if (!TaskStateUtils.IsRunnable(_taskInfos[id].TaskStatus)) { - LOGGER.Log(Level.Info, $"Received running from task {task.Id} which is not runnable: ignoring."); + LOGGER.Log(Level.Info, "Received running from task {0} which is not runnable: ignoring.", + task.Id); _taskInfos[id].Dispose(); return; @@ -688,7 +686,8 @@ public void OnTaskRunning(IRunningTask task) } /// - /// Method implementing how the task set manager should react when a notification that a task is completed is received. + /// Method implementing how the task set manager should react when a notification that a task + /// is completed is received. /// /// The completed task public void OnTaskCompleted(ICompletedTask taskInfo) @@ -791,12 +790,14 @@ public void OnTimeout(Alarm alarm, ref List msgs, ref Lis { if (Completed() || Failed()) { - LOGGER.Log(Level.Warning, $"Taskset made no progress in the last {_parameters.Timeout}ms. Forcing Disposal."); + LOGGER.Log(Level.Warning, "Taskset made no progress in the last {0}ms. Forcing Disposal.", + _parameters.Timeout); Dispose(); } else { - LOGGER.Log(Level.Error, $"Taskset made no progress in the last {_parameters.Timeout}ms. Aborting."); + LOGGER.Log(Level.Error, "Taskset made no progress in the last {0}ms. Aborting.", + _parameters.Timeout); Fail(); return; } @@ -835,7 +836,7 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent { if (IsTaskManagedBy(task.Id)) { - LOGGER.Log(Level.Info, "Received a failure from " + task.Id, task.AsError()); + LOGGER.Log(Level.Info, "Received a failure from {0}", task.Id, task.AsError()); Interlocked.Decrement(ref _tasksRunning); _totFailedTasks++; @@ -844,7 +845,8 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent if (Completed() || Failed()) { - LOGGER.Log(Level.Info, $"Received a failure from task {task.Id} but the task set is completed or failed: ignoring the failure", task.AsError()); + LOGGER.Log(Level.Info, "Received a failure from task {0} but the task set is completed or " + + "failed: ignoring the failure", task.Id, task.AsError()); lock (_taskInfos[id].Lock) { @@ -896,7 +898,7 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent /// The failed evaluator public void OnEvaluatorFailure(IFailedEvaluator evaluator) { - LOGGER.Log(Level.Info, "Received a failure from " + evaluator.Id, evaluator.EvaluatorException); + LOGGER.Log(Level.Info, "Received a failure from {0}", evaluator.Id, evaluator.EvaluatorException); _totFailedEvaluators++; @@ -936,7 +938,9 @@ public void OnEvaluatorFailure(IFailedEvaluator evaluator) if (cinfo.NumRetry > _parameters.NumEvaluatorFailures) { - LOGGER.Log(Level.Error, $"Context {cinfo.Id} failed more than {_parameters.NumEvaluatorFailures} times: Aborting"); + LOGGER.Log(Level.Error, "Context {0} failed more than {1} times: Aborting", + cinfo.Id, + _parameters.NumEvaluatorFailures); Fail(); } @@ -971,17 +975,21 @@ public void EventDispatcher(ref IFailureEvent @event) var rec = @event as ReconfigureEvent; OnReconfigure(ref rec); break; + case DefaultFailureStateEvents.Reschedule: var res = @event as RescheduleEvent; OnReschedule(ref res); break; + case DefaultFailureStateEvents.Stop: var stp = @event as StopEvent; OnStop(ref stp); break; + case DefaultFailureStateEvents.Fail: OnFail(); break; + default: throw new IllegalStateException("Failure event not recognized."); } @@ -995,7 +1003,8 @@ public void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { lock (_statusLock) { - _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReconfigure)); } SendToTasks(reconfigureEvent.FailureResponse); @@ -1009,7 +1018,8 @@ public void OnReschedule(ref RescheduleEvent rescheduleEvent) { lock (_statusLock) { - _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.ContinueAndReschedule)); } SendToTasks(rescheduleEvent.FailureResponse); @@ -1025,7 +1035,8 @@ public void OnStop(ref StopEvent stopEvent) { lock (_statusLock) { - _failureStatus = _failureStatus.Merge(new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); + _failureStatus = _failureStatus.Merge( + new DefaultFailureState((int)DefaultFailureStates.StopAndReschedule)); } SendToTasks(stopEvent.FailureResponse); @@ -1050,7 +1061,7 @@ public void OnFail() Dispose(); } - #endregion + #endregion Failure Response public void Dispose() { @@ -1139,7 +1150,7 @@ private void AddTask(string taskId, IActiveContext activeContext, List Configurations.Merge(x, y)); - _taskInfos[id] = new TaskInfo(aggregatedConfs, activeContext, activeContext.EvaluatorId, TaskState.Init, stageList); + _taskInfos[id] = new TaskInfo( + aggregatedConfs, + activeContext, + activeContext.EvaluatorId, + TaskState.Init, + stageList); if (_scheduled) { @@ -1172,7 +1188,7 @@ private bool StartSubmitTasks() { _scheduled = true; - LOGGER.Log(Level.Info, $"Scheduling {_tasksAdded} tasks from Taskset {StagesId}"); + LOGGER.Log(Level.Info, "Scheduling {0} tasks from Taskset {1}", _tasksAdded, StagesId); } } @@ -1202,8 +1218,8 @@ private void SubmitTask(int id) lock (_taskInfos[id].Lock) { - // Check that the task was not already submitted. This may happen for instance if _scheduled is set to true - // and a new active context message is received. + // Check that the task was not already submitted. This may happen for instance if + // _scheduled is set to true and a new active context message is received. if (_taskInfos[id].TaskStatus == TaskState.Submitted) { return; @@ -1238,7 +1254,8 @@ private void SubmitTask(int id) if (_taskInfos[id].IsActiveContextDisposed) { - LOGGER.Log(Level.Warning, $"Task submit for {id + 1} with a non-active context: spawning a new evaluator."); + LOGGER.Log(Level.Warning, + "Task submit for {0} with a non-active context: spawning a new evaluator.", id + 1); if (_taskInfos[id].TaskStatus == TaskState.Failed) { @@ -1321,7 +1338,7 @@ private void SendToTasks(IList messages, int retry = 0) private void SpawnNewEvaluator(int id) { - LOGGER.Log(Level.Warning, $"Spawning new evaluator for id {id}"); + LOGGER.Log(Level.Warning, "Spawning new evaluator for id {0}", id); var request = _evaluatorRequestor.NewBuilder() .SetNumber(1) @@ -1343,13 +1360,15 @@ private void Reschedule(RescheduleEvent rescheduleEvent) if (_taskInfos[id].NumRetry > _parameters.NumTaskFailures) { - LOGGER.Log(Level.Error, $"Task {rescheduleEvent.TaskId} failed more than {_parameters.NumTaskFailures} times: aborting"); + LOGGER.Log(Level.Error, "Task {0} failed more than {1} times: aborting", + rescheduleEvent.TaskId, + _parameters.NumTaskFailures); Fail(rescheduleEvent.TaskId); } if (rescheduleEvent.Reschedule) { - LOGGER.Log(Level.Info, $"Rescheduling task {rescheduleEvent.TaskId}"); + LOGGER.Log(Level.Info, "Rescheduling task {0}", rescheduleEvent.TaskId); _taskInfos[id].RescheduleConfigurations = rescheduleEvent.RescheduleTaskConfigurations; @@ -1367,7 +1386,9 @@ private void Fail(string taskId = "") private void LogFinalStatistics() { - var msg = string.Format("Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}", _totFailedTasks, _totFailedEvaluators); + var msg = string.Format("Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}", + _totFailedTasks, + _totFailedEvaluators); msg += _stages.Select(x => x.Value.LogFinalStatistics()).Aggregate((a, b) => a + "\n" + b); LOGGER.Log(Level.Info, msg); } @@ -1392,4 +1413,4 @@ private bool Failed() return _failureStatus.FailureState == (int)DefaultFailureStates.Fail; } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs index 7ed5a5911d..f14e86bd90 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs @@ -85,7 +85,7 @@ private FailuresClock( ++numberOfInstantiations; if (numberOfInstantiations > 1) { - LOGGER.Log(Level.Warning, "Instantiated `RuntimeClock` instance number " + numberOfInstantiations); + LOGGER.Log(Level.Warning, "Instantiated `RuntimeClock` instance number {0}", numberOfInstantiations); } } @@ -193,7 +193,8 @@ public void Run() } catch (Exception e) { - runtimeException = Optional.Of(new ReefRuntimeException("Caught Exception in clock, failing the Evaluator.", e)); + runtimeException = Optional.Of( + new ReefRuntimeException("Caught Exception in clock, failing the Evaluator.", e)); } var runtimeStop = runtimeException.IsPresent() @@ -253,7 +254,7 @@ private Time GetNextEvent() } /// - /// Process the next Time event. + /// Process the next Time event. /// /// The Time event to handle private void ProcessEvent(Time time) @@ -269,4 +270,4 @@ private void ProcessEvent(Time time) } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs index 4f6c2926b7..270c647ee6 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -32,8 +32,6 @@ using Org.Apache.REEF.Network.Elastic.Config; using System.Globalization; using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Tang.Exceptions; -using Org.Apache.REEF.Network.Elastic.Operators.Physical.Enum; namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default { @@ -46,7 +44,7 @@ internal abstract class DefaultOneToN : ElasticOperatorWithDefaultDispatcher { private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultOneToN<>)); - private volatile bool _stop; + private volatile bool _stop = false; /// /// Constructor for an operator where one node sends to N nodes and with default @@ -74,8 +72,6 @@ public DefaultOneToN( { MasterId = senderId; WithinIteration = prev.WithinIteration; - - _stop = false; } /// @@ -84,7 +80,9 @@ public DefaultOneToN( /// Incoming message from a task /// Zero or more reply messages for the task /// True if the operator has reacted to the task message - protected override bool ReactOnTaskMessage(ITaskMessage message, ref List returnMessages) + protected override bool ReactOnTaskMessage( + ITaskMessage message, + ref List returnMessages) { var offset = BitConverter.ToUInt16(message.Message, 0); offset += sizeof(ushort); @@ -105,7 +103,7 @@ protected override bool ReactOnTaskMessage(ITaskMessage message, ref List.Of(_failureMachine)); + _topology.TopologyUpdateResponse( + message.TaskId, + ref returnMessages, + Optional.Of(_failureMachine)); if (_stop) { @@ -134,7 +136,8 @@ protected override bool ReactOnTaskMessage(ITaskMessage message, ref List public override void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { - LOGGER.Log(Level.Info, $"Going to reconfigure the {OperatorType.ToString()} operator"); + LOGGER.Log(Level.Info, "Going to reconfigure the {0} operator", OperatorType.ToString()); if (reconfigureEvent.FailedTask.IsPresent()) { if (reconfigureEvent.FailedTask.Value.AsError() is OperatorException) { - var info = Optional.Of(((OperatorException)reconfigureEvent.FailedTask.Value.AsError()).AdditionalInfo); - var msg = _topology.Reconfigure(reconfigureEvent.FailedTask.Value.Id, info, reconfigureEvent.Iteration); + var info = Optional.Of( + ((OperatorException)reconfigureEvent.FailedTask.Value.AsError()).AdditionalInfo); + var msg = _topology.Reconfigure( + reconfigureEvent.FailedTask.Value.Id, + info, + reconfigureEvent.Iteration); reconfigureEvent.FailureResponse.AddRange(msg); } else { - var msg = _topology.Reconfigure(reconfigureEvent.FailedTask.Value.Id, Optional.Empty(), reconfigureEvent.Iteration); + var msg = _topology.Reconfigure( + reconfigureEvent.FailedTask.Value.Id, + Optional.Empty(), + reconfigureEvent.Iteration); reconfigureEvent.FailureResponse.AddRange(msg); } @@ -188,9 +198,11 @@ public override void OnReschedule(ref RescheduleEvent rescheduleEvent) // Iterators manage the re-schuedling of tasks. If not iterator exists, setup the rescheduling. if (!WithinIteration) { - LOGGER.Log(Level.Info, "Going to reschedule task " + rescheduleEvent.TaskId); + LOGGER.Log(Level.Info, "Going to reschedule task {0}", rescheduleEvent.TaskId); - if (!rescheduleEvent.RescheduleTaskConfigurations.TryGetValue(Stage.StageName, out IList confs)) + if (!rescheduleEvent.RescheduleTaskConfigurations.TryGetValue( + Stage.StageName, + out IList confs)) { confs = new List(); rescheduleEvent.RescheduleTaskConfigurations.Add(Stage.StageName, confs); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs index 4d3df764bb..94ad9f3b9f 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs @@ -69,7 +69,12 @@ protected ElasticOperatorWithDefaultDispatcher( /// The checkpoint policy for the operator /// Additional configurations for the operator /// The same operator pipeline with the added broadcast operator - public override ElasticOperator Broadcast(int senderId, ITopology topology, IFailureStateMachine failureMachine, CheckpointLevel checkpointLevel, params IConfiguration[] configurations) + public override ElasticOperator Broadcast( + int senderId, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel, + params IConfiguration[] configurations) { _next = new DefaultBroadcast(senderId, this, topology, failureMachine, checkpointLevel, configurations); return _next; @@ -91,11 +96,6 @@ public override void OnTaskFailure(IFailedTask task, ref List fai var opException = task.AsError() as OperatorException; failedOperatorId = opException.OperatorId; } - //else - //{ - // LOGGER.Log(Level.Info, $"Failure from {task.Id} cannot be properly managed: failing."); - // failureEvents.Add(new FailEvent(task.Id)); - //} if (WithinIteration || failedOperatorId <= _id) { @@ -135,7 +135,7 @@ public override void OnTaskFailure(IFailedTask task, ref List fai break; default: - LOGGER.Log(Level.Info, $"Failure from {task.Id} requires no action"); + LOGGER.Log(Level.Info, "Failure from {0} requires no action", task.Id); break; } @@ -154,7 +154,10 @@ public override void OnTaskFailure(IFailedTask task, ref List fai /// The alarm triggering the timeput /// A list of messages encoding how remote tasks need to react /// The next timeouts to be scheduled - public override void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts) + public override void OnTimeout( + Alarm alarm, + ref List msgs, + ref List nextTimeouts) { if (_next != null) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index b6617ea1c4..137cde4862 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -89,8 +89,8 @@ public abstract class ElasticOperator : IFailureResponse, ITaskMessageResponse protected readonly int _id; protected readonly IConfiguration[] _configurations; - protected bool _operatorFinalized; - protected volatile bool _operatorStateFinalized; + protected bool _operatorFinalized = false; + protected volatile bool _operatorStateFinalized = false; protected IElasticStage _stage; /// @@ -117,8 +117,6 @@ public ElasticOperator( _failureMachine = failureMachine; _checkpointLevel = checkpointLevel; _configurations = configurations; - _operatorFinalized = false; - _operatorStateFinalized = false; _topology.OperatorId = _id; _topology.StageName = Stage.StageName; @@ -172,7 +170,12 @@ public IElasticStage Stage /// The same operator pipeline with the added broadcast operator public ElasticOperator Broadcast(TopologyType topology, params IConfiguration[] configurations) { - return Broadcast(MasterId, GetTopology(topology), _failureMachine.Clone(), CheckpointLevel.None, configurations); + return Broadcast( + MasterId, + GetTopology(topology), + _failureMachine.Clone(), + CheckpointLevel.None, + configurations); } /// @@ -184,9 +187,17 @@ public ElasticOperator Broadcast(TopologyType topology, params IConfiguration /// The checkpoint policy for the operator /// Additional configurations for the operator /// The same operator pipeline with the added broadcast operator - public ElasticOperator Broadcast(TopologyType topology, CheckpointLevel checkpointLevel, params IConfiguration[] configurations) + public ElasticOperator Broadcast( + TopologyType topology, + CheckpointLevel checkpointLevel, + params IConfiguration[] configurations) { - return Broadcast(MasterId, GetTopology(topology), _failureMachine.Clone(), checkpointLevel, configurations); + return Broadcast( + MasterId, + GetTopology(topology), + _failureMachine.Clone(), + checkpointLevel, + configurations); } /// @@ -196,7 +207,8 @@ public ElasticOperator Broadcast(TopologyType topology, CheckpointLevel check /// The task message for the operator /// A list of messages containing the instructions for the task /// True if the message was managed correctly, false otherwise - /// If the message cannot be handled correctly or generate an incorrent state + /// If the message cannot be handled correctly or + /// generate an incorrent state public void OnTaskMessage(ITaskMessage message, ref List returnMessages) { var hasReacted = ReactOnTaskMessage(message, ref returnMessages); @@ -325,7 +337,12 @@ public virtual bool CheckIfLastIterator() /// The checkpoint policy for the operator /// Additional configurations for the operator /// The same operator pipeline with the added broadcast operator - public abstract ElasticOperator Broadcast(int senderId, ITopology topology, IFailureStateMachine failureMachine, CheckpointLevel checkpointLevel = CheckpointLevel.None, params IConfiguration[] configurations); + public abstract ElasticOperator Broadcast( + int senderId, + ITopology topology, + IFailureStateMachine failureMachine, + CheckpointLevel checkpointLevel = CheckpointLevel.None, + params IConfiguration[] configurations); /// /// Used to react on a failure occurred on a task. @@ -343,7 +360,10 @@ public virtual bool CheckIfLastIterator() /// The alarm triggering the timeput /// A list of messages encoding how remote Tasks need to reach /// The next timeouts to be scheduled - public abstract void OnTimeout(Alarm alarm, ref List msgs, ref List nextTimeouts); + public abstract void OnTimeout( + Alarm alarm, + ref List msgs, + ref List nextTimeouts); /// /// When a new failure state is reached, this method is used to dispatch @@ -467,7 +487,8 @@ protected void OnNewIteration(int iteration) /// /// This method is operator specific and serializes the operator configuration into the input list. /// - /// A list the serialized operator configuration will be appended to + /// A list the serialized operator configuration will be + /// appended to /// The task id of the task that belongs to this operator protected virtual void GetOperatorConfiguration(ref IList serializedOperatorsConfs, int taskId) { @@ -558,7 +579,10 @@ private ITopology GetTopology(TopologyType topologyType) topology = new FlatTopology(MasterId); break; - default: throw new ArgumentException(nameof(topologyType), $"Topology type {topologyType} not supported by {OperatorType.ToString()}."); + default: + throw new ArgumentException( + nameof(topologyType), + $"Topology type {topologyType} not supported by {OperatorType.ToString()}."); } return topology; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index 1443a026c4..52127a1927 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -127,7 +127,8 @@ public T Receive() if (isIterative && typedDataMessage.Iteration < (int)IteratorReference.Current) { - LOGGER.Log(Level.Warning, $"Received message for iteration {typedDataMessage.Iteration} but I am already in iteration {(int)IteratorReference.Current}: ignoring."); + LOGGER.Log(Level.Warning, "Received message for iteration {0} but I am already in iteration " + + "{1}: ignoring.", typedDataMessage.Iteration, (int)IteratorReference.Current); } else { @@ -166,7 +167,7 @@ public void ResetPosition() /// public void WaitForTaskRegistration(CancellationTokenSource cancellationSource) { - LOGGER.Log(Level.Info, $"Waiting for task registration for {OperatorType.ToString()} operator."); + LOGGER.Log(Level.Info, "Waiting for task registration for {0} operator.", OperatorType.ToString()); _topology.WaitForTaskRegistration(cancellationSource); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index f3373dcb96..60e42e842e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -51,7 +51,7 @@ internal abstract class CommunicationLayer : private IDisposable _communicationObserver; private readonly ConcurrentDictionary _driverMessageObservers; - protected bool _disposed; + protected bool _disposed = false; protected readonly ConcurrentDictionary _groupMessageObservers = new ConcurrentDictionary(); @@ -78,8 +78,6 @@ protected CommunicationLayer( _driverMessagesHandler = driverMessagesHandler; _idFactory = idFactory; - _disposed = false; - _communicationObserver = _networkService.RemoteManager.RegisterObserver(this); _driverMessageObservers = _driverMessagesHandler.DriverMessageObservers; } @@ -119,8 +117,13 @@ internal void RegisterOperatorTopologyForDriver(DriverAwareOperatorTopology oper /// /// Send the communication message to the task whose name is included in the message. /// + /// The destination node for the message /// The message to send - internal void Send(string destination, ElasticGroupCommunicationMessage message, CancellationTokenSource cancellationSource) + /// The token to cancel the operation + internal void Send( + string destination, + ElasticGroupCommunicationMessage message, + CancellationTokenSource cancellationSource) { if (message == null) { @@ -163,7 +166,11 @@ internal void Send(string destination, ElasticGroupCommunicationMessage message, /// /// The identifier to look up /// The token to cancel the operation - public void WaitForTaskRegistration(IList identifiers, CancellationTokenSource cancellationSource, ConcurrentDictionary removed = null) + /// Nodes that got removed during task registration + public void WaitForTaskRegistration( + IList identifiers, + CancellationTokenSource cancellationSource, + ConcurrentDictionary removed = null) { if (removed == null) { @@ -176,36 +183,40 @@ public void WaitForTaskRegistration(IList identifiers, CancellationToken { if (cancellationSource != null && cancellationSource.Token.IsCancellationRequested) { - LOGGER.Log(Level.Warning, $"WaitForTaskRegistration is canceled in retryCount {i}."); + LOGGER.Log(Level.Warning, "WaitForTaskRegistration is canceled in retryCount {0}.", i); throw new OperationCanceledException("WaitForTaskRegistration is canceled"); } - LOGGER.Log(Level.Info, $"WaitForTaskRegistration, in retryCount {i}."); + LOGGER.Log(Level.Info, "WaitForTaskRegistration, in retryCount {0}.", i); foreach (var identifier in identifiers) { var notFound = !foundList.Contains(identifier); if (notFound && removed.ContainsKey(identifier)) { foundList.Add(identifier); - LOGGER.Log(Level.Verbose, $"WaitForTaskRegistration, dependent id {identifier} was removed at loop {i}."); + LOGGER.Log(Level.Verbose, + "WaitForTaskRegistration, dependent id {0} was removed at loop {1}.", identifier, i); } else if (notFound && Lookup(identifier)) { foundList.Add(identifier); - LOGGER.Log(Level.Verbose, $"WaitForTaskRegistration, find a dependent id {identifier} at loop {i}."); + LOGGER.Log(Level.Verbose, + "WaitForTaskRegistration, find a dependent id {0} at loop {1}.", identifier, i); } } if (foundList.Count >= identifiers.Count) { - LOGGER.Log(Level.Info, $"WaitForTaskRegistration, found all {foundList.Count} dependent ids at loop {i}."); + LOGGER.Log(Level.Info, + "WaitForTaskRegistration, found all {0} dependent ids at loop {1}.", foundList.Count, i); return; } Thread.Sleep(_sleepTime); } - ICollection leftovers = foundList.Count == 0 ? identifiers : identifiers.Where(e => !foundList.Contains(e)).ToList(); + ICollection leftovers = + foundList.Count == 0 ? identifiers : identifiers.Where(e => !foundList.Contains(e)).ToList(); var msg = string.Join(",", leftovers); LOGGER.Log(Level.Error, "Cannot find registered parent/children: {0}.", msg); @@ -279,7 +290,6 @@ private bool Send(IIdentifier destId, ElasticGroupCommunicationMessage message) } connection.Write(message); - LOGGER.Log(Level.Verbose, $"message sent to {destId}"); } catch (Exception e) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs index df22ea4069..230a6d628c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs @@ -25,6 +25,7 @@ using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Network.Elastic.Topology.Physical; using Org.Apache.REEF.Utilities.Attributes; +using static Org.Apache.REEF.Network.Elastic.Config.GroupCommunicationConfigurationOptions; namespace Org.Apache.REEF.Network.Elastic.Task.Impl { @@ -43,9 +44,9 @@ internal sealed class DefaultCommunicationLayer : /// [Inject] private DefaultCommunicationLayer( - [Parameter(typeof(GroupCommunicationConfigurationOptions.Timeout))] int timeout, - [Parameter(typeof(GroupCommunicationConfigurationOptions.RetryCountWaitingForRegistration))] int retryRegistration, - [Parameter(typeof(GroupCommunicationConfigurationOptions.SleepTimeWaitingForRegistration))] int sleepTime, + [Parameter(typeof(Timeout))] int timeout, + [Parameter(typeof(RetryCountWaitingForRegistration))] int retryRegistration, + [Parameter(typeof(SleepTimeWaitingForRegistration))] int sleepTime, [Parameter(typeof(ElasticServiceConfigurationOptions.SendRetry))] int retrySending, StreamingNetworkService networkService, DefaultTaskToDriverMessageDispatcher taskToDriverDispatcher, @@ -84,7 +85,8 @@ public override void OnNext(IRemoteMessage.Class); Type groupCommOperatorGenericInterface = typeof(IElasticTypedOperator<>); - Type groupCommOperatorInterface = groupCommOperatorGenericInterface.MakeGenericType(Type.GetType(msgType)); + Type groupCommOperatorInterface = + groupCommOperatorGenericInterface.MakeGenericType(Type.GetType(msgType)); var operatorObj = operatorInjector.GetInstance(groupCommOperatorInterface); Workflow.Add(operatorObj as IElasticOperator); @@ -103,7 +104,7 @@ public void WaitForTaskRegistration(CancellationTokenSource cancellationSource = } catch (OperationCanceledException e) { - LOGGER.Log(Level.Error, $"Stage {StageName} failed during registration."); + LOGGER.Log(Level.Error, "Stage {0} failed during registration.", StageName); throw e; } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs index 3190041048..143a514160 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultTaskToDriverMessageDispatcher.cs @@ -30,7 +30,9 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl /// messages dispatcher. /// [Unstable("0.16", "API may change")] - internal sealed class DefaultTaskToDriverMessageDispatcher : TaskToDriverMessageDispatcher, IDefaultTaskToDriverMessages + internal sealed class DefaultTaskToDriverMessageDispatcher : + TaskToDriverMessageDispatcher, + IDefaultTaskToDriverMessages { private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultTaskToDriverMessageDispatcher)); @@ -57,11 +59,12 @@ public void JoinTopology(string taskId, string stageName, int operatorId) offset += sizeof(ushort); Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); offset += stageName.Length; - Buffer.BlockCopy(BitConverter.GetBytes((ushort)TaskMessageType.JoinTopology), 0, message, offset, sizeof(ushort)); + Buffer.BlockCopy( + BitConverter.GetBytes((ushort)TaskMessageType.JoinTopology), 0, message, offset, sizeof(ushort)); offset += sizeof(ushort); Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); - LOGGER.Log(Level.Info, $"Operator {operatorId} requesting to join the topology through heartbeat."); + LOGGER.Log(Level.Info, "Operator {0} requesting to join the topology through heartbeat.", operatorId); Send(taskId, message); } @@ -79,11 +82,16 @@ public void TopologyUpdateRequest(string taskId, string stageName, int operatorI offset += sizeof(ushort); Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); offset += stageName.Length; - Buffer.BlockCopy(BitConverter.GetBytes((ushort)TaskMessageType.TopologyUpdateRequest), 0, message, offset, sizeof(ushort)); + Buffer.BlockCopy( + BitConverter.GetBytes((ushort)TaskMessageType.TopologyUpdateRequest), + 0, + message, + offset, + sizeof(ushort)); offset += sizeof(ushort); Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); - LOGGER.Log(Level.Info, string.Format($"Operator {operatorId} requesting a topology update through heartbeat.")); + LOGGER.Log(Level.Info, "Operator {0} requesting a topology update through heartbeat.", operatorId); Send(taskId, message); } @@ -100,11 +108,12 @@ public void StageComplete(string taskId, string stageName) offset += sizeof(ushort); Buffer.BlockCopy(ByteUtilities.StringToByteArrays(stageName), 0, message, offset, stageName.Length); offset += stageName.Length; - Buffer.BlockCopy(BitConverter.GetBytes((ushort)TaskMessageType.CompleteStage), 0, message, offset, sizeof(ushort)); + Buffer.BlockCopy( + BitConverter.GetBytes((ushort)TaskMessageType.CompleteStage), 0, message, offset, sizeof(ushort)); LOGGER.Log(Level.Info, "Sending notification that the stage is completed."); Send(taskId, message); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index 87f56d3dc1..656830ce70 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -43,7 +43,7 @@ public sealed class Workflow : IEnumerator, IEnumerable _iteratorsPosition = new List(); + private readonly List _iteratorsPosition = new List(); private readonly object _lock = new object(); private readonly IList _operators = new List(); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index 196f82b910..6cd75193da 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -41,22 +41,22 @@ public class FlatTopology : ITopology { private static readonly Logger LOGGER = Logger.GetLogger(typeof(FlatTopology)); - private string _rootTaskId; + private string _rootTaskId = string.Empty; private int _rootId; - private string _taskStage; - private volatile int _iteration; - private bool _finalized; + private string _taskStage = string.Empty; + private volatile int _iteration = 1; + private bool _finalized = false; private readonly bool _sorted; - private readonly Dictionary _nodes; - private readonly HashSet _lostNodesToBeRemoved; - private HashSet _nodesWaitingToJoinTopologyNextIteration; - private HashSet _nodesWaitingToJoinTopology; + private readonly Dictionary _nodes = new Dictionary(); + private readonly HashSet _lostNodesToBeRemoved = new HashSet(); + private HashSet _nodesWaitingToJoinTopologyNextIteration = new HashSet(); + private HashSet _nodesWaitingToJoinTopology = new HashSet(); - private volatile int _availableDataPoints; + private volatile int _availableDataPoints = 0; private int _totNumberofNodes; - private readonly object _lock; + private readonly object _lock = new object(); /// /// Constructor for flat topology. After construction the graph is empty @@ -66,21 +66,9 @@ public class FlatTopology : ITopology /// Whether the leaf nodes need to be ordered or not public FlatTopology(int rootId, bool sorted = false) { - _rootTaskId = string.Empty; - _taskStage = string.Empty; _rootId = rootId; - _finalized = false; _sorted = sorted; OperatorId = -1; - _iteration = 1; - _availableDataPoints = 0; - - _lock = new object(); - - _nodes = new Dictionary(); - _lostNodesToBeRemoved = new HashSet(); - _nodesWaitingToJoinTopologyNextIteration = new HashSet(); - _nodesWaitingToJoinTopology = new HashSet(); } /// @@ -307,7 +295,10 @@ public void GetTaskConfiguration(ref ICsConfigurationBuilder confBuilder, int ta /// The identifier of the task asking for the update /// A list of message containing the topology update /// An optional failure machine to log updates - public void TopologyUpdateResponse(string taskId, ref List returnMessages, Optional failureStateMachine) + public void TopologyUpdateResponse( + string taskId, + ref List returnMessages, + Optional failureStateMachine) { if (taskId != _rootTaskId) { @@ -323,14 +314,18 @@ public void TopologyUpdateResponse(string taskId, ref List() { update }, StageName, OperatorId, _iteration); + var data = new UpdateMessagePayload( + new List() { update }, StageName, OperatorId, _iteration); var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); returnMessages.Add(returnMessage); if (_nodesWaitingToJoinTopology.Count > 0) { - LOGGER.Log(Level.Info, $"Tasks [{string.Join(",", _nodesWaitingToJoinTopology)}] are added to topology in iteration {_iteration}"); + LOGGER.Log(Level.Info, + "Tasks [{0}] are added to topology in iteration {1}", + string.Join(",", _nodesWaitingToJoinTopology), + _iteration); _availableDataPoints += _nodesWaitingToJoinTopology.Count; failureStateMachine.Value.AddDataPoints(_nodesWaitingToJoinTopology.Count, false); @@ -352,7 +347,11 @@ public void TopologyUpdateResponse(string taskId, ref ListThe new iteration number public void OnNewIteration(int iteration) { - LOGGER.Log(Level.Info, $"Flat Topology for Operator {OperatorId} in Iteration {iteration - 1} is closed with {_availableDataPoints} nodes"); + LOGGER.Log(Level.Info, + "Flat Topology for Operator {0} in Iteration {1} is closed with {2} nodes", + OperatorId, + iteration - 1, + _availableDataPoints); _iteration = iteration; _totNumberofNodes += _availableDataPoints; @@ -370,7 +369,10 @@ public void OnNewIteration(int iteration) /// Some additional topology-specific information /// The optional iteration number in which the event occurred /// One or more messages for reconfiguring the Tasks - public IList Reconfigure(string taskId, Optional info, Optional iteration) + public IList Reconfigure( + string taskId, + Optional info, + Optional iteration) { if (taskId == _rootTaskId) { @@ -400,7 +402,7 @@ public IList Reconfigure(string taskId, Optional var data = new FailureMessagePayload(update, StageName, OperatorId, -1); var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); - LOGGER.Log(Level.Info, $"Task {taskId} is removed from topology"); + LOGGER.Log(Level.Info, "Task {0} is removed from topology", taskId); messages.Add(returnMessage); _lostNodesToBeRemoved.Clear(); } @@ -414,12 +416,18 @@ public IList Reconfigure(string taskId, Optional /// public string LogFinalStatistics() { - return $"\nAverage number of nodes in the topology of Operator {OperatorId}: {(_iteration >= 2 ? (float)_totNumberofNodes / (_iteration - 1) : _availableDataPoints)}"; + return string.Format( + "\nAverage number of nodes in the topology of Operator {0}: {1}", + OperatorId, + _iteration >= 2 ? (float)_totNumberofNodes / (_iteration - 1) : _availableDataPoints); } private void BuildTopology() { - IEnumerator iter = _sorted ? _nodes.OrderBy(kv => kv.Key).Select(kv => kv.Value).GetEnumerator() : _nodes.Values.GetEnumerator(); + IEnumerator iter = + _sorted ? + _nodes.OrderBy(kv => kv.Key).Select(kv => kv.Value).GetEnumerator() : + _nodes.Values.GetEnumerator(); var root = _nodes[_rootId]; while (iter.MoveNext()) @@ -431,4 +439,4 @@ private void BuildTopology() } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index 7285907fea..279fe4fa8f 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -38,7 +38,8 @@ internal abstract class OneToNTopology : OperatorTopologyWithDefaultCommunicatio { protected static readonly Logger LOGGER = Logger.GetLogger(typeof(OneToNTopology)); - protected readonly ConcurrentDictionary _nodesToRemove; + protected readonly ConcurrentDictionary _nodesToRemove = + new ConcurrentDictionary(); protected readonly ManualResetEvent _topologyUpdateReceived; protected readonly bool _piggybackTopologyUpdates; @@ -67,9 +68,16 @@ public OneToNTopology( int retry, int timeout, int disposeTimeout, - DefaultCommunicationLayer commLayer) : base(stageName, taskId, rootTaskId, operatorId, commLayer, retry, timeout, disposeTimeout) + DefaultCommunicationLayer commLayer) : base( + stageName, + taskId, + rootTaskId, + operatorId, + commLayer, + retry, + timeout, + disposeTimeout) { - _nodesToRemove = new ConcurrentDictionary(); _topologyUpdateReceived = new ManualResetEvent(RootTaskId == taskId ? false : true); _commLayer.RegisterOperatorTopologyForTask(this); @@ -94,7 +102,7 @@ public bool IsSending } /// - /// Waiting logic before disposing topologies. + /// Waiting logic before disposing topologies. /// public void WaitCompletionBeforeDisposing(CancellationTokenSource cancellationSource) { @@ -125,7 +133,8 @@ public override void WaitForTaskRegistration(CancellationTokenSource cancellatio } catch (Exception e) { - throw new IllegalStateException("Failed to find parent/children nodes in operator topology for node: " + TaskId, e); + throw new IllegalStateException( + "Failed to find parent/children nodes in operator topology for node: " + TaskId, e); } _initialized = true; @@ -182,7 +191,7 @@ public override void OnNext(DriverMessagePayload message) { foreach (var node in updates.Children) { - LOGGER.Log(Level.Info, $"Removing task {node} from the topology."); + LOGGER.Log(Level.Info, "Removing task {0} from the topology.", node); _nodesToRemove.TryAdd(node, new byte()); _commLayer.RemoveConnection(node); } @@ -219,12 +228,15 @@ public override void OnNext(DriverMessagePayload message) } else { - LOGGER.Log(Level.Warning, "Received a topology update message from driver but sending queue is empty: ignoring."); + LOGGER.Log(Level.Warning, "Received a topology update message from driver " + + "but sending queue is empty: ignoring."); } } break; + default: - throw new ArgumentException($"Message type {message.PayloadType} not supported by N to one topologies."); + throw new ArgumentException( + $"Message type {message.PayloadType} not supported by N to one topologies."); } } @@ -254,4 +266,4 @@ private void UpdateTopology(ref List updates) } } } -} +} \ No newline at end of file From efb630f2b5ecdda05fee1c406ae9b707717af8e5 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 31 Jan 2019 11:22:42 -0800 Subject: [PATCH 12/29] Fixed: - Log notation - Added generics to drivers and client to decrease the number of files --- .../ElasticBroadcastClient.cs | 85 +++++++++--------- ...stClientWithFailEvaluatorBeforeWorkflow.cs | 53 ----------- ...roadcastClientWithFailureAfterBroadcast.cs | 53 ----------- ...oadcastClientWithFailureBeforeBroadcast.cs | 53 ----------- ...roadcastClientWithFailureBeforeWorkflow.cs | 53 ----------- ...BroadcastClientWithFailureInConstructor.cs | 53 ----------- ...sticBroadcastClientWithMultipleFailures.cs | 53 ----------- .../Run.cs | 90 +++++++++++-------- .../Elastic/BroadcastMasterTask.cs | 4 +- .../Elastic/BroadcastSlaveTask.cs | 4 +- .../Elastic/ElasticBroadcastDriver.cs | 11 +-- .../ElasticBroadcastDriverWithFailures.cs | 14 +-- .../BroadcastSlaveTaskDieAfterBroadcast.cs | 4 +- .../BroadcastSlaveTaskDieBeforeBroadcast.cs | 4 +- .../BroadcastSlaveTaskDieBeforeWorkflow.cs | 4 +- ...castSlaveTaskDieEvaluatorBeforeWorkflow.cs | 4 +- .../BroadcastSlaveTaskDieInConstructor.cs | 4 +- .../BroadcastSlaveTaskDieMultiple.cs | 4 +- ...BroadcastSlaveTaskDieMultipleEvaluators.cs | 4 +- ...stDriverWithFailEvaluatorBeforeWorkflow.cs | 58 ------------ ...roadcastDriverWithFailureAfterBroadcast.cs | 57 ------------ ...oadcastDriverWithFailureBeforeBroadcast.cs | 57 ------------ ...roadcastDriverWithFailureBeforeWorkflow.cs | 57 ------------ ...BroadcastDriverWithFailureInConstructor.cs | 57 ------------ ...sticBroadcastDriverWithMultipleFailures.cs | 55 ------------ .../ScatterReduceDriver.cs | 16 ++-- .../Driver/Default/DefaultElasticStage.cs | 8 +- .../Default/DefaultElasticTaskSetManager.cs | 62 ++++++------- .../Elastic/Failures/FailuresClock.cs | 4 +- .../Logical/Default/DefaultOneToN.cs | 12 +-- .../ElastiOperatorWithDefaultDispatcher.cs | 6 +- .../Operators/Logical/ElasticOperator.cs | 4 +- .../Physical/Default/DefaultOneToN.cs | 6 +- .../Elastic/Task/CommunicationLayer.cs | 20 ++--- .../Task/Default/DefaultCommunicationLayer.cs | 4 +- .../Task/Default/DefaultElasticStage.cs | 6 +- .../DefaultTaskToDriverMessageDispatcher.cs | 8 +- .../Elastic/Task/Workflow.cs | 6 +- .../Topology/Logical/Impl/FlatTopology.cs | 8 +- .../Default/DefaultBroadcastTopology.cs | 2 +- .../Physical/Default/OneToNTopology.cs | 6 +- .../Utilities/Utils.cs | 6 +- 42 files changed, 220 insertions(+), 859 deletions(-) delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureAfterBroadcast.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeBroadcast.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeWorkflow.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureInConstructor.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithMultipleFailures.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs index fab5e70fc6..e8f98fdf88 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs @@ -28,24 +28,32 @@ using Org.Apache.REEF.Client.Local; using Org.Apache.REEF.Client.Yarn; using Org.Apache.REEF.Network.Elastic.Config; -using Org.Apache.REEF.Network.Examples.Elastic; +using Org.Apache.REEF.Network.Elastic.Driver.Default; namespace Org.Apache.REEF.Network.Examples.Client { - public class ElasticBroadcastClient + internal class JobIdentifiers { - const string Local = "local"; - const string Yarn = "yarn"; - const string DefaultRuntimeFolder = "REEF_LOCAL_RUNTIME"; + public const string ElastiBroadcast = "ElasticBroadcast"; + public const string ElastiBroadcastWithFailure = "ElasticBroadcastWithFailure"; + } + + public sealed class ElasticBroadcastClient where T : DefaultElasticDriver + { + private const string Local = "local"; + private const string Yarn = "yarn"; + private const string DefaultRuntimeFolder = "REEF_LOCAL_RUNTIME"; + private const string stage = "Broadcast"; public ElasticBroadcastClient( - bool runOnYarn, - int numTasks, - int startingPortNo, - int portRange) + bool runOnYarn, + int numTasks, + int startingPortNo, + int portRange, + string jobIdentifier) { - const string driverId = "ElasticBroadcastDriver"; - const string stage = "Broadcast"; + string driverId = GenericType.Class.ToString(); + JobIdentifier = jobIdentifier; IConfiguration driverConfig = TangFactory.GetTang() .NewConfigurationBuilder(GetDriverConf()) @@ -72,20 +80,16 @@ public ElasticBroadcastClient( .Merge(driverConfig, elsticGroupCommServiceDriverConfig); string runPlatform = runOnYarn ? "yarn" : "local"; - TestRun( - merged, - typeof(ElasticBroadcastDriver), - numTasks, - JobIdentifier, - runPlatform); + + TestRun(merged, typeof(T), numTasks, JobIdentifier, runPlatform); } - internal static void TestRun( - IConfiguration driverConfig, - Type globalAssemblyType, - int numberOfEvaluator, - string jobIdentifier = "myDriver", - string runOnYarn = "local", + private static void TestRun( + IConfiguration driverConfig, + Type globalAssemblyType, + int numberOfEvaluator, + string jobIdentifier = "myDriver", + string runOnYarn = "local", string runtimeFolder = DefaultRuntimeFolder) { IInjector injector = TangFactory.GetTang() @@ -101,9 +105,9 @@ internal static void TestRun( reefClient.SubmitAndGetJobStatus(jobSubmission); } - internal static IConfiguration GetRuntimeConfiguration( - string runOnYarn, - int numberOfEvaluator, + private static IConfiguration GetRuntimeConfiguration( + string runOnYarn, + int numberOfEvaluator, string runtimeFolder) { switch (runOnYarn) @@ -112,35 +116,34 @@ internal static IConfiguration GetRuntimeConfiguration( var dir = Path.Combine(".", runtimeFolder); return LocalRuntimeClientConfiguration.ConfigurationModule .Set( - LocalRuntimeClientConfiguration.NumberOfEvaluators, + LocalRuntimeClientConfiguration.NumberOfEvaluators, numberOfEvaluator.ToString()) .Set(LocalRuntimeClientConfiguration.RuntimeFolder, dir) .Build(); + case Yarn: return YARNClientConfiguration.ConfigurationModule.Build(); + default: throw new ArgumentException("Unknown runtime: " + runOnYarn); } } - protected virtual string JobIdentifier - { - get { return "ElasticBroadcast"; } - } + private string JobIdentifier { get; set; } - protected virtual IConfiguration GetDriverConf() + private IConfiguration GetDriverConf() { return DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) + .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) + .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) + .Set(DriverConfiguration.OnContextActive, GenericType.Class) + .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) + .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) + .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) + .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) .Build(); } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow.cs deleted file mode 100644 index c1ea6ce626..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow.cs +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Driver; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Utilities.Logging; -using Org.Apache.REEF.Network.Examples.Elastic; - -namespace Org.Apache.REEF.Network.Examples.Client -{ - public sealed class ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow : ElasticBroadcastClient - { - public ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow(bool runOnYarn, int numTasks, int startingPortNo, int portRange) - : base (runOnYarn, numTasks, startingPortNo, portRange) - { - } - - protected override string JobIdentifier - { - get { return "ElasticBroadcastWithFailure"; } - } - - protected override IConfiguration GetDriverConf() - { - return DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) - .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) - .Build(); - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureAfterBroadcast.cs deleted file mode 100644 index db3872d776..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureAfterBroadcast.cs +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Driver; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Utilities.Logging; -using Org.Apache.REEF.Network.Examples.Elastic; - -namespace Org.Apache.REEF.Network.Examples.Client -{ - public sealed class ElasticBroadcastClientWithFailureAfterBroadcast : ElasticBroadcastClient - { - public ElasticBroadcastClientWithFailureAfterBroadcast(bool runOnYarn, int numTasks, int startingPortNo, int portRange) - : base (runOnYarn, numTasks, startingPortNo, portRange) - { - } - - protected override string JobIdentifier - { - get { return "ElasticBroadcastWithFailure"; } - } - - protected override IConfiguration GetDriverConf() - { - return DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) - .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) - .Build(); - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeBroadcast.cs deleted file mode 100644 index 13e5d9de55..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeBroadcast.cs +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Driver; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Utilities.Logging; -using Org.Apache.REEF.Network.Examples.Elastic; - -namespace Org.Apache.REEF.Network.Examples.Client -{ - public sealed class ElasticBroadcastClientWithFailureBeforeBroadcast : ElasticBroadcastClient - { - public ElasticBroadcastClientWithFailureBeforeBroadcast(bool runOnYarn, int numTasks, int startingPortNo, int portRange) - : base (runOnYarn, numTasks, startingPortNo, portRange) - { - } - - protected override string JobIdentifier - { - get { return "ElasticBroadcastWithFailure"; } - } - - protected override IConfiguration GetDriverConf() - { - return DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) - .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) - .Build(); - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeWorkflow.cs deleted file mode 100644 index f3b8c3dbd5..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureBeforeWorkflow.cs +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Driver; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Utilities.Logging; -using Org.Apache.REEF.Network.Examples.Elastic; - -namespace Org.Apache.REEF.Network.Examples.Client -{ - public sealed class ElasticBroadcastClientWithFailureBeforeWorkflow : ElasticBroadcastClient - { - public ElasticBroadcastClientWithFailureBeforeWorkflow(bool runOnYarn, int numTasks, int startingPortNo, int portRange) - : base (runOnYarn, numTasks, startingPortNo, portRange) - { - } - - protected override string JobIdentifier - { - get { return "ElasticBroadcastWithFailure"; } - } - - protected override IConfiguration GetDriverConf() - { - return DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) - .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) - .Build(); - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureInConstructor.cs deleted file mode 100644 index 6f7e1815e3..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithFailureInConstructor.cs +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Driver; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Utilities.Logging; -using Org.Apache.REEF.Network.Examples.Elastic; - -namespace Org.Apache.REEF.Network.Examples.Client -{ - public sealed class ElasticBroadcastClientWithFailureInConstructor : ElasticBroadcastClient - { - public ElasticBroadcastClientWithFailureInConstructor(bool runOnYarn, int numTasks, int startingPortNo, int portRange) - : base (runOnYarn, numTasks, startingPortNo, portRange) - { - } - - protected override string JobIdentifier - { - get { return "ElasticBroadcastWithFailure"; } - } - - protected override IConfiguration GetDriverConf() - { - return DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) - .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) - .Build(); - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithMultipleFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithMultipleFailures.cs deleted file mode 100644 index 550964d3e7..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClientWithMultipleFailures.cs +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Driver; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Utilities.Logging; -using Org.Apache.REEF.Network.Examples.Elastic; - -namespace Org.Apache.REEF.Network.Examples.Client -{ - public sealed class ElasticBroadcastClientWithMultipleFailures : ElasticBroadcastClient - { - public ElasticBroadcastClientWithMultipleFailures(bool runOnYarn, int numTasks, int startingPortNo, int portRange) - : base (runOnYarn, numTasks, startingPortNo, portRange) - { - } - - protected override string JobIdentifier - { - get { return "ElasticBroadcastWithFailure"; } - } - - protected override IConfiguration GetDriverConf() - { - return DriverConfiguration.ConfigurationModule - .Set(DriverConfiguration.OnDriverStarted, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorAllocated, GenericType.Class) - .Set(DriverConfiguration.OnEvaluatorFailed, GenericType.Class) - .Set(DriverConfiguration.OnContextActive, GenericType.Class) - .Set(DriverConfiguration.OnTaskRunning, GenericType.Class) - .Set(DriverConfiguration.OnTaskCompleted, GenericType.Class) - .Set(DriverConfiguration.OnTaskFailed, GenericType.Class) - .Set(DriverConfiguration.OnTaskMessage, GenericType.Class) - .Set(DriverConfiguration.CustomTraceLevel, Level.Info.ToString()) - .Build(); - } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs index bab9f96f31..c4bda166c2 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/Run.cs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +using Org.Apache.REEF.Network.Examples.Elastic; using Org.Apache.REEF.Network.Examples.GroupCommunication; using System; @@ -42,7 +43,7 @@ public static void Main(string[] args) int numNodes = 5; int startPort = 8900; int portRange = 1000; - string testToRun = "ElasticBroadcastWithFailEvaluatorBeforeWorkflow"; + string testToRun = TestType.ElasticBroadcastWithFailEvaluatorBeforeWorkflow.ToString(); if (args != null) { @@ -84,11 +85,11 @@ public static void Main(string[] args) } new PipelineBroadcastAndReduceClient().RunPipelineBroadcastAndReduce( - runOnYarn, - numNodes, + runOnYarn, + numNodes, startPort, - portRange, - arraySize, + portRange, + arraySize, chunkSize); Console.WriteLine("PipelineBroadcastAndReduce completed!!!"); } @@ -96,76 +97,87 @@ public static void Main(string[] args) if (TestType.BroadcastAndReduce.Match(testToRun)) { new BroadcastAndReduceClient().RunBroadcastAndReduce( - runOnYarn, - numNodes, - startPort, + runOnYarn, + numNodes, + startPort, portRange); Console.WriteLine("BroadcastAndReduce completed!!!"); } if (TestType.ElasticBroadcast.Match(testToRun)) { - new ElasticBroadcastClient(runOnYarn, numNodes, startPort, portRange); + new ElasticBroadcastClient( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcast); Console.WriteLine("ElasticBroadcast completed!!!"); } if (TestType.ElasticBroadcastWithFailureInConstructor.Match(testToRun)) { - new ElasticBroadcastClientWithFailureInConstructor( - runOnYarn, - numNodes, - startPort, - portRange); + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); Console.WriteLine("ElasticBroadcastWithFailureInConstructor completed!!!"); } if (TestType.ElasticBroadcastWithFailureBeforeWorkflow.Match(testToRun)) { - new ElasticBroadcastClientWithFailureBeforeWorkflow( - runOnYarn, - numNodes, - startPort, - portRange); + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); Console.WriteLine("ElasticBroadcastWithFailureBeforeWorkflow completed!!!"); } if (TestType.ElasticBroadcastWithFailEvaluatorBeforeWorkflow.Match(testToRun)) { - new ElasticBroadcastClientWithFailEvaluatorBeforeWorkflow( - runOnYarn, - numNodes, - startPort, - portRange); + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); Console.WriteLine("ElasticBroadcastWithFailEvaluatorBeforeWorkflow completed!!!"); } if (TestType.ElasticBroadcastWithFailureBeforeBroadcast.Match(testToRun)) { - new ElasticBroadcastClientWithFailureBeforeBroadcast( - runOnYarn, - numNodes, - startPort, - portRange); + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); Console.WriteLine("ElasticBroadcastWithFailureBeforeBroadcast completed!!!"); } if (TestType.ElasticBroadcastWithFailureAfterBroadcast.Match(testToRun)) { - new ElasticBroadcastClientWithFailureAfterBroadcast( - runOnYarn, - numNodes, - startPort, - portRange); + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); Console.WriteLine("ElasticBroadcastWithFailureAfterBroadcast completed!!!"); } if (TestType.ElasticBroadcastWithMultipleFailures.Match(testToRun)) { - new ElasticBroadcastClientWithMultipleFailures( - runOnYarn, - numNodes, - startPort, - portRange); + new ElasticBroadcastClient>( + runOnYarn, + numNodes, + startPort, + portRange, + JobIdentifiers.ElastiBroadcastWithFailure); Console.WriteLine("ElasticBroadcastWithMultipleFailures completed!!!"); } } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs index 6f2aafb530..4bdedbf04f 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs @@ -27,7 +27,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastMasterTask : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(BroadcastMasterTask)); + private static readonly Logger Log = Logger.GetLogger(typeof(BroadcastMasterTask)); [Inject] private BroadcastMasterTask(CancellationSource source, IElasticContext context) @@ -51,7 +51,7 @@ protected override void Execute(byte[] memento, Workflow workflow) sender.Send(number); - LOGGER.Log(Level.Info, "Master has sent {0}", number); + Log.Log(Level.Info, "Master has sent {0}", number); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs index 2057bc4db3..988acba951 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastSlaveTask.cs @@ -27,7 +27,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTask : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(BroadcastSlaveTask)); + private static readonly Logger Log = Logger.GetLogger(typeof(BroadcastSlaveTask)); [Inject] public BroadcastSlaveTask(CancellationSource source, IElasticContext context) @@ -46,7 +46,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, $"Slave has received {rec}"); + Log.Log(Level.Info, $"Slave has received {rec}"); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs index a9cc1ccb97..dafcc72dbd 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs @@ -24,16 +24,17 @@ using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; using Org.Apache.REEF.Network.Elastic.Driver.Default; +using Org.Apache.REEF.Network.Elastic.Task.Default; namespace Org.Apache.REEF.Network.Examples.Elastic { /// /// Example implementation of broadcasting using the elastic group communication service. /// - public class ElasticBroadcastDriver : DefaultElasticDriver + public sealed class ElasticBroadcastDriver : DefaultElasticDriver { [Inject] - protected ElasticBroadcastDriver(IElasticContext context) : base(context) + private ElasticBroadcastDriver(IElasticContext context) : base(context) { IElasticStage stage = Context.DefaultStage(); @@ -56,7 +57,7 @@ protected ElasticBroadcastDriver(IElasticContext context) : base(context) TaskSetManager.Build(); } - protected virtual Func MasterTaskConfiguration + private Func MasterTaskConfiguration { get { @@ -68,7 +69,7 @@ protected virtual Func MasterTaskConfiguration } } - protected virtual Func SlaveTaskConfiguration + private Func SlaveTaskConfiguration { get { @@ -80,4 +81,4 @@ protected virtual Func SlaveTaskConfiguration } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs index 3a65836acb..41dfdba1cc 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs @@ -26,15 +26,19 @@ using Org.Apache.REEF.Network.Elastic.Driver.Default; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Network.Elastic.Failures.Default; +using Org.Apache.REEF.Network.Elastic.Task.Default; namespace Org.Apache.REEF.Network.Examples.Elastic { /// /// Example implementation of broadcasting using the elastic group communication service. /// - public abstract class ElasticBroadcastDriverWithFailures : DefaultElasticDriver + public sealed class ElasticBroadcastDriverWithFailures + : DefaultElasticDriver + where TSlave : DefaultElasticTask { - protected ElasticBroadcastDriverWithFailures( + [Inject] + private ElasticBroadcastDriverWithFailures( string stageName, int numEvaluators, IElasticContext context) : base(context) @@ -78,7 +82,7 @@ protected ElasticBroadcastDriverWithFailures( TaskSetManager.Build(); } - protected virtual Func MasterTaskConfiguration + private Func MasterTaskConfiguration { get { @@ -90,13 +94,13 @@ protected virtual Func MasterTaskConfiguration } } - protected virtual Func SlaveTaskConfiguration + private Func SlaveTaskConfiguration { get { return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) + .Set(TaskConfiguration.Task, GenericType.Class) .Build()) .Build(); } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs index d9bd5de6d1..9e9b07d2c2 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieAfterBroadcast.cs @@ -29,7 +29,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieAfterBroadcast : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger( + private static readonly Logger Log = Logger.GetLogger( typeof(BroadcastSlaveTaskDieAfterBroadcast)); private readonly string _taskId; @@ -55,7 +55,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, "Slave has received {0}", rec); + Log.Log(Level.Info, "Slave has received {0}", rec); if (Utils.GetTaskNum(_taskId) == 2) { diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs index 8c8c9dd68d..8a5e4245be 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeBroadcast.cs @@ -29,7 +29,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieBeforeBroadcast : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger( + private static readonly Logger Log = Logger.GetLogger( typeof(BroadcastSlaveTaskDieBeforeBroadcast)); private readonly string _taskId; @@ -60,7 +60,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, "Slave has received {0}", rec); + Log.Log(Level.Info, "Slave has received {0}", rec); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs index 15cdc82134..aeeda3eb68 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieBeforeWorkflow.cs @@ -29,7 +29,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieBeforeWorkflow : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger( + private static readonly Logger Log = Logger.GetLogger( typeof(BroadcastSlaveTaskDieBeforeWorkflow)); private readonly string _taskId; @@ -59,7 +59,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, "Slave has received {0}", rec); + Log.Log(Level.Info, "Slave has received {0}", rec); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs index 224e875133..7f30b6fdb5 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieEvaluatorBeforeWorkflow.cs @@ -29,7 +29,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieEvaluatorBeforeWorkflow : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger( + private static readonly Logger Log = Logger.GetLogger( typeof(BroadcastSlaveTaskDieEvaluatorBeforeWorkflow)); private readonly string _taskId; @@ -60,7 +60,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, "Slave has received {0}", rec); + Log.Log(Level.Info, "Slave has received {0}", rec); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs index f26155fa21..323ebf4359 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieInConstructor.cs @@ -29,7 +29,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieInConstructor : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger( + private static readonly Logger Log = Logger.GetLogger( typeof(BroadcastSlaveTaskDieInConstructor)); [Inject] @@ -55,7 +55,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, "Slave has received {0}", rec); + Log.Log(Level.Info, "Slave has received {0}", rec); break; default: diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs index 4cdcc8d8f9..4f36b54ea1 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultiple.cs @@ -27,7 +27,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieMultiple : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger( + private static readonly Logger Log = Logger.GetLogger( typeof(BroadcastSlaveTaskDieMultiple)); private const int _failProb = 70; @@ -62,7 +62,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, "Slave has received {0}", rec); + Log.Log(Level.Info, "Slave has received {0}", rec); break; diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs index 9dcb065a58..1032e2670a 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/BroadcastSlaveTaskDieMultipleEvaluators.cs @@ -27,7 +27,7 @@ namespace Org.Apache.REEF.Network.Examples.Elastic { public sealed class BroadcastSlaveTaskDieMultipleEvaluators : DefaultElasticTask { - private static readonly Logger LOGGER = Logger.GetLogger( + private static readonly Logger Log = Logger.GetLogger( typeof(BroadcastSlaveTaskDieMultipleEvaluators)); private const int _failProb = 50; @@ -62,7 +62,7 @@ protected override void Execute(byte[] memento, Workflow workflow) var rec = receiver.Receive(); - LOGGER.Log(Level.Info, "Slave has received {0}", rec); + Log.Log(Level.Info, "Slave has received {0}", rec); if (_rand.Next(100) < _failProb) { diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs deleted file mode 100644 index 56fac012d6..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow.cs +++ /dev/null @@ -1,58 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Network.Elastic.Driver; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Implementations.Tang; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Utilities.Logging; -using System; -using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; - -namespace Org.Apache.REEF.Network.Examples.Elastic -{ - /// - /// Example implementation of broadcasting using the elastic group communication service. - /// - public sealed class ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow : - ElasticBroadcastDriverWithFailures - { - [Inject] - private ElasticBroadcastDriverWithFailEvaluatorBeforeWorkflow( - [Parameter(typeof(DefaultStageName))] string stageName, - [Parameter(typeof(NumEvaluators))] int numEvaluators, - IElasticContext context) : base(stageName, numEvaluators, context) - { - } - - protected override Func SlaveTaskConfiguration - { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set( - TaskConfiguration.Task, - GenericType.Class) - .Build()) - .Build(); - } - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs deleted file mode 100644 index 4099072f5d..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureAfterBroadcast.cs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Common.Tasks; -using Org.Apache.REEF.Network.Elastic.Driver; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Implementations.Tang; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using System; -using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; - -namespace Org.Apache.REEF.Network.Examples.Elastic -{ - /// - /// Example implementation of broadcasting using the elastic group communication service. - /// - public sealed class ElasticBroadcastDriverWithFailureAfterBroadcast : - ElasticBroadcastDriverWithFailures - { - [Inject] - private ElasticBroadcastDriverWithFailureAfterBroadcast( - [Parameter(typeof(DefaultStageName))] string stageName, - [Parameter(typeof(NumEvaluators))] int numEvaluators, - IElasticContext context) : base(stageName, numEvaluators, context) - { - } - - protected override Func SlaveTaskConfiguration - { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set( - TaskConfiguration.Task, - GenericType.Class) - .Build()) - .Build(); - } - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs deleted file mode 100644 index abce50edc9..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeBroadcast.cs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using System; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Implementations.Tang; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Network.Elastic.Driver; -using Org.Apache.REEF.Common.Tasks; -using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; - -namespace Org.Apache.REEF.Network.Examples.Elastic -{ - /// - /// Example implementation of broadcasting using the elastic group communication service. - /// - public sealed class ElasticBroadcastDriverWithFailureBeforeBroadcast : - ElasticBroadcastDriverWithFailures - { - [Inject] - private ElasticBroadcastDriverWithFailureBeforeBroadcast( - [Parameter(typeof(DefaultStageName))] string stageName, - [Parameter(typeof(NumEvaluators))] int numEvaluators, - IElasticContext context) : base(stageName, numEvaluators, context) - { - } - - protected override Func SlaveTaskConfiguration - { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set( - TaskConfiguration.Task, - GenericType.Class) - .Build()) - .Build(); - } - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs deleted file mode 100644 index 3a54057013..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureBeforeWorkflow.cs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using System; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Implementations.Tang; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Network.Elastic.Driver; -using Org.Apache.REEF.Common.Tasks; -using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; - -namespace Org.Apache.REEF.Network.Examples.Elastic -{ - /// - /// Example implementation of broadcasting using the elastic group communication service. - /// - public sealed class ElasticBroadcastDriverWithFailureBeforeWorkflow : - ElasticBroadcastDriverWithFailures - { - [Inject] - private ElasticBroadcastDriverWithFailureBeforeWorkflow( - [Parameter(typeof(DefaultStageName))] string stageName, - [Parameter(typeof(NumEvaluators))] int numEvaluators, - IElasticContext context) : base(stageName, numEvaluators, context) - { - } - - protected override Func SlaveTaskConfiguration - { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set( - TaskConfiguration.Task, - GenericType.Class) - .Build()) - .Build(); - } - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs deleted file mode 100644 index 9f4cadf585..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailureInConstructor.cs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using System; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Implementations.Tang; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Network.Elastic.Driver; -using Org.Apache.REEF.Common.Tasks; -using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; - -namespace Org.Apache.REEF.Network.Examples.Elastic -{ - /// - /// Example implementation of broadcasting using the elastic group communication service. - /// - public sealed class ElasticBroadcastDriverWithFailureInConstructor : - ElasticBroadcastDriverWithFailures - { - [Inject] - private ElasticBroadcastDriverWithFailureInConstructor( - [Parameter(typeof(DefaultStageName))] string stageName, - [Parameter(typeof(NumEvaluators))] int numEvaluators, - IElasticContext context) : base(stageName, numEvaluators, context) - { - } - - protected override Func SlaveTaskConfiguration - { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set( - TaskConfiguration.Task, - GenericType.Class) - .Build()) - .Build(); - } - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs deleted file mode 100644 index 887fa913cf..0000000000 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithMultipleFailures.cs +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using System; -using Org.Apache.REEF.Tang.Annotations; -using Org.Apache.REEF.Tang.Implementations.Tang; -using Org.Apache.REEF.Tang.Interface; -using Org.Apache.REEF.Tang.Util; -using Org.Apache.REEF.Network.Elastic.Driver; -using Org.Apache.REEF.Common.Tasks; -using static Org.Apache.REEF.Network.Elastic.Config.ElasticServiceConfigurationOptions; - -namespace Org.Apache.REEF.Network.Examples.Elastic -{ - /// - /// Example implementation of broadcasting using the elastic group communication service. - /// - public sealed class ElasticBroadcastDriverWithMultipleFailures : - ElasticBroadcastDriverWithFailures - { - [Inject] - private ElasticBroadcastDriverWithMultipleFailures( - [Parameter(typeof(DefaultStageName))] string stageName, - [Parameter(typeof(NumEvaluators))] int numEvaluators, - IElasticContext context) : base(stageName, numEvaluators, context) - { - } - - protected override Func SlaveTaskConfiguration - { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) - .Build(); - } - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/GroupCommunication/ScatterReduceDriverAndTasks/ScatterReduceDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/GroupCommunication/ScatterReduceDriverAndTasks/ScatterReduceDriver.cs index db82defd30..430e56503d 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/GroupCommunication/ScatterReduceDriverAndTasks/ScatterReduceDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/GroupCommunication/ScatterReduceDriverAndTasks/ScatterReduceDriver.cs @@ -37,10 +37,10 @@ namespace Org.Apache.REEF.Network.Examples.GroupCommunication.ScatterReduceDriverAndTasks { - public class ScatterReduceDriver : - IObserver, - IObserver, - IObserver, + public class ScatterReduceDriver : + IObserver, + IObserver, + IObserver, IObserver { private static readonly Logger LOGGER = Logger.GetLogger(typeof(ScatterReduceDriver)); @@ -79,12 +79,12 @@ public ScatterReduceDriver( .AddScatter( GroupTestConstants.ScatterOperatorName, GroupTestConstants.MasterTaskId, - TopologyTypes.Tree, + TopologyTypes.Tree, dataConverterConfig) .AddReduce( GroupTestConstants.ReduceOperatorName, GroupTestConstants.MasterTaskId, - TopologyTypes.Tree, + TopologyTypes.Tree, reduceFunctionConfig, dataConverterConfig) @@ -142,7 +142,7 @@ public void OnNext(IDriverStarted value) .SetMegabytes(512) .SetCores(2) .SetRackName("WonderlandRack") - .SetEvaluatorBatchId("BroadcastEvaluator").Build(); + .SetEvaluatorBatchId("BroadcastEvaluator").Build(); _evaluatorRequestor.Submit(request); } @@ -168,4 +168,4 @@ public int Reduce(IEnumerable elements) } } } -} +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index e8cf7c518c..e916ab1013 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -47,7 +47,7 @@ namespace Org.Apache.REEF.Network.Elastic.Driver.Default [Unstable("0.16", "API may change")] internal sealed class DefaultElasticStage : IElasticStage, IDefaultFailureEventResponse { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticStage)); + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultElasticStage)); private bool _finalized = false; private volatile bool _scheduled = false; @@ -200,7 +200,7 @@ public bool AddTask(string taskId) if (IsCompleted || (_scheduled && FailureState.FailureState == (int)DefaultFailureStates.Fail)) { - LOGGER.Log(Level.Warning, "Taskset " + (IsCompleted ? "completed." : "failed.")); + Log.Log(Level.Warning, "Taskset " + (IsCompleted ? "completed." : "failed.")); return false; } @@ -220,13 +220,13 @@ public bool AddTask(string taskId) { if (tooManyTasks) { - LOGGER.Log(Level.Warning, + Log.Log(Level.Warning, "Already added {0} tasks when total tasks request is {1}", _tasksAdded, _numTasks); } if (notAddingMaster) { - LOGGER.Log(Level.Warning, + Log.Log(Level.Warning, "Already added {0} over {1} but missing master task(s)", _tasksAdded, _numTasks); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index 376f5851d2..bc0c4ded6c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -357,7 +357,7 @@ public Alarm GetAlarm(long time) #endregion Private classes - private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultElasticTaskSetManager)); + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultElasticTaskSetManager)); private bool _finalized = false; private volatile bool _disposed = false; @@ -508,7 +508,7 @@ public bool TryGetNextTaskContextId(IAllocatedEvaluator evaluator, out string id if (_contextsAdded > _numTasks) { - LOGGER.Log(Level.Warning, "Trying to schedule too many contexts"); + Log.Log(Level.Warning, "Trying to schedule too many contexts"); identifier = string.Empty; return false; } @@ -517,7 +517,7 @@ public bool TryGetNextTaskContextId(IAllocatedEvaluator evaluator, out string id cinfo = new ContextInfo(id); _evaluatorToContextIdMapping.TryAdd(evaluator.Id, cinfo); - LOGGER.Log(Level.Info, "Evaluator {0} is scheduled on node {1}", + Log.Log(Level.Info, "Evaluator {0} is scheduled on node {1}", evaluator.Id, evaluator.GetEvaluatorDescriptor().NodeDescriptor.HostName); @@ -575,7 +575,7 @@ public void OnNewActiveContext(IActiveContext activeContext) if (Completed() || Failed()) { - LOGGER.Log(Level.Warning, "Adding tasks to already completed task set: ignoring."); + Log.Log(Level.Warning, "Adding tasks to already completed task set: ignoring."); activeContext.Dispose(); return; } @@ -588,7 +588,7 @@ public void OnNewActiveContext(IActiveContext activeContext) // actually scheduled at least once (_taskInfos[id].TaskStatus > TaskStatus.Init) if (_taskInfos[id] != null && _taskInfos[id].TaskStatus > TaskState.Init) { - LOGGER.Log(Level.Info, "{0} already part of task set: going to directly submit it.", taskId); + Log.Log(Level.Info, "{0} already part of task set: going to directly submit it.", taskId); lock (_taskInfos[id].Lock) { @@ -601,7 +601,7 @@ public void OnNewActiveContext(IActiveContext activeContext) { bool isMaster = IsMasterTaskContext(activeContext).Any(); - LOGGER.Log(Level.Info, "Task {0} to be scheduled on {1}", taskId, activeContext.EvaluatorId); + Log.Log(Level.Info, "Task {0} to be scheduled on {1}", taskId, activeContext.EvaluatorId); List partialTaskConfs = new List(); @@ -653,7 +653,7 @@ public void OnTaskRunning(IRunningTask task) if (Completed() || Failed()) { - LOGGER.Log(Level.Info, "Received running from task {0} but task set is completed " + Log.Log(Level.Info, "Received running from task {0} but task set is completed " + "or failed: ignoring.", task.Id); _taskInfos[id].Dispose(); @@ -661,7 +661,7 @@ public void OnTaskRunning(IRunningTask task) } if (!TaskStateUtils.IsRunnable(_taskInfos[id].TaskStatus)) { - LOGGER.Log(Level.Info, "Received running from task {0} which is not runnable: ignoring.", + Log.Log(Level.Info, "Received running from task {0} which is not runnable: ignoring.", task.Id); _taskInfos[id].Dispose(); @@ -733,7 +733,7 @@ public void OnTaskMessage(ITaskMessage message) } catch (IllegalStateException e) { - LOGGER.Log(Level.Error, e.Message, e); + Log.Log(Level.Error, e.Message, e); Fail(message.TaskId); } @@ -776,7 +776,7 @@ public void OnTimeout(Alarm alarm, ref List msgs, ref Lis if (isInit) { _hasProgress = false; - LOGGER.Log(Level.Info, "Timeout alarm for task set initialized"); + Log.Log(Level.Info, "Timeout alarm for task set initialized"); nextTimeouts.Add(new TaskSetTimeout(_parameters.Timeout, this)); foreach (var stage in _stages.Values) @@ -790,13 +790,13 @@ public void OnTimeout(Alarm alarm, ref List msgs, ref Lis { if (Completed() || Failed()) { - LOGGER.Log(Level.Warning, "Taskset made no progress in the last {0}ms. Forcing Disposal.", + Log.Log(Level.Warning, "Taskset made no progress in the last {0}ms. Forcing Disposal.", _parameters.Timeout); Dispose(); } else { - LOGGER.Log(Level.Error, "Taskset made no progress in the last {0}ms. Aborting.", + Log.Log(Level.Error, "Taskset made no progress in the last {0}ms. Aborting.", _parameters.Timeout); Fail(); return; @@ -836,7 +836,7 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent { if (IsTaskManagedBy(task.Id)) { - LOGGER.Log(Level.Info, "Received a failure from {0}", task.Id, task.AsError()); + Log.Log(Level.Info, "Received a failure from {0}", task.Id, task.AsError()); Interlocked.Decrement(ref _tasksRunning); _totFailedTasks++; @@ -845,7 +845,7 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent if (Completed() || Failed()) { - LOGGER.Log(Level.Info, "Received a failure from task {0} but the task set is completed or " + Log.Log(Level.Info, "Received a failure from task {0} but the task set is completed or " + "failed: ignoring the failure", task.Id, task.AsError()); lock (_taskInfos[id].Lock) @@ -876,7 +876,7 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent } catch (Exception e) { - LOGGER.Log(Level.Error, e.Message, e); + Log.Log(Level.Error, e.Message, e); Fail(task.Id); } @@ -898,7 +898,7 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent /// The failed evaluator public void OnEvaluatorFailure(IFailedEvaluator evaluator) { - LOGGER.Log(Level.Info, "Received a failure from {0}", evaluator.Id, evaluator.EvaluatorException); + Log.Log(Level.Info, "Received a failure from {0}", evaluator.Id, evaluator.EvaluatorException); _totFailedEvaluators++; @@ -938,7 +938,7 @@ public void OnEvaluatorFailure(IFailedEvaluator evaluator) if (cinfo.NumRetry > _parameters.NumEvaluatorFailures) { - LOGGER.Log(Level.Error, "Context {0} failed more than {1} times: Aborting", + Log.Log(Level.Error, "Context {0} failed more than {1} times: Aborting", cinfo.Id, _parameters.NumEvaluatorFailures); Fail(); @@ -1051,7 +1051,7 @@ public void OnStop(ref StopEvent stopEvent) /// public void OnFail() { - LOGGER.Log(Level.Info, "Task set failed"); + Log.Log(Level.Info, "Task set failed"); lock (_statusLock) { @@ -1150,7 +1150,7 @@ private void AddTask(string taskId, IActiveContext activeContext, List messages, int retry = 0) { if (Completed() || Failed()) { - LOGGER.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring"); + Log.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring"); _taskInfos[destination].DisposeTask(); return; @@ -1310,7 +1310,7 @@ private void SendToTasks(IList messages, int retry = 0) if (_taskInfos[destination].TaskStatus == TaskState.Submitted && retry < _parameters.Retry) { - LOGGER.Log(Level.Warning, msg + " Retry"); + Log.Log(Level.Warning, msg + " Retry"); System.Threading.Tasks.Task.Run(() => { Thread.Sleep(_parameters.WaitTime); @@ -1319,12 +1319,12 @@ private void SendToTasks(IList messages, int retry = 0) } else if (retry >= _parameters.Retry) { - LOGGER.Log(Level.Warning, msg + " Aborting"); + Log.Log(Level.Warning, msg + " Aborting"); Fail(returnMessage.Destination); } else { - LOGGER.Log(Level.Warning, msg + " Ignoring"); + Log.Log(Level.Warning, msg + " Ignoring"); } continue; @@ -1338,7 +1338,7 @@ private void SendToTasks(IList messages, int retry = 0) private void SpawnNewEvaluator(int id) { - LOGGER.Log(Level.Warning, "Spawning new evaluator for id {0}", id); + Log.Log(Level.Warning, "Spawning new evaluator for id {0}", id); var request = _evaluatorRequestor.NewBuilder() .SetNumber(1) @@ -1360,7 +1360,7 @@ private void Reschedule(RescheduleEvent rescheduleEvent) if (_taskInfos[id].NumRetry > _parameters.NumTaskFailures) { - LOGGER.Log(Level.Error, "Task {0} failed more than {1} times: aborting", + Log.Log(Level.Error, "Task {0} failed more than {1} times: aborting", rescheduleEvent.TaskId, _parameters.NumTaskFailures); Fail(rescheduleEvent.TaskId); @@ -1368,7 +1368,7 @@ private void Reschedule(RescheduleEvent rescheduleEvent) if (rescheduleEvent.Reschedule) { - LOGGER.Log(Level.Info, "Rescheduling task {0}", rescheduleEvent.TaskId); + Log.Log(Level.Info, "Rescheduling task {0}", rescheduleEvent.TaskId); _taskInfos[id].RescheduleConfigurations = rescheduleEvent.RescheduleTaskConfigurations; @@ -1390,7 +1390,7 @@ private void LogFinalStatistics() _totFailedTasks, _totFailedEvaluators); msg += _stages.Select(x => x.Value.LogFinalStatistics()).Aggregate((a, b) => a + "\n" + b); - LOGGER.Log(Level.Info, msg); + Log.Log(Level.Info, msg); } private bool Completed() @@ -1401,7 +1401,7 @@ private bool Completed() if (_completed) { - LOGGER.Log(Level.Info, "Task set completed"); + Log.Log(Level.Info, "Task set completed"); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs index f14e86bd90..a687cda909 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs @@ -38,7 +38,7 @@ namespace Org.Apache.REEF.Network.Elastic.Failures /// internal sealed class FailuresClock : IClock { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(FailuresClock)); + private static readonly Logger Log = Logger.GetLogger(typeof(FailuresClock)); private static int numberOfInstantiations = 0; @@ -85,7 +85,7 @@ private FailuresClock( ++numberOfInstantiations; if (numberOfInstantiations > 1) { - LOGGER.Log(Level.Warning, "Instantiated `RuntimeClock` instance number {0}", numberOfInstantiations); + Log.Log(Level.Warning, "Instantiated `RuntimeClock` instance number {0}", numberOfInstantiations); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs index 270c647ee6..448924a2ed 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -42,7 +42,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default [Unstable("0.16", "API may change")] internal abstract class DefaultOneToN : ElasticOperatorWithDefaultDispatcher { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultOneToN<>)); + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultOneToN<>)); private volatile bool _stop = false; @@ -103,7 +103,7 @@ protected override bool ReactOnTaskMessage( if (!Stage.IsCompleted && _failureMachine.State.FailureState < (int)DefaultFailureStates.Fail) { var taskId = message.TaskId; - LOGGER.Log(Level.Info, "{0} joins the topology for operator {1}", taskId, _id); + Log.Log(Level.Info, "{0} joins the topology for operator {1}", taskId, _id); _topology.AddTask(taskId, _failureMachine); } @@ -119,7 +119,7 @@ protected override bool ReactOnTaskMessage( return false; } - LOGGER.Log(Level.Info, "Received topology update request for {0} {1} from {2}", + Log.Log(Level.Info, "Received topology update request for {0} {1} from {2}", OperatorType.ToString(), _id, message.TaskId); _topology.TopologyUpdateResponse( @@ -136,7 +136,7 @@ protected override bool ReactOnTaskMessage( else { returnMessages.Clear(); - LOGGER.Log(Level.Info, "Operator {0} is in stopped: Waiting.", + Log.Log(Level.Info, "Operator {0} is in stopped: Waiting.", OperatorType.ToString()); } } @@ -162,7 +162,7 @@ protected override bool ReactOnTaskMessage( /// public override void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { - LOGGER.Log(Level.Info, "Going to reconfigure the {0} operator", OperatorType.ToString()); + Log.Log(Level.Info, "Going to reconfigure the {0} operator", OperatorType.ToString()); if (reconfigureEvent.FailedTask.IsPresent()) { @@ -198,7 +198,7 @@ public override void OnReschedule(ref RescheduleEvent rescheduleEvent) // Iterators manage the re-schuedling of tasks. If not iterator exists, setup the rescheduling. if (!WithinIteration) { - LOGGER.Log(Level.Info, "Going to reschedule task {0}", rescheduleEvent.TaskId); + Log.Log(Level.Info, "Going to reschedule task {0}", rescheduleEvent.TaskId); if (!rescheduleEvent.RescheduleTaskConfigurations.TryGetValue( Stage.StageName, diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs index 94ad9f3b9f..8a29079f0c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs @@ -38,7 +38,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default [Unstable("0.16", "API may change")] internal abstract class ElasticOperatorWithDefaultDispatcher : ElasticOperator, IDefaultFailureEventResponse { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(ElasticOperatorWithDefaultDispatcher)); + private static readonly Logger Log = Logger.GetLogger(typeof(ElasticOperatorWithDefaultDispatcher)); /// /// Base constructor for an abstract operator implementing the default failure logic. @@ -135,7 +135,7 @@ public override void OnTaskFailure(IFailedTask task, ref List fai break; default: - LOGGER.Log(Level.Info, "Failure from {0} requires no action", task.Id); + Log.Log(Level.Info, "Failure from {0} requires no action", task.Id); break; } @@ -264,7 +264,7 @@ protected override void LogOperatorState() string failureMachineState = $"Failure State: {(DefaultFailureStates)_failureMachine.State.FailureState}" + $"\nFailure(s) Reported: {_failureMachine.NumOfFailedDataPoints}/{_failureMachine.NumOfDataPoints}"; - LOGGER.Log(Level.Info, intro + topologyState + failureMachineState); + Log.Log(Level.Info, intro + topologyState + failureMachineState); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index 137cde4862..8038ad88a8 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -52,7 +52,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Logical [Unstable("0.16", "API may change")] public abstract class ElasticOperator : IFailureResponse, ITaskMessageResponse { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(ElasticOperator)); + private static readonly Logger Log = Logger.GetLogger(typeof(ElasticOperator)); protected static readonly Dictionary CODECMAP = new Dictionary() { @@ -552,7 +552,7 @@ protected virtual void LogOperatorState() string failureMachineState = "Failure State: " + _failureMachine.State.FailureState + "\nFailure(s) Reported: " + _failureMachine.NumOfFailedDataPoints; - LOGGER.Log(Level.Info, intro + topologyState + failureMachineState); + Log.Log(Level.Info, intro + topologyState + failureMachineState); } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index 52127a1927..7e9bdd5f8b 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -34,7 +34,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Default [Unstable("0.16", "API may change")] public abstract class DefaultOneToN { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultOneToN<>)); + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultOneToN<>)); internal readonly OneToNTopology _topology; internal volatile PositionTracker _position; @@ -127,7 +127,7 @@ public T Receive() if (isIterative && typedDataMessage.Iteration < (int)IteratorReference.Current) { - LOGGER.Log(Level.Warning, "Received message for iteration {0} but I am already in iteration " + Log.Log(Level.Warning, "Received message for iteration {0} but I am already in iteration " + "{1}: ignoring.", typedDataMessage.Iteration, (int)IteratorReference.Current); } else @@ -167,7 +167,7 @@ public void ResetPosition() /// public void WaitForTaskRegistration(CancellationTokenSource cancellationSource) { - LOGGER.Log(Level.Info, "Waiting for task registration for {0} operator.", OperatorType.ToString()); + Log.Log(Level.Info, "Waiting for task registration for {0} operator.", OperatorType.ToString()); _topology.WaitForTaskRegistration(cancellationSource); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index 60e42e842e..a1e3e81674 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -38,7 +38,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl internal abstract class CommunicationLayer : IObserver>> { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(CommunicationLayer)); + private static readonly Logger Log = Logger.GetLogger(typeof(CommunicationLayer)); private readonly int _timeout; private readonly int _retryRegistration; @@ -135,7 +135,7 @@ internal void Send( } if (_disposed) { - LOGGER.Log(Level.Warning, "Received send message request after disposing: Ignoring."); + Log.Log(Level.Warning, "Received send message request after disposing: Ignoring."); return; } @@ -183,31 +183,31 @@ public void WaitForTaskRegistration( { if (cancellationSource != null && cancellationSource.Token.IsCancellationRequested) { - LOGGER.Log(Level.Warning, "WaitForTaskRegistration is canceled in retryCount {0}.", i); + Log.Log(Level.Warning, "WaitForTaskRegistration is canceled in retryCount {0}.", i); throw new OperationCanceledException("WaitForTaskRegistration is canceled"); } - LOGGER.Log(Level.Info, "WaitForTaskRegistration, in retryCount {0}.", i); + Log.Log(Level.Info, "WaitForTaskRegistration, in retryCount {0}.", i); foreach (var identifier in identifiers) { var notFound = !foundList.Contains(identifier); if (notFound && removed.ContainsKey(identifier)) { foundList.Add(identifier); - LOGGER.Log(Level.Verbose, + Log.Log(Level.Verbose, "WaitForTaskRegistration, dependent id {0} was removed at loop {1}.", identifier, i); } else if (notFound && Lookup(identifier)) { foundList.Add(identifier); - LOGGER.Log(Level.Verbose, + Log.Log(Level.Verbose, "WaitForTaskRegistration, find a dependent id {0} at loop {1}.", identifier, i); } } if (foundList.Count >= identifiers.Count) { - LOGGER.Log(Level.Info, + Log.Log(Level.Info, "WaitForTaskRegistration, found all {0} dependent ids at loop {1}.", foundList.Count, i); return; } @@ -219,7 +219,7 @@ public void WaitForTaskRegistration( foundList.Count == 0 ? identifiers : identifiers.Where(e => !foundList.Contains(e)).ToList(); var msg = string.Join(",", leftovers); - LOGGER.Log(Level.Error, "Cannot find registered parent/children: {0}.", msg); + Log.Log(Level.Error, "Cannot find registered parent/children: {0}.", msg); throw new Exception("Failed to find parent/children nodes"); } @@ -274,7 +274,7 @@ public void Dispose() _disposed = true; - LOGGER.Log(Level.Info, "Communication layer disposed."); + Log.Log(Level.Info, "Communication layer disposed."); } } @@ -293,7 +293,7 @@ private bool Send(IIdentifier destId, ElasticGroupCommunicationMessage message) } catch (Exception e) { - LOGGER.Log(Level.Warning, "Unable to send message " + e.Message); + Log.Log(Level.Warning, "Unable to send message " + e.Message); connection.Dispose(); return false; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs index 230a6d628c..d68a8bc9db 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs @@ -37,7 +37,7 @@ internal sealed class DefaultCommunicationLayer : CommunicationLayer, IDefaultTaskToDriverMessages { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(DefaultCommunicationLayer)); + private static readonly Logger Log = Logger.GetLogger(typeof(DefaultCommunicationLayer)); /// /// Creates a new communication layer. @@ -71,7 +71,7 @@ public override void OnNext(IRemoteMessage /// Injectable constrcutor. @@ -64,7 +64,7 @@ public void JoinTopology(string taskId, string stageName, int operatorId) offset += sizeof(ushort); Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); - LOGGER.Log(Level.Info, "Operator {0} requesting to join the topology through heartbeat.", operatorId); + Log.Log(Level.Info, "Operator {0} requesting to join the topology through heartbeat.", operatorId); Send(taskId, message); } @@ -91,7 +91,7 @@ public void TopologyUpdateRequest(string taskId, string stageName, int operatorI offset += sizeof(ushort); Buffer.BlockCopy(BitConverter.GetBytes((ushort)operatorId), 0, message, offset, sizeof(ushort)); - LOGGER.Log(Level.Info, "Operator {0} requesting a topology update through heartbeat.", operatorId); + Log.Log(Level.Info, "Operator {0} requesting a topology update through heartbeat.", operatorId); Send(taskId, message); } @@ -111,7 +111,7 @@ public void StageComplete(string taskId, string stageName) Buffer.BlockCopy( BitConverter.GetBytes((ushort)TaskMessageType.CompleteStage), 0, message, offset, sizeof(ushort)); - LOGGER.Log(Level.Info, "Sending notification that the stage is completed."); + Log.Log(Level.Info, "Sending notification that the stage is completed."); Send(taskId, message); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index 656830ce70..e4bc67c511 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -38,7 +38,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task [Unstable("0.16", "API may change")] public sealed class Workflow : IEnumerator, IEnumerable { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(Workflow)); + private static readonly Logger Log = Logger.GetLogger(typeof(Workflow)); private int _position = -1; private bool _failed = false; @@ -134,12 +134,12 @@ public void Throw(Exception e) { if (_cancellationSource.IsCancelled) { - LOGGER.Log(Level.Warning, + Log.Log(Level.Warning, "Workflow captured an exception while cancellation source was true.", e); } else { - LOGGER.Log(Level.Error, "Workflow captured an exception.", e); + Log.Log(Level.Error, "Workflow captured an exception.", e); _failed = true; throw new OperatorException( diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index 6cd75193da..565d6477da 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -39,7 +39,7 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl [Unstable("0.16", "API may change")] public class FlatTopology : ITopology { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(FlatTopology)); + private static readonly Logger Log = Logger.GetLogger(typeof(FlatTopology)); private string _rootTaskId = string.Empty; private int _rootId; @@ -322,7 +322,7 @@ public void TopologyUpdateResponse( if (_nodesWaitingToJoinTopology.Count > 0) { - LOGGER.Log(Level.Info, + Log.Log(Level.Info, "Tasks [{0}] are added to topology in iteration {1}", string.Join(",", _nodesWaitingToJoinTopology), _iteration); @@ -347,7 +347,7 @@ public void TopologyUpdateResponse( /// The new iteration number public void OnNewIteration(int iteration) { - LOGGER.Log(Level.Info, + Log.Log(Level.Info, "Flat Topology for Operator {0} in Iteration {1} is closed with {2} nodes", OperatorId, iteration - 1, @@ -402,7 +402,7 @@ public IList Reconfigure( var data = new FailureMessagePayload(update, StageName, OperatorId, -1); var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); - LOGGER.Log(Level.Info, "Task {0} is removed from topology", taskId); + Log.Log(Level.Info, "Task {0} is removed from topology", taskId); messages.Add(returnMessage); _lostNodesToBeRemoved.Clear(); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs index 791d23261a..ffb00e4599 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs @@ -94,7 +94,7 @@ protected override void Send(CancellationTokenSource cancellationSource) // If we are here, we weren't able to receive a topology update on time. Retry. if (cancellationSource.IsCancellationRequested) { - LOGGER.Log(Level.Warning, "Received cancellation request: stop sending"); + Log.Log(Level.Warning, "Received cancellation request: stop sending"); return; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index 279fe4fa8f..50e2459c89 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -36,7 +36,7 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default [Unstable("0.16", "API may change")] internal abstract class OneToNTopology : OperatorTopologyWithDefaultCommunication { - protected static readonly Logger LOGGER = Logger.GetLogger(typeof(OneToNTopology)); + protected static readonly Logger Log = Logger.GetLogger(typeof(OneToNTopology)); protected readonly ConcurrentDictionary _nodesToRemove = new ConcurrentDictionary(); @@ -191,7 +191,7 @@ public override void OnNext(DriverMessagePayload message) { foreach (var node in updates.Children) { - LOGGER.Log(Level.Info, "Removing task {0} from the topology.", node); + Log.Log(Level.Info, "Removing task {0} from the topology.", node); _nodesToRemove.TryAdd(node, new byte()); _commLayer.RemoveConnection(node); } @@ -228,7 +228,7 @@ public override void OnNext(DriverMessagePayload message) } else { - LOGGER.Log(Level.Warning, "Received a topology update message from driver " + Log.Log(Level.Warning, "Received a topology update message from driver " + "but sending queue is empty: ignoring."); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Utilities/Utils.cs b/lang/cs/Org.Apache.REEF.Network/Utilities/Utils.cs index 5ba1eb0a80..7c15a1a581 100644 --- a/lang/cs/Org.Apache.REEF.Network/Utilities/Utils.cs +++ b/lang/cs/Org.Apache.REEF.Network/Utilities/Utils.cs @@ -27,7 +27,7 @@ namespace Org.Apache.REEF.Network.Utilities { internal class Utils { - private static readonly Logger LOGGER = Logger.GetLogger(typeof(Utils)); + private static readonly Logger Log = Logger.GetLogger(typeof(Utils)); /// /// Returns the TaskIdentifier from the Configuration. @@ -44,7 +44,7 @@ public static string GetTaskId(IConfiguration taskConfiguration) } catch (InjectionException) { - LOGGER.Log(Level.Error, "Unable to find task identifier"); + Log.Log(Level.Error, "Unable to find task identifier"); throw; } } @@ -64,7 +64,7 @@ public static string GetContextId(IConfiguration contextConfiguration) } catch (InjectionException) { - LOGGER.Log(Level.Error, "Unable to find task identifier"); + Log.Log(Level.Error, "Unable to find task identifier"); throw; } } From 835e046955d11ddc28b2f269315844096803f9e0 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 31 Jan 2019 11:24:27 -0800 Subject: [PATCH 13/29] Moved a file into the proper directory --- .../{ => WithFailures}/ElasticBroadcastDriverWithFailures.cs | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lang/cs/Org.Apache.REEF.Network.Examples/Elastic/{ => WithFailures}/ElasticBroadcastDriverWithFailures.cs (100%) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs similarity index 100% rename from lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriverWithFailures.cs rename to lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs From 30085b41a0f88a94f67d8f8d2c6c97363f0c2114 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Fri, 1 Feb 2019 11:46:41 -0800 Subject: [PATCH 14/29] Fixed typo. Fixed API for generic new threshold for the failre machine. --- .../ElasticBroadcastDriverWithFailures.cs | 18 +++++------------- .../Failures/Default/DefaultFailureState.cs | 5 +++++ .../Default/DefaultFailureStateMachine.cs | 6 +++--- .../Elastic/Failures/IFailureStateMachine.cs | 8 +++++--- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs index 41dfdba1cc..2b32ea6841 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs @@ -45,20 +45,12 @@ private ElasticBroadcastDriverWithFailures( { IFailureStateMachine failureMachine = new DefaultFailureStateMachine(); - failureMachine.SetThreasholds(new Tuple[] + failureMachine.SetThresholds(new Tuple[] { - Tuple.Create( - new DefaultFailureState( - (int)DefaultFailureStates.ContinueAndReconfigure) as IFailureState, 0.01F), - Tuple.Create( - new DefaultFailureState( - (int)DefaultFailureStates.ContinueAndReschedule) as IFailureState, 0.40F), - Tuple.Create( - new DefaultFailureState( - (int)DefaultFailureStates.StopAndReschedule) as IFailureState, 0.60F), - Tuple.Create( - new DefaultFailureState( - (int)DefaultFailureStates.Fail) as IFailureState, 0.80F) + DefaultFailureState.Threshold(DefaultFailureStates.ContinueAndReconfigure, 0.01F), + DefaultFailureState.Threshold(DefaultFailureStates.ContinueAndReschedule, 0.40F), + DefaultFailureState.Threshold(DefaultFailureStates.StopAndReschedule, 0.60F), + DefaultFailureState.Threshold(DefaultFailureStates.Fail, 0.80F) }); IElasticStage stage = Context.CreateNewStage(stageName, numEvaluators, failureMachine); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs index 72f2d5987e..bdee88e901 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureState.cs @@ -60,5 +60,10 @@ public IFailureState Merge(IFailureState that) { return new DefaultFailureState(Math.Max(FailureState, that.FailureState)); } + + public static Tuple Threshold(DefaultFailureStates state, float weight) + { + return new Tuple(new DefaultFailureState((int)state), weight); + } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs index 4dba3b5e4f..1c3bd6f3ee 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -207,7 +207,7 @@ public IFailureState Complete() /// /// The failure state we want to change /// A [0, 1] value specifying when the failure level is reached - public void SetThreashold(IFailureState level, float threshold) + public void SetThreshold(IFailureState level, float threshold) { if (!(level is DefaultFailureState)) { @@ -231,7 +231,7 @@ public void SetThreashold(IFailureState level, float threshold) /// A utility method for setting multiple threshold at once. /// /// Pairs of failure states with related new thresholds - public void SetThreasholds(Tuple[] weights) + public void SetThresholds(Tuple[] weights) { if (!weights.All(weight => weight.Item1 is DefaultFailureState)) { @@ -267,7 +267,7 @@ public IFailureStateMachine Clone(int initalPoints = 0, int initalState = (int)D foreach (DefaultFailureStates state in transitionWeights.Keys.OrderByDescending(x => x)) { - newMachine.SetThreashold(new DefaultFailureState((int)state), transitionWeights[state]); + newMachine.SetThreshold(new DefaultFailureState((int)state), transitionWeights[state]); } return newMachine; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs index f59d324bd0..c1c34f2fd9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs @@ -53,17 +53,19 @@ public interface IFailureStateMachine /// /// Method used to set or update the current threshold connected with /// a target failure state. The assumption is that higher failure states - /// have higher thresholds. + /// have higher thresholds. If multiple threshould need to be changed, use + /// the SetThresholds method instead. /// /// The failure state we want to change /// A [0, 1] value specifying when the failure level is reached - void SetThreashold(IFailureState level, float threshold); + void SetThreshold(IFailureState level, float threshold); /// /// A utility method for setting multiple threshold at once. + /// This method is appropriate when multiple threshould needs to be setted at once. /// /// Pairs of failure states with related new thresholds - void SetThreasholds(Tuple[] weights); + void SetThresholds(Tuple[] weights); /// /// Add new data point(s) to the failure machine. From 0cfab73b324c066529e63da93895237acffffbbb Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Fri, 1 Feb 2019 16:15:04 -0800 Subject: [PATCH 15/29] Added the delegate for task configuration. Added params for better set of threshould in the failure machine --- .../Elastic/ElasticBroadcastDriver.cs | 31 +++++++--------- .../ElasticBroadcastDriverWithFailures.cs | 36 ++++++++----------- .../Driver/Default/DefaultElasticContext.cs | 8 ++--- .../Default/DefaultElasticTaskSetManager.cs | 8 ++--- .../Elastic/Driver/IElasticContext.cs | 19 +++++++--- .../Default/DefaultFailureStateMachine.cs | 2 +- .../Elastic/Failures/IFailureStateMachine.cs | 2 +- 7 files changed, 51 insertions(+), 55 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs index dafcc72dbd..3b3806c3fb 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs @@ -24,7 +24,6 @@ using Org.Apache.REEF.Common.Tasks; using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; using Org.Apache.REEF.Network.Elastic.Driver.Default; -using Org.Apache.REEF.Network.Elastic.Task.Default; namespace Org.Apache.REEF.Network.Examples.Elastic { @@ -57,28 +56,22 @@ private ElasticBroadcastDriver(IElasticContext context) : base(context) TaskSetManager.Build(); } - private Func MasterTaskConfiguration + private IConfiguration MasterTaskConfiguration(string taskId) { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) - .Build(); - } + return TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); } - private Func SlaveTaskConfiguration + private IConfiguration SlaveTaskConfiguration(string taskId) { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) - .Build(); - } + return TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs index 2b32ea6841..cc2c291045 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs @@ -45,13 +45,11 @@ private ElasticBroadcastDriverWithFailures( { IFailureStateMachine failureMachine = new DefaultFailureStateMachine(); - failureMachine.SetThresholds(new Tuple[] - { + failureMachine.SetThresholds( DefaultFailureState.Threshold(DefaultFailureStates.ContinueAndReconfigure, 0.01F), DefaultFailureState.Threshold(DefaultFailureStates.ContinueAndReschedule, 0.40F), DefaultFailureState.Threshold(DefaultFailureStates.StopAndReschedule, 0.60F), - DefaultFailureState.Threshold(DefaultFailureStates.Fail, 0.80F) - }); + DefaultFailureState.Threshold(DefaultFailureStates.Fail, 0.80F)); IElasticStage stage = Context.CreateNewStage(stageName, numEvaluators, failureMachine); @@ -74,28 +72,22 @@ private ElasticBroadcastDriverWithFailures( TaskSetManager.Build(); } - private Func MasterTaskConfiguration + private IConfiguration MasterTaskConfiguration(string taskId) { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) - .Build(); - } + return TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); } - private Func SlaveTaskConfiguration + private IConfiguration SlaveTaskConfiguration(string taskId) { - get - { - return (taskId) => TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) - .Build(); - } + return TangFactory.GetTang().NewConfigurationBuilder( + Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) + .Build()) + .Build(); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs index 390db11eb2..263b363c0c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs @@ -229,8 +229,8 @@ public void Start() /// A new task set manager public IElasticTaskSetManager CreateNewTaskSetManager( - Func masterTaskConfiguration, - Func slaveTaskConfiguration = null) + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null) { return CreateNewTaskSetManager(_numEvaluators, masterTaskConfiguration, slaveTaskConfiguration); } @@ -244,8 +244,8 @@ public IElasticTaskSetManager CreateNewTaskSetManager( /// A new task set manager public IElasticTaskSetManager CreateNewTaskSetManager( int numOfTasks, - Func masterTaskConfiguration, - Func slaveTaskConfiguration = null) + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null) { return new DefaultElasticTaskSetManager( numOfTasks, diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index bc0c4ded6c..4dfa78ff4a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -374,8 +374,8 @@ public Alarm GetAlarm(long time) private readonly int _numTasks; private readonly IEvaluatorRequestor _evaluatorRequestor; private readonly string _driverId; - private readonly Func _masterTaskConfiguration; - private readonly Func _slaveTaskConfiguration; + private readonly TaskConfigurator _masterTaskConfiguration; + private readonly TaskConfigurator _slaveTaskConfiguration; // Task info 0-indexed private readonly List _taskInfos; @@ -409,8 +409,8 @@ public DefaultElasticTaskSetManager( int numTasks, IEvaluatorRequestor evaluatorRequestor, string driverId, - Func masterTaskConfiguration, - Func slaveTaskConfiguration = null, + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null, params IConfiguration[] confs) { _numTasks = numTasks; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs index 9ff4d288e0..8b96905cbf 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticContext.cs @@ -21,11 +21,17 @@ using Org.Apache.REEF.Tang.Formats; using Org.Apache.REEF.Tang.Interface; using Org.Apache.REEF.Utilities.Attributes; -using System; using System.Collections.Generic; namespace Org.Apache.REEF.Network.Elastic.Driver { + /// + /// Delegate used to generate the task configuration for the input task. + /// + /// The identifier for the task + /// + public delegate IConfiguration TaskConfigurator(string taskId); + /// /// This is the entry point for enabling the Elastic Group Communication. /// The workflow is the following: @@ -68,7 +74,7 @@ public interface IElasticContext : IFailureResponse /// /// Generate the base configuration module for tasks. - /// This method is method can be used to generate configurations for the task set menager. + /// This method can be used to generate configurations for the task set menager. /// /// The id of the task the configuration is generate for /// The module with the service properly set up for the task @@ -86,7 +92,9 @@ public interface IElasticContext : IFailureResponse /// The configuration for the master task /// The configuration for the slave task /// A new task set manager - IElasticTaskSetManager CreateNewTaskSetManager(Func masterTaskConfiguration, Func slaveTaskConfiguration = null); + IElasticTaskSetManager CreateNewTaskSetManager( + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null); /// /// Create a new task set manager. @@ -95,7 +103,10 @@ public interface IElasticContext : IFailureResponse /// The configuration for the master task /// The configuration for the slave task /// A new task set manager - IElasticTaskSetManager CreateNewTaskSetManager(int numOfTasks, Func masterTaskConfiguration, Func slaveTaskConfiguration = null); + IElasticTaskSetManager CreateNewTaskSetManager( + int numOfTasks, + TaskConfigurator masterTaskConfiguration, + TaskConfigurator slaveTaskConfiguration = null); /// /// Generate the elastic service configuration object. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs index 1c3bd6f3ee..8c6fec510c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -231,7 +231,7 @@ public void SetThreshold(IFailureState level, float threshold) /// A utility method for setting multiple threshold at once. /// /// Pairs of failure states with related new thresholds - public void SetThresholds(Tuple[] weights) + public void SetThresholds(params Tuple[] weights) { if (!weights.All(weight => weight.Item1 is DefaultFailureState)) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs index c1c34f2fd9..ef3f7a3716 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/IFailureStateMachine.cs @@ -65,7 +65,7 @@ public interface IFailureStateMachine /// This method is appropriate when multiple threshould needs to be setted at once. /// /// Pairs of failure states with related new thresholds - void SetThresholds(Tuple[] weights); + void SetThresholds(params Tuple[] weights); /// /// Add new data point(s) to the failure machine. From c9259ac12383e7f8a99aa7eec792751581d2e313 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Sun, 3 Feb 2019 20:56:04 -0800 Subject: [PATCH 16/29] Improved the API. --- .../Elastic/ElasticBroadcastDriver.cs | 26 ++++------- .../ElasticBroadcastDriverWithFailures.cs | 26 ++++------- .../Default/DefaultElasticTaskSetManager.cs | 5 ++- .../Elastic/Driver/IElasticTaskSetManager.cs | 3 +- .../Default/DefaultFailureStateMachine.cs | 43 ++++++++++++------- 5 files changed, 51 insertions(+), 52 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs index 3b3806c3fb..345feade2f 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/ElasticBroadcastDriver.cs @@ -45,32 +45,24 @@ private ElasticBroadcastDriver(IElasticContext context) : base(context) // Build the stage stage = stage.Build(); - // Create the task manager - TaskSetManager = Context.CreateNewTaskSetManager( - MasterTaskConfiguration, SlaveTaskConfiguration); - - // Register the stage to the task manager - TaskSetManager.AddStage(stage); - - // Build the task set manager - TaskSetManager.Build(); + // Create the task manager, register the stage to the task manager, build the task set manager + TaskSetManager = Context + .CreateNewTaskSetManager(MasterTaskConfiguration, SlaveTaskConfiguration) + .AddStage(stage) + .Build(); } private IConfiguration MasterTaskConfiguration(string taskId) { - return TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) + return Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) .Build(); } private IConfiguration SlaveTaskConfiguration(string taskId) { - return TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) + return Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) .Build(); } } diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs index cc2c291045..dc6321c62a 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs @@ -61,32 +61,24 @@ private ElasticBroadcastDriverWithFailures( // Build the stage stage = stage.Build(); - // Create the task manager - TaskSetManager = Context.CreateNewTaskSetManager( - MasterTaskConfiguration, SlaveTaskConfiguration); - - // Register the stage to the task manager - TaskSetManager.AddStage(stage); - - // Build the task set manager - TaskSetManager.Build(); + // Create the task manager, register the stage to the task manager, build the task set manager + TaskSetManager = Context + .CreateNewTaskSetManager(MasterTaskConfiguration, SlaveTaskConfiguration) + .AddStage(stage) + .Build(); } private IConfiguration MasterTaskConfiguration(string taskId) { - return TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) + return Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) .Build(); } private IConfiguration SlaveTaskConfiguration(string taskId) { - return TangFactory.GetTang().NewConfigurationBuilder( - Context.GetTaskConfigurationModule(taskId) - .Set(TaskConfiguration.Task, GenericType.Class) - .Build()) + return Context.GetTaskConfigurationModule(taskId) + .Set(TaskConfiguration.Task, GenericType.Class) .Build(); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index 4dfa78ff4a..eee468d189 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -458,7 +458,8 @@ public string StagesId /// Subscribe the current task set manager to a new stage. /// /// The stage to subscribe to - public void AddStage(IElasticStage stage) + /// The same finalized task set manager + public IElasticTaskSetManager AddStage(IElasticStage stage) { if (_finalized == true) { @@ -466,6 +467,8 @@ public void AddStage(IElasticStage stage) } _stages.Add(stage.StageName, stage); + + return this; } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs index db66cc9808..4559cc5184 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs @@ -44,7 +44,8 @@ public interface IElasticTaskSetManager : IFailureResponse, IDisposable /// Subscribe the current task set manager to a new stage. /// /// The stage to subscribe to - void AddStage(IElasticStage stage); + /// The task manager with the added stage + IElasticTaskSetManager AddStage(IElasticStage stage); /// /// Decides whether more contexts have to be added to this Task Manger or not. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs index 8c6fec510c..7d9d547812 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -36,9 +36,10 @@ namespace Org.Apache.REEF.Network.Elastic.Failures.Default [Unstable("0.16", "API may change")] public sealed class DefaultFailureStateMachine : IFailureStateMachine { - private readonly object _statusLock; + private readonly object _statusLock = new object(); - private readonly SortedDictionary transitionMapUp = new SortedDictionary() + private readonly SortedDictionary transitionMapUp = + new SortedDictionary() { { DefaultFailureStates.Continue, DefaultFailureStates.ContinueAndReconfigure }, { DefaultFailureStates.ContinueAndReconfigure, DefaultFailureStates.ContinueAndReschedule }, @@ -46,7 +47,8 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine { DefaultFailureStates.StopAndReschedule, DefaultFailureStates.Fail } }; - private readonly SortedDictionary transitionMapDown = new SortedDictionary() + private readonly SortedDictionary transitionMapDown = + new SortedDictionary() { { DefaultFailureStates.ContinueAndReconfigure, DefaultFailureStates.Continue }, { DefaultFailureStates.ContinueAndReschedule, DefaultFailureStates.ContinueAndReconfigure }, @@ -54,7 +56,8 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine { DefaultFailureStates.Fail, DefaultFailureStates.StopAndReschedule } }; - private readonly IDictionary transitionWeights = new Dictionary() + private readonly IDictionary transitionWeights = + new Dictionary() { { DefaultFailureStates.ContinueAndReconfigure, 0.01F }, { DefaultFailureStates.ContinueAndReschedule, 0.40F }, @@ -84,17 +87,18 @@ public DefaultFailureStateMachine() : this(0, DefaultFailureStates.Continue) } /// - /// Default failure stata machine starting with a given amount of data points and a given intial state. + /// Default failure stata machine starting with a given amount of data points and a given + /// intial state. /// /// The number of initial data points for the machine, 0 by default /// The initial state, continue by default - public DefaultFailureStateMachine(int initalPoints = 0, DefaultFailureStates initalState = DefaultFailureStates.Continue) + public DefaultFailureStateMachine( + int initalPoints = 0, + DefaultFailureStates initalState = DefaultFailureStates.Continue) { NumOfDataPoints = initalPoints; NumOfFailedDataPoints = initalPoints; State = new DefaultFailureState((int)initalState); - - _statusLock = new object(); } /// @@ -138,11 +142,13 @@ public IFailureState AddDataPoints(int points, bool isNew) { NumOfFailedDataPoints -= points; } - if (State.FailureState > (int)DefaultFailureStates.Continue && State.FailureState <= (int)DefaultFailureStates.Fail) + if (State.FailureState > (int)DefaultFailureStates.Continue && + State.FailureState <= (int)DefaultFailureStates.Fail) { float currentRate = (float)NumOfFailedDataPoints / NumOfDataPoints; - while (State.FailureState > (int)DefaultFailureStates.Continue && currentRate < transitionWeights[(DefaultFailureStates)State.FailureState]) + while (State.FailureState > (int)DefaultFailureStates.Continue && + currentRate < transitionWeights[(DefaultFailureStates)State.FailureState]) { State.FailureState = (int)transitionMapDown[(DefaultFailureStates)State.FailureState]; } @@ -165,7 +171,8 @@ public IFailureState RemoveDataPoints(int points) float currentRate = (float)NumOfFailedDataPoints / NumOfDataPoints; - if (isFinalState.Contains(State.FailureState) && currentRate >= transitionWeights[DefaultFailureStates.StopAndReschedule]) + if (isFinalState.Contains(State.FailureState) && + currentRate >= transitionWeights[DefaultFailureStates.StopAndReschedule]) { throw new IllegalStateException("Received remove data point when state is complete: failing."); } @@ -193,7 +200,8 @@ public IFailureState Complete() } else { - throw new IllegalStateException($"Failure machine cannot move from state {State.FailureState} to Complete: failing."); + throw new IllegalStateException( + $"Failure machine cannot move from state {State.FailureState} to Complete: failing."); } } @@ -261,7 +269,9 @@ public void SetThresholds(params Tuple[] weights) /// How many data points are avaialble in the new state machine /// The state from which the new machine should start /// A new failure machine with the same settings - public IFailureStateMachine Clone(int initalPoints = 0, int initalState = (int)DefaultFailureStates.Continue) + public IFailureStateMachine Clone( + int initalPoints = 0, + int initalState = (int)DefaultFailureStates.Continue) { var newMachine = new DefaultFailureStateMachine(initalPoints, (DefaultFailureStates)initalState); @@ -274,8 +284,8 @@ public IFailureStateMachine Clone(int initalPoints = 0, int initalState = (int)D } /// - /// Check if the states and related thresholds and consistent: i.e., each state can move up or down to only - /// one other state. + /// Check if the states and related thresholds and consistent: i.e., each state can move + /// up or down to only one other state. /// private void CheckConsistency() { @@ -290,7 +300,8 @@ private void CheckConsistency() { if (nextWeight < prevWeight) { - throw new IllegalStateException($"State {transitionMapDown[state]} weight is bigger than state {state}."); + throw new IllegalStateException( + $"State {transitionMapDown[state]} weight is bigger than state {state}."); } prevWeight = nextWeight; From 90ff0ed786edbd95a2699bbff69238acde44bd66 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 7 Feb 2019 08:45:39 -0800 Subject: [PATCH 17/29] Few fixes addressing Sergiy's comments --- .../Physical/Default/DefaultOneToN.cs | 20 ++++--------- .../Elastic/Task/Workflow.cs | 28 ++++++------------- 2 files changed, 14 insertions(+), 34 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index 7e9bdd5f8b..0d4dfe6b41 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -37,10 +37,10 @@ public abstract class DefaultOneToN private static readonly Logger Log = Logger.GetLogger(typeof(DefaultOneToN<>)); internal readonly OneToNTopology _topology; - internal volatile PositionTracker _position; + internal volatile PositionTracker _position = PositionTracker.Nil; private readonly bool _isLast; - private bool _cleanDisposal; + private bool _cleanDisposal = false; /// /// Creates a new one to N operator. @@ -54,13 +54,8 @@ internal DefaultOneToN(int id, bool isLast, OneToNTopology topology) OperatorId = id; _isLast = isLast; _topology = topology; - _position = PositionTracker.Nil; - _cleanDisposal = false; - OnTaskRescheduled = new Action(() => - { - _topology.JoinTopology(); - }); + OnTaskRescheduled = _topology.JoinTopology; } /// @@ -80,10 +75,7 @@ public string FailureInfo { get { - string iteration = IteratorReference == null ? "-1" : IteratorReference.Current.ToString(); - string position = ((int)_position).ToString() + ":"; - string isSending = _topology.IsSending ? "1" : "0"; - return iteration + ":" + position + ":" + isSending; + return $"{IteratorReference?.Current ?? -1:d}:{_position:d}:{_topology.IsSending:d}"; } } @@ -116,14 +108,12 @@ public T Receive() _position = PositionTracker.InReceive; var received = false; - DataMessage dataMessage = null; ITypedDataMessage typedDataMessage = null; var isIterative = IteratorReference != null; while (!received && !CancellationSource.IsCancellationRequested) { - dataMessage = _topology.Receive(CancellationSource) as DataMessage; - typedDataMessage = dataMessage as ITypedDataMessage; + typedDataMessage = (ITypedDataMessage)_topology.Receive(CancellationSource); if (isIterative && typedDataMessage.Iteration < (int)IteratorReference.Current) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index e4bc67c511..7085505a3a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -45,7 +45,7 @@ public sealed class Workflow : IEnumerator, IEnumerable _iteratorsPosition = new List(); - private readonly object _lock = new object(); + private readonly object disposeLock = new object(); private readonly IList _operators = new List(); private readonly CancellationSource _cancellationSource; private readonly bool _isRescheduled; @@ -79,7 +79,7 @@ public bool MoveNext() // Check if we need to iterate if (_iteratorsPosition.Count > 0 && _position == _iteratorsPosition[0]) { - var iteratorOperator = _operators[_position] as IElasticIterator; + IElasticIterator iteratorOperator = _operators[_position] as IElasticIterator; if (iteratorOperator.MoveNext()) { @@ -152,14 +152,7 @@ public void Throw(Exception e) /// public void Reset() { - if (_iteratorsPosition.Count > 0) - { - _position = _iteratorsPosition[0]; - } - else - { - _position = 0; - } + _position = _iteratorsPosition.FirstOrDefault(); // default for int is 0 } /// @@ -169,7 +162,7 @@ public IElasticOperator Current { get { - return _position == -1 ? _operators[0] : _operators[_position]; + return _operators[_position < 0 ? 0 : _position]; } } @@ -183,19 +176,16 @@ object IEnumerator.Current /// public void Dispose() { - lock (_lock) + lock (disposeLock) { if (!_disposed) { - if (_operators != null) + // Clean dispose, check that the computation is completed + if (!_failed) { - // Clean dispose, check that the computation is completed - if (!_failed) + foreach (var op in _operators) { - foreach (var op in _operators) - { - op?.WaitCompletionBeforeDisposing(); - } + op?.WaitCompletionBeforeDisposing(); } } From 48dcc06d74fa8639195bc3bb7a29d68abfe24abc Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 7 Feb 2019 10:42:20 -0800 Subject: [PATCH 18/29] Another round of fixes. --- .../Elastic/BroadcastMasterTask.cs | 5 +- .../Elastic/Task/CancellationSource.cs | 11 ++-- .../Elastic/Task/CommunicationLayer.cs | 51 +++++++------------ .../Default/DefaultBroadcastTopology.cs | 7 +-- ...peratorTopologyWithDefaultCommunication.cs | 25 ++++----- 5 files changed, 39 insertions(+), 60 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs index 4bdedbf04f..c9969eb85a 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs @@ -35,14 +35,15 @@ private BroadcastMasterTask(CancellationSource source, IElasticContext context) { } + private readonly Random _rand = new Random(); + protected override void Execute(byte[] memento, Workflow workflow) { - var rand = new Random(); int number = 0; foreach (var op in workflow) { - number = rand.Next(); + number = _rand.Next(); switch (op.OperatorType) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs index 9a238c31af..4654a3a18d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CancellationSource.cs @@ -17,6 +17,7 @@ using Org.Apache.REEF.Tang.Annotations; using Org.Apache.REEF.Utilities.Attributes; +using System; using System.Threading; namespace Org.Apache.REEF.Network.Elastic.Task @@ -27,18 +28,17 @@ namespace Org.Apache.REEF.Network.Elastic.Task /// to inject the same source through the elastic communication services. /// [Unstable("0.16", "API may change")] - public sealed class CancellationSource + public sealed class CancellationSource : IDisposable { [Inject] private CancellationSource() { - Source = new CancellationTokenSource(); } /// /// The wrapped cancellation source. /// - public CancellationTokenSource Source { get; private set; } + public readonly CancellationTokenSource Source = new CancellationTokenSource(); /// /// Whether the operation is cancelled. @@ -56,5 +56,10 @@ public void Cancel() { Source.Cancel(); } + + public void Dispose() + { + Source.Dispose(); + } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index a1e3e81674..9d7456f8a3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -36,7 +36,8 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl /// [Unstable("0.16", "API may change")] internal abstract class CommunicationLayer : - IObserver>> + IObserver>>, + IDisposable { private static readonly Logger Log = Logger.GetLogger(typeof(CommunicationLayer)); @@ -48,7 +49,7 @@ internal abstract class CommunicationLayer : protected readonly DefaultTaskToDriverMessageDispatcher _taskToDriverDispatcher; private readonly ElasticDriverMessageHandler _driverMessagesHandler; private readonly IIdentifierFactory _idFactory; - private IDisposable _communicationObserver; + private readonly IDisposable _communicationObserver; private readonly ConcurrentDictionary _driverMessageObservers; protected bool _disposed = false; @@ -90,12 +91,10 @@ public void RegisterOperatorTopologyForTask(IOperatorTopologyWithCommunication o { var id = NodeObserverIdentifier.FromObserver(operatorObserver); - if (_groupMessageObservers.ContainsKey(id)) + if (_groupMessageObservers.TryAdd(id, operatorObserver)) { throw new IllegalStateException($"Topology for id {id} already added among listeners."); } - - _groupMessageObservers.TryAdd(id, operatorObserver); } /// @@ -106,12 +105,10 @@ internal void RegisterOperatorTopologyForDriver(DriverAwareOperatorTopology oper { var id = NodeObserverIdentifier.FromObserver(operatorObserver); - if (_driverMessageObservers.ContainsKey(id)) + if (!_driverMessageObservers.TryAdd(id, operatorObserver)) { throw new IllegalStateException($"Topology for id {id} already added among driver listeners."); } - - _driverMessageObservers.TryAdd(id, operatorObserver); } /// @@ -140,17 +137,14 @@ internal void Send( } IIdentifier destId = _idFactory.Create(destination); - int retry = 0; - while (!Send(destId, message)) + for (int retry = 0; !Send(destId, message); retry++) { if (retry > _retrySending) { throw new IllegalStateException($"Unable to send message after retying {retry} times."); } Thread.Sleep(_timeout); - - retry++; } } @@ -168,16 +162,11 @@ internal void Send( /// The token to cancel the operation /// Nodes that got removed during task registration public void WaitForTaskRegistration( - IList identifiers, + ICollection identifiers, CancellationTokenSource cancellationSource, - ConcurrentDictionary removed = null) + IDictionary removed = null) { - if (removed == null) - { - removed = new ConcurrentDictionary(); - } - - IList foundList = new List(); + ISet foundSet = new HashSet(); for (var i = 0; i < _retryRegistration; i++) { @@ -190,34 +179,32 @@ public void WaitForTaskRegistration( Log.Log(Level.Info, "WaitForTaskRegistration, in retryCount {0}.", i); foreach (var identifier in identifiers) { - var notFound = !foundList.Contains(identifier); - if (notFound && removed.ContainsKey(identifier)) + var notFound = !foundSet.Contains(identifier); + if (notFound && removed != null ? removed.ContainsKey(identifier) : false) { - foundList.Add(identifier); + foundSet.Add(identifier); Log.Log(Level.Verbose, "WaitForTaskRegistration, dependent id {0} was removed at loop {1}.", identifier, i); } else if (notFound && Lookup(identifier)) { - foundList.Add(identifier); + foundSet.Add(identifier); Log.Log(Level.Verbose, "WaitForTaskRegistration, find a dependent id {0} at loop {1}.", identifier, i); } } - if (foundList.Count >= identifiers.Count) + if (foundSet.Count >= identifiers.Count) { Log.Log(Level.Info, - "WaitForTaskRegistration, found all {0} dependent ids at loop {1}.", foundList.Count, i); + "WaitForTaskRegistration, found all {0} dependent ids at loop {1}.", foundSet.Count, i); return; } Thread.Sleep(_sleepTime); } - ICollection leftovers = - foundList.Count == 0 ? identifiers : identifiers.Where(e => !foundList.Contains(e)).ToList(); - var msg = string.Join(",", leftovers); + var msg = string.Join(",", identifiers.Except(foundSet)); Log.Log(Level.Error, "Cannot find registered parent/children: {0}.", msg); throw new Exception("Failed to find parent/children nodes"); @@ -230,11 +217,7 @@ public void WaitForTaskRegistration( /// public bool Lookup(string identifier) { - if (_disposed || _networkService == null) - { - return false; - } - return _networkService.NamingClient.Lookup(identifier) != null; + return !_disposed && _networkService?.NamingClient.Lookup(identifier) != null; } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs index ffb00e4599..d8b7614fdd 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs @@ -79,14 +79,11 @@ public override DataMessage GetDataMessage(int iteration, T[] data) /// The source in case the task is cancelled protected override void Send(CancellationTokenSource cancellationSource) { - ElasticGroupCommunicationMessage message; int retry = 0; // Check if we have a message to send - if (_sendQueue.TryPeek(out message)) + if (_sendQueue.TryPeek(out ElasticGroupCommunicationMessage message)) { - var dm = message as DataMessage; - // Broadcast topology require the driver to send topology updates to the root node // in order to have the most update topology at each boradcast round. while (!_topologyUpdateReceived.WaitOne(_timeout)) @@ -102,7 +99,7 @@ protected override void Send(CancellationTokenSource cancellationSource) if (retry > _retry) { - throw new OperatorException($"Iteration {dm.Iteration}: Failed to send message to the next node in the ring after {_retry} try.", OperatorId); + throw new OperatorException($"Iteration {((DataMessage)message).Iteration}: Failed to send message to the next node in the ring after {_retry} try.", OperatorId); } TopologyUpdateRequest(); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs index 8f184c8d7c..83d80eb8c1 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -36,7 +36,7 @@ internal abstract class OperatorTopologyWithDefaultCommunication : DriverAwareOperatorTopology, IOperatorTopologyWithCommunication { - protected bool _initialized; + protected bool _initialized = false; protected DefaultCommunicationLayer _commLayer; @@ -44,10 +44,10 @@ internal abstract class OperatorTopologyWithDefaultCommunication : protected readonly int _timeout; protected readonly int _retry; - protected ConcurrentQueue _sendQueue; - protected BlockingCollection _messageQueue; - protected readonly ConcurrentDictionary _children; - protected readonly CancellationTokenSource _cancellationSignal; + protected readonly ConcurrentQueue _sendQueue = new ConcurrentQueue(); + protected BlockingCollection _messageQueue = new BlockingCollection(); + protected readonly ConcurrentDictionary _children = new ConcurrentDictionary(); + protected readonly CancellationTokenSource _cancellationSignal = new CancellationTokenSource(); /// /// Constructor for a communicating topology. @@ -70,15 +70,8 @@ public OperatorTopologyWithDefaultCommunication( int timeout, int disposeTimeout) : base(stageName, taskId, rootTaskId, operatorId) { - _initialized = false; _commLayer = commLayer; - - _children = new ConcurrentDictionary(); - _messageQueue = new BlockingCollection(); - _sendQueue = new ConcurrentQueue(); - - _cancellationSignal = new CancellationTokenSource(); - + _retry = retry; _timeout = timeout; _disposeTimeout = disposeTimeout; @@ -109,12 +102,12 @@ public void TopologyUpdateRequest() /// public override void WaitCompletionBeforeDisposing() { - var elapsedTime = 0; - while (_sendQueue.Count > 0 && elapsedTime < _disposeTimeout) + var tsEnd = DateTime.Now.AddMilliseconds(_disposeTimeout); + while (_sendQueue.Count > 0 && DateTime.Now < tsEnd) { // The topology is still trying to send messages, wait. Thread.Sleep(100); - elapsedTime += 100; + } } From 7acd6ec0721efaaac2fe01a64120d6dab4e9292d Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Mon, 11 Feb 2019 16:29:25 -0800 Subject: [PATCH 19/29] Addressing Sergiy's new comments. --- .../Elastic/BroadcastMasterTask.cs | 5 +- .../Physical/Default/DefaultBroadcast.cs | 4 +- .../Elastic/Task/CommunicationLayer.cs | 6 +-- .../Task/ElasticDriverMessageHandler.cs | 4 +- .../Elastic/Task/NodeObserverIdentifier.cs | 27 ++++------- .../Elastic/Task/Workflow.cs | 22 ++++----- .../Default/DefaultBroadcastTopology.cs | 47 ++++++++++++++----- .../Physical/Default/OneToNTopology.cs | 43 ++++++++++------- ...peratorTopologyWithDefaultCommunication.cs | 4 +- 9 files changed, 87 insertions(+), 75 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs index c9969eb85a..4710fca41a 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/BroadcastMasterTask.cs @@ -39,16 +39,13 @@ private BroadcastMasterTask(CancellationSource source, IElasticContext context) protected override void Execute(byte[] memento, Workflow workflow) { - int number = 0; - foreach (var op in workflow) { - number = _rand.Next(); - switch (op.OperatorType) { case OperatorType.Broadcast: var sender = workflow.Current as IElasticBroadcast; + int number = _rand.Next(); sender.Send(number); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs index f5facde679..f0e2ca22a1 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultBroadcast.cs @@ -60,8 +60,8 @@ public void Send(T data) _position = PositionTracker.InSend; - int iteration = IteratorReference == null ? 0 : (int)IteratorReference.Current; - var message = _topology.GetDataMessage(iteration, new[] { data }); + int iteration = (int)(IteratorReference?.Current ?? 0); + var message = _topology.GetDataMessage(iteration, data); _topology.Send(message, CancellationSource); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index 9d7456f8a3..76f177d929 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -142,7 +142,7 @@ internal void Send( { if (retry > _retrySending) { - throw new IllegalStateException($"Unable to send message after retying {retry} times."); + throw new IllegalStateException($"Unable to send message after retrying {retry} times."); } Thread.Sleep(_timeout); } @@ -180,7 +180,7 @@ public void WaitForTaskRegistration( foreach (var identifier in identifiers) { var notFound = !foundSet.Contains(identifier); - if (notFound && removed != null ? removed.ContainsKey(identifier) : false) + if (notFound && removed != null && removed.ContainsKey(identifier)) { foundSet.Add(identifier); Log.Log(Level.Verbose, @@ -276,7 +276,7 @@ private bool Send(IIdentifier destId, ElasticGroupCommunicationMessage message) } catch (Exception e) { - Log.Log(Level.Warning, "Unable to send message " + e.Message); + Log.Log(Level.Warning, "Unable to send message to " + destId, e.Message); connection.Dispose(); return false; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs index bfa533a7aa..a17ed7b181 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs @@ -39,13 +39,13 @@ internal sealed class ElasticDriverMessageHandler : IDriverMessageHandler [Inject] private ElasticDriverMessageHandler() { - DriverMessageObservers = new ConcurrentDictionary(); } /// /// Observers of incoming messages from the driver. /// - internal ConcurrentDictionary DriverMessageObservers { get; set; } + internal readonly ConcurrentDictionary DriverMessageObservers = + new ConcurrentDictionary(); /// /// Handle an incoming message. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs index e9cd505933..6e4432f988 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs @@ -29,10 +29,7 @@ namespace Org.Apache.REEF.Network.Elastic.Task.Impl [Unstable("0.16", "API may change")] internal sealed class NodeObserverIdentifier { - private readonly string _stageName; - private readonly int _operatorId; - - /// + /// /// Creates an identifier from an operator topology with communication. /// public static NodeObserverIdentifier FromObserver(IOperatorTopologyWithCommunication observer) @@ -63,25 +60,19 @@ public static NodeObserverIdentifier FromMessage(ElasticGroupCommunicationMessag /// The identifier of the operator private NodeObserverIdentifier(string stageName, int operatorId) { - _stageName = stageName; - _operatorId = operatorId; + StageName = stageName; + OperatorId = operatorId; } /// /// The stage name. /// - public string StageName - { - get { return _stageName; } - } + public string StageName { get; } /// /// The operator name. /// - public int OperatorId - { - get { return _operatorId; } - } + public int OperatorId { get; } /// /// Overrides . Simply compares equivalence of instance fields. @@ -107,8 +98,8 @@ public override bool Equals(object obj) public override int GetHashCode() { int hash = 17; - hash = (hash * 31) + _stageName.GetHashCode(); - return (hash * 31) + _operatorId.GetHashCode(); + hash = (hash * 31) + StageName.GetHashCode(); + return (hash * 31) + OperatorId.GetHashCode(); } /// @@ -116,8 +107,8 @@ public override int GetHashCode() /// private bool Equals(NodeObserverIdentifier other) { - return _stageName.Equals(other.StageName) && - _operatorId.Equals(other.OperatorId); + return StageName.Equals(other.StageName) && + OperatorId.Equals(other.OperatorId); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index 7085505a3a..461a99da49 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -88,16 +88,14 @@ public bool MoveNext() return true; } - else + + if (_iteratorsPosition.Count > 1) { - if (_iteratorsPosition.Count > 1) - { - _iteratorsPosition.RemoveAt(0); - _position = _iteratorsPosition[0] - 1; - } - - return false; + _iteratorsPosition.RemoveAt(0); + _position = _iteratorsPosition[0] - 1; } + + return false; } // In case we have one or zero iterators @@ -109,12 +107,10 @@ public bool MoveNext() { return false; } - else - { - _position = _iteratorsPosition[0] - 1; - return MoveNext(); - } + _position = _iteratorsPosition[0] - 1; + + return MoveNext(); } if (_isRescheduled) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs index d8b7614fdd..f8130a1aac 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/DefaultBroadcastTopology.cs @@ -27,6 +27,7 @@ using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Task; using Org.Apache.REEF.Network.Elastic.Failures; +using System; namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default { @@ -61,16 +62,31 @@ private DefaultBroadcastTopology( { } - public override DataMessage GetDataMessage(int iteration, T[] data) + /// + /// Creates a DataMessage out of some input data. + /// + /// The iteration number of this message + /// The data to communicate + /// A properly configured DataMessage + public override DataMessage GetDataMessage(int iteration, T data) { if (_piggybackTopologyUpdates) { - return new DataMessageWithTopology(StageName, OperatorId, iteration, data[0]); - } - else - { - return new DataMessage(StageName, OperatorId, iteration, data[0]); + return new DataMessageWithTopology(StageName, OperatorId, iteration, data); } + + return new DataMessage(StageName, OperatorId, iteration, data); + } + + /// + /// Creates a DataMessage out of some input data. + /// + /// The iteration number of this message + /// The data to communicate + /// A properly configured DataMessage + public override DataMessage GetDataMessage(int iteration, params T[] data) + { + throw new NotImplementedException("Broadcast is allowed to send only one piece of data at a time"); } /// @@ -99,14 +115,18 @@ protected override void Send(CancellationTokenSource cancellationSource) if (retry > _retry) { - throw new OperatorException($"Iteration {((DataMessage)message).Iteration}: Failed to send message to the next node in the ring after {_retry} try.", OperatorId); + throw new OperatorException( + $"Iteration {((DataMessage)message).Iteration}: " + + $"Failed to send message to the next node in the ring after {_retry} try." + , OperatorId); } TopologyUpdateRequest(); } - // Get the actual message to send. Note that altough message sending is asynchronous, broadcast rounds should not overlap. - _sendQueue.TryDequeue(out message); + // Get the actual message to send. Note that altough message sending is asynchronous, + // broadcast rounds should not overlap. + var canSend = _sendQueue.TryDequeue(out message); if (TaskId == RootTaskId) { @@ -114,10 +134,13 @@ protected override void Send(CancellationTokenSource cancellationSource) _topologyUpdateReceived.Reset(); } - // Deliver the message to the commonication layer. - foreach (var node in _children.Where(x => !_nodesToRemove.TryGetValue(x.Value, out byte val))) + if (canSend) { - _commLayer.Send(node.Value, message, cancellationSource); + // Deliver the message to the communication layer. + foreach (var destination in _children.Values.Except(_nodesToRemove.Keys)) + { + _commLayer.Send(destination, message, cancellationSource); + } } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index 50e2459c89..1483447764 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -78,7 +78,7 @@ public OneToNTopology( timeout, disposeTimeout) { - _topologyUpdateReceived = new ManualResetEvent(RootTaskId == taskId ? false : true); + _topologyUpdateReceived = new ManualResetEvent(RootTaskId != taskId); _commLayer.RegisterOperatorTopologyForTask(this); _commLayer.RegisterOperatorTopologyForDriver(this); @@ -118,7 +118,21 @@ public void WaitCompletionBeforeDisposing(CancellationTokenSource cancellationSo } } - public abstract DataMessage GetDataMessage(int iteration, T[] data); + /// + /// Creates a DataMessage out of some input data. + /// + /// The iteration number of this message + /// The data to communicate + /// A properly configured DataMessage + public abstract DataMessage GetDataMessage(int iteration, T data); + + /// + /// Creates a DataMessage out of some input data. + /// + /// The iteration number of this message + /// The data to communicate + /// A properly configured DataMessage + public abstract DataMessage GetDataMessage(int iteration, params T[] data); /// /// Initializes the communication group. @@ -192,7 +206,7 @@ public override void OnNext(DriverMessagePayload message) foreach (var node in updates.Children) { Log.Log(Level.Info, "Removing task {0} from the topology.", node); - _nodesToRemove.TryAdd(node, new byte()); + _nodesToRemove.TryAdd(node, 0); _commLayer.RemoveConnection(node); } } @@ -242,27 +256,20 @@ public override void OnNext(DriverMessagePayload message) private void UpdateTopology(ref List updates) { - TopologyUpdate toRemove = null; - foreach (var update in updates) + var update = updates.Find(elem => elem.Node == TaskId); + + foreach (var child in update.Children) { - if (update.Node == TaskId) + if (!_nodesToRemove.TryRemove(child, out byte value)) { - toRemove = update; - foreach (var child in update.Children) - { - if (!_nodesToRemove.TryRemove(child, out byte value)) - { - var id = Utils.GetTaskNum(child); - _children.TryAdd(id, child); - } - } - break; + var id = Utils.GetTaskNum(child); + _children.TryAdd(id, child); } } - if (toRemove != null) + if (update != null) { - updates.Remove(toRemove); + updates.Remove(update); } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs index 83d80eb8c1..4458786ab4 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -45,7 +45,7 @@ internal abstract class OperatorTopologyWithDefaultCommunication : protected readonly int _retry; protected readonly ConcurrentQueue _sendQueue = new ConcurrentQueue(); - protected BlockingCollection _messageQueue = new BlockingCollection(); + protected readonly BlockingCollection _messageQueue = new BlockingCollection(); protected readonly ConcurrentDictionary _children = new ConcurrentDictionary(); protected readonly CancellationTokenSource _cancellationSignal = new CancellationTokenSource(); @@ -194,8 +194,6 @@ public virtual void OnNext(NsMessage message) { throw new IllegalStateException("Trying to add messages to a closed non-empty queue."); } - - _messageQueue = new BlockingCollection(); } _messageQueue.Add(message.Data); From 6a91d96661ee0e788e7d39d970a195af33d4dc0a Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 12 Feb 2019 08:35:44 -0800 Subject: [PATCH 20/29] Fixed stupid error. --- .../Topology/Physical/Default/OneToNTopology.cs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index 1483447764..e3ed8ec115 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -258,17 +258,17 @@ private void UpdateTopology(ref List updates) { var update = updates.Find(elem => elem.Node == TaskId); - foreach (var child in update.Children) + if (update != null) { - if (!_nodesToRemove.TryRemove(child, out byte value)) + foreach (var child in update.Children) { - var id = Utils.GetTaskNum(child); - _children.TryAdd(id, child); + if (!_nodesToRemove.TryRemove(child, out byte value)) + { + var id = Utils.GetTaskNum(child); + _children.TryAdd(id, child); + } } - } - if (update != null) - { updates.Remove(update); } } From 55aedfaf2549da8d52b0843a3961f830b788fd95 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Wed, 13 Feb 2019 20:53:16 -0800 Subject: [PATCH 21/29] added further fixes following Sergiy's suggestions --- .../Elastic/Comm/ITaskMessageResponse.cs | 3 +- .../Comm/Impl/ElasticDriverMessageImpl.cs | 15 +++++---- .../Comm/Impl/TopologyMessagePayload.cs | 31 ++++++++++++------- .../Elastic/Comm/Impl/TopologyUpdate.cs | 9 ++++-- .../Elastic/Comm/Impl/UpdateMessagePayload.cs | 1 - .../Physical/Default/DefaultOneToN.cs | 4 +-- .../Elastic/Task/Workflow.cs | 2 +- ...peratorTopologyWithDefaultCommunication.cs | 9 ++++-- 8 files changed, 45 insertions(+), 29 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs index f59572d228..94d4fdfda3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs @@ -32,7 +32,8 @@ public interface ITaskMessageResponse /// /// The task message for the operator /// A list of messages containing the instructions for the task - /// If the message cannot be handled correctly or generate an incorrent state + /// If the message cannot be handled correctly or + /// generate an incorrect state void OnTaskMessage(ITaskMessage message, ref List returnMessages); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs index a5ca4e3d85..89592d1d89 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs @@ -59,16 +59,15 @@ public ElasticDriverMessageImpl( /// The serialized message public byte[] Serialize() { - List buffer = new List(); + List buffer = new List(); - byte[] destinationBytes = ByteUtilities.StringToByteArrays(Destination); + var destinationBytes = ByteUtilities.StringToByteArrays(Destination); + buffer.AddRange(BitConverter.GetBytes(destinationBytes.Length)); + buffer.AddRange(destinationBytes); + buffer.AddRange(BitConverter.GetBytes((short)Message.PayloadType)); + buffer.AddRange(Message.Serialize()); - buffer.Add(BitConverter.GetBytes(destinationBytes.Length)); - buffer.Add(destinationBytes); - buffer.Add(BitConverter.GetBytes((short)Message.PayloadType)); - buffer.Add(Message.Serialize()); - - return buffer.SelectMany(i => i).ToArray(); + return buffer.ToArray(); } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs index 198ad3c366..d27f50e71f 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs @@ -35,11 +35,15 @@ internal class TopologyMessagePayload : DriverMessagePayload /// Create a driver message payload containing topology updates. /// /// The topology updates - /// Whether the updates are additions to the current topology state or nodes removal /// The stage context for the message /// The id of the operator receiving the topology update /// The iteration in which the update takes effect - public TopologyMessagePayload(DriverMessagePayloadType type, List updates, string stageName, int operatorId, int iteration) + public TopologyMessagePayload( + DriverMessagePayloadType type, + List updates, + string stageName, + int operatorId, + int iteration) : base(stageName, operatorId, iteration) { PayloadType = type; @@ -52,13 +56,7 @@ public TopologyMessagePayload(DriverMessagePayloadType type, ListAn object containing the shallow copy of the message. public override object Clone() { - var updatesClone = new List(); - - foreach (var update in TopologyUpdates) - { - var clone = new TopologyUpdate(update.Node, update.Children, update.Root); - updatesClone.Add(update); - } + var updatesClone = TopologyUpdates.Select(up => (TopologyUpdate)up.Clone()).ToList(); return TopologyMessageBuilder(PayloadType, updatesClone, StageName, OperatorId, Iteration); } @@ -101,7 +99,13 @@ internal override byte[] Serialize() byte[] stageBytes = ByteUtilities.StringToByteArrays(StageName); int offset = 0; var totalLengthUpdates = TopologyUpdates.Sum(x => x.Size); - byte[] buffer = new byte[sizeof(int) + totalLengthUpdates + sizeof(int) + stageBytes.Length + sizeof(bool) + sizeof(int) + sizeof(int)]; + byte[] buffer = new byte[sizeof(int) + + totalLengthUpdates + + sizeof(int) + + stageBytes.Length + + sizeof(bool) + + sizeof(int) + + sizeof(int)]; Buffer.BlockCopy(BitConverter.GetBytes(totalLengthUpdates), 0, buffer, offset, sizeof(int)); offset += sizeof(int); @@ -119,7 +123,12 @@ internal override byte[] Serialize() return buffer; } - private static DriverMessagePayload TopologyMessageBuilder(DriverMessagePayloadType type, List updates, string stageName, int operatorId, int iteration) + private static DriverMessagePayload TopologyMessageBuilder( + DriverMessagePayloadType type, + List updates, + string stageName, + int operatorId, + int iteration) { switch (type) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs index 05b81bfa0c..0b0b4b9b86 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs @@ -27,7 +27,7 @@ namespace Org.Apache.REEF.Network.Elastic.Comm.Impl /// Class defining the updates of the topology for a running task. /// [Unstable("0.16", "API may change")] - internal sealed class TopologyUpdate + internal sealed class TopologyUpdate : ICloneable { /// /// Create an update for a node containing both the list of children and the root node. @@ -102,7 +102,7 @@ public int Size /// The memory space where to copy the serialized update /// Where to start writing in the buffer /// The updates to serialize - internal static void Serialize(byte[] buffer, ref int offset, List updates) + internal static void Serialize(byte[] buffer, ref int offset, IEnumerable updates) { byte[] tmpBuffer; @@ -192,5 +192,10 @@ internal static List Deserialize(byte[] data, int totLength, int return result; } + + public object Clone() + { + return new TopologyUpdate(Node, Children, Root); + } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs index beac689901..910a133d3d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs @@ -31,7 +31,6 @@ internal sealed class UpdateMessagePayload : TopologyMessagePayload /// Create a driver message payload containing topology updates /// /// The topology updates - /// Whether the updates are additions to the current topology state or nodes removal /// The stage context for the message /// The id of the operator receiving the topology update /// The iteration in which the update takes effect diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index 0d4dfe6b41..13ca31e862 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -32,7 +32,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Default /// /// The type of message being sent. [Unstable("0.16", "API may change")] - public abstract class DefaultOneToN + public abstract class DefaultOneToN : IDisposable { private static readonly Logger Log = Logger.GetLogger(typeof(DefaultOneToN<>)); @@ -157,7 +157,7 @@ public void ResetPosition() /// public void WaitForTaskRegistration(CancellationTokenSource cancellationSource) { - Log.Log(Level.Info, "Waiting for task registration for {0} operator.", OperatorType.ToString()); + Log.Log(Level.Info, "Waiting for task registration for {0} operator.", OperatorType); _topology.WaitForTaskRegistration(cancellationSource); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index 461a99da49..02c1a821f9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -208,7 +208,7 @@ internal void Add(IElasticOperator op) if (_iteratorsPosition.Count > 0) { var iterPos = _iteratorsPosition.Last(); - var iterator = _operators[iterPos] as IElasticIterator; + var iterator = (IElasticIterator)_operators[iterPos]; op.IteratorReference = iterator; iterator.RegisterActionOnTaskRescheduled(op.OnTaskRescheduled); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs index 4458786ab4..8a0ad7b3f4 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -44,8 +44,10 @@ internal abstract class OperatorTopologyWithDefaultCommunication : protected readonly int _timeout; protected readonly int _retry; - protected readonly ConcurrentQueue _sendQueue = new ConcurrentQueue(); - protected readonly BlockingCollection _messageQueue = new BlockingCollection(); + protected readonly ConcurrentQueue _sendQueue = + new ConcurrentQueue(); + protected readonly BlockingCollection _messageQueue = + new BlockingCollection(); protected readonly ConcurrentDictionary _children = new ConcurrentDictionary(); protected readonly CancellationTokenSource _cancellationSignal = new CancellationTokenSource(); @@ -132,7 +134,8 @@ public virtual void WaitForTaskRegistration(CancellationTokenSource cancellation } catch (Exception e) { - throw new OperationCanceledException("Failed to find parent/children nodes in operator topology for node: " + TaskId, e); + throw new OperationCanceledException( + "Failed to find parent/children nodes in operator topology for node: " + TaskId, e); } _initialized = true; From edb7fd05d6fd336a3616fa59f95a1c79d41845c0 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 19 Feb 2019 10:41:49 -0800 Subject: [PATCH 22/29] Fixed later issues. --- .../Driver/Default/DefaultElasticContext.cs | 21 +- .../Driver/Default/DefaultElasticDriver.cs | 6 +- .../Driver/Default/DefaultElasticStage.cs | 31 +-- .../Default/DefaultElasticTaskSetManager.cs | 255 +++++++----------- .../DefaultElasticTaskSetManagerParameters.cs | 2 +- .../Elastic/Driver/Default/TaskState.cs | 71 +++++ .../Elastic/Driver/IElasticTaskSetManager.cs | 22 +- .../Elastic/Failures/FailuresClock.cs | 32 +-- .../Logical/Default/DefaultBroadcast.cs | 13 +- .../Operators/Logical/Default/DefaultEmpty.cs | 5 +- .../Operators/Logical/ElasticOperator.cs | 99 +++---- .../Elastic/Topology/Logical/ITopology.cs | 4 +- .../Topology/Logical/Impl/EmptyTopology.cs | 6 +- .../Topology/Logical/Impl/FlatTopology.cs | 14 +- 14 files changed, 262 insertions(+), 319 deletions(-) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/TaskState.cs diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs index 263b363c0c..63a3df4157 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs @@ -181,12 +181,10 @@ public void RemoveElasticStage(string stageName) { lock (_subsLock) { - if (!_stages.ContainsKey(stageName)) + if (!_stages.Remove(stageName)) { throw new ArgumentException($"Stage {stageName} is not registered with the context."); } - - _stages.Remove(stageName); } } @@ -268,18 +266,11 @@ public IConfiguration GetElasticServiceConfiguration() .Build(); return TangFactory.GetTang().NewConfigurationBuilder(contextConfig) - .BindNamedParameter( - GenericType.Class, - _nameServerAddr) - .BindNamedParameter( - GenericType.Class, - _nameServerPort.ToString(CultureInfo.InvariantCulture)) - .BindImplementation(GenericType.Class, - GenericType.Class) - .BindNamedParameter(GenericType.Class, - _startingPort.ToString(CultureInfo.InvariantCulture)) - .BindNamedParameter(GenericType.Class, - _portRange.ToString(CultureInfo.InvariantCulture)) + .BindStringNamedParam(_nameServerAddr) + .BindIntNamedParam("" + _nameServerPort) + .BindImplementation() + .BindIntNamedParam("" + _startingPort) + .BindIntNamedParam("" + _portRange) .Build(); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs index d7bc0e48b3..8253bb1460 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticDriver.cs @@ -90,7 +90,7 @@ public void OnNext(ICompletedTask value) { TaskSetManager.OnTaskCompleted(value); - if (TaskSetManager.IsCompleted()) + if (TaskSetManager.IsCompleted) { TaskSetManager.Dispose(); } @@ -100,7 +100,7 @@ public void OnNext(IFailedEvaluator failedEvaluator) { TaskSetManager.OnEvaluatorFailure(failedEvaluator); - if (TaskSetManager.IsCompleted()) + if (TaskSetManager.IsCompleted) { TaskSetManager.Dispose(); } @@ -110,7 +110,7 @@ public void OnNext(IFailedTask failedTask) { TaskSetManager.OnTaskFailure(failedTask); - if (TaskSetManager.IsCompleted()) + if (TaskSetManager.IsCompleted) { TaskSetManager.Dispose(); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index e916ab1013..ce650d1f6f 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -173,8 +173,8 @@ public IElasticStage Build() if (_datasetConfiguration.Value.Length + adjust < _numTasks) { throw new IllegalStateException( - $"Dataset is smaller than the number of tasks: " - + "re-submit with {_datasetConfiguration.Value.Length + adjust} tasks"); + "Dataset is smaller than the number of tasks: " + + $"re-submit with {_datasetConfiguration.Value.Length + adjust} tasks"); } } @@ -193,14 +193,14 @@ public IElasticStage Build() /// True if the task is correctly added to the stages public bool AddTask(string taskId) { - if (taskId == string.Empty) + if (string.IsNullOrEmpty(taskId)) { throw new ArgumentException($"{nameof(taskId)} cannot be empty."); } if (IsCompleted || (_scheduled && FailureState.FailureState == (int)DefaultFailureStates.Fail)) { - Log.Log(Level.Warning, "Taskset " + (IsCompleted ? "completed." : "failed.")); + Log.Log(Level.Warning, "Taskset {0}." ,IsCompleted ? "completed." : "failed."); return false; } @@ -233,16 +233,12 @@ public bool AddTask(string taskId) return false; } - if (!PipelineRoot.AddTask(taskId)) + if (PipelineRoot.AddTask(taskId)) { - return true; + _tasksAdded++; + _missingMasterTasks.Remove(taskId); + _failureMachine.AddDataPoints(1, false); } - - _tasksAdded++; - - _missingMasterTasks.Remove(taskId); - - _failureMachine.AddDataPoints(1, false); } return true; @@ -285,7 +281,7 @@ public bool IsMasterTaskContext(IActiveContext activeContext) } int id = Utils.GetContextNum(activeContext); - return _masterTasks.Select(Utils.GetTaskNum).Any(x => x == id); + return _masterTasks.Any(task => Utils.GetTaskNum(task) == id); } /// @@ -300,16 +296,11 @@ public IConfiguration GetTaskConfiguration(int taskId) ICsConfigurationBuilder confBuilder = TangFactory.GetTang().NewConfigurationBuilder(); IList serializedOperatorsConfs = new List(); - confBuilder.BindNamedParameter( - GenericType.Class, - StageName); - PipelineRoot.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); return confBuilder - .BindList( - GenericType.Class, - serializedOperatorsConfs) + .BindStringNamedParam(StageName) + .BindList(serializedOperatorsConfs) .Build(); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index eee468d189..5310535e9f 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -69,26 +69,6 @@ public ContextInfo(int id) public int NumRetry { get; set; } } - /// - /// Definition of the the different states in which a task can be. - /// - private enum TaskState - { - Init = 1, - - Queued = 2, - - Submitted = 3, - - Recovering = 4, - - Running = 5, - - Failed = 6, - - Completed = 7 - } - #endregion Private structs #region Private classes @@ -121,10 +101,7 @@ public TaskInfo( ActiveContext = context; EvaluatorId = evaluatorId; Stages = stages; - NumRetry = 1; TaskStatus = status; - RescheduleConfigurations = new Dictionary>(); - Lock = new object(); } /// @@ -158,7 +135,8 @@ public bool IsActiveContextDisposed /// /// Configurations when the task will be rescheduled after a failure. /// - public Dictionary> RescheduleConfigurations { get; set; } + public Dictionary> RescheduleConfigurations = + new Dictionary>(); /// /// Reference to the remote running task. @@ -173,12 +151,7 @@ public bool IsActiveContextDisposed /// /// How many times the task have been scheduled. /// - public int NumRetry { get; set; } - - /// - ///An object used as lock for the task info. - /// - public object Lock { get; private set; } + public int NumRetry = 1; /// /// Save the reference to the remote running task. @@ -232,10 +205,7 @@ public void DisposeTask() { if (!_isTaskDisposed) { - if (TaskRunner != null) - { - TaskRunner.Dispose(); - } + TaskRunner?.Dispose(); _isTaskDisposed = true; } @@ -248,10 +218,7 @@ public void DisposeActiveContext() { if (!_isActiveContextDisposed) { - if (ActiveContext != null) - { - ActiveContext.Dispose(); - } + ActiveContext?.Dispose(); _isActiveContextDisposed = true; } @@ -273,42 +240,6 @@ public void Dispose() } } - /// - /// Utility class used to recognize particular task states. - /// - private static class TaskStateUtils - { - private static List recoverable = new List() - { - TaskState.Failed, TaskState.Queued - }; - - private static List notRunnable = new List() - { - TaskState.Failed, TaskState.Completed - }; - - /// - /// Whether a task is recoverable or not. - /// - /// The current state of the task - /// True if the task is recoverable - public static bool IsRecoverable(TaskState state) - { - return recoverable.Contains(state); - } - - /// - /// Whether a task can be run or not. - /// - /// The current state of the task - /// True if the task can be run - public static bool IsRunnable(TaskState state) - { - return !notRunnable.Contains(state); - } - } - /// /// Represent an event triggered by some timeout registered by the task set. /// @@ -378,7 +309,7 @@ public Alarm GetAlarm(long time) private readonly TaskConfigurator _slaveTaskConfiguration; // Task info 0-indexed - private readonly List _taskInfos; + private readonly TaskInfo[] _taskInfos; private readonly Dictionary _stages = new Dictionary(); private readonly ConcurrentQueue _queuedTasks = new ConcurrentQueue(); @@ -419,16 +350,11 @@ public DefaultElasticTaskSetManager( _masterTaskConfiguration = masterTaskConfiguration; _slaveTaskConfiguration = slaveTaskConfiguration ?? masterTaskConfiguration; - _taskInfos = new List(numTasks); - - for (int i = 0; i < numTasks; i++) - { - _taskInfos.Add(null); - } + _taskInfos = new TaskInfo[numTasks]; var injector = TangFactory.GetTang().NewInjector(confs); Type parametersType = typeof(DefaultElasticTaskSetManagerParameters); - _parameters = injector.GetInstance(parametersType) as DefaultElasticTaskSetManagerParameters; + _parameters = injector.GetInstance(); // Set up the timeout List msgs = null; @@ -445,12 +371,36 @@ public string StagesId { get { - if (_finalized != true) + if (!_finalized) { throw new IllegalStateException("Task set have to be built before getting its stages"); } - return _stages.Keys.Aggregate((current, next) => current + "+" + next); + return string.Join("+", _stages.Keys); + } + } + + /// + /// Decides whether more contexts have to be added to this Task Manger or not. + /// + /// True if the number of added contexts is less than the available slots + + public bool HasMoreContextToAdd + { + get + { + return _contextsAdded < _numTasks; + } + } + + /// + /// Whether this task set is done. + /// + public bool IsCompleted + { + get + { + return Completed() && _tasksRunning == 0; } } @@ -461,9 +411,9 @@ public string StagesId /// The same finalized task set manager public IElasticTaskSetManager AddStage(IElasticStage stage) { - if (_finalized == true) + if (_finalized) { - throw new IllegalStateException("Cannot add stage to an already built task set manager"); + throw new IllegalStateException("Cannot add stage to an already built task set manager."); } _stages.Add(stage.StageName, stage); @@ -471,16 +421,6 @@ public IElasticTaskSetManager AddStage(IElasticStage stage) return this; } - /// - /// Decides whether more contexts have to be added to this Task Manger or not. - /// - /// True if the number of added contexts is less than the available slots - - public bool HasMoreContextToAdd() - { - return _contextsAdded < _numTasks; - } - /// /// Method used to generate unique context ids. /// @@ -571,7 +511,7 @@ public IConfiguration GetCodecConfiguration() /// The new active context public void OnNewActiveContext(IActiveContext activeContext) { - if (_finalized != true) + if (!_finalized) { throw new IllegalStateException("Task set have to be finalized before adding tasks."); } @@ -584,18 +524,20 @@ public void OnNewActiveContext(IActiveContext activeContext) } _hasProgress = true; + + var taskId = Utils.BuildTaskId(StagesId, Utils.GetContextNum(activeContext)); var id = Utils.GetContextNum(activeContext) - 1; - var taskId = Utils.BuildTaskId(StagesId, id + 1); + var taskInfo = _taskInfos[id]; // We reschedule the task only if the context was active (_taskInfos[id] != null) and the task was // actually scheduled at least once (_taskInfos[id].TaskStatus > TaskStatus.Init) - if (_taskInfos[id] != null && _taskInfos[id].TaskStatus > TaskState.Init) + if (taskInfo?.TaskStatus > TaskState.Init) { Log.Log(Level.Info, "{0} already part of task set: going to directly submit it.", taskId); - lock (_taskInfos[id].Lock) + lock (taskInfo) { - _taskInfos[id].UpdateRuntime(activeContext, activeContext.EvaluatorId); + taskInfo.UpdateRuntime(activeContext, activeContext.EvaluatorId); } SubmitTask(id); @@ -628,7 +570,7 @@ public void OnNewActiveContext(IActiveContext activeContext) /// The same finalized task set manager public IElasticTaskSetManager Build() { - if (_finalized == true) + if (_finalized) { throw new IllegalStateException("Task set manager cannot be built more than once"); } @@ -647,33 +589,33 @@ public void OnTaskRunning(IRunningTask task) { if (IsTaskManagedBy(task.Id)) { - var id = Utils.GetTaskNum(task.Id) - 1; + var taskInfo = _taskInfos[Utils.GetTaskNum(task.Id) - 1]; _hasProgress = true; - lock (_taskInfos[id].Lock) + lock (taskInfo) { - _taskInfos[id].SetTaskRunner(task); + taskInfo.SetTaskRunner(task); if (Completed() || Failed()) { Log.Log(Level.Info, "Received running from task {0} but task set is completed " + "or failed: ignoring.", task.Id); - _taskInfos[id].Dispose(); + taskInfo.Dispose(); return; } - if (!TaskStateUtils.IsRunnable(_taskInfos[id].TaskStatus)) + if (!taskInfo.TaskStatus.IsRunnable()) { Log.Log(Level.Info, "Received running from task {0} which is not runnable: ignoring.", task.Id); - _taskInfos[id].Dispose(); + taskInfo.Dispose(); return; } - if (_taskInfos[id].TaskStatus != TaskState.Running) + if (taskInfo.TaskStatus != TaskState.Running) { - if (_taskInfos[id].TaskStatus == TaskState.Recovering) + if (taskInfo.TaskStatus == TaskState.Recovering) { foreach (var stage in _stages) { @@ -681,7 +623,7 @@ public void OnTaskRunning(IRunningTask task) } } - _taskInfos[id].SetTaskStatus(TaskState.Running); + taskInfo.SetTaskStatus(TaskState.Running); Interlocked.Increment(ref _tasksRunning); } } @@ -701,7 +643,7 @@ public void OnTaskCompleted(ICompletedTask taskInfo) var id = Utils.GetTaskNum(taskInfo.Id) - 1; _hasProgress = true; - lock (_taskInfos[id].Lock) + lock (_taskInfos[id]) { _taskInfos[id].SetTaskStatus(TaskState.Completed); } @@ -744,14 +686,6 @@ public void OnTaskMessage(ITaskMessage message) } } - /// - /// Whether this task set is done. - /// - public bool IsCompleted() - { - return Completed() && _tasksRunning == 0; - } - #region Failure Response /// @@ -851,7 +785,7 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent Log.Log(Level.Info, "Received a failure from task {0} but the task set is completed or " + "failed: ignoring the failure", task.Id, task.AsError()); - lock (_taskInfos[id].Lock) + lock (_taskInfos[id]) { _taskInfos[id].SetTaskStatus(TaskState.Failed); } @@ -863,7 +797,7 @@ public void OnTaskFailure(IFailedTask task, ref List failureEvent failureEvents = failureEvents ?? new List(); - lock (_taskInfos[id].Lock) + lock (_taskInfos[id]) { if (_taskInfos[id].TaskStatus < TaskState.Failed) { @@ -910,7 +844,7 @@ public void OnEvaluatorFailure(IFailedEvaluator evaluator) var failedTask = evaluator.FailedTask.Value; var id = Utils.GetTaskNum(failedTask.Id) - 1; - lock (_taskInfos[id].Lock) + lock (_taskInfos[id]) { _taskInfos[id].DropRuntime(); } @@ -930,7 +864,7 @@ public void OnEvaluatorFailure(IFailedEvaluator evaluator) if (_taskInfos[id] != null) { - lock (_taskInfos[id].Lock) + lock (_taskInfos[id]) { _taskInfos[id].DropRuntime(); _taskInfos[id].SetTaskStatus(TaskState.Failed); @@ -1077,7 +1011,7 @@ public void Dispose() { if (info != null) { - lock (info.Lock) + lock (info) { info.Dispose(); } @@ -1211,26 +1145,27 @@ private void SubmitTasks() private void SubmitTask(int id) { + var taskInfo = _taskInfos[id]; if (Completed() || Failed()) { - Log.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring"); - _taskInfos[id].DisposeTask(); + Log.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring."); + taskInfo.DisposeTask(); return; } - lock (_taskInfos[id].Lock) + lock (taskInfo) { // Check that the task was not already submitted. This may happen for instance if // _scheduled is set to true and a new active context message is received. - if (_taskInfos[id].TaskStatus == TaskState.Submitted) + if (taskInfo.TaskStatus == TaskState.Submitted) { return; } - var stages = _taskInfos[id].Stages; + var stages = taskInfo.Stages; ICsConfigurationBuilder confBuilder = TangFactory.GetTang().NewConfigurationBuilder(); - var rescheduleConfs = _taskInfos[id].RescheduleConfigurations; + var rescheduleConfs = taskInfo.RescheduleConfigurations; foreach (var stage in stages) { @@ -1253,17 +1188,17 @@ private void SubmitTask(int id) _driverId) .Build(); - IConfiguration mergedTaskConf = Configurations.Merge(_taskInfos[id].TaskConfiguration, baseConf); + IConfiguration mergedTaskConf = Configurations.Merge(taskInfo.TaskConfiguration, baseConf); - if (_taskInfos[id].IsActiveContextDisposed) + if (taskInfo.IsActiveContextDisposed) { Log.Log(Level.Warning, "Task submit for {0} with a non-active context: spawning a new evaluator.", id + 1); - if (_taskInfos[id].TaskStatus == TaskState.Failed) + if (taskInfo.TaskStatus == TaskState.Failed) { _queuedTasks.Enqueue(id + 1); - _taskInfos[id].SetTaskStatus(TaskState.Queued); + taskInfo.SetTaskStatus(TaskState.Queued); SpawnNewEvaluator(id); } @@ -1271,16 +1206,11 @@ private void SubmitTask(int id) return; } - _taskInfos[id].ActiveContext.SubmitTask(mergedTaskConf); - - if (TaskStateUtils.IsRecoverable(_taskInfos[id].TaskStatus)) - { - _taskInfos[id].SetTaskStatus(TaskState.Recovering); - } - else - { - _taskInfos[id].SetTaskStatus(TaskState.Submitted); - } + taskInfo.ActiveContext.SubmitTask(mergedTaskConf); + taskInfo.SetTaskStatus( + taskInfo.TaskStatus.IsRecoverable() ? + TaskState.Recovering : + TaskState.Submitted); } } @@ -1291,27 +1221,23 @@ private void SendToTasks(IList messages, int retry = 0) if (returnMessage != null) { var destination = Utils.GetTaskNum(returnMessage.Destination) - 1; + var taskInfo = _taskInfos[destination] ?? throw new ArgumentNullException("Task Info"); - if (_taskInfos[destination] == null) - { - throw new ArgumentNullException("Task Info"); - } - lock (_taskInfos[destination].Lock) + lock (taskInfo) { if (Completed() || Failed()) { - Log.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring"); - _taskInfos[destination].DisposeTask(); + Log.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring."); + taskInfo.DisposeTask(); return; } - if (_taskInfos[destination].TaskStatus != TaskState.Running || - _taskInfos[destination].TaskRunner == null) + if (taskInfo.TaskStatus != TaskState.Running || + taskInfo.TaskRunner == null) { - var msg = string.Format("Cannot send message to {0}:", destination + 1); - msg += ": Task Status is " + _taskInfos[destination].TaskStatus; + var msg = $"Cannot send message to {destination + 1}: Task Status is {taskInfo.TaskStatus}:"; - if (_taskInfos[destination].TaskStatus == TaskState.Submitted && retry < _parameters.Retry) + if (taskInfo.TaskStatus == TaskState.Submitted && retry < _parameters.Retry) { Log.Log(Level.Warning, msg + " Retry"); System.Threading.Tasks.Task.Run(() => @@ -1333,7 +1259,7 @@ private void SendToTasks(IList messages, int retry = 0) continue; } - _taskInfos[destination].TaskRunner.Send(returnMessage.Serialize()); + taskInfo.TaskRunner.Send(returnMessage.Serialize()); } } } @@ -1357,7 +1283,7 @@ private void Reschedule(RescheduleEvent rescheduleEvent) { var id = Utils.GetTaskNum(rescheduleEvent.TaskId) - 1; - lock (_taskInfos[id].Lock) + lock (_taskInfos[id]) { _taskInfos[id].NumRetry++; @@ -1389,22 +1315,21 @@ private void Fail(string taskId = "") private void LogFinalStatistics() { - var msg = string.Format("Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}", + Log.Log(Level.Info, "Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}\n{2}", _totFailedTasks, - _totFailedEvaluators); - msg += _stages.Select(x => x.Value.LogFinalStatistics()).Aggregate((a, b) => a + "\n" + b); - Log.Log(Level.Info, msg); + _totFailedEvaluators, + string.Join("\n", _stages.Select(x => x.Value.LogFinalStatistics()))); } private bool Completed() { if (!_completed) { - _completed = _stages.Select(stage => stage.Value.IsCompleted).Aggregate((com1, com2) => com1 && com2); + _completed = _stages.Values.All(stage => stage.IsCompleted); if (_completed) { - Log.Log(Level.Info, "Task set completed"); + Log.Log(Level.Info, "Task set completed."); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs index 1447a5cfd1..934b5a113e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs @@ -53,7 +53,7 @@ private DefaultElasticTaskSetManagerParameters( NewEvaluatorNumCores = numCores; NewEvaluatorMemorySize = memorySize; - System.Threading.Tasks.Task.Factory.StartNew(() => Clock.Run(), TaskCreationOptions.LongRunning); + System.Threading.Tasks.Task.Factory.StartNew(Clock.Run, TaskCreationOptions.LongRunning); } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/TaskState.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/TaskState.cs new file mode 100644 index 0000000000..0e828337c3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/TaskState.cs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System.Linq; + +namespace Org.Apache.REEF.Network.Elastic.Driver.Default +{ + /// + /// Definition of the the different states in which a task can be. + /// + internal enum TaskState + { + Init = 1, + + Queued = 2, + + Submitted = 3, + + Recovering = 4, + + Running = 5, + + Failed = 6, + + Completed = 7 + } + + /// + /// Utility class used to recognize particular task states. + /// + internal static class TaskStateUtils + { + private static readonly TaskState[] Recoverable = { TaskState.Failed, TaskState.Queued }; + + private static readonly TaskState[] NotRunnable = { TaskState.Failed, TaskState.Completed }; + + /// + /// Whether a task is recoverable or not. + /// + /// The current state of the task + /// True if the task is recoverable + public static bool IsRecoverable(this TaskState state) + { + return Recoverable.Contains(state); + } + + /// + /// Whether a task can be run or not. + /// + /// The current state of the task + /// True if the task can be run + public static bool IsRunnable(this TaskState state) + { + return !NotRunnable.Contains(state); + } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs index 4559cc5184..7fa4fdad16 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/IElasticTaskSetManager.cs @@ -40,6 +40,17 @@ public interface IElasticTaskSetManager : IFailureResponse, IDisposable /// string StagesId { get; } + /// + /// Decides whether more contexts have to be added to this Task Manger or not. + /// + /// True if the number of added contexts is less than the available slots + bool HasMoreContextToAdd { get; } + + /// + /// Whether this task set manger is done. + /// + bool IsCompleted { get; } + /// /// Subscribe the current task set manager to a new stage. /// @@ -47,12 +58,6 @@ public interface IElasticTaskSetManager : IFailureResponse, IDisposable /// The task manager with the added stage IElasticTaskSetManager AddStage(IElasticStage stage); - /// - /// Decides whether more contexts have to be added to this Task Manger or not. - /// - /// True if the number of added contexts is less than the available slots - bool HasMoreContextToAdd(); - /// /// Method used to generate unique context ids. /// @@ -132,11 +137,6 @@ public interface IElasticTaskSetManager : IFailureResponse, IDisposable /// The context identifier bool IsEvaluatorManagedBy(string id); - /// - /// Whether this task set manger is done. - /// - bool IsCompleted(); - /// /// Used to react on a task failure. /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs index a687cda909..42ca8b6f21 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs @@ -43,8 +43,8 @@ internal sealed class FailuresClock : IClock private static int numberOfInstantiations = 0; private readonly ITimer _timer; - private readonly PubSubSubject /// The configuration builder the binding will be added to - protected override void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder confBuilder) + /// The physcal operator configurations + protected override IConfiguration PhysicalOperatorConfiguration() { + return TangFactory.GetTang().NewConfigurationBuilder().Build(); } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index 8038ad88a8..96b003677c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -304,13 +304,10 @@ public virtual ElasticOperator BuildState() /// /// Generate the data serializer configuration for the target operator. /// - /// The conf builder where to attach the codec configuration - internal virtual void GetCodecConfiguration(ref IConfiguration confBuilder) + /// The conf builder where to attach the codec configuration + internal virtual void GetCodecConfiguration(ref IConfiguration conf) { - if (_next != null) - { - _next.GetCodecConfiguration(ref confBuilder); - } + _next?.GetCodecConfiguration(ref conf); } /// @@ -319,12 +316,7 @@ internal virtual void GetCodecConfiguration(ref IConfiguration confBuilder) /// True if this is the last iterator public virtual bool CheckIfLastIterator() { - if (_next == null) - { - return true; - } - - return _next.CheckIfLastIterator(); + return _next?.CheckIfLastIterator() ?? true; } /// @@ -388,10 +380,7 @@ internal void GetTaskConfiguration(ref IList serializedOperatorsConfs, i { GetOperatorConfiguration(ref serializedOperatorsConfs, taskId); - if (_next != null) - { - _next.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); - } + _next?.GetTaskConfiguration(ref serializedOperatorsConfs, taskId); } else { @@ -405,14 +394,7 @@ internal void GetTaskConfiguration(ref IList serializedOperatorsConfs, i /// True if the operator is ready to be scheduled internal bool CanBeScheduled() { - bool canBeScheduled = _topology.CanBeScheduled(); - - if (canBeScheduled && _next != null) - { - return _next.CanBeScheduled(); - } - - return canBeScheduled; + return _topology.CanBeScheduled() && (_next?.CanBeScheduled() ?? true); } /// @@ -421,17 +403,14 @@ internal bool CanBeScheduled() /// The id of the master tasks of the current and successive operators internal virtual void GatherMasterIds(ref HashSet masterTasks) { - if (_operatorFinalized != true) + if (!_operatorFinalized) { throw new IllegalStateException("Operator need to be build before gathering information."); } masterTasks.Add(Utils.BuildTaskId(Stage.StageName, MasterId)); - if (_next != null) - { - _next.GatherMasterIds(ref masterTasks); - } + _next?.GatherMasterIds(ref masterTasks); } /// @@ -440,29 +419,24 @@ internal virtual void GatherMasterIds(ref HashSet masterTasks) /// internal virtual string LogFinalStatistics() { - var str = LogInternalStatistics(); - - if (_next != null) - { - str += _next.LogFinalStatistics(); - } - - return str; + return LogInternalStatistics() + _next?.LogFinalStatistics(); } /// /// Appends the message type to the configuration. /// /// The type of the messages the operator is configured to accept - /// The configuration builder the message type will be added to - protected void SetMessageType(Type operatorType, ref ICsConfigurationBuilder confBuilder) + /// The conf builder with added the message type + protected IConfiguration SetMessageType(Type operatorType) { if (operatorType.IsGenericType) { var genericTypes = operatorType.GenericTypeArguments; var msgType = genericTypes[0]; - confBuilder.BindNamedParameter( - GenericType.Class, msgType.AssemblyQualifiedName); + + return TangFactory.GetTang().NewConfigurationBuilder() + .BindStringNamedParam(msgType.AssemblyQualifiedName) + .Build(); } else { @@ -492,32 +466,19 @@ protected void OnNewIteration(int iteration) /// The task id of the task that belongs to this operator protected virtual void GetOperatorConfiguration(ref IList serializedOperatorsConfs, int taskId) { - ICsConfigurationBuilder operatorBuilder = TangFactory.GetTang().NewConfigurationBuilder(); - - _topology.GetTaskConfiguration(ref operatorBuilder, taskId); - - PhysicalOperatorConfiguration(ref operatorBuilder); - - if (!Stage.IsIterative && _next == null) - { - operatorBuilder.BindNamedParameter( - GenericType.Class, - true.ToString(CultureInfo.InvariantCulture)); - } - - IConfiguration operatorConf = operatorBuilder - .BindNamedParameter( - GenericType.Class, - _id.ToString(CultureInfo.InvariantCulture)) - .BindNamedParameter( - GenericType.Class, - ((int)_checkpointLevel).ToString(CultureInfo.InvariantCulture)) + var operatorBuilderWithTaskConf = _topology.GetTaskConfiguration(taskId); + var operatorBuilderWithTaskAndPhysicalConf = PhysicalOperatorConfiguration(); + IConfiguration operatorConf = TangFactory.GetTang().NewConfigurationBuilder() + .BindNamedParam("" + (!Stage.IsIterative && _next == null)) + .BindIntNamedParam("" + _id) + .BindIntNamedParam("" + (int)_checkpointLevel) .Build(); - foreach (var conf in _configurations) - { - operatorConf = Configurations.Merge(operatorConf, conf); - } + operatorConf = Configurations.Merge( + operatorConf, + operatorBuilderWithTaskConf, + operatorBuilderWithTaskAndPhysicalConf, + Configurations.Merge(_configurations)); Stage.Context.SerializeOperatorConfiguration(ref serializedOperatorsConfs, operatorConf); } @@ -566,8 +527,8 @@ protected virtual string LogInternalStatistics() /// /// Binding from logical to physical operator. /// - /// The configuration builder the binding will be added to - protected abstract void PhysicalOperatorConfiguration(ref ICsConfigurationBuilder builder); + /// The physcal operator configuration + protected abstract IConfiguration PhysicalOperatorConfiguration(); private ITopology GetTopology(TopologyType topologyType) { @@ -581,8 +542,8 @@ private ITopology GetTopology(TopologyType topologyType) default: throw new ArgumentException( - nameof(topologyType), - $"Topology type {topologyType} not supported by {OperatorType.ToString()}."); + nameof(topologyType), + $"Topology type {topologyType} not supported by {OperatorType.ToString()}."); } return topology; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs index bcbd249b1a..d6506e94ab 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs @@ -77,9 +77,9 @@ public interface ITopology /// Adds the topology configuration for the input task to the input builder. /// Must be called only after all tasks have been added to the topology, i.e., after build. /// - /// The configuration builder the configuration will be appended to /// The task id of the task that belongs to this Topology - void GetTaskConfiguration(ref ICsConfigurationBuilder builder, int taskId); + /// The task configuration + IConfiguration GetTaskConfiguration(int taskId); /// /// Utility method for logging the topology state. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs index ae20ddfd25..9d5caeca59 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs @@ -23,6 +23,7 @@ using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Utilities; using Org.Apache.REEF.Utilities.Attributes; +using Org.Apache.REEF.Tang.Implementations.Tang; namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl { @@ -116,10 +117,11 @@ public ITopology Build() /// Adds the topology configuration for the input task to the input builder. /// This method does nothig. /// - /// The configuration builder the configuration will be appended to /// The task id of the task that belongs to this Topology - public void GetTaskConfiguration(ref ICsConfigurationBuilder confBuilder, int taskId) + /// The task configuration + public IConfiguration GetTaskConfiguration(int taskId) { + return TangFactory.GetTang().NewConfigurationBuilder().Build(); } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index 565d6477da..00b94ec541 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -29,6 +29,7 @@ using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Utilities.Attributes; using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; +using Org.Apache.REEF.Tang.Implementations.Tang; namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl { @@ -263,15 +264,17 @@ public string LogTopologyState() /// Adds the topology configuration for the input task to the input builder. /// Must be called only after all tasks have been added to the topology, i.e., after build. /// - /// The configuration builder the configuration will be appended to /// The task id of the task that belongs to this Topology - public void GetTaskConfiguration(ref ICsConfigurationBuilder confBuilder, int taskId) + /// The task configuration + public IConfiguration GetTaskConfiguration(int taskId) { if (!_finalized) { throw new IllegalStateException("Cannot get task configuration from a not finalized topology."); } + var confBuilder = TangFactory.GetTang().NewConfigurationBuilder(); + if (taskId == _rootId) { var root = _nodes[_rootId]; @@ -283,9 +286,12 @@ public void GetTaskConfiguration(ref ICsConfigurationBuilder confBuilder, int ta tId.TaskId.ToString(CultureInfo.InvariantCulture)); } } - confBuilder.BindNamedParameter( + + return confBuilder + .BindNamedParameter( GenericType.Class, - _rootId.ToString(CultureInfo.InvariantCulture)); + _rootId.ToString(CultureInfo.InvariantCulture)) + .Build(); } /// From 301ea516b196053295c60e95bd4b69f02bc2bcea Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 21 Feb 2019 09:47:05 -0800 Subject: [PATCH 23/29] Done with another pass. --- .../Default/DefaultElasticTaskSetManager.cs | 30 +++++++++++++------ .../Elastic/Failures/FailuresClock.cs | 10 +++++-- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index 5310535e9f..246c9e9fb2 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -159,6 +159,11 @@ public bool IsActiveContextDisposed /// The reference to the remote running task public void SetTaskRunner(IRunningTask taskRunner) { + if (_isDisposed) + { + throw new IllegalStateException("Cannot set task runner for a disposed task."); + } + TaskRunner = taskRunner; _isTaskDisposed = false; } @@ -179,9 +184,13 @@ public void SetTaskStatus(TaskState status) /// The id of the evaluator public void UpdateRuntime(IActiveContext newActiveContext, string evaluatorId) { + if (_isDisposed) + { + throw new IllegalStateException("Cannot update runtime for a disposed task."); + } if (!_isActiveContextDisposed) { - throw new IllegalStateException("Updating Task with not disposed active context"); + throw new IllegalStateException("Updating Task with not disposed active context."); } ActiveContext = newActiveContext; @@ -205,9 +214,9 @@ public void DisposeTask() { if (!_isTaskDisposed) { - TaskRunner?.Dispose(); - _isTaskDisposed = true; + + TaskRunner?.Dispose(); } } @@ -218,9 +227,9 @@ public void DisposeActiveContext() { if (!_isActiveContextDisposed) { - ActiveContext?.Dispose(); - _isActiveContextDisposed = true; + + ActiveContext?.Dispose(); } } @@ -231,11 +240,11 @@ public void Dispose() { if (!_isDisposed) { + _isDisposed = true; + DisposeTask(); DisposeActiveContext(); - - _isDisposed = true; } } } @@ -1230,7 +1239,7 @@ private void SendToTasks(IList messages, int retry = 0) Log.Log(Level.Warning, "Task submit for a completed or failed Task Set: ignoring."); taskInfo.DisposeTask(); - return; + continue; } if (taskInfo.TaskStatus != TaskState.Running || taskInfo.TaskRunner == null) @@ -1315,10 +1324,13 @@ private void Fail(string taskId = "") private void LogFinalStatistics() { - Log.Log(Level.Info, "Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}\n{2}", + if (Log.IsLoggable(Level.Info)) + { + Log.Log(Level.Info, "Total Failed Tasks: {0}\nTotal Failed Evaluators: {1}\n{2}", _totFailedTasks, _totFailedEvaluators, string.Join("\n", _stages.Select(x => x.Value.LogFinalStatistics()))); + } } private bool Completed() diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs index 42ca8b6f21..99766d0e0d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/FailuresClock.cs @@ -52,7 +52,7 @@ internal sealed class FailuresClock : IClock private readonly IInjectionFuture>> _runtimeStopHandler; private readonly IInjectionFuture>> _idleHandler; - private bool _disposed; + private volatile bool _disposed; /// /// Create a new failure clock with injectable IObservers. @@ -145,12 +145,18 @@ public bool IsIdle() /// public void Dispose() { + if (_disposed) + { + return; + } + + _disposed = true; + lock (_schedule) { _schedule.Clear(); _schedule.Add(new StopTime(_timer.CurrentTime)); Monitor.PulseAll(_schedule); - _disposed = true; } } From 739f9d449bfa9466e30ecbcab13e060b466d3c67 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Fri, 22 Feb 2019 11:37:56 -0800 Subject: [PATCH 24/29] Another round done. --- .../Driver/Default/DefaultElasticContext.cs | 15 +++++------ .../Default/DefaultElasticTaskSetManager.cs | 23 +++++++--------- .../Default/DefaultFailureStateMachine.cs | 26 +++++++++---------- 3 files changed, 28 insertions(+), 36 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs index 63a3df4157..8b4ac0614c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticContext.cs @@ -117,18 +117,15 @@ public IElasticStage DefaultStage() { lock (_subsLock) { - IElasticStage defaultStage; - _stages.TryGetValue(_defaultStageName, out defaultStage); - - if (defaultStage == null) + if (_stages.TryGetValue(_defaultStageName, out IElasticStage defaultStage)) { - CreateNewStage( - _defaultStageName, - _numEvaluators, - _defaultFailureMachine.Clone(_numEvaluators, (int)DefaultFailureStates.Fail)); + return defaultStage; } - return _stages[_defaultStageName]; + return CreateNewStage( + _defaultStageName, + _numEvaluators, + _defaultFailureMachine.Clone(_numEvaluators, (int)DefaultFailureStates.Fail)); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index 246c9e9fb2..d37a620975 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -557,16 +557,10 @@ public void OnNewActiveContext(IActiveContext activeContext) Log.Log(Level.Info, "Task {0} to be scheduled on {1}", taskId, activeContext.EvaluatorId); - List partialTaskConfs = new List(); - - if (isMaster) + List partialTaskConfs = new List { - partialTaskConfs.Add(_masterTaskConfiguration(taskId)); - } - else - { - partialTaskConfs.Add(_slaveTaskConfiguration(taskId)); - } + isMaster ? _masterTaskConfiguration(taskId) : _slaveTaskConfiguration(taskId) + }; AddTask(taskId, activeContext, partialTaskConfs); } @@ -658,7 +652,7 @@ public void OnTaskCompleted(ICompletedTask taskInfo) } if (Completed()) { - foreach (var info in _taskInfos.Where(info => info != null && info.TaskStatus < TaskState.Failed)) + foreach (var info in _taskInfos.Where(info =>info?.TaskStatus < TaskState.Failed)) { info.DisposeTask(); } @@ -870,13 +864,14 @@ public void OnEvaluatorFailure(IFailedEvaluator evaluator) if (_evaluatorToContextIdMapping.TryRemove(evaluator.Id, out ContextInfo cinfo)) { int id = cinfo.Id - 1; + var taskInfo = _taskInfos[id]; - if (_taskInfos[id] != null) + if (taskInfo != null) { - lock (_taskInfos[id]) + lock (taskInfo) { - _taskInfos[id].DropRuntime(); - _taskInfos[id].SetTaskStatus(TaskState.Failed); + taskInfo.DropRuntime(); + taskInfo.SetTaskStatus(TaskState.Failed); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs index 7d9d547812..b853c93a02 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -56,8 +56,8 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine { DefaultFailureStates.Fail, DefaultFailureStates.StopAndReschedule } }; - private readonly IDictionary transitionWeights = - new Dictionary() + private readonly SortedDictionary transitionWeights = + new SortedDictionary() { { DefaultFailureStates.ContinueAndReconfigure, 0.01F }, { DefaultFailureStates.ContinueAndReschedule, 0.40F }, @@ -241,14 +241,16 @@ public void SetThreshold(IFailureState level, float threshold) /// Pairs of failure states with related new thresholds public void SetThresholds(params Tuple[] weights) { - if (!weights.All(weight => weight.Item1 is DefaultFailureState)) + foreach (var weight in weights) { - throw new ArgumentException("Input is not of type DefaultFailureStateMachine,"); - } - - if (weights.Any(weight => weight.Item1.FailureState == (int)DefaultFailureStates.Continue)) - { - throw new ArgumentException("Cannot change the threshold for Continue state."); + if (!(weight.Item1 is DefaultFailureState)) + { + throw new ArgumentException("Input is not of type DefaultFailureStateMachine."); + } + if (weight.Item1.FailureState == (int)DefaultFailureStates.Continue) + { + throw new ArgumentException("Cannot change the threshold for Continue state."); + } } lock (_statusLock) @@ -275,7 +277,7 @@ public IFailureStateMachine Clone( { var newMachine = new DefaultFailureStateMachine(initalPoints, (DefaultFailureStates)initalState); - foreach (DefaultFailureStates state in transitionWeights.Keys.OrderByDescending(x => x)) + foreach (DefaultFailureStates state in transitionWeights.Keys) { newMachine.SetThreshold(new DefaultFailureState((int)state), transitionWeights[state]); } @@ -294,9 +296,8 @@ private void CheckConsistency() var state = DefaultFailureStates.ContinueAndReconfigure; float prevWeight = transitionWeights[state]; state = transitionMapUp[state]; - float nextWeight = transitionWeights[state]; - while (nextWeight >= 0) + while (transitionWeights.TryGetValue(state, out float nextWeight)) { if (nextWeight < prevWeight) { @@ -312,7 +313,6 @@ private void CheckConsistency() } state = transitionMapUp[state]; - transitionWeights.TryGetValue(state, out nextWeight); } } } From bc80651ec02a4e41ac676ff4b3958cd47c5a7df2 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 26 Feb 2019 15:52:50 -0800 Subject: [PATCH 25/29] Done with another round plus few minor bug fixes. --- .../ElasticBroadcastClient.cs | 2 +- .../ElasticBroadcastDriverWithFailures.cs | 5 +- .../Impl/ElasticGroupCommunicationMessage.cs | 13 +- .../Driver/Default/DefaultElasticStage.cs | 6 +- .../Default/DefaultElasticTaskSetManager.cs | 2 +- .../Default/DefaultFailureStateMachine.cs | 42 +++--- .../Failures/Default/DefaultFailureStates.cs | 56 ++++++++ .../Elastic/Failures/Default/FailEvent.cs | 3 +- .../Failures/Default/ReconfigureEvent.cs | 24 +--- .../Logical/Default/DefaultBroadcast.cs | 6 +- .../Operators/Logical/Default/DefaultEmpty.cs | 10 +- .../Logical/Default/DefaultOneToN.cs | 39 ++---- ...> ElasticOperatorWithDefaultDispatcher.cs} | 51 +++---- .../Operators/Logical/ElasticOperator.cs | 124 +++++++----------- .../Elastic/Operators/OperatorType.cs | 18 --- .../Physical/Default/DefaultOneToN.cs | 11 +- .../Operators/Physical/IElasticIterator.cs | 15 +-- .../Operators/Physical/IElasticOperator.cs | 9 +- .../Elastic/Operators/Physical/IReceiver.cs | 2 +- .../Operators/Physical/IReschedulable.cs | 34 +++++ .../Elastic/Operators/Physical/ISender.cs | 2 +- .../Elastic/Task/CommunicationLayer.cs | 18 +-- .../Task/Default/DefaultCommunicationLayer.cs | 6 +- .../DefaultTaskToDriverMessageDispatcher.cs | 3 +- .../Default/IDefaultTaskToDrivermessages.cs | 3 + .../Task/ElasticDriverMessageHandler.cs | 8 +- .../Elastic/Task/IIdentifiable.cs | 33 +++++ .../Elastic/Task/INodeIdentifier.cs | 38 ++++++ .../Elastic/Task/NodeIdentifier.cs | 51 +++++++ .../Elastic/Task/NodeObserverIdentifier.cs | 114 ---------------- .../Task/TaskToDriverMessageDispatcher.cs | 16 +-- .../Elastic/Task/Workflow.cs | 6 +- .../IOperatorTopologyWithCommunication.cs | 11 +- .../Topology/Physical/OperatorTopology.cs | 9 +- 34 files changed, 387 insertions(+), 403 deletions(-) rename lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/{ElastiOperatorWithDefaultDispatcher.cs => ElasticOperatorWithDefaultDispatcher.cs} (86%) create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReschedulable.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/IIdentifiable.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/INodeIdentifier.cs create mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs diff --git a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs index e8f98fdf88..960227e855 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples.Client/ElasticBroadcastClient.cs @@ -52,7 +52,7 @@ public ElasticBroadcastClient( int portRange, string jobIdentifier) { - string driverId = GenericType.Class.ToString(); + string driverId = typeof(T).Name; JobIdentifier = jobIdentifier; IConfiguration driverConfig = TangFactory.GetTang() diff --git a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs index dc6321c62a..521457137d 100644 --- a/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs +++ b/lang/cs/Org.Apache.REEF.Network.Examples/Elastic/WithFailures/ElasticBroadcastDriverWithFailures.cs @@ -27,6 +27,7 @@ using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Network.Elastic.Failures.Default; using Org.Apache.REEF.Network.Elastic.Task.Default; +using Org.Apache.REEF.Network.Elastic.Config; namespace Org.Apache.REEF.Network.Examples.Elastic { @@ -39,8 +40,8 @@ public sealed class ElasticBroadcastDriverWithFailures { [Inject] private ElasticBroadcastDriverWithFailures( - string stageName, - int numEvaluators, + [Parameter(typeof(ElasticServiceConfigurationOptions.DefaultStageName))] string stageName, + [Parameter(typeof(ElasticServiceConfigurationOptions.NumEvaluators))] int numEvaluators, IElasticContext context) : base(context) { IFailureStateMachine failureMachine = new DefaultFailureStateMachine(); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs index e439e8569f..718041d300 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +using Org.Apache.REEF.Network.Elastic.Task; using Org.Apache.REEF.Utilities.Attributes; using System; @@ -24,7 +25,7 @@ namespace Org.Apache.REEF.Network.Elastic.Comm.Impl /// Message sent by Group Communication operators. /// [Unstable("0.16", "API may change")] - public abstract class ElasticGroupCommunicationMessage : ICloneable + public abstract class ElasticGroupCommunicationMessage : ICloneable, INodeIdentifier, IIdentifiable { /// /// Create a new elastic group communication message. @@ -37,6 +38,7 @@ protected ElasticGroupCommunicationMessage( { StageName = stageName; OperatorId = operatorId; + NodeId = new NodeIdentifier(StageName, OperatorId); } /// @@ -46,11 +48,16 @@ protected ElasticGroupCommunicationMessage( /// /// Returns the stage. - internal string StageName { get; private set; } + public string StageName { get; private set; } /// /// Returns the operator id. /// - internal int OperatorId { get; private set; } + public int OperatorId { get; private set; } + + /// + /// The identifier for the node. + /// + public NodeIdentifier NodeId { get; private set; } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index ce650d1f6f..ab3d6ffbd8 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -119,7 +119,7 @@ internal DefaultElasticStage( /// public bool IsCompleted { - get { return FailureState.FailureState == (int)DefaultFailureStates.Complete; } + get { return FailureState.FailureState.IsComplete(); } } /// @@ -198,7 +198,7 @@ public bool AddTask(string taskId) throw new ArgumentException($"{nameof(taskId)} cannot be empty."); } - if (IsCompleted || (_scheduled && FailureState.FailureState == (int)DefaultFailureStates.Fail)) + if (IsCompleted || (_scheduled && FailureState.FailureState.IsFail())) { Log.Log(Level.Warning, "Taskset {0}." ,IsCompleted ? "completed." : "failed."); return false; @@ -348,7 +348,7 @@ public void Complete() /// The final statistics for the computation public string LogFinalStatistics() { - if (IsCompleted || FailureState.FailureState == (int)DefaultFailureStates.Fail) + if (IsCompleted || FailureState.FailureState.IsFail()) { return PipelineRoot.LogFinalStatistics(); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index d37a620975..e7e6127972 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -1345,7 +1345,7 @@ private bool Completed() private bool Failed() { - return _failureStatus.FailureState == (int)DefaultFailureStates.Fail; + return _failureStatus.FailureState.IsFail(); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs index b853c93a02..a904322bb2 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStateMachine.cs @@ -38,8 +38,8 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine { private readonly object _statusLock = new object(); - private readonly SortedDictionary transitionMapUp = - new SortedDictionary() + private readonly static SortedDictionary TransitionMapUp = + new SortedDictionary { { DefaultFailureStates.Continue, DefaultFailureStates.ContinueAndReconfigure }, { DefaultFailureStates.ContinueAndReconfigure, DefaultFailureStates.ContinueAndReschedule }, @@ -47,8 +47,8 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine { DefaultFailureStates.StopAndReschedule, DefaultFailureStates.Fail } }; - private readonly SortedDictionary transitionMapDown = - new SortedDictionary() + private readonly static SortedDictionary TransitionMapDown = + new SortedDictionary { { DefaultFailureStates.ContinueAndReconfigure, DefaultFailureStates.Continue }, { DefaultFailureStates.ContinueAndReschedule, DefaultFailureStates.ContinueAndReconfigure }, @@ -57,7 +57,7 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine }; private readonly SortedDictionary transitionWeights = - new SortedDictionary() + new SortedDictionary { { DefaultFailureStates.ContinueAndReconfigure, 0.01F }, { DefaultFailureStates.ContinueAndReschedule, 0.40F }, @@ -65,7 +65,7 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine { DefaultFailureStates.Fail, 0.80F } }; - private static List canMoveToComplete = new List() + private readonly static int[] CanMoveToComplete = new int[] { (int)DefaultFailureStates.Continue, (int)DefaultFailureStates.ContinueAndReconfigure, @@ -73,7 +73,7 @@ public sealed class DefaultFailureStateMachine : IFailureStateMachine (int)DefaultFailureStates.Complete }; - private static List isFinalState = new List() + private readonly static int[] IsFinalState = new int[] { (int)DefaultFailureStates.Complete }; @@ -129,7 +129,7 @@ public IFailureState AddDataPoints(int points, bool isNew) { lock (_statusLock) { - if (isFinalState.Contains(State.FailureState)) + if (IsFinalState.Contains(State.FailureState)) { return State; } @@ -150,7 +150,7 @@ public IFailureState AddDataPoints(int points, bool isNew) while (State.FailureState > (int)DefaultFailureStates.Continue && currentRate < transitionWeights[(DefaultFailureStates)State.FailureState]) { - State.FailureState = (int)transitionMapDown[(DefaultFailureStates)State.FailureState]; + State.FailureState = (int)TransitionMapDown[(DefaultFailureStates)State.FailureState]; } } @@ -171,16 +171,16 @@ public IFailureState RemoveDataPoints(int points) float currentRate = (float)NumOfFailedDataPoints / NumOfDataPoints; - if (isFinalState.Contains(State.FailureState) && + if (IsFinalState.Contains(State.FailureState) && currentRate >= transitionWeights[DefaultFailureStates.StopAndReschedule]) { throw new IllegalStateException("Received remove data point when state is complete: failing."); } while (State.FailureState < (int)DefaultFailureStates.Fail && - currentRate > transitionWeights[transitionMapUp[(DefaultFailureStates)State.FailureState]]) + currentRate > transitionWeights[TransitionMapUp[(DefaultFailureStates)State.FailureState]]) { - State.FailureState = (int)transitionMapUp[(DefaultFailureStates)State.FailureState]; + State.FailureState = (int)TransitionMapUp[(DefaultFailureStates)State.FailureState]; } return State; @@ -194,7 +194,7 @@ public IFailureState Complete() { lock (_statusLock) { - if (canMoveToComplete.Contains(State.FailureState)) + if (CanMoveToComplete.Contains(State.FailureState)) { State.FailureState = (int)DefaultFailureStates.Complete; } @@ -219,12 +219,12 @@ public void SetThreshold(IFailureState level, float threshold) { if (!(level is DefaultFailureState)) { - throw new ArgumentException(level.GetType() + " is not DefaultFailureStateMachine"); + throw new ArgumentException(level.GetType() + " is not DefaultFailureState."); } - if (level.FailureState == (int)DefaultFailureStates.Continue) + if (level.FailureState.IsContinue()) { - throw new ArgumentException("Cannot change the threshold for Continue state"); + throw new ArgumentException("Cannot change the threshold for Continue state."); } lock (_statusLock) @@ -245,9 +245,9 @@ public void SetThresholds(params Tuple[] weights) { if (!(weight.Item1 is DefaultFailureState)) { - throw new ArgumentException("Input is not of type DefaultFailureStateMachine."); + throw new ArgumentException("Input is not of type DefaultFailureState."); } - if (weight.Item1.FailureState == (int)DefaultFailureStates.Continue) + if (weight.Item1.FailureState.IsContinue()) { throw new ArgumentException("Cannot change the threshold for Continue state."); } @@ -295,14 +295,14 @@ private void CheckConsistency() { var state = DefaultFailureStates.ContinueAndReconfigure; float prevWeight = transitionWeights[state]; - state = transitionMapUp[state]; + state = TransitionMapUp[state]; while (transitionWeights.TryGetValue(state, out float nextWeight)) { if (nextWeight < prevWeight) { throw new IllegalStateException( - $"State {transitionMapDown[state]} weight is bigger than state {state}."); + $"State {TransitionMapDown[state]} weight is bigger than state {state}."); } prevWeight = nextWeight; @@ -312,7 +312,7 @@ private void CheckConsistency() return; } - state = transitionMapUp[state]; + state = TransitionMapUp[state]; } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs index 9d6d1fa0cb..e68e00b207 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/DefaultFailureStates.cs @@ -40,4 +40,60 @@ public enum DefaultFailureStates : int } + + /// + /// Extension methods for default failure states. + /// + public static class DefaultFailureStateExtensions + { + /// + /// Whether the current failure state is Continue. + /// + /// The current failure state + /// True if is Continue + public static bool IsContinue(this int state) + { + return state == (int)DefaultFailureStates.Continue; + } + + /// + /// Whether the current failure state is ContinueAndReconfigure. + /// + /// The current failure state + /// True if is ContinueAndReconfigure + public static bool IsContinueAndReconfigure(this int state) + { + return state == (int)DefaultFailureStates.ContinueAndReconfigure; + } + + /// + /// Whether the current failure state is ContinueAndReschedule. + /// + /// The current failure state + /// True if is ContinueAndReschedule + public static bool IsContinueAndReschedule(this int state) + { + return state == (int)DefaultFailureStates.ContinueAndReschedule; + } + + /// + /// Whether the current failure state is Fail. + /// + /// The current failure state + /// True if is Fail + public static bool IsFail(this int state) + { + return state == (int)DefaultFailureStates.Fail; + } + + /// + /// Whether the current failure state is Complete. + /// + /// The current failure state + /// True if is Complete + public static bool IsComplete(this int state) + { + return state == (int)DefaultFailureStates.Complete; + } + } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs index caf2265860..370384d3b9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs @@ -34,7 +34,6 @@ internal class FailEvent : IFailureEvent public FailEvent(string taskId) { TaskId = taskId; - FailureResponse = new List(); } /// @@ -62,6 +61,6 @@ public int OperatorId /// Messages implementing the response from the driver to the tasks /// to reconfigure the compution. /// - public List FailureResponse { get; private set; } + public List FailureResponse { get; } = new List(); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs index 4ef242eb91..c5193cd5b3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs @@ -33,22 +33,12 @@ public class ReconfigureEvent : IFailureEvent /// Constructor for a reconfigure event. /// /// The failed task - /// The operator identifier in which the event was detected - public ReconfigureEvent(IFailedTask failedTask, int opertorId) + /// The operator identifier in which the event was detected + public ReconfigureEvent(IFailedTask failedTask, int operatorId) { - if (failedTask != null) - { - FailedTask = Optional.Of(failedTask); - TaskId = failedTask.Id; - } - else - { - FailedTask = Optional.Empty(); - } - - OperatorId = opertorId; - FailureResponse = new List(); - Iteration = Optional.Empty(); + FailedTask = Optional.OfNullable(failedTask); + TaskId = failedTask?.Id; + OperatorId = operatorId; } /// @@ -67,7 +57,7 @@ public virtual int FailureEvent /// /// The iteration in which the failure is rised. /// - public Optional Iteration { get; set; } + public Optional Iteration { get; set; } = Optional.Empty(); /// /// The identifier of the task triggering the event. @@ -82,6 +72,6 @@ public virtual int FailureEvent /// /// The response message generated to react to the failure event. /// - public List FailureResponse { get; protected set; } + public List FailureResponse { get; protected set; } = new List(); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs index 0660648189..b525121cd8 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultBroadcast.cs @@ -66,7 +66,7 @@ public DefaultBroadcast( /// The conf builder where to attach the codec configuration internal override void GetCodecConfiguration(ref IConfiguration conf) { - if (CODECMAP.TryGetValue(typeof(T), out IConfiguration codecConf)) + if (CodecMap.TryGetValue(typeof(T), out IConfiguration codecConf)) { conf = Configurations.Merge(conf, codecConf); base.GetCodecConfiguration(ref conf); @@ -85,9 +85,9 @@ internal override void GetCodecConfiguration(ref IConfiguration conf) protected override IConfiguration PhysicalOperatorConfiguration() { var physicalOperatorConf = TangFactory.GetTang().NewConfigurationBuilder() - .BindImplementation(GenericType>.Class, GenericType>.Class) + .BindImplementation, Physical.Default.DefaultBroadcast>() .Build(); - var messageconf = SetMessageType(typeof(Physical.Default.DefaultBroadcast)); + var messageconf = SetMessageType(); return Configurations.Merge(physicalOperatorConf, messageconf); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs index cedaf6b027..50c9d36319 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultEmpty.cs @@ -55,10 +55,7 @@ public DefaultEmpty(IElasticStage stage, IFailureStateMachine failureMachine) : /// If the task failure cannot be properly handled public override void OnTaskFailure(IFailedTask task, ref List failureEvents) { - if (_next != null) - { - _next.OnTaskFailure(task, ref failureEvents); - } + _next?.OnTaskFailure(task, ref failureEvents); } /// @@ -98,10 +95,7 @@ internal override void GatherMasterIds(ref HashSet masterTasks) throw new IllegalStateException("Operator need to be build before finalizing the stage"); } - if (_next != null) - { - _next.GatherMasterIds(ref masterTasks); - } + _next?.GatherMasterIds(ref masterTasks); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs index 448924a2ed..e30ac6811c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -120,7 +120,7 @@ protected override bool ReactOnTaskMessage( } Log.Log(Level.Info, "Received topology update request for {0} {1} from {2}", - OperatorType.ToString(), _id, message.TaskId); + OperatorType, _id, message.TaskId); _topology.TopologyUpdateResponse( message.TaskId, @@ -136,8 +136,7 @@ protected override bool ReactOnTaskMessage( else { returnMessages.Clear(); - Log.Log(Level.Info, "Operator {0} is in stopped: Waiting.", - OperatorType.ToString()); + Log.Log(Level.Info, "Operator {0} is in stopped: Waiting.", OperatorType); } } @@ -162,30 +161,17 @@ protected override bool ReactOnTaskMessage( /// public override void OnReconfigure(ref ReconfigureEvent reconfigureEvent) { - Log.Log(Level.Info, "Going to reconfigure the {0} operator", OperatorType.ToString()); + Log.Log(Level.Info, "Going to reconfigure the {0} operator", OperatorType); if (reconfigureEvent.FailedTask.IsPresent()) { - if (reconfigureEvent.FailedTask.Value.AsError() is OperatorException) - { - var info = Optional.Of( - ((OperatorException)reconfigureEvent.FailedTask.Value.AsError()).AdditionalInfo); - var msg = _topology.Reconfigure( - reconfigureEvent.FailedTask.Value.Id, - info, - reconfigureEvent.Iteration); + var error = reconfigureEvent.FailedTask.Value.AsError() as OperatorException; - reconfigureEvent.FailureResponse.AddRange(msg); - } - else - { - var msg = _topology.Reconfigure( + reconfigureEvent.FailureResponse.AddRange( + _topology.Reconfigure( reconfigureEvent.FailedTask.Value.Id, - Optional.Empty(), - reconfigureEvent.Iteration); - - reconfigureEvent.FailureResponse.AddRange(msg); - } + Optional.OfNullable(error?.AdditionalInfo), + reconfigureEvent.Iteration)); } } @@ -208,9 +194,7 @@ public override void OnReschedule(ref RescheduleEvent rescheduleEvent) rescheduleEvent.RescheduleTaskConfigurations.Add(Stage.StageName, confs); } confs.Add(TangFactory.GetTang().NewConfigurationBuilder() - .BindNamedParameter( - GenericType.Class, - true.ToString(CultureInfo.InvariantCulture)) + .BindNamedParam("true") .Build()); } @@ -225,10 +209,7 @@ public override void OnReschedule(ref RescheduleEvent rescheduleEvent) /// public override void OnStop(ref StopEvent stopEvent) { - if (!_stop) - { - _stop = true; - } + _stop = true; var rescheduleEvent = stopEvent as RescheduleEvent; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElasticOperatorWithDefaultDispatcher.cs similarity index 86% rename from lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs rename to lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElasticOperatorWithDefaultDispatcher.cs index 8a29079f0c..1dbd287b81 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElastiOperatorWithDefaultDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/ElasticOperatorWithDefaultDispatcher.cs @@ -89,13 +89,7 @@ public override ElasticOperator Broadcast( /// If the task failure cannot be properly handled public override void OnTaskFailure(IFailedTask task, ref List failureEvents) { - var failedOperatorId = _id; - - if (task.AsError() is OperatorException) - { - var opException = task.AsError() as OperatorException; - failedOperatorId = opException.OperatorId; - } + var failedOperatorId = (task.AsError() as OperatorException)?.OperatorId ?? _id; if (WithinIteration || failedOperatorId <= _id) { @@ -142,9 +136,9 @@ public override void OnTaskFailure(IFailedTask task, ref List fai LogOperatorState(); } - if (PropagateFailureDownstream() && _next != null) + if (PropagateFailureDownstream()) { - _next.OnTaskFailure(task, ref failureEvents); + _next?.OnTaskFailure(task, ref failureEvents); } } @@ -159,10 +153,7 @@ public override void OnTimeout( ref List msgs, ref List nextTimeouts) { - if (_next != null) - { - _next.OnTimeout(alarm, ref msgs, ref nextTimeouts); - } + _next?.OnTimeout(alarm, ref msgs, ref nextTimeouts); } /// @@ -199,9 +190,9 @@ public override void EventDispatcher(ref IFailureEvent @event) } } - if (_next != null && (@event.OperatorId == -1 || @event.OperatorId > _id)) + if (@event.OperatorId == -1 || @event.OperatorId > _id) { - _next.EventDispatcher(ref @event); + _next?.EventDispatcher(ref @event); } } @@ -242,16 +233,9 @@ public virtual void OnFail() /// True if the failure has to be sent downstream protected override bool PropagateFailureDownstream() { - switch (_failureMachine.State.FailureState) - { - case (int)DefaultFailureStates.Continue: - case (int)DefaultFailureStates.ContinueAndReconfigure: - case (int)DefaultFailureStates.ContinueAndReschedule: - return true; - - default: - return false; - } + return _failureMachine.State.FailureState.IsContinue() || + _failureMachine.State.FailureState.IsContinueAndReconfigure() || + _failureMachine.State.FailureState.IsContinueAndReschedule(); } /// @@ -259,12 +243,17 @@ protected override bool PropagateFailureDownstream() /// protected override void LogOperatorState() { - string intro = $"State for Operator {OperatorType.ToString()} in Stage {Stage.StageName}:\n"; - string topologyState = $"Topology:\n{_topology.LogTopologyState()}\n"; - string failureMachineState = $"Failure State: {(DefaultFailureStates)_failureMachine.State.FailureState}" + - $"\nFailure(s) Reported: {_failureMachine.NumOfFailedDataPoints}/{_failureMachine.NumOfDataPoints}"; - - Log.Log(Level.Info, intro + topologyState + failureMachineState); + if (Log.IsLoggable(Level.Info)) + { + Log.Log(Level.Info, + "State for Operator {0} in Stage {1}:\n" + + "Topology:\n{2}\n" + + "Failure State: {3}\n" + + "Failure(s) Reported: {4}/{5}", + OperatorType, Stage.StageName, _topology.LogTopologyState(), + (DefaultFailureStates)_failureMachine.State.FailureState, + _failureMachine.NumOfFailedDataPoints, _failureMachine.NumOfDataPoints); + } } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index 96b003677c..fa1dcff3c9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -36,6 +36,8 @@ using Org.Apache.REEF.Network.Elastic.Failures.Enum; using Org.Apache.REEF.Network.Elastic.Topology.Logical.Enum; using Org.Apache.REEF.Wake.StreamingCodec.CommonStreamingCodecs; +using Org.Apache.REEF.Wake.StreamingCodec; +using System.Linq; namespace Org.Apache.REEF.Network.Elastic.Operators.Logical { @@ -54,29 +56,27 @@ public abstract class ElasticOperator : IFailureResponse, ITaskMessageResponse { private static readonly Logger Log = Logger.GetLogger(typeof(ElasticOperator)); - protected static readonly Dictionary CODECMAP = new Dictionary() + private static KeyValuePair Codec() + where TCodec : IStreamingCodec { - { - typeof(int), StreamingCodecConfiguration.Conf - .Set(StreamingCodecConfiguration.Codec, GenericType.Class) - .Build() - }, - { - typeof(int[]), StreamingCodecConfiguration.Conf - .Set(StreamingCodecConfiguration.Codec, GenericType.Class) - .Build() - }, - { - typeof(float), StreamingCodecConfiguration.Conf - .Set(StreamingCodecConfiguration.Codec, GenericType.Class) - .Build() - }, - { - typeof(float[]), StreamingCodecConfiguration.Conf - .Set(StreamingCodecConfiguration.Codec, GenericType.Class) - .Build() - } - }; + return new KeyValuePair( + typeof(TType), StreamingCodecConfiguration.Conf + .Set(StreamingCodecConfiguration.Codec, GenericType.Class) + .Build()); + } + + private static Dictionary AsDictionary( + params KeyValuePair[] values) + { + return values.ToDictionary(kv => kv.Key, kv => kv.Value); + } + + protected static readonly Dictionary CodecMap = AsDictionary( + Codec(), + Codec(), + Codec(), + Codec() + ); // For the moment we consider only linear sequences (pipelines) of operators (no branching for e.g., joins) protected ElasticOperator _next = null; @@ -146,12 +146,8 @@ public IElasticStage Stage { if (_stage == null) { - if (_prev == null) - { - throw new IllegalStateException("The reference to the parent stage is lost."); - } - _stage = _prev.Stage; + _stage = _prev?.Stage ?? throw new IllegalStateException("The reference to the parent stage is lost."); return _prev.Stage; } @@ -211,11 +207,9 @@ public ElasticOperator Broadcast( /// generate an incorrent state public void OnTaskMessage(ITaskMessage message, ref List returnMessages) { - var hasReacted = ReactOnTaskMessage(message, ref returnMessages); - - if (!hasReacted && _next != null) + if (!ReactOnTaskMessage(message, ref returnMessages)) { - _next.OnTaskMessage(message, ref returnMessages); + _next?.OnTaskMessage(message, ref returnMessages); } } @@ -234,19 +228,11 @@ public virtual bool AddTask(string taskId) throw new IllegalStateException("Operator needs to be finalized before adding tasks."); } - if (!_operatorStateFinalized) - { - // If state is finalized tasks can join the topology only explicitly. - newTask = _topology.AddTask(taskId, _failureMachine); - } - - if (_next != null) - { - // A task is new if it got added by at least one operator - return _next.AddTask(taskId) || newTask; - } + // If state is finalized, tasks can join the topology only explicitly. + newTask = _operatorStateFinalized || _topology.AddTask(taskId, _failureMachine); - return newTask; + // A task is new if it got added by at least one operator. + return (_next?.AddTask(taskId) ?? true) || newTask; } /// @@ -260,10 +246,7 @@ public virtual ElasticOperator Build() throw new IllegalStateException("Operator cannot be built more than once."); } - if (_prev != null) - { - _prev.Build(); - } + _prev?.Build(); _operatorFinalized = true; @@ -287,10 +270,7 @@ public virtual ElasticOperator BuildState() throw new IllegalStateException("Operator need to be build before finalizing its state."); } - if (_next != null) - { - _next.BuildState(); - } + _next?.BuildState(); _topology.Build(); @@ -425,23 +405,12 @@ internal virtual string LogFinalStatistics() /// /// Appends the message type to the configuration. /// - /// The type of the messages the operator is configured to accept /// The conf builder with added the message type - protected IConfiguration SetMessageType(Type operatorType) + protected IConfiguration SetMessageType() { - if (operatorType.IsGenericType) - { - var genericTypes = operatorType.GenericTypeArguments; - var msgType = genericTypes[0]; - - return TangFactory.GetTang().NewConfigurationBuilder() - .BindStringNamedParam(msgType.AssemblyQualifiedName) - .Build(); - } - else - { - throw new IllegalStateException("Expecting a generic type for the message."); - } + return TangFactory.GetTang().NewConfigurationBuilder() + .BindStringNamedParam(typeof(TMsg).AssemblyQualifiedName) + .Build(); } /// @@ -452,10 +421,7 @@ protected void OnNewIteration(int iteration) { _topology.OnNewIteration(iteration); - if (_next != null) - { - _next.OnNewIteration(iteration); - } + _next?.OnNewIteration(iteration); } /// @@ -508,12 +474,16 @@ protected virtual bool ReactOnTaskMessage(ITaskMessage message, ref List protected virtual void LogOperatorState() { - string intro = $"State for Operator {OperatorType.ToString()} in Stage {Stage.StageName}:\n"; - string topologyState = $"Topology:\n{_topology.LogTopologyState()}"; - string failureMachineState = "Failure State: " + _failureMachine.State.FailureState + - "\nFailure(s) Reported: " + _failureMachine.NumOfFailedDataPoints; - - Log.Log(Level.Info, intro + topologyState + failureMachineState); + if (Log.IsLoggable(Level.Info)) + { + Log.Log(Level.Info, + "State for Operator {0} in Stage {1}:\n" + + "Topology:\n{2}" + + "Failure State: {3}\n" + + "Failure(s) Reported: {4}", + OperatorType, Stage.StageName, _topology.LogTopologyState(), + _failureMachine.State.FailureState, _failureMachine.NumOfFailedDataPoints); + } } /// @@ -543,7 +513,7 @@ private ITopology GetTopology(TopologyType topologyType) default: throw new ArgumentException( nameof(topologyType), - $"Topology type {topologyType} not supported by {OperatorType.ToString()}."); + $"Topology type {topologyType} not supported by {OperatorType}."); } return topology; diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs index 4d039137f7..2899b5d76e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/OperatorType.cs @@ -34,22 +34,4 @@ public enum OperatorType : int Scatter = 5, Gather = 6 } - - public static class OperatorTypeToString - { - public static string ToString(this OperatorType type) - { - switch (type) - { - case OperatorType.Empty: return "Empty"; - case OperatorType.Broadcast: return "Broadcast"; - case OperatorType.Reduce: return "Reduce"; - case OperatorType.AggregationRing: return "AggregationRing"; - case OperatorType.Iterate: return "Iterate"; - case OperatorType.Scatter: return "Scatter"; - case OperatorType.Gather: return "Gather"; - default: throw new ArgumentException($"Operator type {type} not found"); - } - } - } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index 13ca31e862..84d672f853 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -32,7 +32,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Physical.Default /// /// The type of message being sent. [Unstable("0.16", "API may change")] - public abstract class DefaultOneToN : IDisposable + public abstract class DefaultOneToN : IDisposable, IReschedulable { private static readonly Logger Log = Logger.GetLogger(typeof(DefaultOneToN<>)); @@ -54,8 +54,6 @@ internal DefaultOneToN(int id, bool isLast, OneToNTopology topology) OperatorId = id; _isLast = isLast; _topology = topology; - - OnTaskRescheduled = _topology.JoinTopology; } /// @@ -92,7 +90,10 @@ public string FailureInfo /// /// Action to execute when a task is re-scheduled. /// - public Action OnTaskRescheduled { get; private set; } + public Action OnTaskRescheduled() + { + return _topology.JoinTopology; + } /// /// The set of messages checkpointed in memory. @@ -145,7 +146,7 @@ public T Receive() /// Reset the internal position tracker. This should be called /// every time a new iteration start in the workflow. /// - public void ResetPosition() + public void Reset() { _position = PositionTracker.Nil; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs index a6764954d3..09a315974e 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticIterator.cs @@ -17,6 +17,7 @@ using Org.Apache.REEF.Utilities.Attributes; using System; +using System.Collections; namespace Org.Apache.REEF.Network.Elastic.Operators.Physical { @@ -24,20 +25,8 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Physical /// Group communication operator used to for iterations. /// [Unstable("0.16", "API may change")] - public interface IElasticIterator : IElasticOperator + public interface IElasticIterator : IElasticOperator, IEnumerator { - /// - /// Move to the next iteration. - /// - /// True if the next iteration exists - bool MoveNext(); - - /// - /// The current iteration. - /// - /// An object representing the current iteration - object Current { get; } - /// /// Synchronize the current iteration with the input one. /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs index 58d6cfc97a..55f06c0a3a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IElasticOperator.cs @@ -26,7 +26,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Physical /// Base class for task-side, physical, group communication operators. /// [Unstable("0.16", "API may change")] - public interface IElasticOperator : IWaitForTaskRegistration, IDisposable + public interface IElasticOperator : IWaitForTaskRegistration, IReschedulable, IDisposable { /// /// The operator type. @@ -63,11 +63,6 @@ public interface IElasticOperator : IWaitForTaskRegistration, IDisposable /// Reset the internal position tracker. This should be called /// every time a new iteration start in the workflow. /// - void ResetPosition(); - - /// - /// Action to execute when a task is re-scheduled. - /// - Action OnTaskRescheduled { get; } + void Reset(); } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs index e83c3c36ec..5dc4c18856 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReceiver.cs @@ -24,7 +24,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Physical /// /// The type of data being receive. [Unstable("0.16", "API may change")] - public interface IReceiver + public interface IReceiver { /// /// Receive a message from a sender task. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReschedulable.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReschedulable.cs new file mode 100644 index 0000000000..30440984c8 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/IReschedulable.cs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; +using System; + +namespace Org.Apache.REEF.Network.Elastic.Operators.Physical +{ + /// + /// Interface used when an action needs to triggered after a rescheduling event. + /// + [Unstable("0.16", "API may change")] + public interface IReschedulable + { + /// + /// Action to execute when a task is re-scheduled. + /// + Action OnTaskRescheduled(); + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs index 008bb46967..b341ca6fef 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/ISender.cs @@ -24,7 +24,7 @@ namespace Org.Apache.REEF.Network.Elastic.Operators.Physical /// /// The data type of the message [Unstable("0.16", "API may change")] - public interface ISender + public interface ISender { /// /// Send the data to all child receivers. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index 76f177d929..f055aea18b 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -50,12 +50,12 @@ internal abstract class CommunicationLayer : private readonly ElasticDriverMessageHandler _driverMessagesHandler; private readonly IIdentifierFactory _idFactory; private readonly IDisposable _communicationObserver; - private readonly ConcurrentDictionary _driverMessageObservers; + private readonly ConcurrentDictionary _driverMessageObservers; protected bool _disposed = false; - protected readonly ConcurrentDictionary _groupMessageObservers = - new ConcurrentDictionary(); + protected readonly ConcurrentDictionary _groupMessageObservers = + new ConcurrentDictionary(); /// /// Creates a new communication layer. @@ -89,11 +89,9 @@ protected CommunicationLayer( /// The observer of the communicating topology operator public void RegisterOperatorTopologyForTask(IOperatorTopologyWithCommunication operatorObserver) { - var id = NodeObserverIdentifier.FromObserver(operatorObserver); - - if (_groupMessageObservers.TryAdd(id, operatorObserver)) + if (!_groupMessageObservers.TryAdd(operatorObserver.NodeId, operatorObserver)) { - throw new IllegalStateException($"Topology for id {id} already added among listeners."); + throw new IllegalStateException($"Topology for id {operatorObserver.NodeId} already added among listeners."); } } @@ -103,11 +101,9 @@ public void RegisterOperatorTopologyForTask(IOperatorTopologyWithCommunication o /// The observer of the driver aware topology internal void RegisterOperatorTopologyForDriver(DriverAwareOperatorTopology operatorObserver) { - var id = NodeObserverIdentifier.FromObserver(operatorObserver); - - if (!_driverMessageObservers.TryAdd(id, operatorObserver)) + if (!_driverMessageObservers.TryAdd(operatorObserver.NodeId, operatorObserver)) { - throw new IllegalStateException($"Topology for id {id} already added among driver listeners."); + throw new IllegalStateException($"Topology for id {operatorObserver.NodeId} already added among driver listeners."); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs index d68a8bc9db..111f901764 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs @@ -79,11 +79,7 @@ public override void OnNext(IRemoteMessage /// [Inject] - private DefaultTaskToDriverMessageDispatcher(IInjector injector) : base(injector) + private DefaultTaskToDriverMessageDispatcher(IHeartBeatManager heartBeatManager) : base(heartBeatManager) { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs index c3dc804b66..6c1d67b862 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/IDefaultTaskToDrivermessages.cs @@ -30,6 +30,7 @@ internal interface IDefaultTaskToDriverMessages /// group communication topology. /// /// The current task + /// The name of the stage /// The identifier of the operator ready to join the topology void JoinTopology(string taskId, string stageName, int operatorId); @@ -37,6 +38,7 @@ internal interface IDefaultTaskToDriverMessages /// Send a notification to the driver for an update on topology state. /// /// The current task id + /// The name of the stage /// The operator requiring the topology update void TopologyUpdateRequest(string taskId, string stageName, int operatorId); @@ -44,6 +46,7 @@ internal interface IDefaultTaskToDriverMessages /// Signal the driver that the current stage is completed. /// /// The current task identifier + /// The name of the stage void StageComplete(string taskId, string stageName); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs index a17ed7b181..cfb2635b81 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs @@ -44,8 +44,8 @@ private ElasticDriverMessageHandler() /// /// Observers of incoming messages from the driver. /// - internal readonly ConcurrentDictionary DriverMessageObservers = - new ConcurrentDictionary(); + internal readonly ConcurrentDictionary DriverMessageObservers = + new ConcurrentDictionary(); /// /// Handle an incoming message. @@ -60,10 +60,8 @@ public void Handle(IDriverMessage message) } var edm = ElasticDriverMessageImpl.From(message.Message.Value); - var id = NodeObserverIdentifier.FromMessage(edm.Message); - DriverAwareOperatorTopology operatorObserver; - if (!DriverMessageObservers.TryGetValue(id, out operatorObserver)) + if (!DriverMessageObservers.TryGetValue(edm.Message.NodeId, out DriverAwareOperatorTopology operatorObserver)) { throw new KeyNotFoundException("Unable to find registered operator topology for stage " + edm.Message.StageName + " operator " + edm.Message.OperatorId); diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IIdentifiable.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IIdentifiable.cs new file mode 100644 index 0000000000..bd0dc981e7 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/IIdentifiable.cs @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Interface used to return an identfiable object from a class. + /// + [Unstable("0.16", "API may change")] + public interface IIdentifiable + { + /// + /// The identifier for the node. + /// + NodeIdentifier NodeId { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/INodeIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/INodeIdentifier.cs new file mode 100644 index 0000000000..79c03a20a3 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/INodeIdentifier.cs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// Interface used by the elastic group communication framework to manage node identifiers. + /// + [Unstable("0.16", "API may change")] + public interface INodeIdentifier + { + /// + /// The stage name. + /// + string StageName { get; } + + /// + /// The operator name. + /// + int OperatorId { get; } + } +} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs new file mode 100644 index 0000000000..8e7504fb89 --- /dev/null +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using Org.Apache.REEF.Utilities.Attributes; + +namespace Org.Apache.REEF.Network.Elastic.Task +{ + /// + /// An identifier for a given node in the group communication topology. + /// A node is uniquely identifiable by a combination of its + /// , and . + /// + [Unstable("0.16", "API may change")] + public struct NodeIdentifier : INodeIdentifier + { + /// + /// The stage name. + /// + public string StageName { get; private set; } + + /// + /// The operator name. + /// + public int OperatorId { get; private set; } + + /// + /// Constructor. + /// + /// + /// + public NodeIdentifier(string stageName, int operatorId) + { + StageName = stageName; + OperatorId = operatorId; + } + } +} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs deleted file mode 100644 index 6e4432f988..0000000000 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeObserverIdentifier.cs +++ /dev/null @@ -1,114 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -using Org.Apache.REEF.Network.Elastic.Comm.Impl; -using Org.Apache.REEF.Network.Elastic.Topology.Physical; -using Org.Apache.REEF.Utilities.Attributes; - -namespace Org.Apache.REEF.Network.Elastic.Task.Impl -{ - /// - /// An identifier for a given node in the group communication topology. - /// A node is uniquely identifiable by a combination of its - /// , and . - /// - [Unstable("0.16", "API may change")] - internal sealed class NodeObserverIdentifier - { - /// - /// Creates an identifier from an operator topology with communication. - /// - public static NodeObserverIdentifier FromObserver(IOperatorTopologyWithCommunication observer) - { - return new NodeObserverIdentifier(observer.StageName, observer.OperatorId); - } - - /// - /// Creates an from a driver aware topology. - /// - public static NodeObserverIdentifier FromObserver(DriverAwareOperatorTopology observer) - { - return new NodeObserverIdentifier(observer.StageName, observer.OperatorId); - } - - /// - /// Creates an identifier from a group communication message. - /// - public static NodeObserverIdentifier FromMessage(ElasticGroupCommunicationMessage message) - { - return new NodeObserverIdentifier(message.StageName, message.OperatorId); - } - - /// - /// Basic constructor. - /// - /// The name of the stage - /// The identifier of the operator - private NodeObserverIdentifier(string stageName, int operatorId) - { - StageName = stageName; - OperatorId = operatorId; - } - - /// - /// The stage name. - /// - public string StageName { get; } - - /// - /// The operator name. - /// - public int OperatorId { get; } - - /// - /// Overrides . Simply compares equivalence of instance fields. - /// - public override bool Equals(object obj) - { - if (ReferenceEquals(null, obj)) - { - return false; - } - - if (ReferenceEquals(this, obj)) - { - return true; - } - - return obj is NodeObserverIdentifier && Equals((NodeObserverIdentifier)obj); - } - - /// - /// Overrides . Generates hashcode based on the instance fields. - /// - public override int GetHashCode() - { - int hash = 17; - hash = (hash * 31) + StageName.GetHashCode(); - return (hash * 31) + OperatorId.GetHashCode(); - } - - /// - /// Compare equality of instance fields. - /// - private bool Equals(NodeObserverIdentifier other) - { - return StageName.Equals(other.StageName) && - OperatorId.Equals(other.OperatorId); - } - } -} \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs index ad92fcc610..5e46874fa5 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/TaskToDriverMessageDispatcher.cs @@ -19,6 +19,7 @@ using Org.Apache.REEF.Common.Protobuf.ReefProtocol; using Org.Apache.REEF.Tang.Interface; using Org.Apache.REEF.Utilities.Attributes; +using static Org.Apache.REEF.Common.Protobuf.ReefProtocol.TaskStatusProto; namespace Org.Apache.REEF.Network.Elastic.Task.Impl { @@ -35,9 +36,9 @@ internal abstract class TaskToDriverMessageDispatcher /// Constrcutor. /// /// Reference to the heartbeat manager - protected TaskToDriverMessageDispatcher(IInjector subInjector) + protected TaskToDriverMessageDispatcher(IHeartBeatManager heartBeatManager) { - _heartBeatManager = subInjector.GetInstance(); + _heartBeatManager = heartBeatManager; } /// @@ -50,17 +51,10 @@ protected void Send(string taskId, byte[] message) TaskStatusProto taskStatusProto = new TaskStatusProto() { task_id = taskId, - context_id = Utils.GetContextIdFromTaskId(taskId) + context_id = Utils.GetContextIdFromTaskId(taskId), + task_message = { new TaskMessageProto { source_id = taskId, message = message } } }; - TaskStatusProto.TaskMessageProto taskMessageProto = new TaskStatusProto.TaskMessageProto() - { - source_id = taskId, - message = message, - }; - - taskStatusProto.task_message.Add(taskMessageProto); - Heartbeat(taskStatusProto); } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs index 02c1a821f9..b0cf4d5f40 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Workflow.cs @@ -115,7 +115,7 @@ public bool MoveNext() if (_isRescheduled) { - Current.OnTaskRescheduled.Invoke(); + Current.OnTaskRescheduled().Invoke(); } return true; @@ -211,7 +211,7 @@ internal void Add(IElasticOperator op) var iterator = (IElasticIterator)_operators[iterPos]; op.IteratorReference = iterator; - iterator.RegisterActionOnTaskRescheduled(op.OnTaskRescheduled); + iterator.RegisterActionOnTaskRescheduled(op.OnTaskRescheduled()); } if (op.OperatorType == OperatorType.Iterate) @@ -240,7 +240,7 @@ private void ResetOperatorPositions() { for (int pos = _position; pos < _operators.Count; pos++) { - _operators[pos].ResetPosition(); + _operators[pos].Reset(); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs index 5a56df14f4..c1a50cbe36 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs @@ -29,17 +29,10 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Physical [Unstable("0.16", "API may change")] internal interface IOperatorTopologyWithCommunication : IWaitForTaskRegistration, + INodeIdentifier, + IIdentifiable, IDisposable, IObserver> { - /// - /// The stage name context in which the topology is running. - /// - string StageName { get; } - - /// - /// The identifier of the operator in which the topology is running. - /// - int OperatorId { get; } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs index 58fcfc37e0..7d299cb964 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +using Org.Apache.REEF.Network.Elastic.Task; using Org.Apache.REEF.Utilities.Attributes; namespace Org.Apache.REEF.Network.Elastic.Topology.Physical @@ -24,7 +25,7 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Physical /// not generic but directly related to the operators using them to communicate data. /// [Unstable("0.16", "API may change")] - public abstract class OperatorTopology + public abstract class OperatorTopology : INodeIdentifier, IIdentifiable { /// /// Constructor for an operator topology. @@ -39,6 +40,7 @@ public OperatorTopology(string stageName, string taskId, string rootTaskId, int TaskId = taskId; RootTaskId = rootTaskId; OperatorId = operatorId; + NodeId = new NodeIdentifier(StageName, OperatorId); } /// @@ -51,6 +53,11 @@ public OperatorTopology(string stageName, string taskId, string rootTaskId, int /// public int OperatorId { get; private set; } + /// + /// The idenfier for the node. + /// + public NodeIdentifier NodeId { get; private set;} + /// /// The identifier of the task in which the topology is running. /// From 10634d0e8016aab5a89475f8321b25bea6ccde54 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 26 Feb 2019 17:55:26 -0800 Subject: [PATCH 26/29] Better node identifier stuff --- .../Impl/ElasticGroupCommunicationMessage.cs | 8 +---- .../Elastic/Task/CommunicationLayer.cs | 8 ++--- .../Task/Default/DefaultCommunicationLayer.cs | 5 ++- .../Task/ElasticDriverMessageHandler.cs | 5 ++- .../Elastic/Task/IIdentifiable.cs | 33 ------------------- .../Elastic/Task/NodeIdentifier.cs | 13 ++++++++ .../IOperatorTopologyWithCommunication.cs | 1 - .../Topology/Physical/OperatorTopology.cs | 8 +---- 8 files changed, 23 insertions(+), 58 deletions(-) delete mode 100644 lang/cs/Org.Apache.REEF.Network/Elastic/Task/IIdentifiable.cs diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs index 718041d300..1407ae8763 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs @@ -25,7 +25,7 @@ namespace Org.Apache.REEF.Network.Elastic.Comm.Impl /// Message sent by Group Communication operators. /// [Unstable("0.16", "API may change")] - public abstract class ElasticGroupCommunicationMessage : ICloneable, INodeIdentifier, IIdentifiable + public abstract class ElasticGroupCommunicationMessage : ICloneable, INodeIdentifier { /// /// Create a new elastic group communication message. @@ -38,7 +38,6 @@ protected ElasticGroupCommunicationMessage( { StageName = stageName; OperatorId = operatorId; - NodeId = new NodeIdentifier(StageName, OperatorId); } /// @@ -54,10 +53,5 @@ protected ElasticGroupCommunicationMessage( /// Returns the operator id. /// public int OperatorId { get; private set; } - - /// - /// The identifier for the node. - /// - public NodeIdentifier NodeId { get; private set; } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index f055aea18b..bfadbf946b 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -89,9 +89,9 @@ protected CommunicationLayer( /// The observer of the communicating topology operator public void RegisterOperatorTopologyForTask(IOperatorTopologyWithCommunication operatorObserver) { - if (!_groupMessageObservers.TryAdd(operatorObserver.NodeId, operatorObserver)) + if (!_groupMessageObservers.TryAdd(operatorObserver.NodeId(), operatorObserver)) { - throw new IllegalStateException($"Topology for id {operatorObserver.NodeId} already added among listeners."); + throw new IllegalStateException($"Topology for id {operatorObserver.NodeId()} already added among listeners."); } } @@ -101,9 +101,9 @@ public void RegisterOperatorTopologyForTask(IOperatorTopologyWithCommunication o /// The observer of the driver aware topology internal void RegisterOperatorTopologyForDriver(DriverAwareOperatorTopology operatorObserver) { - if (!_driverMessageObservers.TryAdd(operatorObserver.NodeId, operatorObserver)) + if (!_driverMessageObservers.TryAdd(operatorObserver.NodeId(), operatorObserver)) { - throw new IllegalStateException($"Topology for id {operatorObserver.NodeId} already added among driver listeners."); + throw new IllegalStateException($"Topology for id {operatorObserver.NodeId()} already added among driver listeners."); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs index 111f901764..0b8fbb25ae 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs @@ -79,10 +79,9 @@ public override void OnNext(IRemoteMessage - /// Interface used to return an identfiable object from a class. - /// - [Unstable("0.16", "API may change")] - public interface IIdentifiable - { - /// - /// The identifier for the node. - /// - NodeIdentifier NodeId { get; } - } -} diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs index 8e7504fb89..42f517d88c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs @@ -47,5 +47,18 @@ public NodeIdentifier(string stageName, int operatorId) StageName = stageName; OperatorId = operatorId; } + + public override string ToString() + { + return $"{StageName}-{OperatorId}"; + } + } + + public static class NodeIdentifierExtensions + { + public static NodeIdentifier NodeId(this INodeIdentifier id) + { + return new NodeIdentifier(id.StageName, id.OperatorId); + } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs index c1a50cbe36..314976d761 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/IOperatorTopologyWithCommunication.cs @@ -30,7 +30,6 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Physical internal interface IOperatorTopologyWithCommunication : IWaitForTaskRegistration, INodeIdentifier, - IIdentifiable, IDisposable, IObserver> { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs index 7d299cb964..5713d0d7b1 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs @@ -25,7 +25,7 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Physical /// not generic but directly related to the operators using them to communicate data. /// [Unstable("0.16", "API may change")] - public abstract class OperatorTopology : INodeIdentifier, IIdentifiable + public abstract class OperatorTopology : INodeIdentifier { /// /// Constructor for an operator topology. @@ -40,7 +40,6 @@ public OperatorTopology(string stageName, string taskId, string rootTaskId, int TaskId = taskId; RootTaskId = rootTaskId; OperatorId = operatorId; - NodeId = new NodeIdentifier(StageName, OperatorId); } /// @@ -53,11 +52,6 @@ public OperatorTopology(string stageName, string taskId, string rootTaskId, int /// public int OperatorId { get; private set; } - /// - /// The idenfier for the node. - /// - public NodeIdentifier NodeId { get; private set;} - /// /// The identifier of the task in which the topology is running. /// From 4739662a8bbcca4fb10a3e4491097c07e1fb47ca Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Mon, 4 Mar 2019 11:28:34 -0800 Subject: [PATCH 27/29] Done with another round --- .../Elastic/Task/CommunicationLayer.cs | 22 +++--- .../Task/Default/DefaultCommunicationLayer.cs | 7 +- .../Task/Default/DefaultElasticContext.cs | 40 ++++------- .../Task/Default/DefaultElasticStage.cs | 26 +++---- .../Task/ElasticDriverMessageHandler.cs | 8 +-- .../Elastic/Topology/Logical/Impl/DataNode.cs | 29 ++------ .../Topology/Logical/Impl/FlatTopology.cs | 70 ++++++++----------- .../Physical/Default/OneToNTopology.cs | 7 +- ...peratorTopologyWithDefaultCommunication.cs | 28 +++----- 9 files changed, 89 insertions(+), 148 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs index bfadbf946b..1ef2a3d2f5 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/CommunicationLayer.cs @@ -158,42 +158,42 @@ internal void Send( /// The token to cancel the operation /// Nodes that got removed during task registration public void WaitForTaskRegistration( - ICollection identifiers, + IEnumerable identifiers, CancellationTokenSource cancellationSource, IDictionary removed = null) { ISet foundSet = new HashSet(); + var count = identifiers.Count(); for (var i = 0; i < _retryRegistration; i++) { - if (cancellationSource != null && cancellationSource.Token.IsCancellationRequested) + if ((cancellationSource?.Token.IsCancellationRequested) ?? false) { Log.Log(Level.Warning, "WaitForTaskRegistration is canceled in retryCount {0}.", i); throw new OperationCanceledException("WaitForTaskRegistration is canceled"); } - Log.Log(Level.Info, "WaitForTaskRegistration, in retryCount {0}.", i); - foreach (var identifier in identifiers) + Log.Log(Level.Info, "In retryCount {0}.", i); + foreach (var identifier in identifiers.Except(foundSet)) { - var notFound = !foundSet.Contains(identifier); - if (notFound && removed != null && removed.ContainsKey(identifier)) + if (removed?.ContainsKey(identifier) ?? false) { foundSet.Add(identifier); Log.Log(Level.Verbose, - "WaitForTaskRegistration, dependent id {0} was removed at loop {1}.", identifier, i); + "Dependent id {0} was removed at loop {1}.", identifier, i); } - else if (notFound && Lookup(identifier)) + else if (Lookup(identifier)) { foundSet.Add(identifier); Log.Log(Level.Verbose, - "WaitForTaskRegistration, find a dependent id {0} at loop {1}.", identifier, i); + "Find a dependent id {0} at loop {1}.", identifier, i); } } - if (foundSet.Count >= identifiers.Count) + if (foundSet.Count >= count) { Log.Log(Level.Info, - "WaitForTaskRegistration, found all {0} dependent ids at loop {1}.", foundSet.Count, i); + "Found all {0} dependent ids at loop {1}.", foundSet.Count, i); return; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs index 0b8fbb25ae..f9b2635941 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultCommunicationLayer.cs @@ -76,12 +76,11 @@ public override void OnNext(IRemoteMessage _stages; + private readonly Dictionary _stages = new Dictionary(); private readonly string _taskId; private readonly INetworkService _networkService; - private readonly object _lock; - private bool _disposed; + private readonly object _disposeLock = new object(); + private bool _disposed = false; /// /// Creates a new elastic context and registers the task id with the Name Server. @@ -64,22 +65,13 @@ public DefaultElasticContext( ElasticDriverMessageHandler driverMessageHandler, IInjector injector) { - _stages = new Dictionary(); _networkService = networkService; _taskId = taskId; - _disposed = false; - _lock = new object(); - - foreach (string serializedGroupConfig in stageConfigs) - { - IConfiguration stageConfig = configSerializer.FromString(serializedGroupConfig); - IInjector subInjector = injector.ForkInjector(stageConfig); - - var stageClient = subInjector.GetInstance(); - - _stages[stageClient.StageName] = stageClient; - } + _stages = stageConfigs + .Select(config => + injector.ForkInjector(configSerializer.FromString(config)).GetInstance()) + .ToDictionary(stage => stage.StageName, stage => stage); _networkService.Register(new StringIdentifier(_taskId)); } @@ -99,20 +91,16 @@ public void WaitForTaskRegistration(CancellationTokenSource cancellationSource = /// /// Gets the stage object for the given stage name. /// - /// The name of the stage + /// The name of the stage /// The task-side stage object - public IElasticStage GetStage(string stagepName) + public IElasticStage GetStage(string stageName) { - if (string.IsNullOrEmpty(stagepName)) - { - throw new ArgumentNullException("stagepName"); - } - if (!_stages.ContainsKey(stagepName)) + if (!_stages.TryGetValue(stageName, out IElasticStage stage)) { - throw new ArgumentException("No stage with name: " + stagepName); + return stage; } - return _stages[stagepName]; + throw new ArgumentException($"No stage with name: {stageName}."); } /// @@ -120,7 +108,7 @@ public IElasticStage GetStage(string stagepName) /// public void Dispose() { - lock (_lock) + lock (_disposeLock) { if (!_disposed) { diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs index 568827788d..724232ff65 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs @@ -40,8 +40,8 @@ internal sealed class DefaultElasticStage : IElasticStage private readonly CancellationSource _cancellationSource; - private readonly object _lock; - private bool _disposed; + private readonly object _disposeLock = new object(); + private bool _disposed = false; /// /// Injectable constructor. @@ -59,25 +59,20 @@ private DefaultElasticStage( { StageName = stageName; Workflow = workflow; - + _cancellationSource = cancellationSource; - _disposed = false; - _lock = new object(); foreach (string operatorConfigStr in operatorConfigs) { IConfiguration operatorConfig = configSerializer.FromString(operatorConfigStr); - IInjector operatorInjector = injector.ForkInjector(operatorConfig); - string msgType = operatorInjector.GetNamedInstance( - GenericType.Class); - + string msgType = operatorInjector.GetNamedInstance(); Type groupCommOperatorGenericInterface = typeof(IElasticTypedOperator<>); Type groupCommOperatorInterface = groupCommOperatorGenericInterface.MakeGenericType(Type.GetType(msgType)); var operatorObj = operatorInjector.GetInstance(groupCommOperatorInterface); - Workflow.Add(operatorObj as IElasticOperator); + Workflow.Add((IElasticOperator)operatorObj); } } @@ -104,7 +99,7 @@ public void WaitForTaskRegistration(CancellationTokenSource cancellationSource = } catch (OperationCanceledException e) { - Log.Log(Level.Error, "Stage {0} failed during registration.", StageName); + Log.Log(Level.Error, "Stage " + StageName + " failed during registration.", e); throw e; } } @@ -114,14 +109,11 @@ public void WaitForTaskRegistration(CancellationTokenSource cancellationSource = /// public void Dispose() { - lock (_lock) + lock (_disposeLock) { if (!_disposed) { - if (Workflow != null) - { - Workflow.Dispose(); - } + Workflow?.Dispose(); _disposed = true; } @@ -137,7 +129,7 @@ public void Cancel() { _cancellationSource.Cancel(); - Log.Log(Level.Info, "Received request to close stage ", StageName); + Log.Log(Level.Info, "Received request to close stage {0}", StageName); } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs index 594ae335b5..99977b78a9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/ElasticDriverMessageHandler.cs @@ -59,14 +59,14 @@ public void Handle(IDriverMessage message) throw new IllegalStateException("Received message with no payload."); } - var edm = ElasticDriverMessageImpl.From(message.Message.Value); + var edm = ElasticDriverMessageImpl.From(message.Message.Value).Message; - if (!DriverMessageObservers.TryGetValue(edm.Message.NodeId(), out DriverAwareOperatorTopology operatorObserver)) + if (!DriverMessageObservers.TryGetValue(edm.NodeId(), out DriverAwareOperatorTopology operatorObserver)) { - throw new KeyNotFoundException("Unable to find registered operator topology for " + edm.Message.NodeId()); + throw new KeyNotFoundException("Unable to find registered operator topology for " + edm.NodeId()); } - operatorObserver.OnNext(edm.Message); + operatorObserver.OnNext(edm); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs index 29336dd057..e0eae3ad94 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs @@ -28,13 +28,9 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl [Unstable("0.16", "API may change")] internal sealed class DataNode { - private readonly int _taskId; private readonly bool _isRoot; private readonly List _children; - private DataNode _parent; - private DataNodeState _state; - /// /// Construct a node using a given task id. /// @@ -44,9 +40,9 @@ public DataNode( int taskId, bool isRoot) { - _taskId = taskId; + TaskId = taskId; _isRoot = isRoot; - _state = DataNodeState.Reachable; + FailState = DataNodeState.Reachable; _children = new List(); } @@ -54,36 +50,25 @@ public DataNode( /// /// The current state for the node. /// - public DataNodeState FailState - { - get { return _state; } - set { _state = value; } - } + public DataNodeState FailState { get; set; } /// /// The parent of the target node. /// - public DataNode Parent - { - get { return _parent; } - set { _parent = value; } - } + public DataNode Parent { get; set; } /// /// Add a node to the list of children nodes of the current one. /// - public void AddChild(DataNode child) + public void AddChild(IEnumerable child) { - _children.Add(child); + _children.AddRange(child); } /// /// The task id represented by the data node. /// - public int TaskId - { - get { return _taskId; } - } + public int TaskId { get; } /// /// Return how many children the current node has. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index 00b94ec541..c0c752d818 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -49,7 +49,7 @@ public class FlatTopology : ITopology private bool _finalized = false; private readonly bool _sorted; - private readonly Dictionary _nodes = new Dictionary(); + private readonly IDictionary _nodes; private readonly HashSet _lostNodesToBeRemoved = new HashSet(); private HashSet _nodesWaitingToJoinTopologyNextIteration = new HashSet(); private HashSet _nodesWaitingToJoinTopology = new HashSet(); @@ -70,6 +70,15 @@ public FlatTopology(int rootId, bool sorted = false) _rootId = rootId; _sorted = sorted; OperatorId = -1; + + if (_sorted) + { + _nodes = new SortedDictionary(); + } + else + { + _nodes = new Dictionary(); + } } /// @@ -107,21 +116,21 @@ public bool AddTask(string taskId, IFailureStateMachine failureMachine) lock (_lock) { - if (_nodes.ContainsKey(id)) + if (_nodes.TryGetValue(id, out DataNode node)) { - if (_nodes[id].FailState != DataNodeState.Reachable) + if (node.FailState != DataNodeState.Reachable) { // This is node already added to the topology and which probably failed. _nodesWaitingToJoinTopologyNextIteration.Add(taskId); - _nodes[id].FailState = DataNodeState.Unreachable; + node.FailState = DataNodeState.Unreachable; return false; } throw new ArgumentException("Task already added to the topology."); } - DataNode node = new DataNode(id, false); - _nodes[id] = node; + DataNode dnode = new DataNode(id, false); + _nodes[id] = dnode; if (_finalized) { @@ -281,17 +290,13 @@ public IConfiguration GetTaskConfiguration(int taskId) foreach (var tId in root.Children) { - confBuilder.BindSetEntry( - GenericType.Class, - tId.TaskId.ToString(CultureInfo.InvariantCulture)); + confBuilder.BindSetEntry("" + tId.TaskId); } } return confBuilder - .BindNamedParameter( - GenericType.Class, - _rootId.ToString(CultureInfo.InvariantCulture)) - .Build(); + .BindNamedParam("" + _rootId) + .Build(); } /// @@ -328,10 +333,13 @@ public void TopologyUpdateResponse( if (_nodesWaitingToJoinTopology.Count > 0) { - Log.Log(Level.Info, - "Tasks [{0}] are added to topology in iteration {1}", - string.Join(",", _nodesWaitingToJoinTopology), - _iteration); + if (Log.IsLoggable(Level.Info)) + { + Log.Log(Level.Info, + "Tasks [{0}] are added to topology in iteration {1}", + string.Join(",", _nodesWaitingToJoinTopology), + _iteration); + } _availableDataPoints += _nodesWaitingToJoinTopology.Count; failureStateMachine.Value.AddDataPoints(_nodesWaitingToJoinTopology.Count, false); @@ -388,18 +396,8 @@ public IList Reconfigure( List messages = new List(); lock (_lock) - { - int iter; - - if (info.IsPresent()) - { - iter = int.Parse(info.Value.Split(':')[0]); - } - else - { - iter = iteration.Value; - } - + { + int iter = info.IsPresent() ? int.Parse(info.Value.Split(':')[0]) : iteration.Value; var children = _lostNodesToBeRemoved.ToList(); var update = new List() { @@ -430,19 +428,7 @@ public string LogFinalStatistics() private void BuildTopology() { - IEnumerator iter = - _sorted ? - _nodes.OrderBy(kv => kv.Key).Select(kv => kv.Value).GetEnumerator() : - _nodes.Values.GetEnumerator(); - var root = _nodes[_rootId]; - - while (iter.MoveNext()) - { - if (iter.Current.TaskId != _rootId) - { - root.AddChild(iter.Current); - } - } + _nodes[_rootId].AddChild(_nodes.Values.Where(n => n.TaskId != _rootId)); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index e3ed8ec115..f13e195cba 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -143,7 +143,7 @@ public override void WaitForTaskRegistration(CancellationTokenSource cancellatio { try { - _commLayer.WaitForTaskRegistration(_children.Values.ToList(), cancellationSource, _nodesToRemove); + _commLayer.WaitForTaskRegistration(_children.Values, cancellationSource, _nodesToRemove); } catch (Exception e) { @@ -231,10 +231,9 @@ public override void OnNext(DriverMessagePayload message) foreach (var taskId in _nodesToRemove.Keys) { - var id = Utils.GetTaskNum(taskId); - _nodesToRemove.TryRemove(taskId, out byte val); - _children.TryRemove(id, out string str); + _children.TryRemove(Utils.GetTaskNum(taskId), out string str); } + _nodesToRemove.Clear(); } // Unblock this broadcast round. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs index 8a0ad7b3f4..41d87cd311 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -109,7 +109,6 @@ public override void WaitCompletionBeforeDisposing() { // The topology is still trying to send messages, wait. Thread.Sleep(100); - } } @@ -130,7 +129,7 @@ public virtual void WaitForTaskRegistration(CancellationTokenSource cancellation { try { - _commLayer.WaitForTaskRegistration(_children.Values.ToList(), cancellationSource); + _commLayer.WaitForTaskRegistration(_children.Values, cancellationSource); } catch (Exception e) { @@ -151,23 +150,20 @@ public virtual void WaitForTaskRegistration(CancellationTokenSource cancellation /// public virtual ElasticGroupCommunicationMessage Receive(CancellationTokenSource cancellationSource) { - ElasticGroupCommunicationMessage message; - int retry = 1; - - while (!_messageQueue.TryTake(out message, _timeout, cancellationSource.Token)) + for (int retry = 0; retry < _retry; ++retry) { - if (cancellationSource.IsCancellationRequested) + if (_messageQueue.TryTake(out ElasticGroupCommunicationMessage message, _timeout, cancellationSource.Token)) { - throw new OperationCanceledException("Received cancellation request: stop receiving."); + return message; } - if (retry++ > _retry) + if (cancellationSource.IsCancellationRequested) { - throw new Exception($"Failed to receive message after {_retry} try."); + throw new OperationCanceledException("Received cancellation request: stop receiving."); } } - return message; + throw new TimeoutException($"Failed to receive message after {_retry} try."); } /// @@ -191,12 +187,9 @@ public virtual void Send(ElasticGroupCommunicationMessage message, CancellationT /// The message that need to be devlivered to the operator public virtual void OnNext(NsMessage message) { - if (_messageQueue.IsAddingCompleted) + if (_messageQueue.IsAddingCompleted && _messageQueue.Count > 0) { - if (_messageQueue.Count > 0) - { - throw new IllegalStateException("Trying to add messages to a closed non-empty queue."); - } + throw new IllegalStateException("Trying to add messages to a closed non-empty queue."); } _messageQueue.Add(message.Data); @@ -249,8 +242,7 @@ public virtual void Dispose() /// The singal in case the task is cancelled protected virtual void Send(CancellationTokenSource cancellationSource) { - ElasticGroupCommunicationMessage message; - while (_sendQueue.TryDequeue(out message) && !cancellationSource.IsCancellationRequested) + while (_sendQueue.TryDequeue(out ElasticGroupCommunicationMessage message) && !cancellationSource.IsCancellationRequested) { foreach (var child in _children.Values) { From 08e2f90419780a36d541f1a73762c62b2530a238 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Mon, 25 Mar 2019 15:37:25 -0700 Subject: [PATCH 28/29] Adressed new Sergiy's comments. --- .../Topology/Logical/Impl/FlatTopology.cs | 35 ++++++++----------- .../Org.Apache.REEF.Network/Elastic/Utils.cs | 2 +- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index c0c752d818..035662b239 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -18,8 +18,6 @@ using System; using Org.Apache.REEF.Tang.Interface; using System.Collections.Generic; -using Org.Apache.REEF.Tang.Util; -using System.Globalization; using Org.Apache.REEF.Tang.Exceptions; using Org.Apache.REEF.Utilities.Logging; using System.Linq; @@ -42,20 +40,20 @@ public class FlatTopology : ITopology { private static readonly Logger Log = Logger.GetLogger(typeof(FlatTopology)); - private string _rootTaskId = string.Empty; - private int _rootId; - private string _taskStage = string.Empty; + private string _rootTaskId = null; + private readonly int _rootId; + private string _taskStage = null; private volatile int _iteration = 1; private bool _finalized = false; - private readonly bool _sorted; private readonly IDictionary _nodes; + private DataNode _root; // This is just for caching private readonly HashSet _lostNodesToBeRemoved = new HashSet(); private HashSet _nodesWaitingToJoinTopologyNextIteration = new HashSet(); private HashSet _nodesWaitingToJoinTopology = new HashSet(); private volatile int _availableDataPoints = 0; - private int _totNumberofNodes; + private int _totalDataPoints = 0; private readonly object _lock = new object(); @@ -68,10 +66,9 @@ public class FlatTopology : ITopology public FlatTopology(int rootId, bool sorted = false) { _rootId = rootId; - _sorted = sorted; OperatorId = -1; - if (_sorted) + if (sorted) { _nodes = new SortedDictionary(); } @@ -137,7 +134,7 @@ public bool AddTask(string taskId, IFailureStateMachine failureMachine) // New node but elastically added. It should be gracefully added to the topology. _nodesWaitingToJoinTopologyNextIteration.Add(taskId); _nodes[id].FailState = DataNodeState.Unreachable; - _nodes[_rootId].Children.Add(_nodes[id]); + _root.Children.Add(_nodes[id]); failureMachine.AddDataPoints(1, true); failureMachine.RemoveDataPoints(1); return false; @@ -206,7 +203,7 @@ public int RemoveTask(string taskId) /// True if the topology is ready to be scheduled public bool CanBeScheduled() { - return _nodes.ContainsKey(_rootId); + return _root != null; } /// @@ -222,7 +219,7 @@ public ITopology Build() throw new IllegalStateException("Topology cannot be built more than once"); } - if (!_nodes.ContainsKey(_rootId)) + if (_root == null) { throw new IllegalStateException("Topology cannot be built because the root node is missing"); } @@ -252,8 +249,7 @@ public ITopology Build() /// public string LogTopologyState() { - var root = _nodes[_rootId]; - var children = root.Children.GetEnumerator(); + var children = _root.Children.GetEnumerator(); string output = _rootId + "\n"; while (children.MoveNext()) { @@ -286,9 +282,7 @@ public IConfiguration GetTaskConfiguration(int taskId) if (taskId == _rootId) { - var root = _nodes[_rootId]; - - foreach (var tId in root.Children) + foreach (var tId in _root.Children) { confBuilder.BindSetEntry("" + tId.TaskId); } @@ -367,7 +361,7 @@ public void OnNewIteration(int iteration) iteration - 1, _availableDataPoints); _iteration = iteration; - _totNumberofNodes += _availableDataPoints; + _totalDataPoints += _availableDataPoints; lock (_lock) { @@ -423,12 +417,13 @@ public string LogFinalStatistics() return string.Format( "\nAverage number of nodes in the topology of Operator {0}: {1}", OperatorId, - _iteration >= 2 ? (float)_totNumberofNodes / (_iteration - 1) : _availableDataPoints); + _iteration >= 2 ? (float)_totalDataPoints / (_iteration - 1) : _availableDataPoints); } private void BuildTopology() { - _nodes[_rootId].AddChild(_nodes.Values.Where(n => n.TaskId != _rootId)); + _root = _nodes[_rootId]; + _root.AddChild(_nodes.Values.Where(n => n.TaskId != _rootId)); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs index a2fdf504c2..c9659f3e2d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Utils.cs @@ -119,7 +119,7 @@ public static string GetContextIdFromTaskId(string taskId) /// An id merging the three fields private static string BuildIdentifier(string first, string second, int third) { - return string.Format(CultureInfo.InvariantCulture, "{0}-{1}-{2}", first, second, third); + return $"{first}-{second}-{third}"; } /// From eb187cb9262be51599e987e66de3d61afef74c3f Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 4 Apr 2019 16:39:10 -0700 Subject: [PATCH 29/29] Other round of comments. --- .../Elastic/Comm/ITaskMessageResponse.cs | 3 +- .../Comm/Impl/ElasticDriverMessageImpl.cs | 4 +- .../Impl/ElasticGroupCommunicationMessage.cs | 4 +- .../Comm/Impl/FailureMessagePayload.cs | 2 +- .../Comm/Impl/TopologyMessagePayload.cs | 8 +-- .../Elastic/Comm/Impl/TopologyUpdate.cs | 10 ++-- .../Elastic/Comm/Impl/UpdateMessagePayload.cs | 2 +- .../Driver/Default/DefaultElasticStage.cs | 7 ++- .../Default/DefaultElasticTaskSetManager.cs | 2 +- .../DefaultElasticTaskSetManagerParameters.cs | 20 +++---- .../Elastic/Failures/Default/FailEvent.cs | 2 +- .../Failures/Default/ReconfigureEvent.cs | 2 +- .../Logical/Default/DefaultOneToN.cs | 16 ++--- .../Operators/Logical/ElasticOperator.cs | 11 ++-- .../Physical/Default/DefaultOneToN.cs | 2 +- .../Task/Default/DefaultElasticStage.cs | 4 +- .../Elastic/Task/NodeIdentifier.cs | 4 +- .../Elastic/Topology/Logical/ITopology.cs | 5 +- .../Elastic/Topology/Logical/Impl/DataNode.cs | 20 ++----- .../Topology/Logical/Impl/EmptyTopology.cs | 28 +++------ .../Topology/Logical/Impl/FlatTopology.cs | 58 ++++++------------- .../Physical/Default/OneToNTopology.cs | 4 +- ...peratorTopologyWithDefaultCommunication.cs | 6 +- .../Physical/DriverAwareOperatorTopology.cs | 2 +- .../Topology/Physical/OperatorTopology.cs | 8 +-- 25 files changed, 98 insertions(+), 136 deletions(-) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs index 94d4fdfda3..1f667a667a 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/ITaskMessageResponse.cs @@ -31,9 +31,8 @@ public interface ITaskMessageResponse /// Method triggered when a task to driver message is received. /// /// The task message for the operator - /// A list of messages containing the instructions for the task /// If the message cannot be handled correctly or /// generate an incorrect state - void OnTaskMessage(ITaskMessage message, ref List returnMessages); + IEnumerable OnTaskMessage(ITaskMessage message); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs index 89592d1d89..899b2aac9c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticDriverMessageImpl.cs @@ -46,12 +46,12 @@ public ElasticDriverMessageImpl( /// /// The destination task of the message. - public string Destination { get; private set; } + public string Destination { get; } /// /// Operator and event specific payload of the message. /// - public DriverMessagePayload Message { get; private set; } + public DriverMessagePayload Message { get; } /// /// Utility method to serialize the message for communication over the network. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs index 1407ae8763..700fcf591d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/ElasticGroupCommunicationMessage.cs @@ -47,11 +47,11 @@ protected ElasticGroupCommunicationMessage( /// /// Returns the stage. - public string StageName { get; private set; } + public string StageName { get; } /// /// Returns the operator id. /// - public int OperatorId { get; private set; } + public int OperatorId { get; } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs index da2bd35b6a..24b92c69f3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/FailureMessagePayload.cs @@ -35,7 +35,7 @@ internal sealed class FailureMessagePayload : TopologyMessagePayload /// The stage context for the message /// The id of the operator receiving the topology update /// The iteration in which the update takes effect - public FailureMessagePayload(List updates, string stageName, int operatorId, int iteration) + public FailureMessagePayload(IEnumerable updates, string stageName, int operatorId, int iteration) : base(DriverMessagePayloadType.Failure, updates, stageName, operatorId, iteration) { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs index d27f50e71f..d70151ea4d 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyMessagePayload.cs @@ -39,15 +39,15 @@ internal class TopologyMessagePayload : DriverMessagePayload /// The id of the operator receiving the topology update /// The iteration in which the update takes effect public TopologyMessagePayload( - DriverMessagePayloadType type, - List updates, + DriverMessagePayloadType type, + IEnumerable updates, string stageName, int operatorId, int iteration) : base(stageName, operatorId, iteration) { PayloadType = type; - TopologyUpdates = updates; + TopologyUpdates = updates.ToList(); } /// @@ -64,7 +64,7 @@ public override object Clone() /// /// The updates for the topology. /// - internal List TopologyUpdates { get; private set; } + internal List TopologyUpdates { get; } /// /// Creates a topology message payload out of memory buffer. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs index 0b0b4b9b86..8fcc02c025 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/TopologyUpdate.cs @@ -35,10 +35,10 @@ internal sealed class TopologyUpdate : ICloneable /// The node receiving the update /// The update to the children of the node /// The update for the root of the node - public TopologyUpdate(string node, List children, string root) + public TopologyUpdate(string node, IEnumerable children, string root) { Node = node; - Children = children; + Children = children.ToList(); Root = root; } @@ -47,7 +47,7 @@ public TopologyUpdate(string node, List children, string root) /// /// The node receiving the update /// The update to the children of the node - public TopologyUpdate(string node, List children) : this(node, children, string.Empty) + public TopologyUpdate(string node, IEnumerable children) : this(node, children, string.Empty) { } @@ -63,7 +63,7 @@ public TopologyUpdate(string node, List children) : this(node, children, /// /// The node receiving the update. /// - public string Node { get; private set; } + public string Node { get; } /// /// The updates for the children. @@ -73,7 +73,7 @@ public TopologyUpdate(string node, List children) : this(node, children, /// /// The updates for the root. /// - public string Root { get; private set; } + public string Root { get; } /// /// The total memory size for the update (used for serialization). diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs index 910a133d3d..adc295df62 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Comm/Impl/UpdateMessagePayload.cs @@ -34,7 +34,7 @@ internal sealed class UpdateMessagePayload : TopologyMessagePayload /// The stage context for the message /// The id of the operator receiving the topology update /// The iteration in which the update takes effect - public UpdateMessagePayload(List updates, string stageName, int operatorId, int iteration) + public UpdateMessagePayload(IEnumerable updates, string stageName, int operatorId, int iteration) : base(DriverMessagePayloadType.Update, updates, stageName, operatorId, iteration) { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs index ab3d6ffbd8..9ce7ba7450 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticStage.cs @@ -363,10 +363,9 @@ public string LogFinalStatistics() /// Method triggered when a task to driver message is received. /// /// The task message for the operator - /// A list of messages containing the instructions for the task /// If the message cannot be handled correctly or generate /// an incorrent state - public void OnTaskMessage(ITaskMessage message, ref List returnMessages) + public IEnumerable OnTaskMessage(ITaskMessage message) { int offset = 0; var length = BitConverter.ToUInt16(message.Message, offset); @@ -377,8 +376,10 @@ public void OnTaskMessage(ITaskMessage message, ref List if (stageName == StageName) { // Messages have to be propagated down to the operators - PipelineRoot.OnTaskMessage(message, ref returnMessages); + return PipelineRoot.OnTaskMessage(message); } + + return new IElasticDriverMessage[] { }; } #region Failure Response diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs index e7e6127972..f124104f19 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManager.cs @@ -676,7 +676,7 @@ public void OnTaskMessage(ITaskMessage message) { foreach (var stage in _stages.Values) { - stage.OnTaskMessage(message, ref returnMessages); + returnMessages.AddRange(stage.OnTaskMessage(message)); } } catch (IllegalStateException e) diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs index 934b5a113e..ec54652c41 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Driver/Default/DefaultElasticTaskSetManagerParameters.cs @@ -59,51 +59,51 @@ private DefaultElasticTaskSetManagerParameters( /// /// The clock for scheduling alarms. /// - public FailuresClock Clock { get; private set; } + public FailuresClock Clock { get; } /// /// Timeout after which computation is considered inactive. /// - public int Timeout { get; private set; } + public int Timeout { get; } /// /// How many times a message communication can be retried. /// - public int Retry { get; private set; } + public int Retry { get; } /// /// How much time to wait between messages retry. /// - public int WaitTime { get; private set; } + public int WaitTime { get; } /// /// Supported number of task failures. /// - public int NumTaskFailures { get; private set; } + public int NumTaskFailures { get; } /// /// Supported number of evaluator failures. /// - public int NumEvaluatorFailures { get; private set; } + public int NumEvaluatorFailures { get; } /// /// The rack name when spawning new evaluators. /// - public string NewEvaluatorRackName { get; private set; } + public string NewEvaluatorRackName { get; } /// /// The batch id when spawning new evaluators. /// - public string NewEvaluatorBatchId { get; private set; } + public string NewEvaluatorBatchId { get; } /// /// Number of cores for new evaluators. /// - public int NewEvaluatorNumCores { get; private set; } + public int NewEvaluatorNumCores { get; } /// /// Memory size for new evaluators. /// - public int NewEvaluatorMemorySize { get; private set; } + public int NewEvaluatorMemorySize { get; } } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs index 370384d3b9..9d7763f006 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/FailEvent.cs @@ -47,7 +47,7 @@ public int FailureEvent /// /// The identifier of the task triggering the event. /// - public string TaskId { get; private set; } + public string TaskId { get; } /// /// The opeartor id in which the failure is rised. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs index c5193cd5b3..3c9a2579df 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Failures/Default/ReconfigureEvent.cs @@ -57,7 +57,7 @@ public virtual int FailureEvent /// /// The iteration in which the failure is rised. /// - public Optional Iteration { get; set; } = Optional.Empty(); + public int? Iteration { get; set; } = null; /// /// The identifier of the task triggering the event. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs index e30ac6811c..87d6beef5c 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/Default/DefaultOneToN.cs @@ -30,8 +30,7 @@ using Org.Apache.REEF.Network.Elastic.Failures.Default; using Org.Apache.REEF.Tang.Implementations.Tang; using Org.Apache.REEF.Network.Elastic.Config; -using System.Globalization; -using Org.Apache.REEF.Tang.Util; +using System.Linq; namespace Org.Apache.REEF.Network.Elastic.Operators.Logical.Default { @@ -82,13 +81,15 @@ public DefaultOneToN( /// True if the operator has reacted to the task message protected override bool ReactOnTaskMessage( ITaskMessage message, - ref List returnMessages) + out IEnumerable returnMessages) { var offset = BitConverter.ToUInt16(message.Message, 0); offset += sizeof(ushort); var msgReceived = (TaskMessageType)BitConverter.ToUInt16(message.Message, offset); offset += sizeof(ushort); + returnMessages = new List(); + switch (msgReceived) { case TaskMessageType.JoinTopology: @@ -122,10 +123,9 @@ protected override bool ReactOnTaskMessage( Log.Log(Level.Info, "Received topology update request for {0} {1} from {2}", OperatorType, _id, message.TaskId); - _topology.TopologyUpdateResponse( + ((List)returnMessages).AddRange(_topology.TopologyUpdateResponse( message.TaskId, - ref returnMessages, - Optional.Of(_failureMachine)); + Optional.Of(_failureMachine))); if (_stop) { @@ -135,7 +135,7 @@ protected override bool ReactOnTaskMessage( } else { - returnMessages.Clear(); + ((List)returnMessages).Clear(); // Remove all messages. Log.Log(Level.Info, "Operator {0} is in stopped: Waiting.", OperatorType); } } @@ -170,7 +170,7 @@ public override void OnReconfigure(ref ReconfigureEvent reconfigureEvent) reconfigureEvent.FailureResponse.AddRange( _topology.Reconfigure( reconfigureEvent.FailedTask.Value.Id, - Optional.OfNullable(error?.AdditionalInfo), + error?.AdditionalInfo, reconfigureEvent.Iteration)); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs index fa1dcff3c9..865b3aba71 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Logical/ElasticOperator.cs @@ -205,12 +205,14 @@ public ElasticOperator Broadcast( /// True if the message was managed correctly, false otherwise /// If the message cannot be handled correctly or /// generate an incorrent state - public void OnTaskMessage(ITaskMessage message, ref List returnMessages) + public IEnumerable OnTaskMessage(ITaskMessage message) { - if (!ReactOnTaskMessage(message, ref returnMessages)) + if (!ReactOnTaskMessage(message, out IEnumerable returnMessages)) { - _next?.OnTaskMessage(message, ref returnMessages); + return returnMessages.Concat(_next?.OnTaskMessage(message)); } + + return returnMessages; } /// @@ -464,8 +466,9 @@ protected virtual bool PropagateFailureDownstream() /// Incoming message from a task /// Zero or more reply messages for the task /// True if the operator has reacted to the task message - protected virtual bool ReactOnTaskMessage(ITaskMessage message, ref List returnMessages) + protected virtual bool ReactOnTaskMessage(ITaskMessage message, out IEnumerable returnMessages) { + returnMessages = new IElasticDriverMessage[] { }; return false; } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs index 84d672f853..4c5782a2fc 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Operators/Physical/Default/DefaultOneToN.cs @@ -59,7 +59,7 @@ internal DefaultOneToN(int id, bool isLast, OneToNTopology topology) /// /// The operator identifier. /// - public int OperatorId { get; private set; } + public int OperatorId { get; } /// /// The operator type. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs index 724232ff65..5cce608067 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/Default/DefaultElasticStage.cs @@ -79,12 +79,12 @@ private DefaultElasticStage( /// /// The stage name. /// - public string StageName { get; private set; } + public string StageName { get; } /// /// The workflow of the stage. /// - public Workflow Workflow { get; private set; } + public Workflow Workflow { get; } /// /// Initializes the communication group. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs index 42f517d88c..33ed8f1d42 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Task/NodeIdentifier.cs @@ -30,12 +30,12 @@ public struct NodeIdentifier : INodeIdentifier /// /// The stage name. /// - public string StageName { get; private set; } + public string StageName { get; } /// /// The operator name. /// - public int OperatorId { get; private set; } + public int OperatorId { get; } /// /// Constructor. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs index d6506e94ab..ba74942063 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/ITopology.cs @@ -93,9 +93,8 @@ public interface ITopology /// with the driver's one. /// /// The identifier of the task asking for the update - /// A list of message containing the topology update /// An optional failure machine to log updates - void TopologyUpdateResponse(string taskId, ref List returnMessages, Optional failureStateMachine); + IEnumerable TopologyUpdateResponse(string taskId, Optional failureStateMachine); /// /// Action to trigger when the operator recdeives a notification that a new iteration is started. @@ -110,7 +109,7 @@ public interface ITopology /// Some additional topology-specific information /// The optional iteration number in which the event occurred /// One or more messages for reconfiguring the tasks - IList Reconfigure(string taskId, Optional info, Optional iteration); + IEnumerable Reconfigure(string taskId,string info = null, int? iteration = null); /// /// Log the final statistics of the operator. diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs index e0eae3ad94..060b8a34e9 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/DataNode.cs @@ -29,7 +29,6 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl internal sealed class DataNode { private readonly bool _isRoot; - private readonly List _children; /// /// Construct a node using a given task id. @@ -42,15 +41,12 @@ public DataNode( { TaskId = taskId; _isRoot = isRoot; - FailState = DataNodeState.Reachable; - - _children = new List(); } /// /// The current state for the node. /// - public DataNodeState FailState { get; set; } + public DataNodeState FailState { get; set; } = DataNodeState.Reachable; /// /// The parent of the target node. @@ -60,9 +56,9 @@ public DataNode( /// /// Add a node to the list of children nodes of the current one. /// - public void AddChild(IEnumerable child) + public void AddChildren(IEnumerable child) { - _children.AddRange(child); + Children.AddRange(child); } /// @@ -73,17 +69,11 @@ public void AddChild(IEnumerable child) /// /// Return how many children the current node has. /// - public int NumberOfChildren - { - get { return _children.Count; } - } + public int NumberOfChildren => Children.Count; /// /// Return the list of children fro the current node. /// - public IList Children - { - get { return _children; } - } + public List Children { get; } = new List(); } } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs index 9d5caeca59..2547daa0c1 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/EmptyTopology.cs @@ -19,7 +19,6 @@ using Org.Apache.REEF.Tang.Exceptions; using System.Collections.Generic; using Org.Apache.REEF.Network.Elastic.Comm; -using System; using Org.Apache.REEF.Network.Elastic.Failures; using Org.Apache.REEF.Utilities; using Org.Apache.REEF.Utilities.Attributes; @@ -32,23 +31,14 @@ namespace Org.Apache.REEF.Network.Elastic.Topology.Logical.Impl /// Used as a placeholder when no topology is required. /// [Unstable("0.16", "API may change")] - class EmptyTopology : ITopology + internal class EmptyTopology : ITopology { - private bool _finalized; - - /// - /// Constructor for the empty topology. - /// - public EmptyTopology() - { - _finalized = false; - OperatorId = -1; - } + private bool _finalized = false; /// /// The identifier of the operator using the topology. /// - public int OperatorId { get; set; } + public int OperatorId { get; set; } = -1; /// /// The stage of the operator using the topology. @@ -93,7 +83,7 @@ public bool CanBeScheduled() /// The same finalized topology public ITopology Build() { - if (_finalized == true) + if (_finalized) { throw new IllegalStateException("Topology cannot be built more than once"); } @@ -103,7 +93,7 @@ public ITopology Build() throw new IllegalStateException("Topology cannot be built because not linked to any operator"); } - if (StageName == string.Empty) + if (StageName == null) { throw new IllegalStateException("Topology cannot be built because not linked to any stage"); } @@ -138,10 +128,10 @@ public string LogTopologyState() /// This method is triggered when a node detects a change in the topology and asks the driver for an update. /// /// The identifier of the task asking for the update - /// A list of message containing the topology update /// An optional failure machine to log updates - public void TopologyUpdateResponse(string taskId, ref List returnMessages, Optional failureStateMachine) + public IEnumerable TopologyUpdateResponse(string taskId, Optional failureStateMachine) { + return new IElasticDriverMessage[] { }; } /// @@ -160,9 +150,9 @@ public void OnNewIteration(int iteration) /// Some additional topology-specific information /// The optional iteration number in which the event occurred /// An empty list of messages - public IList Reconfigure(string taskId, Optional info, Optional iteration) + public IEnumerable Reconfigure(string taskId, string info = null, int? iteration = null) { - return new List(); + return new IElasticDriverMessage[] { }; } /// diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs index 035662b239..8f0bd0decb 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Logical/Impl/FlatTopology.cs @@ -133,8 +133,8 @@ public bool AddTask(string taskId, IFailureStateMachine failureMachine) { // New node but elastically added. It should be gracefully added to the topology. _nodesWaitingToJoinTopologyNextIteration.Add(taskId); - _nodes[id].FailState = DataNodeState.Unreachable; - _root.Children.Add(_nodes[id]); + dnode.FailState = DataNodeState.Unreachable; + _root.Children.Add(dnode); failureMachine.AddDataPoints(1, true); failureMachine.RemoveDataPoints(1); return false; @@ -174,12 +174,11 @@ public int RemoveTask(string taskId) lock (_lock) { - if (!_nodes.ContainsKey(id)) + if (!_nodes.TryGetValue(id, out DataNode node)) { throw new ArgumentException("Task is not part of this topology"); } - DataNode node = _nodes[id]; var prevState = node.FailState; node.FailState = DataNodeState.Lost; _nodesWaitingToJoinTopologyNextIteration.Remove(taskId); @@ -214,7 +213,7 @@ public bool CanBeScheduled() /// The same finalized topology public ITopology Build() { - if (_finalized == true) + if (_finalized) { throw new IllegalStateException("Topology cannot be built more than once"); } @@ -249,20 +248,8 @@ public ITopology Build() /// public string LogTopologyState() { - var children = _root.Children.GetEnumerator(); - string output = _rootId + "\n"; - while (children.MoveNext()) - { - var rep = "X"; - if (children.Current.FailState == DataNodeState.Reachable) - { - rep = children.Current.TaskId.ToString(); - } - - output += rep + " "; - } - - return output; + return _rootId + "\n" + string.Join(" ", _root.Children.Select(node => + node.FailState == DataNodeState.Reachable ? "" + node.TaskId : "X")); } /// @@ -298,11 +285,9 @@ public IConfiguration GetTaskConfiguration(int taskId) /// with the driver's one. /// /// The identifier of the task asking for the update - /// A list of message containing the topology update /// An optional failure machine to log updates - public void TopologyUpdateResponse( + public IEnumerable TopologyUpdateResponse( string taskId, - ref List returnMessages, Optional failureStateMachine) { if (taskId != _rootTaskId) @@ -317,14 +302,10 @@ public void TopologyUpdateResponse( lock (_lock) { - var list = _nodesWaitingToJoinTopology.ToList(); - var update = new TopologyUpdate(_rootTaskId, list); - var data = new UpdateMessagePayload( - new List() { update }, StageName, OperatorId, _iteration); + var update = new TopologyUpdate(_rootTaskId, _nodesWaitingToJoinTopology); + var data = new UpdateMessagePayload( new[]{ update }, StageName, OperatorId, _iteration); var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); - returnMessages.Add(returnMessage); - if (_nodesWaitingToJoinTopology.Count > 0) { if (Log.IsLoggable(Level.Info)) @@ -346,6 +327,8 @@ public void TopologyUpdateResponse( _nodesWaitingToJoinTopology.Clear(); } + + return new[] { returnMessage }; } } @@ -377,10 +360,10 @@ public void OnNewIteration(int iteration) /// Some additional topology-specific information /// The optional iteration number in which the event occurred /// One or more messages for reconfiguring the Tasks - public IList Reconfigure( + public IEnumerable Reconfigure( string taskId, - Optional info, - Optional iteration) + string info = null, + int? iteration = null) { if (taskId == _rootTaskId) { @@ -390,14 +373,9 @@ public IList Reconfigure( List messages = new List(); lock (_lock) - { - int iter = info.IsPresent() ? int.Parse(info.Value.Split(':')[0]) : iteration.Value; - var children = _lostNodesToBeRemoved.ToList(); - var update = new List() - { - new TopologyUpdate(_rootTaskId, children) - }; - var data = new FailureMessagePayload(update, StageName, OperatorId, -1); + { + var update = new TopologyUpdate(_rootTaskId, _lostNodesToBeRemoved); + var data = new FailureMessagePayload(new[] { update }, StageName, OperatorId, -1); var returnMessage = new ElasticDriverMessageImpl(_rootTaskId, data); Log.Log(Level.Info, "Task {0} is removed from topology", taskId); @@ -423,7 +401,7 @@ public string LogFinalStatistics() private void BuildTopology() { _root = _nodes[_rootId]; - _root.AddChild(_nodes.Values.Where(n => n.TaskId != _rootId)); + _root.AddChildren(_nodes.Values.Where(n => n.TaskId != _rootId)); } } } \ No newline at end of file diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs index f13e195cba..a878925626 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OneToNTopology.cs @@ -58,12 +58,12 @@ internal abstract class OneToNTopology : OperatorTopologyWithDefaultCommunicatio /// Maximum wait time for topology disposal /// Layer responsible for communication /// Layer responsible for saving and retrieving checkpoints - public OneToNTopology( + protected OneToNTopology( string stageName, string taskId, string rootTaskId, int operatorId, - ISet children, + IEnumerable children, bool piggyback, int retry, int timeout, diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs index 41d87cd311..78115f71c6 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/Default/OperatorTopologyWithDefaultCommunication.cs @@ -19,11 +19,11 @@ using System; using System.Collections.Concurrent; using System.Threading; -using System.Linq; using Org.Apache.REEF.Network.NetworkService; using Org.Apache.REEF.Tang.Exceptions; using Org.Apache.REEF.Network.Elastic.Comm.Impl; using Org.Apache.REEF.Utilities.Attributes; +using System.Collections.Generic; namespace Org.Apache.REEF.Network.Elastic.Topology.Physical.Default { @@ -46,8 +46,10 @@ internal abstract class OperatorTopologyWithDefaultCommunication : protected readonly ConcurrentQueue _sendQueue = new ConcurrentQueue(); + protected readonly BlockingCollection _messageQueue = new BlockingCollection(); + protected readonly ConcurrentDictionary _children = new ConcurrentDictionary(); protected readonly CancellationTokenSource _cancellationSignal = new CancellationTokenSource(); @@ -62,7 +64,7 @@ internal abstract class OperatorTopologyWithDefaultCommunication : /// After how long the topology waits for an event /// Maximum wait time for topology disposal /// Class responsible for communication - public OperatorTopologyWithDefaultCommunication( + protected OperatorTopologyWithDefaultCommunication( string stageName, string taskId, string rootTaskId, diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs index 38dc9f9e87..2923b5d4e3 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/DriverAwareOperatorTopology.cs @@ -34,7 +34,7 @@ public abstract class DriverAwareOperatorTopology : OperatorTopology, IObserver< /// The identifier of the task the topology is running on /// The identifier of the root note in the topology /// The identifier of the operator for this topology - public DriverAwareOperatorTopology(string stageName, string taskId, string rootTaskId, int operatorId) + protected DriverAwareOperatorTopology(string stageName, string taskId, string rootTaskId, int operatorId) : base(stageName, taskId, rootTaskId, operatorId) { } diff --git a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs index 5713d0d7b1..8e9bbc7ee5 100644 --- a/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs +++ b/lang/cs/Org.Apache.REEF.Network/Elastic/Topology/Physical/OperatorTopology.cs @@ -34,7 +34,7 @@ public abstract class OperatorTopology : INodeIdentifier /// The identifier of the task the topology is running on /// The identifier of the root note in the topology /// The identifier of the operator for this topology - public OperatorTopology(string stageName, string taskId, string rootTaskId, int operatorId) + protected OperatorTopology(string stageName, string taskId, string rootTaskId, int operatorId) { StageName = stageName; TaskId = taskId; @@ -45,17 +45,17 @@ public OperatorTopology(string stageName, string taskId, string rootTaskId, int /// /// The stage name context in which the topology is running. /// - public string StageName { get; private set; } + public string StageName { get; } /// /// The identifier of the operator in which the topology is running. /// - public int OperatorId { get; private set; } + public int OperatorId { get; } /// /// The identifier of the task in which the topology is running. /// - protected string TaskId { get; private set; } + protected string TaskId { get; } /// /// The task identifier of the root node of the topology.