From fe92118797497702c314cda1a10a41a9d8387c23 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 23 Feb 2026 13:26:06 -0800 Subject: [PATCH 1/3] moved placement of session.UpdatedRuntimeState call to be before we commit any outbound messages --- .../AzureStorageOrchestrationService.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs b/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs index 38cb7fac3..060905bbb 100644 --- a/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs +++ b/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs @@ -1207,6 +1207,9 @@ public async Task CompleteTaskOrchestrationWorkItemAsync( continuedAsNewMessage, orchestrationState))); + // update the runtime state and execution id stored in the session + session.UpdateRuntimeState(runtimeState); + // First, add new messages into the queue. If a failure happens after this, duplicate messages will // be written after the retry, but the results of those messages are expected to be de-dup'd later. // This provider needs to ensure that response messages are not processed until the history a few @@ -1231,8 +1234,6 @@ await this.CommitOutboundQueueMessages( try { await this.trackingStore.UpdateStateAsync(runtimeState, workItem.OrchestrationRuntimeState, instanceId, executionId, session.ETags, session.TrackingStoreContext); - // update the runtime state and execution id stored in the session - session.UpdateRuntimeState(runtimeState); // if we deferred some messages, and the execution id of this instance has changed, redeliver them if (session.DeferredMessages.Count > 0 From 20846e8bd0c3732f9ec69f3909b6ba2a30106e45 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Tue, 24 Feb 2026 09:56:26 -0800 Subject: [PATCH 2/3] removed the unnecessary changes from the orchestration service, due to initial misdiagnosis, and changed the IsOutOfOrder logic instead --- .../AzureStorageOrchestrationService.cs | 5 +- .../Messaging/OrchestrationSession.cs | 51 ++++++++----------- 2 files changed, 22 insertions(+), 34 deletions(-) diff --git a/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs b/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs index 060905bbb..38cb7fac3 100644 --- a/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs +++ b/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs @@ -1207,9 +1207,6 @@ public async Task CompleteTaskOrchestrationWorkItemAsync( continuedAsNewMessage, orchestrationState))); - // update the runtime state and execution id stored in the session - session.UpdateRuntimeState(runtimeState); - // First, add new messages into the queue. If a failure happens after this, duplicate messages will // be written after the retry, but the results of those messages are expected to be de-dup'd later. // This provider needs to ensure that response messages are not processed until the history a few @@ -1234,6 +1231,8 @@ await this.CommitOutboundQueueMessages( try { await this.trackingStore.UpdateStateAsync(runtimeState, workItem.OrchestrationRuntimeState, instanceId, executionId, session.ETags, session.TrackingStoreContext); + // update the runtime state and execution id stored in the session + session.UpdateRuntimeState(runtimeState); // if we deferred some messages, and the execution id of this instance has changed, redeliver them if (session.DeferredMessages.Count > 0 diff --git a/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs b/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs index 1b2e4a20e..5f1c0c1e6 100644 --- a/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs +++ b/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs @@ -150,37 +150,29 @@ public bool IsOutOfOrderMessage(MessageData message) return false; } - if (this.IsNonexistantInstance() && message.OriginalQueueMessage.DequeueCount > 5) - { - // The first five times a message for a nonexistant instance is dequeued, give the message the benefit - // of the doubt and assume that the instance hasn't had its history table populated yet. After the - // fifth execution, ~30 seconds have passed and the most likely scenario is that this is a zombie event. - // This means the history table for the message's orchestration no longer exists, either due to an explicit - // PurgeHistory request or due to a ContinueAsNew call cleaning the old execution's history. - return false; - } - - if (this.LastCheckpointTime > message.TaskMessage.Event.Timestamp) - { - // LastCheckpointTime represents the time at which the most recent history checkpoint completed. - // The checkpoint is written to the history table only *after* all queue messages are sent. - // A message is out of order when its timestamp *preceeds* the most recent checkpoint timestamp. - // In this case, we see that the checkpoint came *after* the message, so there is no out-of-order - // concern. Note that this logic only applies for messages sent by orchestrations to themselves. - // The next check considers the other cases (activities, sub-orchestrations, etc.). - // Orchestration checkpoint time information was added only after v1.6.4. - return false; - } - + // The first five times a message for a nonexistant instance is dequeued, give the message the benefit + // of the doubt and assume that the instance hasn't had its history table populated yet. After the + // fifth execution, ~30 seconds have passed and the most likely scenario is that this is a zombie event. + // This means the history table for the message's orchestration no longer exists, either due to an explicit + // PurgeHistory request or due to a ContinueAsNew call cleaning the old execution's history. + bool nonExistentInstance = this.IsNonexistantInstance() && message.OriginalQueueMessage.DequeueCount <= 5; + + // LastCheckpointTime represents the time at which the most recent history checkpoint completed. + // The checkpoint is written to the history table only *after* all queue messages are sent. + // A message is out of order when its timestamp *preceeds* the most recent checkpoint timestamp. + // In this case, we see that the checkpoint came *after* the message, so there is no out-of-order + // concern. Note that this logic only applies for messages sent by orchestrations to themselves. + // The next check considers the other cases (activities, sub-orchestrations, etc.). + // Orchestration checkpoint time information was added only after v1.6.4. + bool isStaleCheckpoint = this.LastCheckpointTime <= message.TaskMessage.Event.Timestamp; + + bool triggeringTaskDoesNotExist = true; if (Utils.TryGetTaskScheduledId(message.TaskMessage.Event, out int taskScheduledId)) { // This message is a response to a task. Search the history to make sure that we've recorded the fact that // this task was scheduled. HistoryEvent mostRecentTaskEvent = this.RuntimeState.Events.LastOrDefault(e => e.EventId == taskScheduledId); - if (mostRecentTaskEvent != null) - { - return false; - } + triggeringTaskDoesNotExist = mostRecentTaskEvent == null; } if (message.TaskMessage.Event.EventType == EventType.EventRaised) @@ -190,15 +182,12 @@ public bool IsOutOfOrderMessage(MessageData message) if (requestId != null) { HistoryEvent mostRecentTaskEvent = this.RuntimeState.Events.FirstOrDefault(e => e.EventType == EventType.EventSent && FindRequestId(((EventSentEvent)e).Input)?.ToString() == requestId); - if (mostRecentTaskEvent != null) - { - return false; - } + triggeringTaskDoesNotExist = mostRecentTaskEvent == null; } } // The message is out of order and cannot be handled by the current session. - return true; + return nonExistentInstance || isStaleCheckpoint || triggeringTaskDoesNotExist; } Guid? FindRequestId(string input) From 3909fe7e7d6e149407c2c517bb8fb18efdb1ad4e Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Tue, 24 Feb 2026 10:24:25 -0800 Subject: [PATCH 3/3] fixed the endlessly abandoning nonexistent instances bug --- .../Messaging/OrchestrationSession.cs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs b/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs index 5f1c0c1e6..3472b7e0e 100644 --- a/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs +++ b/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs @@ -157,6 +157,14 @@ public bool IsOutOfOrderMessage(MessageData message) // PurgeHistory request or due to a ContinueAsNew call cleaning the old execution's history. bool nonExistentInstance = this.IsNonexistantInstance() && message.OriginalQueueMessage.DequeueCount <= 5; + // If the instance does not exist, even if the message has ben dequeued > 5 times, the next check for trying + // to find the corresponding task scheduled message will always fail so we will endlessly abandon the message. + // To avoid this we return early here. + if (this.IsNonexistantInstance() && message.OriginalQueueMessage.DequeueCount > 5) + { + return false; + } + // LastCheckpointTime represents the time at which the most recent history checkpoint completed. // The checkpoint is written to the history table only *after* all queue messages are sent. // A message is out of order when its timestamp *preceeds* the most recent checkpoint timestamp.