-
Notifications
You must be signed in to change notification settings - Fork 498
Add durable execution Step + Wait end-to-end #2360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: feature/durablefunction
Are you sure you want to change the base?
Changes from all commits
9e5113d
5cdab8d
4961809
f177507
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| namespace Amazon.Lambda.DurableExecution; | ||
|
|
||
| /// <summary> | ||
| /// Configuration for step execution. | ||
| /// </summary> | ||
| public sealed class StepConfig | ||
| { | ||
| // TODO: Retry support is deferred to a follow-up PR. When added, this is | ||
| // where RetryStrategy and Semantics (AtLeastOncePerRetry / AtMostOncePerRetry) | ||
| // will live. The follow-up needs to use service-mediated retries (checkpoint | ||
| // a RETRY operation + suspend the Lambda) rather than an in-process Task.Delay | ||
| // loop, to avoid billing Lambda compute time during retry backoff. | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,147 @@ | ||
| using System.Diagnostics.CodeAnalysis; | ||
| using Amazon.Lambda.Core; | ||
| using Amazon.Lambda.DurableExecution.Internal; | ||
| using Microsoft.Extensions.Logging; | ||
| using Microsoft.Extensions.Logging.Abstractions; | ||
|
|
||
| namespace Amazon.Lambda.DurableExecution; | ||
|
|
||
| /// <summary> | ||
| /// Implementation of <see cref="IDurableContext"/>. Constructs and dispatches | ||
| /// per-operation classes (<see cref="StepOperation{T}"/>, <see cref="WaitOperation"/>); | ||
| /// the replay logic lives in those classes. | ||
| /// </summary> | ||
| internal sealed class DurableContext : IDurableContext | ||
| { | ||
| private readonly ExecutionState _state; | ||
| private readonly TerminationManager _terminationManager; | ||
| private readonly OperationIdGenerator _idGenerator; | ||
| private readonly string _durableExecutionArn; | ||
| private readonly CheckpointBatcher? _batcher; | ||
|
|
||
| public DurableContext( | ||
| ExecutionState state, | ||
| TerminationManager terminationManager, | ||
| OperationIdGenerator idGenerator, | ||
| string durableExecutionArn, | ||
| ILambdaContext lambdaContext, | ||
| CheckpointBatcher? batcher = null) | ||
| { | ||
| _state = state; | ||
| _terminationManager = terminationManager; | ||
| _idGenerator = idGenerator; | ||
| _durableExecutionArn = durableExecutionArn; | ||
| _batcher = batcher; | ||
| LambdaContext = lambdaContext; | ||
| } | ||
|
|
||
| // Replay-safe logger ships in a follow-up PR; see IDurableContext.Logger doc. | ||
| public ILogger Logger => NullLogger.Instance; | ||
|
GarrettBeatty marked this conversation as resolved.
|
||
| public IExecutionContext ExecutionContext => new DurableExecutionContext(_durableExecutionArn); | ||
| public ILambdaContext LambdaContext { get; } | ||
|
GarrettBeatty marked this conversation as resolved.
|
||
|
|
||
| [RequiresUnreferencedCode("Reflection-based JSON for T. Use the ICheckpointSerializer<T> overload for AOT/trimmed deployments.")] | ||
| [RequiresDynamicCode("Reflection-based JSON for T. Use the ICheckpointSerializer<T> overload for AOT/trimmed deployments.")] | ||
| public Task<T> StepAsync<T>( | ||
| Func<IStepContext, Task<T>> func, | ||
| string? name = null, | ||
| StepConfig? config = null, | ||
| CancellationToken cancellationToken = default) | ||
| => RunStep(func, new ReflectionJsonCheckpointSerializer<T>(), name, config, cancellationToken); | ||
|
|
||
| public async Task StepAsync( | ||
| Func<IStepContext, Task> func, | ||
| string? name = null, | ||
| StepConfig? config = null, | ||
| CancellationToken cancellationToken = default) | ||
| { | ||
| // Void steps don't carry a meaningful payload; we wrap with a null-only | ||
| // serializer that doesn't touch reflection. | ||
| await RunStep<object?>( | ||
| async (ctx) => { await func(ctx); return null; }, | ||
| NullCheckpointSerializer.Instance, | ||
| name, config, cancellationToken); | ||
| } | ||
|
|
||
| public Task<T> StepAsync<T>( | ||
| Func<IStepContext, Task<T>> func, | ||
| ICheckpointSerializer<T> serializer, | ||
| string? name = null, | ||
| StepConfig? config = null, | ||
| CancellationToken cancellationToken = default) | ||
| => RunStep(func, serializer, name, config, cancellationToken); | ||
|
|
||
|
|
||
| private Task<T> RunStep<T>( | ||
| Func<IStepContext, Task<T>> func, | ||
| ICheckpointSerializer<T> serializer, | ||
| string? name, | ||
| StepConfig? config, | ||
| CancellationToken cancellationToken) | ||
| { | ||
| var operationId = _idGenerator.NextId(); | ||
| var op = new StepOperation<T>( | ||
| operationId, name, func, config, serializer, Logger, | ||
| _state, _terminationManager, _durableExecutionArn, _batcher); | ||
| return op.ExecuteAsync(cancellationToken); | ||
| } | ||
|
|
||
| public Task WaitAsync( | ||
| TimeSpan duration, | ||
| string? name = null, | ||
| CancellationToken cancellationToken = default) | ||
| { | ||
| // Service timer granularity is 1 second; sub-second waits would round to 0. | ||
| // WaitOptions.WaitSeconds is integer in [1, 31_622_400] (1 second to ~1 year). | ||
| if (duration < TimeSpan.FromSeconds(1)) | ||
| throw new ArgumentOutOfRangeException(nameof(duration), duration, "Wait duration must be at least 1 second."); | ||
|
|
||
| if (duration > TimeSpan.FromSeconds(31_622_400)) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we be validating this on our end? |
||
| throw new ArgumentOutOfRangeException(nameof(duration), duration, "Wait duration must be at most 31,622,400 seconds (~1 year)."); | ||
|
|
||
| cancellationToken.ThrowIfCancellationRequested(); | ||
|
|
||
| var operationId = _idGenerator.NextId(); | ||
| var waitSeconds = (int)Math.Max(1, Math.Ceiling(duration.TotalSeconds)); | ||
|
GarrettBeatty marked this conversation as resolved.
|
||
| var op = new WaitOperation( | ||
| operationId, name, waitSeconds, | ||
| _state, _terminationManager, _durableExecutionArn, _batcher); | ||
| return op.ExecuteAsync(cancellationToken); | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Trim-safe serializer used by the void <c>StepAsync</c> overloads, which never | ||
| /// carry a meaningful payload. Always serializes to <c>"null"</c> and discards | ||
| /// on deserialize. | ||
| /// </summary> | ||
| internal sealed class NullCheckpointSerializer : ICheckpointSerializer<object?> | ||
| { | ||
| public static NullCheckpointSerializer Instance { get; } = new(); | ||
| public string Serialize(object? value, SerializationContext context) => "null"; | ||
| public object? Deserialize(string data, SerializationContext context) => null; | ||
| } | ||
|
|
||
| internal sealed class DurableExecutionContext : IExecutionContext | ||
| { | ||
| public DurableExecutionContext(string durableExecutionArn) | ||
| { | ||
| DurableExecutionArn = durableExecutionArn; | ||
| } | ||
|
|
||
| public string DurableExecutionArn { get; } | ||
| } | ||
|
|
||
| internal sealed class StepContext : IStepContext | ||
| { | ||
| public StepContext(string operationId, int attemptNumber, ILogger logger) | ||
| { | ||
| OperationId = operationId; | ||
| AttemptNumber = attemptNumber; | ||
| Logger = logger; | ||
| } | ||
|
|
||
| public ILogger Logger { get; } | ||
| public int AttemptNumber { get; } | ||
| public string OperationId { get; } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| using Amazon.Lambda.DurableExecution.Internal; | ||
|
|
||
| namespace Amazon.Lambda.DurableExecution; | ||
|
|
||
| /// <summary> | ||
| /// The result of running a durable execution handler. | ||
| /// </summary> | ||
| internal sealed class HandlerResult<TResult> | ||
| { | ||
| public required InvocationStatus Status { get; init; } | ||
| public TResult? Result { get; init; } | ||
| public string? Message { get; init; } | ||
| public Exception? Exception { get; init; } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Core orchestration engine for durable execution. Races user code against | ||
| /// a termination signal using Task.WhenAny. When user code completes, returns | ||
| /// SUCCEEDED/FAILED. When termination wins (wait, callback, invoke), returns PENDING. | ||
| /// </summary> | ||
| internal static class DurableExecutionHandler | ||
| { | ||
| /// <summary> | ||
| /// Runs the user's workflow function within the durable execution engine. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// <para> | ||
| /// Suspension flow — example: <c>await ctx.WaitAsync(TimeSpan.FromSeconds(5))</c>: | ||
| /// </para> | ||
| /// <code> | ||
| /// user code DurableContext TerminationMgr RunAsync | ||
| /// ───────── ────────────── ────────────── ──────── | ||
| /// WaitAsync(5s) ─────► queue WAIT START | ||
| /// checkpoint | ||
| /// Terminate() ──────► TerminationTask | ||
| /// completes | ||
| /// ◄────── new TCS().Task | ||
| /// (never completes) | ||
| /// await blocks | ||
| /// forever WhenAny: | ||
| /// ── termination wins | ||
| /// ── userTask abandoned | ||
| /// ── return Pending | ||
| /// </code> | ||
| /// <para> | ||
| /// Key insight: <c>WaitAsync</c> never returns a completed Task — it hands back | ||
| /// a TaskCompletionSource that is never resolved. The user's <c>await</c> blocks | ||
| /// indefinitely. The escape signal is <c>terminationManager.Terminate()</c>, | ||
| /// which <c>Task.WhenAny</c> picks up. We return Pending; the dangling user | ||
| /// Task is GC'd. The service flushes checkpoints, fires the wait timer, then | ||
| /// re-invokes Lambda — on replay, <c>WaitAsync</c> sees the matching SUCCEED | ||
| /// checkpoint and returns <c>Task.CompletedTask</c> normally. | ||
| /// </para> | ||
| /// <para> | ||
| /// The same pattern applies to retries (<c>RetryScheduled</c>), callbacks | ||
| /// (<c>CallbackPending</c>), and chained invokes (<c>InvokePending</c>). | ||
| /// </para> | ||
| /// </remarks> | ||
| /// <typeparam name="TResult">The workflow return type.</typeparam> | ||
| /// <param name="executionState">Hydrated execution state from prior invocations.</param> | ||
| /// <param name="terminationManager">Manages the suspension signal.</param> | ||
| /// <param name="userHandler">The user's workflow function receiving a DurableContext.</param> | ||
| /// <returns>The handler result indicating SUCCEEDED, FAILED, or PENDING.</returns> | ||
| internal static async Task<HandlerResult<TResult>> RunAsync<TResult>( | ||
| ExecutionState executionState, | ||
| TerminationManager terminationManager, | ||
| Func<Task<TResult>> userHandler) | ||
| { | ||
| // Run user code on a threadpool thread so it executes independently of | ||
| // the termination signal. When TerminationManager fires (e.g., WaitAsync), | ||
| // we need the WhenAny race below to resolve immediately without waiting | ||
| // for the user task to reach an await point. | ||
| var userTask = Task.Run(userHandler); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the reason for this is imagine the user had If we called userHandler() directly instead of Task.Run(userHandler): var userTask = userHandler(); // ← starts running RIGHT HERE, synchronously The userHandler() invocation runs synchronously up to the first real await. If the user sleeps, blocks on sync I/O, or does any non-yielding work first, we don't even reach the await Task.WhenAny(...) line yet. The wrapper is stuck inside the user's call. |
||
|
|
||
| // Race: user code completing vs. termination signal (wait/callback/retry). | ||
| // If termination wins, we return PENDING and the abandoned userTask is never awaited. | ||
| var winner = await Task.WhenAny(userTask, terminationManager.TerminationTask); | ||
|
|
||
| if (winner == terminationManager.TerminationTask) | ||
| { | ||
| var terminationResult = await terminationManager.TerminationTask; | ||
|
|
||
| if (terminationResult.Exception != null) | ||
| { | ||
| return new HandlerResult<TResult> | ||
| { | ||
| Status = InvocationStatus.Failed, | ||
| Message = terminationResult.Exception.Message, | ||
| Exception = terminationResult.Exception | ||
| }; | ||
| } | ||
|
|
||
| return new HandlerResult<TResult> | ||
| { | ||
| Status = InvocationStatus.Pending, | ||
| Message = terminationResult.Message | ||
| }; | ||
| } | ||
|
|
||
| try | ||
| { | ||
| var result = await userTask; | ||
| return new HandlerResult<TResult> | ||
| { | ||
| Status = InvocationStatus.Succeeded, | ||
| Result = result | ||
| }; | ||
| } | ||
| catch (Exception ex) | ||
| { | ||
| return new HandlerResult<TResult> | ||
| { | ||
| Status = InvocationStatus.Failed, | ||
| Message = ex.Message, | ||
| Exception = ex | ||
| }; | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would rather not do the pattern of putting classes in folders that don't match the namespace. Eventually somebody will add a file in the folder and the IDE will default to matching the folder.
For this file I'm indifferent if you want to move this to the parent folder or update the using statement. Especially since the file isn't really defined yet so not sure how common you expect users to use this.