Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
namespace Amazon.Lambda.DurableExecution;

/// <summary>
/// Determines whether a failed step should be retried and with what delay.
/// </summary>
public interface IRetryStrategy
{
/// <summary>
/// Evaluates whether the given exception warrants a retry.
/// </summary>
/// <param name="exception">The exception that caused the step to fail.</param>
/// <param name="attemptNumber">The 1-based attempt number that just failed.</param>
/// <returns>A decision indicating whether to retry and the delay before the next attempt.</returns>
RetryDecision ShouldRetry(Exception exception, int attemptNumber);
}

/// <summary>
/// The outcome of a retry evaluation.
/// </summary>
public readonly struct RetryDecision
{
/// <summary>Whether the step should be retried.</summary>
public bool ShouldRetry { get; }

/// <summary>The delay before the next retry attempt.</summary>
public TimeSpan Delay { get; }

private RetryDecision(bool shouldRetry, TimeSpan delay)
{
ShouldRetry = shouldRetry;
Delay = delay;
}

/// <summary>Indicates the step should not be retried.</summary>
public static RetryDecision DoNotRetry() => new(false, TimeSpan.Zero);

/// <summary>Indicates the step should be retried after the specified delay.</summary>
public static RetryDecision RetryAfter(TimeSpan delay) => new(true, delay);
}
185 changes: 185 additions & 0 deletions Libraries/src/Amazon.Lambda.DurableExecution/Config/RetryStrategy.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
using System.Text.RegularExpressions;

namespace Amazon.Lambda.DurableExecution;

/// <summary>
/// Jitter strategy for exponential backoff to prevent thundering-herd scenarios.
/// </summary>
public enum JitterStrategy
{
/// <summary>No randomization — delay is exactly the calculated backoff value.</summary>
None,
/// <summary>Random delay between 0 and the calculated backoff value (recommended).</summary>
Full,
/// <summary>Random delay between 50% and 100% of the calculated backoff value.</summary>
Half
}

/// <summary>
/// Controls whether a step re-executes if the Lambda is re-invoked mid-attempt.
/// </summary>
public enum StepSemantics
{
/// <summary>
/// Default. The step may re-execute if the Lambda is re-invoked during execution.
/// Use for idempotent operations.
/// </summary>
AtLeastOncePerRetry,

/// <summary>
/// The step executes at most once per retry attempt. A START checkpoint is written
/// before execution; on replay with an existing START, the SDK skips re-execution
/// and proceeds to the retry handler.
/// </summary>
AtMostOncePerRetry
}

/// <summary>
/// Factory methods for common retry strategies.
/// </summary>
public static class RetryStrategy
{
/// <summary>6 attempts, 2x backoff, 5s initial delay, 60s max, Full jitter.</summary>
public static IRetryStrategy Default { get; } = Exponential(
maxAttempts: 6,
initialDelay: TimeSpan.FromSeconds(5),
maxDelay: TimeSpan.FromSeconds(60),
backoffRate: 2.0,
jitter: JitterStrategy.Full);

/// <summary>3 attempts, 2x backoff, 1s initial delay, 5s max, Half jitter.</summary>
public static IRetryStrategy Transient { get; } = Exponential(
maxAttempts: 3,
initialDelay: TimeSpan.FromSeconds(1),
maxDelay: TimeSpan.FromSeconds(5),
backoffRate: 2.0,
jitter: JitterStrategy.Half);

/// <summary>No retry — 1 attempt only.</summary>
public static IRetryStrategy None { get; } = Exponential(maxAttempts: 1);

/// <summary>
/// Creates an exponential backoff retry strategy.
/// </summary>
public static IRetryStrategy Exponential(
int maxAttempts = 3,
TimeSpan? initialDelay = null,
TimeSpan? maxDelay = null,
double backoffRate = 2.0,
JitterStrategy jitter = JitterStrategy.Full,
Type[]? retryableExceptions = null,
string[]? retryableMessagePatterns = null)
{
return new ExponentialRetryStrategy(
maxAttempts,
initialDelay ?? TimeSpan.FromSeconds(5),
maxDelay ?? TimeSpan.FromSeconds(300),
backoffRate,
jitter,
retryableExceptions,
retryableMessagePatterns);
}

/// <summary>
/// Creates a retry strategy from a delegate.
/// </summary>
public static IRetryStrategy FromDelegate(Func<Exception, int, RetryDecision> strategy)
=> new DelegateRetryStrategy(strategy);
}

internal sealed class ExponentialRetryStrategy : IRetryStrategy
{
private readonly int _maxAttempts;
private readonly TimeSpan _initialDelay;
private readonly TimeSpan _maxDelay;
private readonly double _backoffRate;
private readonly JitterStrategy _jitter;
private readonly Type[]? _retryableExceptions;
private readonly Regex[]? _retryableMessagePatterns;

[ThreadStatic]
private static Random? t_random;
private static Random Random => t_random ??= new Random();

public ExponentialRetryStrategy(
int maxAttempts,
TimeSpan initialDelay,
TimeSpan maxDelay,
double backoffRate,
JitterStrategy jitter,
Type[]? retryableExceptions,
string[]? retryableMessagePatterns)
{
_maxAttempts = maxAttempts;
_initialDelay = initialDelay;
_maxDelay = maxDelay;
_backoffRate = backoffRate;
_jitter = jitter;
_retryableExceptions = retryableExceptions;
_retryableMessagePatterns = retryableMessagePatterns?
.Select(p => new Regex(p, RegexOptions.Compiled))
.ToArray();
}

public RetryDecision ShouldRetry(Exception exception, int attemptNumber)
{
if (attemptNumber >= _maxAttempts)
return RetryDecision.DoNotRetry();

if (!IsRetryable(exception))
return RetryDecision.DoNotRetry();

var delay = CalculateDelay(attemptNumber);
return RetryDecision.RetryAfter(delay);
}

private bool IsRetryable(Exception exception)
{
if (_retryableExceptions == null && _retryableMessagePatterns == null)
return true;

if (_retryableExceptions != null)
{
var exType = exception.GetType();
if (_retryableExceptions.Any(t => t.IsAssignableFrom(exType)))
return true;
}

if (_retryableMessagePatterns != null)
{
var message = exception.Message;
if (_retryableMessagePatterns.Any(p => p.IsMatch(message)))
return true;
}

return false;
}

internal TimeSpan CalculateDelay(int attemptNumber)
{
var baseDelay = _initialDelay.TotalSeconds * Math.Pow(_backoffRate, attemptNumber - 1);
var cappedDelay = Math.Min(baseDelay, _maxDelay.TotalSeconds);

var finalDelay = _jitter switch
{
JitterStrategy.Full => Random.NextDouble() * cappedDelay,
JitterStrategy.Half => cappedDelay * (0.5 + 0.5 * Random.NextDouble()),
_ => cappedDelay
};

return TimeSpan.FromSeconds(Math.Max(1, Math.Ceiling(finalDelay)));
}
}

internal sealed class DelegateRetryStrategy : IRetryStrategy
{
private readonly Func<Exception, int, RetryDecision> _strategy;

public DelegateRetryStrategy(Func<Exception, int, RetryDecision> strategy)
{
_strategy = strategy;
}

public RetryDecision ShouldRetry(Exception exception, int attemptNumber)
=> _strategy(exception, attemptNumber);
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@ namespace Amazon.Lambda.DurableExecution;
/// </summary>
public sealed class StepConfig
{
// TODO: Retry support is deferred to a follow-up PR. When added, this is
// where RetryStrategy and Semantics (AtLeastOncePerRetry / AtMostOncePerRetry)
// will live. The follow-up needs to use service-mediated retries (checkpoint
// a RETRY operation + suspend the Lambda) rather than an in-process Task.Delay
// loop, to avoid billing Lambda compute time during retry backoff.
/// <summary>
/// Retry strategy for failed steps. When null (default), failures are not retried.
/// </summary>
public IRetryStrategy? RetryStrategy { get; set; }

/// <summary>
/// Controls whether a step may re-execute if the Lambda is re-invoked mid-attempt.
/// Default is <see cref="StepSemantics.AtLeastOncePerRetry"/>.
/// </summary>
public StepSemantics Semantics { get; set; } = StepSemantics.AtLeastOncePerRetry;
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ namespace Amazon.Lambda.DurableExecution.Internal;
/// call awaits the flush of its containing batch (sync semantics).
/// </summary>
/// <remarks>
/// TODO: when Map / Parallel / ChildContext / WaitForCondition land — or when
/// AtLeastOncePerRetry step START gets a non-blocking variant — they will need
/// a fire-and-forget overload like
/// <c>Task EnqueueAsync(SdkOperationUpdate update, bool sync)</c> where
/// <c>sync=false</c> returns as soon as the item is queued. Java's
/// <c>sendOperationUpdate</c> vs <c>sendOperationUpdateAsync</c> is the model.
/// Today every call site is sync, so the API stays minimal.
/// Fire-and-forget semantics are achieved by simply not awaiting the returned
/// Task — matching Java/Python/JS SDKs which use the same one-method pattern.
/// Errors still surface deterministically via <c>_terminalError</c>: the next
/// sync <see cref="EnqueueAsync"/> or <see cref="DrainAsync"/> rethrows.
/// Callers using fire-and-forget should observe the discarded Task's exception
/// (see <c>StepOperation.FireAndForget</c>) so it doesn't trip the runtime's
/// <c>UnobservedTaskException</c> event.
/// </remarks>
internal sealed class CheckpointBatcher : IAsyncDisposable
{
Expand Down
Loading
Loading