From c73c9c06e96092c08028053db89bc644171f330a Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 12 Mar 2026 13:11:08 -0400 Subject: [PATCH 01/42] quick edit to readme (#37) --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9e496b5..97f06f4 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ A comprehensive skill for building Temporal applications. ### As a Claude Code Plugin -1. Run `/plugin marketplace add temporalio/agent-skills` +1. Run `/plugin marketplace add temporalio/agent-skills#dev` 2. Run `/plugin` to open the plugin manager 3. Select **Marketplaces** 4. Choose `temporal-marketplace` from the list @@ -16,11 +16,11 @@ A comprehensive skill for building Temporal applications. ### Via `npx skills` - supports all major coding agents -1. `npx skills add temporalio/skill-temporal-developer` +1. `npx skills add https://github.com/temporalio/skill-temporal-developer/tree/dev` 2. Follow prompts ### Via manually cloning the skill repo: 1. `mkdir -p ~/.claude/skills && git clone https://github.com/temporalio/skill-temporal-developer ~/.claude/skills/temporal-developer` -Appropriately adjust the installation directory based on your coding agent. \ No newline at end of file +Appropriately adjust the installation directory based on your coding agent. From c013b87f8704b7cff4fe9571114c048e35a783a4 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Tue, 17 Mar 2026 18:41:25 -0400 Subject: [PATCH 02/42] Fix saga compensations to run under cancellation protection (#43) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a workflow is cancelled mid-saga, compensations must run in a cancellation-protected scope, otherwise they are immediately cancelled before they can execute. - Python: wrap compensation loop in asyncio.shield() so it runs even when the workflow receives a CancelledError - TypeScript: wrap compensation loop in CancellationScope.nonCancellable() so it runs even when the root scope is cancelled (per official docs: "Cleanup logic must be in a nonCancellable scope") - TypeScript: also fix compensation registration order — register BEFORE calling the activity (was already correct in Python) Co-authored-by: Claude Sonnet 4.6 (1M context) --- references/python/patterns.md | 13 ++++++++----- references/typescript/patterns.md | 21 ++++++++++++--------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/references/python/patterns.md b/references/python/patterns.md index 762977b..00cd12a 100644 --- a/references/python/patterns.md +++ b/references/python/patterns.md @@ -243,11 +243,14 @@ class MyWorkflow: except Exception as e: workflow.logger.error(f"Order failed: {e}, running compensations") - for compensate in reversed(compensations): - try: - await compensate() - except Exception as comp_err: - workflow.logger.error(f"Compensation failed: {comp_err}") + # asyncio.shield ensures compensations run even if the workflow is cancelled. + async def run_compensations(): + for compensate in reversed(compensations): + try: + await compensate() + except Exception as comp_err: + workflow.logger.error(f"Compensation failed: {comp_err}") + await asyncio.shield(asyncio.ensure_future(run_compensations())) raise ``` diff --git a/references/typescript/patterns.md b/references/typescript/patterns.md index 878f9f0..4b07947 100644 --- a/references/typescript/patterns.md +++ b/references/typescript/patterns.md @@ -224,7 +224,7 @@ export async function longRunningWorkflow(state: State): Promise { **Important:** Compensation activities should be idempotent. ```typescript -import { log } from '@temporalio/workflow'; +import { CancellationScope, log } from '@temporalio/workflow'; export async function sagaWorkflow(order: Order): Promise { const compensations: Array<() => Promise> = []; @@ -233,22 +233,25 @@ export async function sagaWorkflow(order: Order): Promise { // IMPORTANT: Save compensation BEFORE calling the activity // If activity fails after completing but before returning, // compensation must still be registered - await reserveInventory(order); compensations.push(() => releaseInventory(order)); + await reserveInventory(order); - await chargePayment(order); compensations.push(() => refundPayment(order)); + await chargePayment(order); await shipOrder(order); return 'Order completed'; } catch (err) { - for (const compensate of compensations.reverse()) { - try { - await compensate(); - } catch (compErr) { - log.warn('Compensation failed', { error: compErr }); + // nonCancellable ensures compensations run even if the workflow is cancelled + await CancellationScope.nonCancellable(async () => { + for (const compensate of compensations.reverse()) { + try { + await compensate(); + } catch (compErr) { + log.warn('Compensation failed', { error: compErr }); + } } - } + }); throw err; } } From 291f4e5aaa22567eaf3508196cd0b8c667b7a87f Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Wed, 18 Mar 2026 13:54:09 -0400 Subject: [PATCH 03/42] Update readme for public preview (#45) --- README.md | 20 +++++++++++++++++--- SKILL.md | 2 +- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 97f06f4..d27a3b4 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,16 @@ # Temporal Development Skill -A comprehensive skill for building Temporal applications. +A comprehensive skill for developers to use when building Temporal applications. + +> [!WARNING] +> This Skill is currently in Public Preview, and will continue to evolve and improve. +> We would love to hear your feedback - positive or negative - over in the [Community Slack](https://t.mp/slack), in the [#topic-ai channel](https://temporalio.slack.com/archives/C0818FQPYKY) ## Installation ### As a Claude Code Plugin -1. Run `/plugin marketplace add temporalio/agent-skills#dev` +1. Run `/plugin marketplace add temporalio/agent-skills` 2. Run `/plugin` to open the plugin manager 3. Select **Marketplaces** 4. Choose `temporal-marketplace` from the list @@ -16,7 +20,7 @@ A comprehensive skill for building Temporal applications. ### Via `npx skills` - supports all major coding agents -1. `npx skills add https://github.com/temporalio/skill-temporal-developer/tree/dev` +1. `npx skills add https://github.com/temporalio/skill-temporal-developer` 2. Follow prompts ### Via manually cloning the skill repo: @@ -24,3 +28,13 @@ A comprehensive skill for building Temporal applications. 1. `mkdir -p ~/.claude/skills && git clone https://github.com/temporalio/skill-temporal-developer ~/.claude/skills/temporal-developer` Appropriately adjust the installation directory based on your coding agent. + +## Currently Supported Temporal SDK Langages + +- [x] Python ✅ +- [x] TypeScript ✅ +- [x] Go ✅ +- [ ] Java 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/42)) +- [ ] .NET 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/39)) +- [ ] Ruby 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/41)) +- [ ] PHP 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/40)) diff --git a/SKILL.md b/SKILL.md index 6c36f07..c2775a2 100644 --- a/SKILL.md +++ b/SKILL.md @@ -122,7 +122,7 @@ Once you've downloaded the file, extract the downloaded archive and add the temp When you have **first** loaded this skill, output this message to the user: -"Thank you for trying out the prerelease of Temporal's development skill! We would love to hear your feedback - positive or negative - over in the [Community Slack](https://t.mp/slack), in the [#topic-ai channel](https://temporalio.slack.com/archives/C0818FQPYKY)." +"Thank you for trying out the public preview of the Temporal development skill! We would love to hear your feedback - positive or negative - over in the [Community Slack](https://t.mp/slack), in the [#topic-ai channel](https://temporalio.slack.com/archives/C0818FQPYKY)." Do not output this message multiple times in the same conversation. From 21d1d41dcc9085042128c55504b27b65c10f5174 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Wed, 18 Mar 2026 15:30:59 -0400 Subject: [PATCH 04/42] a few more readme tweaks (#46) --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d27a3b4..6ba88db 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Temporal Development Skill -A comprehensive skill for developers to use when building Temporal applications. +A comprehensive skill for developers to use when building [Temporal](https://temporal.io/) applications. > [!WARNING] > This Skill is currently in Public Preview, and will continue to evolve and improve. @@ -10,6 +10,8 @@ A comprehensive skill for developers to use when building Temporal applications. ### As a Claude Code Plugin +This skill is housed within a [Claude Code plugin](https://github.com/temporalio/agent-skills), which provides a simple way to install and receive future updates to the skill. + 1. Run `/plugin marketplace add temporalio/agent-skills` 2. Run `/plugin` to open the plugin manager 3. Select **Marketplaces** From 6cd40f2291647b7e88d60a831a772f8a3bf353ed Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Wed, 18 Mar 2026 16:00:30 -0400 Subject: [PATCH 05/42] Add MIT License to the project (#47) --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7092ef5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Temporal Technologies Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From e16c9b808aefd8259c7e9cfc62088cf4bd0bb645 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Wed, 18 Mar 2026 21:18:45 -0400 Subject: [PATCH 06/42] Add Go (supersedes other PR) (#38) * progress on go * Go translation workflow completed. * missed a few spots * Manual edits * Address feedback * Add gotcha about anonymous local activities * Sample code for payload converter * clarify sdk protection mechanisms --- SKILL.md | 5 +- references/core/determinism.md | 4 +- references/core/patterns.md | 2 + references/go/advanced-features.md | 187 +++++++++ references/go/data-handling.md | 262 ++++++++++++ references/go/determinism-protection.md | 98 +++++ references/go/determinism.md | 52 +++ references/go/error-handling.md | 184 ++++++++ references/go/go.md | 242 +++++++++++ references/go/gotchas.md | 290 +++++++++++++ references/go/observability.md | 153 +++++++ references/go/patterns.md | 536 ++++++++++++++++++++++++ references/go/testing.md | 238 +++++++++++ references/go/versioning.md | 232 ++++++++++ references/python/patterns.md | 2 + references/typescript/patterns.md | 2 + 16 files changed, 2486 insertions(+), 3 deletions(-) create mode 100644 references/go/advanced-features.md create mode 100644 references/go/data-handling.md create mode 100644 references/go/determinism-protection.md create mode 100644 references/go/determinism.md create mode 100644 references/go/error-handling.md create mode 100644 references/go/go.md create mode 100644 references/go/gotchas.md create mode 100644 references/go/observability.md create mode 100644 references/go/patterns.md create mode 100644 references/go/testing.md create mode 100644 references/go/versioning.md diff --git a/SKILL.md b/SKILL.md index c2775a2..6d9c888 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,6 +1,6 @@ --- name: temporal-developer -description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. +description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "Temporal Go", "Temporal Golang", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. version: 1.0.0 --- @@ -8,7 +8,7 @@ version: 1.0.0 ## Overview -Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python and TypeScript. +Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python, TypeScript, and Go. ## Core Architecture @@ -92,6 +92,7 @@ Once you've downloaded the file, extract the downloaded archive and add the temp 1. First, read the getting started guide for the language you are working in: - Python -> read `references/python/python.md` - TypeScript -> read `references/typescript/typescript.md` + - Go -> read `references/go/go.md` 2. Second, read appropriate `core` and language-specific references for the task at hand. diff --git a/references/core/determinism.md b/references/core/determinism.md index bf4f1ec..af824d2 100644 --- a/references/core/determinism.md +++ b/references/core/determinism.md @@ -78,9 +78,11 @@ For a few simple cases, like timestamps, random values, UUIDs, etc. the Temporal ## SDK Protection Mechanisms Each Temporal SDK language provides a protection mechanism to make it easier to catch non-determinism errors earlier in development: -- Python: The Python SDK runs workflows in a sandbox that intercepts and aborts non-deterministic calls at runtime. +- Python: The Python SDK runs workflows in a sandbox that intercepts and aborts non-deterministic calls early at runtime. - TypeScript: The TypeScript SDK runs workflows in an isolated V8 sandbox, intercepting many common sources of non-determinism and replacing them automatically with deterministic variants. +- Go: The Go SDK has no runtime sandbox. Therefore, non-determinism bugs will never be immediately appararent, and are usually only observable during replay. The optional `workflowcheck` static analysis tool can be used to check for many sources of non-determinism at compile time. +Regardless of which SDK you are using, it is your responsibility to ensure that workflow code does not contain sources of non-determinism. Use SDK-specific tools as well as replay tests for doing so. ## Detecting Non-Determinism diff --git a/references/core/patterns.md b/references/core/patterns.md index 93f774d..566e6f8 100644 --- a/references/core/patterns.md +++ b/references/core/patterns.md @@ -76,6 +76,7 @@ Client Workflow - Synchronous - caller waits for completion - Can mutate state AND return values - Supports validators to reject invalid updates before they even get persisted into history +- **Validators must NOT mutate workflow state or block** (no activities, sleeps, or commands) — they are read-only, similar to query handlers - Recorded in history **Example Flow**: @@ -424,6 +425,7 @@ Activity calls heartbeat() - Less visibility in Temporal UI (no separate task) - Must complete on the same worker - Not suitable for long-running operations +- **Risk with consecutive local activities:** Local activity completions are only persisted when the current Workflow Task completes. Calling multiple local activities in a row (with nothing in between to yield the Workflow Task) increases the risk of losing work if the worker crashes mid-sequence. If you need a chain of operations with durable checkpoints between each step, use regular activities instead. ## Choosing Between Patterns diff --git a/references/go/advanced-features.md b/references/go/advanced-features.md new file mode 100644 index 0000000..55e4e57 --- /dev/null +++ b/references/go/advanced-features.md @@ -0,0 +1,187 @@ +# Go SDK Advanced Features + +## Schedules + +Create recurring workflow executions using the Schedule API. + +```go +scheduleHandle, err := c.ScheduleClient().Create(ctx, client.ScheduleOptions{ + ID: "daily-report", + Spec: client.ScheduleSpec{ + CronExpressions: []string{"0 9 * * *"}, + }, + Action: &client.ScheduleWorkflowAction{ + ID: "daily-report-workflow", + Workflow: DailyReportWorkflow, + TaskQueue: "reports", + }, +}) +``` + +Using intervals instead of cron: + +```go +scheduleHandle, err := c.ScheduleClient().Create(ctx, client.ScheduleOptions{ + ID: "hourly-sync", + Spec: client.ScheduleSpec{ + Intervals: []client.ScheduleIntervalSpec{ + {Every: time.Hour}, + }, + }, + Action: &client.ScheduleWorkflowAction{ + ID: "hourly-sync-workflow", + Workflow: SyncWorkflow, + TaskQueue: "sync", + }, +}) +``` + +Manage schedules: + +```go +handle := c.ScheduleClient().GetHandle(ctx, "daily-report") + +// Pause / unpause +handle.Pause(ctx, client.SchedulePauseOptions{Note: "Maintenance window"}) +handle.Unpause(ctx, client.ScheduleUnpauseOptions{Note: "Maintenance complete"}) + +// Trigger immediately +handle.Trigger(ctx, client.ScheduleTriggerOptions{}) + +// Describe +desc, err := handle.Describe(ctx) + +// Delete +handle.Delete(ctx) +``` + +## Async Activity Completion + +For activities that complete asynchronously (e.g., human tasks, external callbacks). +If you configure a heartbeat_timeout on this activity, the external completer is responsible for sending heartbeats via the async handle. +If you do NOT set a heartbeat_timeout, no heartbeats are required. + +**Note:** If the external system that completes the asynchronous action can reliably be trusted to do the task and Signal back with the result, and it doesn't need to Heartbeat or receive Cancellation, then consider using **signals** instead. + +**Step 1: Return `activity.ErrResultPending` from the activity.** + +```go +func RequestApproval(ctx context.Context, requestID string) (string, error) { + activityInfo := activity.GetInfo(ctx) + taskToken := activityInfo.TaskToken + + // Store taskToken externally (e.g., database) for later completion + err := storeTaskToken(requestID, taskToken) + if err != nil { + return "", err + } + + // Signal that this activity will be completed externally + return "", activity.ErrResultPending +} +``` + +**Step 2: Complete from another process using the task token.** + +```go +temporalClient, err := client.Dial(client.Options{}) + +// Complete the activity +err = temporalClient.CompleteActivity(ctx, taskToken, "approved", nil) + +// Or fail it +err = temporalClient.CompleteActivity(ctx, taskToken, nil, errors.New("rejected")) +``` + +Or complete by ID (no task token needed): + +```go +err = temporalClient.CompleteActivityByID(ctx, namespace, workflowID, runID, activityID, "approved", nil) +``` + +## Worker Tuning + +Configure `worker.Options` for production workloads: + +```go +w := worker.New(c, "my-task-queue", worker.Options{ + // Max concurrent activity executions (default: 1000) + MaxConcurrentActivityExecutionSize: 500, + + // Max concurrent workflow task executions (default: 1000) + MaxConcurrentWorkflowTaskExecutionSize: 500, + + // Max concurrent activity task pollers (default: 2) + MaxConcurrentActivityTaskPollers: 4, + + // Max concurrent workflow task pollers (default: 2) + MaxConcurrentWorkflowTaskPollers: 4, + + // Graceful shutdown timeout (default: 0) + WorkerStopTimeout: 30 * time.Second, +}) +``` + +Scale pollers based on task queue throughput. If you observe high schedule-to-start latency, increase the number of pollers or add more workers. + +## Sessions + +Go-specific feature for routing multiple activities to the same worker. All activities using the session context execute on the same worker host. + +**Enable on the worker:** + +```go +w := worker.New(c, "fileprocessing", worker.Options{ + EnableSessionWorker: true, + MaxConcurrentSessionExecutionSize: 100, // default: 1000 +}) +``` + +**Use in a workflow:** + +```go +func FileProcessingWorkflow(ctx workflow.Context, file FileParam) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + sessionCtx, err := workflow.CreateSession(ctx, &workflow.SessionOptions{ + CreationTimeout: time.Minute, + ExecutionTimeout: 10 * time.Minute, + }) + if err != nil { + return err + } + defer workflow.CompleteSession(sessionCtx) + + // All three activities run on the same worker + var downloadResult string + err = workflow.ExecuteActivity(sessionCtx, DownloadFile, file.URL).Get(sessionCtx, &downloadResult) + if err != nil { + return err + } + + var processResult string + err = workflow.ExecuteActivity(sessionCtx, ProcessFile, downloadResult).Get(sessionCtx, &processResult) + if err != nil { + return err + } + + err = workflow.ExecuteActivity(sessionCtx, UploadFile, processResult).Get(sessionCtx, nil) + return err +} +``` + +Key points: +- `workflow.ErrSessionFailed` is returned if the worker hosting the session dies +- `CompleteSession` releases resources -- always call it (use `defer`) +- Use case: file processing (download, process, upload on same host), GPU workloads, or any pipeline needing local state +- `MaxConcurrentSessionExecutionSize` on `worker.Options` limits how many sessions a single worker can handle + +**Limitations:** +- Sessions do not survive worker process restarts — if the worker dies, the session fails and activities must be retried from the workflow level +- There is no server-side support for sessions — the Go SDK implements them entirely client-side using internal task queue routing +- Session concurrency limiting is per-process, not per-host — only one worker process per host if you rely on this + +**Relationship to worker-specific task queues:** Sessions are essentially a convenience API over the "worker-specific task queue" pattern, where each worker creates a unique task queue and routes activities to it. For simple cases where you don't need separate activities (e.g., download + process + upload can be one unit), consider using a single long-running activity with heartbeating instead. diff --git a/references/go/data-handling.md b/references/go/data-handling.md new file mode 100644 index 0000000..e887e7b --- /dev/null +++ b/references/go/data-handling.md @@ -0,0 +1,262 @@ +# Go SDK Data Handling + +## Overview + +The Go SDK uses the `converter.DataConverter` interface to serialize/deserialize workflow inputs, outputs, and activity parameters. The default converter converts values to JSON. + +## Default Data Converter + +The default `CompositeDataConverter` applies converters in order until one returns a non-nil Payload: + +1. `converter.NewNilPayloadConverter()` -- nil values +2. `converter.NewByteSlicePayloadConverter()` -- `[]byte` +3. `converter.NewProtoJSONPayloadConverter()` -- Protobuf messages as JSON +4. `converter.NewProtoPayloadConverter()` -- Protobuf messages as binary +5. `converter.NewJSONPayloadConverter()` -- anything JSON-serializable + +Structs must have exported fields to be serialized. + +## Custom Data Converter + +In most cases you don't implement the full `DataConverter` interface directly. Instead, implement a **`PayloadConverter`** for your specific type and insert it into a `CompositeDataConverter`. The `PayloadConverter` interface has four methods: + +```go +type PayloadConverter interface { + ToPayload(value interface{}) (*commonpb.Payload, error) // return nil if this type isn't handled + FromPayload(payload *commonpb.Payload, valuePtr interface{}) error + ToString(payload *commonpb.Payload) string + Encoding() string // e.g. "json/msgpack" +} +``` + +**Example — custom msgpack PayloadConverter:** + +```go +import ( + "encoding/json" + "fmt" + + commonpb "go.temporal.io/api/common/v1" + "go.temporal.io/sdk/converter" + "github.com/vmihailenco/msgpack/v5" +) + +const encodingMsgpack = "binary/msgpack" + +type MsgpackPayloadConverter struct{} + +func (c *MsgpackPayloadConverter) Encoding() string { + return encodingMsgpack +} + +func (c *MsgpackPayloadConverter) ToPayload(value interface{}) (*commonpb.Payload, error) { + if value == nil { + return nil, nil + } + data, err := msgpack.Marshal(value) + if err != nil { + return nil, fmt.Errorf("msgpack marshal: %w", err) + } + return &commonpb.Payload{ + Metadata: map[string][]byte{ + converter.MetadataEncoding: []byte(encodingMsgpack), + }, + Data: data, + }, nil +} + +func (c *MsgpackPayloadConverter) FromPayload(payload *commonpb.Payload, valuePtr interface{}) error { + if string(payload.GetMetadata()[converter.MetadataEncoding]) != encodingMsgpack { + return fmt.Errorf("unsupported encoding") + } + return msgpack.Unmarshal(payload.Data, valuePtr) +} + +func (c *MsgpackPayloadConverter) ToString(payload *commonpb.Payload) string { + // Decode to a map for human-readable display + var v interface{} + if err := msgpack.Unmarshal(payload.Data, &v); err != nil { + return fmt.Sprintf("", err) + } + b, _ := json.Marshal(v) + return string(b) +} +``` + +**Register in a CompositeDataConverter and pass to the client:** + +```go +dataConverter := converter.NewCompositeDataConverter( + converter.NewNilPayloadConverter(), + converter.NewByteSlicePayloadConverter(), + &MsgpackPayloadConverter{}, // handles your type; falls through to JSON for everything else + converter.NewJSONPayloadConverter(), +) + +c, err := client.Dial(client.Options{ + DataConverter: dataConverter, +}) +``` + +**Per-activity/child-workflow override** — use a different converter for specific calls: + +```go +actCtx := workflow.WithDataConverter(ctx, mySpecialConverter) +workflow.ExecuteActivity(actCtx, SensitiveActivity, input) +``` + +**Note:** If your converter makes remote calls (e.g., to a KMS for encryption), wrap it with `workflow.DataConverterWithoutDeadlockDetection` to avoid deadlock detection timeouts in workflow code. + +## Composition of Payload Converters + +Use `converter.NewCompositeDataConverter` to chain type-specific converters. The first converter that can handle the type wins. + +```go +dataConverter := converter.NewCompositeDataConverter( + converter.NewNilPayloadConverter(), + converter.NewByteSlicePayloadConverter(), + converter.NewProtoJSONPayloadConverter(), + converter.NewProtoPayloadConverter(), + YourCustomPayloadConverter(), + converter.NewJSONPayloadConverter(), +) +``` + +## Protobuf Support + +Binary protobuf: +```go +converter.NewProtoPayloadConverter() +``` + +JSON protobuf: +```go +converter.NewProtoJSONPayloadConverter() +``` + +Both are included in the default data converter. SDK v1.26.0 (March 2024) migrated from gogo/protobuf to google/protobuf. If you need backward compatibility with older payloads encoded with gogo, use the `LegacyTemporalProtoCompat` option. + +## Payload Encryption + +Implement the `converter.PayloadCodec` interface (`Encode` and `Decode`) and wrap the default data converter: + +```go +// Codec implements converter.PayloadCodec for encryption. +type Codec struct{} + +func (Codec) Encode(payloads []*commonpb.Payload) ([]*commonpb.Payload, error) { + result := make([]*commonpb.Payload, len(payloads)) + for i, p := range payloads { + origBytes, err := p.Marshal() + if err != nil { + return payloads, err + } + encrypted := encrypt(origBytes) // your encryption logic + result[i] = &commonpb.Payload{ + Metadata: map[string][]byte{converter.MetadataEncoding: []byte("binary/encrypted")}, + Data: encrypted, + } + } + return result, nil +} + +func (Codec) Decode(payloads []*commonpb.Payload) ([]*commonpb.Payload, error) { + result := make([]*commonpb.Payload, len(payloads)) + for i, p := range payloads { + if string(p.Metadata[converter.MetadataEncoding]) != "binary/encrypted" { + result[i] = p + continue + } + decrypted := decrypt(p.Data) // your decryption logic + result[i] = &commonpb.Payload{} + err := result[i].Unmarshal(decrypted) + if err != nil { + return payloads, err + } + } + return result, nil +} +``` + +Wrap with `CodecDataConverter` and pass to client: + +```go +var DataConverter = converter.NewCodecDataConverter( + converter.GetDefaultDataConverter(), + &Codec{}, +) + +c, err := client.Dial(client.Options{ + DataConverter: DataConverter, +}) +``` + +## Search Attributes + +Set at workflow start: + +```go +handle, err := c.ExecuteWorkflow(ctx, client.StartWorkflowOptions{ + ID: "order-123", + TaskQueue: "orders", + SearchAttributes: map[string]interface{}{ + "OrderStatus": "pending", + "CustomerId": "cust-456", + }, +}, OrderWorkflow, input) +``` + +Upsert from within a workflow: + +```go +err := workflow.UpsertSearchAttributes(ctx, map[string]interface{}{ + "OrderStatus": "completed", +}) +``` + +Typed search attributes (v1.26.0+, preferred): + +```go +var OrderStatusKey = temporal.NewSearchAttributeKeyKeyword("OrderStatus") + +err := workflow.UpsertTypedSearchAttributes(ctx, OrderStatusKey.ValueSet("completed")) +``` + +Query workflows by search attributes: + +```go +resp, err := c.ListWorkflow(ctx, &workflowservice.ListWorkflowExecutionsRequest{ + Query: `OrderStatus = "pending" AND CustomerId = "cust-456"`, +}) +``` + +## Workflow Memo + +Set in start options: + +```go +handle, err := c.ExecuteWorkflow(ctx, client.StartWorkflowOptions{ + ID: "order-123", + TaskQueue: "orders", + Memo: map[string]interface{}{ + "customerName": "Alice", + "notes": "Priority customer", + }, +}, OrderWorkflow, input) +``` + +Read memo from workflow info. Upsert memo (Go SDK only): + +```go +err := workflow.UpsertMemo(ctx, map[string]interface{}{ + "notes": "Updated notes", +}) +``` + +## Best Practices + +1. Use structs with exported fields for inputs and outputs +2. Prefer JSON for readability during development, protobuf for performance in production +3. Keep payloads small -- see `references/core/gotchas.md` for limits +4. Use `PayloadCodec` for encryption; never store sensitive data unencrypted +5. Configure the same data converter on both client and worker diff --git a/references/go/determinism-protection.md b/references/go/determinism-protection.md new file mode 100644 index 0000000..4a6f5f4 --- /dev/null +++ b/references/go/determinism-protection.md @@ -0,0 +1,98 @@ +# Go Workflow Determinism Protection + +## Overview + +The Go SDK has no runtime sandbox. Determinism is enforced by **developer convention** and **optional static analysis**. Unlike the Python and TypeScript SDKs, the Go SDK will not intercept or replace non-deterministic calls at runtime. The Go SDK does perform a limited runtime command-ordering check, but catching non-deterministic code before deployment requires the `workflowcheck` tool and testing, in particular replay tests (see `references/go/testing`). + +## workflowcheck Static Analysis + +### Install + +```bash +go install go.temporal.io/sdk/contrib/tools/workflowcheck@latest +``` + +### Run + +```bash +workflowcheck ./... +``` + +No output means all registered workflows are deterministic. Non-deterministic code produces hierarchical output showing the call chain to the offending code. + +Use `-show-pos` for exact file positions: + +```bash +workflowcheck -show-pos ./... +``` + +### What It Detects + +**Non-deterministic functions/variables:** +- `time.Now` -- obtaining current time +- `time.Sleep` -- sleeping +- `crypto/rand.Reader` -- crypto random reader +- `math/rand.globalRand` -- global pseudorandom +- `os.Stdin`, `os.Stdout`, `os.Stderr` -- standard I/O streams + +**Non-deterministic Go constructs:** +- Starting a goroutine (`go func()`) +- Sending to a channel +- Receiving from a channel +- Iterating over a channel via `range` +- Iterating over a map via `range` + +### Limitations + +`workflowcheck` cannot catch everything. It does **not** detect: +- Global variable mutation +- Non-determinism via reflection +- Runtime-conditional non-determinism + +### Suppressing False Positives + +Add `//workflowcheck:ignore` on or directly above the offending line: + +```go +now := time.Now() //workflowcheck:ignore +``` + +For broader suppression, use a YAML config file: + +```yaml +# workflowcheck.config.yaml +decls: + path/to/package.MyDeterministicFunc: false +``` + +```bash +workflowcheck -config workflowcheck.config.yaml ./... +``` + +## Determinism Rules + +**You must:** +- Use `workflow.Go(ctx, func(ctx workflow.Context) { ... })` instead of `go` +- Use `workflow.NewChannel(ctx)` instead of `chan` +- Use `workflow.NewSelector(ctx)` instead of `select` +- Use `workflow.Sleep(ctx, duration)` instead of `time.Sleep()` +- Use `workflow.Now(ctx)` instead of `time.Now()` +- Use `workflow.GetLogger(ctx)` instead of `fmt.Println` / `log.Println` +- Sort map keys before iterating, or use `workflow.SideEffect` / an activity + +**You must not:** +- Start native goroutines +- Use native channels or `select` +- Call `time.Now()` or `time.Sleep()` +- Use `math/rand` global functions or `crypto/rand.Reader` +- Access `os.Stdin`, `os.Stdout`, or `os.Stderr` +- Mutate global variables +- Make network calls, file I/O, or database queries (use activities) + +## Best Practices + +1. **Run `workflowcheck` in CI / pre-commit** -- catch non-deterministic code before it reaches production +2. **Keep workflow code thin** -- workflows should orchestrate; delegate all I/O and non-deterministic work to activities +3. **Use struct methods for activities** -- keeps imports clean and avoids pulling non-deterministic dependencies into workflow files +4. **Separate workflow and activity files** -- reduces the surface area that `workflowcheck` needs to analyze and keeps concerns isolated +5. **Test with replay** after any workflow code change to verify backward compatibility diff --git a/references/go/determinism.md b/references/go/determinism.md new file mode 100644 index 0000000..0cff905 --- /dev/null +++ b/references/go/determinism.md @@ -0,0 +1,52 @@ +# Go SDK Determinism + +## Overview + +The Go SDK has NO runtime sandbox (unlike Python/TypeScript). Workflows must be deterministic for replay, and determinism is enforced entirely by developer convention and optional static analysis via the `workflowcheck` tool (see `references/go/determinism-protection.md`). + +## Why Determinism Matters: History Replay + +Temporal provides durable execution through **History Replay**. When a Worker restores workflow state, it re-executes workflow code from the beginning. This requires the code to be **deterministic**. See `references/core/determinism.md` for a deep explanation. + +## Forbidden Operations + +Do not use any of the following in workflow code: + +- **Native goroutines** (`go func()`) -- use `workflow.Go()` instead +- **Native channels** (`chan`, send, receive, `range` over channel) -- use `workflow.Channel` instead +- **Native `select`** -- use `workflow.Selector` instead +- **`time.Now()`** -- use `workflow.Now(ctx)` instead +- **`time.Sleep()`** -- use `workflow.Sleep(ctx, duration)` instead +- **`math/rand` global** (e.g., `rand.Intn()`) -- use `workflow.SideEffect` instead +- **`crypto/rand.Reader`** -- use an activity instead +- **`os.Stdin` / `os.Stdout` / `os.Stderr`** -- use `workflow.GetLogger(ctx)` for logging +- **Map range iteration** (`for k, v := range myMap`) -- sort keys first, then iterate +- **Mutating global variables** -- use local state or `workflow.SideEffect` +- **Anonymous functions as local activities** -- the name is derived from the function and will be non-deterministic across replays; always use named functions for local activities + +## Safe Builtin Alternatives + +| Instead of | Use | +|---|---| +| `go func() { ... }()` | `workflow.Go(ctx, func(ctx workflow.Context) { ... })` | +| `chan T` | `workflow.NewChannel(ctx)` / `workflow.NewBufferedChannel(ctx, size)` | +| `select { ... }` | `workflow.NewSelector(ctx)` | +| `time.Now()` | `workflow.Now(ctx)` | +| `time.Sleep(d)` | `workflow.Sleep(ctx, d)` | +| `rand.Intn(100)` | `workflow.SideEffect(ctx, func(ctx workflow.Context) interface{} { return rand.Intn(100) })` | +| `uuid.New()` | `workflow.SideEffect` or pass as activity result | +| `log.Println(...)` | `workflow.GetLogger(ctx).Info(...)` | + +## Testing Replay Compatibility + +Use `worker.WorkflowReplayer` to verify code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/go/testing.md` + +## Best Practices + +1. Run `workflowcheck ./...` in CI to catch non-deterministic code early +2. Always use `workflow.*` APIs instead of native Go concurrency and time primitives +3. Move all I/O operations (network, filesystem, database) into activities +4. Sort map keys before iterating if you must iterate over a map in workflow code +5. Use `workflow.GetLogger(ctx)` instead of `fmt.Println` or `log.Println` for replay-safe logging +6. Keep workflow code focused on orchestration; delegate non-deterministic work to activities +7. Test with replay after making changes to workflow definitions diff --git a/references/go/error-handling.md b/references/go/error-handling.md new file mode 100644 index 0000000..92a856b --- /dev/null +++ b/references/go/error-handling.md @@ -0,0 +1,184 @@ +# Go SDK Error Handling + +## Overview + +The Go SDK uses error return values (not exceptions). All Temporal errors implement the `error` interface. Activity errors returned to workflows are wrapped in `*temporal.ActivityError`; use `errors.As` to unwrap them. + +## Application Errors + +```go +import "go.temporal.io/sdk/temporal" + +func ValidateOrder(ctx context.Context, order Order) error { + if !order.IsValid() { + return temporal.NewApplicationError( + "Invalid order", + "ValidationError", + ) + } + return nil +} +``` + +`temporal.NewApplicationError(message, errType, details...)` creates a retryable `*temporal.ApplicationError`. Use `NewApplicationErrorWithCause` to include a wrapped cause. + +## Non-Retryable Errors + +```go +func ChargeCard(ctx context.Context, input ChargeCardInput) (string, error) { + if !isValidCard(input.CardNumber) { + return "", temporal.NewNonRetryableApplicationError( + "Permanent failure - invalid credit card", + "PaymentError", + nil, // cause + ) + } + return processPayment(input.CardNumber, input.Amount) +} +``` + +`temporal.NewNonRetryableApplicationError(message, errType, cause, details...)` is always non-retryable regardless of RetryPolicy. You can also mark error types as non-retryable in the RetryPolicy instead: + +```go +RetryPolicy: &temporal.RetryPolicy{ + NonRetryableErrorTypes: []string{"PaymentError", "ValidationError"}, +}, +``` + +## Handling Activity Errors in Workflows + +```go +import ( + "errors" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func MyWorkflow(ctx workflow.Context) (string, error) { + var result string + err := workflow.ExecuteActivity(ctx, RiskyActivity).Get(ctx, &result) + if err != nil { + var applicationErr *temporal.ApplicationError + if errors.As(err, &applicationErr) { + switch applicationErr.Type() { + case "ValidationError": + // handle validation error + case "PaymentError": + // handle payment error + default: + // handle unknown error type + } + } + + var timeoutErr *temporal.TimeoutError + if errors.As(err, &timeoutErr) { + switch timeoutErr.TimeoutType() { + case enumspb.TIMEOUT_TYPE_START_TO_CLOSE: + // handle start-to-close timeout + case enumspb.TIMEOUT_TYPE_HEARTBEAT: + // handle heartbeat timeout + } + } + + var canceledErr *temporal.CanceledError + if errors.As(err, &canceledErr) { + // handle cancellation + } + + var panicErr *temporal.PanicError + if errors.As(err, &panicErr) { + // panicErr.Error() and panicErr.StackTrace() + } + + return "", err + } + return result, nil +} +``` + +## Retry Configuration + +```go +import ( + "time" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func MyWorkflow(ctx workflow.Context) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Minute, + RetryPolicy: &temporal.RetryPolicy{ + InitialInterval: time.Second, + BackoffCoefficient: 2.0, + MaximumInterval: time.Minute, + MaximumAttempts: 5, + NonRetryableErrorTypes: []string{"ValidationError", "PaymentError"}, + }, + } + ctx = workflow.WithActivityOptions(ctx, ao) + return workflow.ExecuteActivity(ctx, MyActivity).Get(ctx, nil) +} +``` + +Only set options such as `MaximumInterval`, `MaximumAttempts`, etc. if you have a domain-specific reason to. If not, prefer to leave them at their defaults. + +## Timeout Configuration + +```go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, // Single attempt max duration + ScheduleToCloseTimeout: 30 * time.Minute, // Total time including retries + ScheduleToStartTimeout: 10 * time.Minute, // Time waiting in task queue + HeartbeatTimeout: 2 * time.Minute, // Between heartbeats +} +ctx = workflow.WithActivityOptions(ctx, ao) +``` + +- **StartToCloseTimeout**: Max time for a single Activity Task Execution. Prefer this over ScheduleToCloseTimeout. +- **ScheduleToCloseTimeout**: Total time including retries. +- **ScheduleToStartTimeout**: Time an Activity Task can wait in the Task Queue before a Worker picks it up. Rarely needed. +- **HeartbeatTimeout**: Max time between heartbeats. Required for long-running activities to detect failures. + +Either `StartToCloseTimeout` or `ScheduleToCloseTimeout` must be set. + +## Workflow Failure + +Returning any error from a workflow function fails the execution. Return `nil` for success. + +**Important Go-specific behavior:** In the Go SDK, returning any error from a workflow fails the workflow execution by default — there is no automatic retry. This differs from other SDKs (Python, TypeScript) where non-`ApplicationError` exceptions cause the workflow task to retry indefinitely. In Go, if you want workflow-level retries, you must explicitly set a `RetryPolicy` on the `StartWorkflowOptions`. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + if someCondition { + return "", temporal.NewApplicationError( + "Cannot process order", + "BusinessError", + ) + } + return "success", nil +} +``` + +To prevent workflow retry, return a non-retryable error: + +```go +return "", temporal.NewNonRetryableApplicationError( + "Unrecoverable failure", + "FatalError", + nil, +) +``` + +**Note:** If an activity returns a non-retryable error, the workflow receives an `*temporal.ActivityError` wrapping it. To fail the workflow without retry, wrap it in a new `NewNonRetryableApplicationError`. + +## Best Practices + +1. Use specific error types for different failure modes +2. Mark permanent failures as non-retryable +3. Set appropriate timeouts; prefer `StartToCloseTimeout` over `ScheduleToCloseTimeout` +4. Let Temporal handle retries via RetryPolicy rather than implementing retry logic yourself +5. Use `errors.As` to unwrap and inspect specific error types +6. Design activities to be idempotent for safe retries (see `references/core/patterns.md`) diff --git a/references/go/go.md b/references/go/go.md new file mode 100644 index 0000000..cc87a6a --- /dev/null +++ b/references/go/go.md @@ -0,0 +1,242 @@ +# Temporal Go SDK Reference + +## Overview + +The Temporal Go SDK (`go.temporal.io/sdk`) provides a strongly-typed, idiomatic Go approach to building durable workflows. Workflows are regular exported Go functions. The Go SDK does not have an automatic sandbox -- determinism is the developer's responsibility, aided by the `workflowcheck` static analysis tool. + +## Quick Start + +**Add Dependency:** In your Go module, add the Temporal SDK: +```bash +go get go.temporal.io/sdk +``` + +**workflows/greeting.go** - Workflow definition: +```go +package workflows + +import ( + "time" + + "go.temporal.io/sdk/workflow" +) + +func GreetingWorkflow(ctx workflow.Context, name string) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, "Greet", name).Get(ctx, &result) + if err != nil { + return "", err + } + return result, nil +} +``` + +**activities/greet.go** - Activity definition: +```go +package activities + +import ( + "context" + "fmt" +) + +type Activities struct{} + +func (a *Activities) Greet(ctx context.Context, name string) (string, error) { + return fmt.Sprintf("Hello, %s!", name), nil +} +``` + +**worker/main.go** - Worker setup: +```go +package main + +import ( + "log" + + "yourmodule/activities" + "yourmodule/workflows" + + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/worker" +) + +func main() { + c, err := client.Dial(client.Options{}) + if err != nil { + log.Fatalln("Unable to create client", err) + } + defer c.Close() + + w := worker.New(c, "my-task-queue", worker.Options{}) + + w.RegisterWorkflow(workflows.GreetingWorkflow) + w.RegisterActivity(&activities.Activities{}) + + err = w.Run(worker.InterruptCh()) + if err != nil { + log.Fatalln("Unable to start worker", err) + } +} +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Run `go run worker/main.go` in the background. + +**starter/main.go** - Start a workflow execution: +```go +package main + +import ( + "context" + "fmt" + "log" + + "yourmodule/workflows" + + "github.com/google/uuid" + "go.temporal.io/sdk/client" +) + +func main() { + c, err := client.Dial(client.Options{}) + if err != nil { + log.Fatalln("Unable to create client", err) + } + defer c.Close() + + options := client.StartWorkflowOptions{ + ID: uuid.NewString(), + TaskQueue: "my-task-queue", + } + + we, err := c.ExecuteWorkflow(context.Background(), options, workflows.GreetingWorkflow, "my name") + if err != nil { + log.Fatalln("Unable to execute workflow", err) + } + + var result string + err = we.Get(context.Background(), &result) + if err != nil { + log.Fatalln("Unable to get workflow result", err) + } + + fmt.Println("Result:", result) +} +``` + +**Run the workflow:** Run `go run starter/main.go`. Should output: `Result: Hello, my name!`. + +## Key Concepts + +### Workflow Definition +- Exported function with `workflow.Context` as the first parameter +- Returns `(ResultType, error)` or just `error` +- Signature: `func MyWorkflow(ctx workflow.Context, input MyInput) (MyOutput, error)` +- Use `workflow.SetQueryHandler()`, `workflow.SetUpdateHandler()` for handlers +- Register with `w.RegisterWorkflow(MyWorkflow)` + +### Activity Definition +- Regular function or struct methods with `context.Context` as the first parameter +- Struct methods are preferred for dependency injection +- Signature: `func (a *Activities) MyActivity(ctx context.Context, input string) (string, error)` +- Register struct with `w.RegisterActivity(&Activities{})` (registers all exported methods) + +### Worker Setup +- Create client with `client.Dial(client.Options{})` +- Create worker with `worker.New(c, "task-queue", worker.Options{})` +- Register workflows and activities +- Run with `w.Run(worker.InterruptCh())` + +### Determinism + +**Workflow code must be deterministic!** The Go SDK has no sandbox -- determinism is enforced by convention and tooling. + +Use Temporal replacements instead of native Go constructs: +- `workflow.Go()` instead of `go` (goroutines) +- `workflow.Channel` instead of `chan` +- `workflow.Selector` instead of `select` +- `workflow.Sleep()` instead of `time.Sleep()` +- `workflow.Now()` instead of `time.Now()` +- `workflow.GetLogger()` instead of `log` / `fmt.Println` for replay-safe logging + +Use the **`workflowcheck`** static analysis tool to catch non-deterministic code: +```bash +go install go.temporal.io/sdk/contrib/tools/workflowcheck@latest +workflowcheck ./... +``` + +Read `references/core/determinism.md` and `references/go/determinism.md` to understand more. + +## File Organization Best Practice + +**Use separate packages for workflows, activities, and worker.** Activities as struct methods enable dependency injection at the worker level. + +``` +myapp/ +├── workflows/ +│ └── greeting.go # Only Workflow functions +├── activities/ +│ └── greet.go # Activity struct and methods +├── worker/ +│ └── main.go # Worker setup, imports both +└── starter/ + └── main.go # Client code to start workflows +``` + +**Activities as struct methods for dependency injection:** +```go +// activities/greet.go +type Activities struct { + HTTPClient *http.Client + DB *sql.DB +} + +func (a *Activities) FetchData(ctx context.Context, url string) (string, error) { + // Use a.HTTPClient, a.DB, etc. +} +``` + +```go +// worker/main.go - inject dependencies at worker startup +activities := &activities.Activities{ + HTTPClient: http.DefaultClient, + DB: db, +} +w.RegisterActivity(activities) +``` + +## Common Pitfalls + +1. **Using native goroutines/channels/select** - Use `workflow.Go()`, `workflow.Channel`, `workflow.Selector` +2. **Using `time.Sleep` or `time.Now`** - Use `workflow.Sleep()` and `workflow.Now()` +3. **Iterating over maps with `range`** - Map iteration order is non-deterministic; sort keys first +4. **Forgetting to register workflows/activities** - Worker will fail tasks for unregistered types +5. **Registering activity functions instead of struct** - Use `w.RegisterActivity(&Activities{})` not `w.RegisterActivity(a.MyMethod)` +6. **Forgetting to heartbeat** - Long-running activities need `activity.RecordHeartbeat(ctx, details)` +7. **Using `fmt.Println` in workflows** - Use `workflow.GetLogger(ctx)` for replay-safe logging +8. **Not setting Activity timeouts** - `StartToCloseTimeout` or `ScheduleToCloseTimeout` is required in `ActivityOptions` + +## Writing Tests + +See `references/go/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files +- **`references/go/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. +- **`references/go/determinism.md`** - Determinism rules, workflowcheck tool, safe alternatives +- **`references/go/gotchas.md`** - Go-specific mistakes and anti-patterns +- **`references/go/error-handling.md`** - ApplicationError, retry policies, non-retryable errors +- **`references/go/observability.md`** - Logging, metrics, tracing, Search Attributes +- **`references/go/testing.md`** - TestWorkflowEnvironment, time-skipping, activity mocking +- **`references/go/advanced-features.md`** - Schedules, worker tuning, and more +- **`references/go/data-handling.md`** - Data converters, payload codecs, encryption +- **`references/go/versioning.md`** - Patching API (`workflow.GetVersion`), Worker Versioning +- **`references/python/determinism-protection.md`** - Information on **`workflowcheck`** tool to help statically check for determinism issues. diff --git a/references/go/gotchas.md b/references/go/gotchas.md new file mode 100644 index 0000000..4b7ddf3 --- /dev/null +++ b/references/go/gotchas.md @@ -0,0 +1,290 @@ +# Go Gotchas + +Go-specific mistakes and anti-patterns. See also [Common Gotchas](references/core/gotchas.md) for language-agnostic concepts. + +## Goroutines and Concurrency + +### Using Native Go Concurrency Primitives + +**The Problem**: Native `go`, `chan`, and `select` are non-deterministic and will cause replay failures. + +```go +// BAD - Native goroutine +func MyWorkflow(ctx workflow.Context) error { + go func() { // Non-deterministic! + // do work + }() + return nil +} + +// GOOD - Use workflow.Go +func MyWorkflow(ctx workflow.Context) error { + workflow.Go(ctx, func(gCtx workflow.Context) { + // do work + }) + return nil +} +``` + +```go +// BAD - Native channel +func MyWorkflow(ctx workflow.Context) error { + ch := make(chan string) // Non-deterministic! + return nil +} + +// GOOD - Use workflow.Channel +func MyWorkflow(ctx workflow.Context) error { + ch := workflow.NewChannel(ctx) + return nil +} +``` + +```go +// BAD - Native select +select { +case val := <-ch1: + // handle +case val := <-ch2: + // handle +} + +// GOOD - Use workflow.Selector +selector := workflow.NewSelector(ctx) +selector.AddReceive(ch1, func(c workflow.ReceiveChannel, more bool) { + var val string + c.Receive(ctx, &val) + // handle +}) +selector.AddReceive(ch2, func(c workflow.ReceiveChannel, more bool) { + var val string + c.Receive(ctx, &val) + // handle +}) +selector.Select(ctx) +``` + +## Non-Deterministic Operations + +### Map Iteration + +```go +// BAD - Map range order is randomized +for k, v := range myMap { + // Non-deterministic order! +} + +// GOOD - Sort keys first +keys := make([]string, 0, len(myMap)) +for k := range myMap { + keys = append(keys, k) +} +sort.Strings(keys) +for _, k := range keys { + v := myMap[k] + // Deterministic order +} +``` + +### Time and Randomness + +```go +// BAD +t := time.Now() // System clock, non-deterministic +time.Sleep(time.Second) // Not replay-safe +r := rand.Intn(100) // Non-deterministic + +// GOOD +t := workflow.Now(ctx) // Deterministic +workflow.Sleep(ctx, time.Second) // Durable timer +encoded := workflow.SideEffect(ctx, func(ctx workflow.Context) interface{} { + return rand.Intn(100) +}) +var r int +encoded.Get(&r) +``` + +Use the `workflowcheck` static analysis tool to catch non-deterministic calls. For false positives, annotate with `//workflowcheck:ignore` on the line above. + +### Anonymous Functions as Local Activities + +**The Problem**: The Go SDK derives the local activity name from the function. Anonymous functions get a non-deterministic name that can change across builds, causing replay failures. + +```go +// BAD - anonymous function: name is non-deterministic +workflow.ExecuteLocalActivity(ctx, func(ctx context.Context) (string, error) { + return "result", nil +}) + +// GOOD - named function: stable, deterministic name +func QuickLookup(ctx context.Context) (string, error) { + return "result", nil +} + +workflow.ExecuteLocalActivity(ctx, QuickLookup) +``` + +Always use named functions for local activities (and regular activities). + +## Wrong Retry Classification + +**Example:** Transient network errors should be retried. Authentication errors should not be. +See `references/go/error-handling.md` for detailed guidance on error classification and retry policies. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```go +// BAD - No heartbeat, can't detect stuck activities or receive cancellation +func ProcessLargeFile(ctx context.Context, path string) error { + for _, chunk := range readChunks(path) { + process(chunk) // Takes hours, no heartbeat + } + return nil +} + +// GOOD - Regular heartbeats with progress +func ProcessLargeFile(ctx context.Context, path string) error { + for i, chunk := range readChunks(path) { + activity.RecordHeartbeat(ctx, fmt.Sprintf("Processing chunk %d", i)) + process(chunk) + } + return nil +} +``` + +### Heartbeat Timeout Too Short + +```go +// BAD - Heartbeat timeout shorter than processing time +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Minute, + HeartbeatTimeout: 10 * time.Second, // Too short! +} + +// GOOD - Heartbeat timeout allows for processing variance +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Minute, + HeartbeatTimeout: 2 * time.Minute, +} +``` + +Set heartbeat timeout as high as acceptable for your use case -- each heartbeat counts as an action. + +## Cancellation + +### Not Handling Workflow Cancellation + +```go +// BAD - Cleanup doesn't run on cancellation +func BadWorkflow(ctx workflow.Context) error { + _ = workflow.ExecuteActivity(ctx, AcquireResource).Get(ctx, nil) + _ = workflow.ExecuteActivity(ctx, DoWork).Get(ctx, nil) + _ = workflow.ExecuteActivity(ctx, ReleaseResource).Get(ctx, nil) // Never runs if cancelled! + return nil +} + +// GOOD - Use defer with NewDisconnectedContext for cleanup +func GoodWorkflow(ctx workflow.Context) error { + defer func() { + if !errors.Is(ctx.Err(), workflow.ErrCanceled) { + return + } + newCtx, _ := workflow.NewDisconnectedContext(ctx) + _ = workflow.ExecuteActivity(newCtx, ReleaseResource).Get(newCtx, nil) + }() + + err := workflow.ExecuteActivity(ctx, AcquireResource).Get(ctx, nil) + if err != nil { + return err + } + return workflow.ExecuteActivity(ctx, DoWork).Get(ctx, nil) +} +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: +1. **Heartbeating** - Cancellation is delivered via heartbeat +2. **Checking ctx.Done()** - Detect when cancellation arrives + +```go +// BAD - Activity ignores cancellation +func LongActivity(ctx context.Context) error { + doExpensiveWork() // Runs to completion even if cancelled + return nil +} + +// GOOD - Heartbeat and check ctx.Done() +func LongActivity(ctx context.Context) error { + for i, item := range items { + select { + case <-ctx.Done(): + cleanup() + return ctx.Err() + default: + activity.RecordHeartbeat(ctx, fmt.Sprintf("Processing item %d", i)) + process(item) + } + } + return nil +} +``` + +## Testing + +### Not Testing Failures + +It is important to make sure workflows work as expected under failure paths in addition to happy paths. Please see `references/go/testing.md` for more info. + +### Not Testing Replay + +Replay tests help you test that you do not have hidden sources of non-determinism bugs in your workflow code, and should be considered in addition to standard testing. Please see `references/go/testing.md` for more info. + +## Timers and Sleep + +### Using time.Sleep Instead of workflow.Sleep + +```go +// BAD: time.Sleep is not deterministic during replay +func BadWorkflow(ctx workflow.Context) error { + time.Sleep(60 * time.Second) // Non-deterministic! + return nil +} + +// GOOD: Use workflow.Sleep for deterministic timers +func GoodWorkflow(ctx workflow.Context) error { + workflow.Sleep(ctx, 60*time.Second) // Deterministic + return nil +} +``` + +### Using time.After Instead of workflow.NewTimer + +```go +// BAD: time.After is not replay-safe +func BadWorkflow(ctx workflow.Context) error { + <-time.After(5 * time.Minute) // Non-deterministic! + return nil +} + +// GOOD: Use workflow.NewTimer for durable timers +func GoodWorkflow(ctx workflow.Context) error { + timer := workflow.NewTimer(ctx, 5*time.Minute) + _ = timer.Get(ctx, nil) // Deterministic, durable + return nil +} +``` + +### Using time.Now() Instead of workflow.Now() + +```go +// BAD: time.Now() differs between execution and replay +deadline := time.Now().Add(24 * time.Hour) + +// GOOD: workflow.Now() is replay-safe +deadline := workflow.Now(ctx).Add(24 * time.Hour) +``` + +**Why this matters:** `time.Now()`, `time.Sleep()`, and `time.After()` use the system clock, which differs between original execution and replay. The `workflow.*` equivalents create durable, deterministic entries in the event history. diff --git a/references/go/observability.md b/references/go/observability.md new file mode 100644 index 0000000..ba55140 --- /dev/null +++ b/references/go/observability.md @@ -0,0 +1,153 @@ +# Go SDK Observability + +## Overview + +The Go SDK provides replay-safe logging via `workflow.GetLogger`, metrics via the Tally library with Prometheus export, and tracing via OpenTelemetry, OpenTracing, or Datadog. + +## Logging / Replay-Aware Logging + +### Workflow Logging + +Use `workflow.GetLogger(ctx)` for replay-safe logging. This logger automatically suppresses duplicate messages during replay. + +```go +func MyWorkflow(ctx workflow.Context, input string) (string, error) { + logger := workflow.GetLogger(ctx) + logger.Info("Workflow started", "input", input) + + var result string + err := workflow.ExecuteActivity(ctx, MyActivity, input).Get(ctx, &result) + if err != nil { + logger.Error("Activity failed", "error", err) + return "", err + } + + logger.Info("Workflow completed", "result", result) + return result, nil +} +``` + +The workflow logger automatically: +- Suppresses duplicate logs during replay +- Includes workflow context (workflow ID, run ID, etc.) + +### Activity Logging + +Use `activity.GetLogger(ctx)` for context-aware activity logging: + +```go +func MyActivity(ctx context.Context, input string) (string, error) { + logger := activity.GetLogger(ctx) + logger.Info("Processing input", "input", input) + // ... + return "done", nil +} +``` + +Activity logger includes: +- Activity ID, type, and task queue +- Workflow ID and run ID +- Attempt number (for retries) + +### Adding Persistent Fields + +Use `log.With` to create a logger with key-value pairs included in every entry: + +```go +logger := log.With(workflow.GetLogger(ctx), "orderId", orderId, "customerId", customerId) +logger.Info("Processing order") // includes orderId and customerId +``` + +## Customizing the Logger + +Set a custom logger via `client.Options{Logger: myLogger}`. Implement the `log.Logger` interface (Debug, Info, Warn, Error methods). + +### Using slog (Go 1.21+) + +```go +import ( + "log/slog" + "os" + + tlog "go.temporal.io/sdk/log" +) + +slogHandler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}) +logger := tlog.NewStructuredLogger(slog.New(slogHandler)) + +c, err := client.Dial(client.Options{ + Logger: logger, +}) +``` + +### Using Third-Party Loggers (Logrus, Zap, etc.) + +Use the [logur](https://github.com/logur/logur) adapter package: + +```go +import ( + "github.com/sirupsen/logrus" + logrusadapter "logur.dev/adapter/logrus" + "logur.dev/logur" +) + +logger := logur.LoggerToKV(logrusadapter.New(logrus.New())) +c, err := client.Dial(client.Options{ + Logger: logger, +}) +``` + +## Metrics + +Use the Tally library (`go.temporal.io/sdk/contrib/tally`) with Prometheus: + +```go +import ( + sdktally "go.temporal.io/sdk/contrib/tally" + "github.com/uber-go/tally/v4" + "github.com/uber-go/tally/v4/prometheus" +) + +func newPrometheusScope(c prometheus.Configuration) tally.Scope { + reporter, err := c.NewReporter( + prometheus.ConfigurationOptions{}, + ) + if err != nil { + log.Fatalln("error creating prometheus reporter", err) + } + scopeOpts := tally.ScopeOptions{ + CacheReporter: reporter, + Separator: "_", + SanitizeOptions: &sdktally.PrometheusSanitizeOptions, + } + scope, _ := tally.NewRootScope(scopeOpts, time.Second) + scope = sdktally.NewPrometheusNamingScope(scope) + return scope +} + +c, err := client.Dial(client.Options{ + MetricsHandler: sdktally.NewMetricsHandler(newPrometheusScope(prometheus.Configuration{ + ListenAddress: "0.0.0.0:9090", + TimerType: "histogram", + })), +}) +``` + +Key SDK metrics: +- `temporal_workflow_task_execution_latency` -- Workflow task processing time +- `temporal_activity_execution_latency` -- Activity execution time +- `temporal_workflow_task_replay_latency` -- Replay duration +- `temporal_request` -- Client requests to server +- `temporal_activity_schedule_to_start_latency` -- Time from scheduling to start + +## Search Attributes (Visibility) + +See the Search Attributes section of `references/go/data-handling.md` + +## Best Practices + +1. Always use `workflow.GetLogger(ctx)` in workflows -- never `fmt.Println` or `log.Println` (they produce duplicates on replay) +2. Use `activity.GetLogger(ctx)` in activities for structured context +3. Set up Prometheus metrics in production +4. Use search attributes for operational visibility and debugging +5. Use `workflow.IsReplaying(ctx)` only for custom side-effect-free logging -- the built-in logger handles replay suppression automatically diff --git a/references/go/patterns.md b/references/go/patterns.md new file mode 100644 index 0000000..732083f --- /dev/null +++ b/references/go/patterns.md @@ -0,0 +1,536 @@ +# Go SDK Patterns + +## Signals + +In Go, signals are received via channels, not handler functions. + +```go +func OrderWorkflow(ctx workflow.Context) (string, error) { + approved := false + var items []string + + approveCh := workflow.GetSignalChannel(ctx, "approve") + addItemCh := workflow.GetSignalChannel(ctx, "add-item") + + // Listen for signals in a goroutine so workflow can proceed + workflow.Go(ctx, func(ctx workflow.Context) { + for { + selector := workflow.NewSelector(ctx) + selector.AddReceive(approveCh, func(c workflow.ReceiveChannel, more bool) { + c.Receive(ctx, &approved) + }) + selector.AddReceive(addItemCh, func(c workflow.ReceiveChannel, more bool) { + var item string + c.Receive(ctx, &item) + items = append(items, item) + }) + selector.Select(ctx) + } + }) + + // Wait for approval + workflow.Await(ctx, func() bool { return approved }) + return fmt.Sprintf("Processed %d items", len(items)), nil +} +``` + +### Blocking receive from a single channel + +When waiting on a single signal, no Selector is needed: + +```go +var approveInput ApproveInput +workflow.GetSignalChannel(ctx, "approve").Receive(ctx, &approveInput) +``` + +## Queries + +**Important:** Queries must NOT modify workflow state. Query handlers run outside workflow context -- do not call `workflow.Go()`, `workflow.NewChannel()`, or any blocking workflow functions. + +```go +func StatusWorkflow(ctx workflow.Context) error { + currentState := "started" + progress := 0 + + err := workflow.SetQueryHandler(ctx, "get-status", func() (string, error) { + return currentState, nil + }) + if err != nil { + return err + } + + err = workflow.SetQueryHandler(ctx, "get-progress", func() (int, error) { + return progress, nil + }) + if err != nil { + return err + } + + // Workflow logic updates currentState and progress as it runs + currentState = "running" + for i := 0; i < 100; i++ { + progress = i + err := workflow.ExecuteActivity( + workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + }), + ProcessItem, i, + ).Get(ctx, nil) + if err != nil { + currentState = "failed" + return err + } + } + currentState = "done" + return nil +} +``` + +## Updates + +```go +func OrderWorkflow(ctx workflow.Context) (int, error) { + var items []string + + err := workflow.SetUpdateHandlerWithOptions( + ctx, + "add-item", + func(ctx workflow.Context, item string) (int, error) { + // Handler can mutate workflow state and return a value + items = append(items, item) + return len(items), nil + }, + workflow.UpdateHandlerOptions{ + Validator: func(ctx workflow.Context, item string) error { + if item == "" { + return fmt.Errorf("item cannot be empty") + } + if len(items) >= 100 { + return fmt.Errorf("order is full") + } + return nil + }, + }, + ) + if err != nil { + return 0, err + } + + // Block until cancelled + _ = ctx.Done().Receive(ctx, nil) + return len(items), nil +} +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Return an error to reject the update; return `nil` to accept. + +## Child Workflows + +```go +func ParentWorkflow(ctx workflow.Context, orders []Order) ([]string, error) { + cwo := workflow.ChildWorkflowOptions{ + WorkflowExecutionTimeout: 30 * time.Minute, + } + ctx = workflow.WithChildOptions(ctx, cwo) + + var results []string + for _, order := range orders { + var result string + err := workflow.ExecuteChildWorkflow(ctx, ProcessOrderWorkflow, order).Get(ctx, &result) + if err != nil { + return nil, err + } + results = append(results, result) + } + return results, nil +} +``` + +### Child Workflow Options + +```go +import enumspb "go.temporal.io/api/enums/v1" + +cwo := workflow.ChildWorkflowOptions{ + WorkflowID: fmt.Sprintf("child-%s", workflow.GetInfo(ctx).WorkflowExecution.ID), + + // ParentClosePolicy - what happens to child when parent closes + // PARENT_CLOSE_POLICY_TERMINATE (default), PARENT_CLOSE_POLICY_ABANDON, PARENT_CLOSE_POLICY_REQUEST_CANCEL + ParentClosePolicy: enumspb.PARENT_CLOSE_POLICY_ABANDON, + + WorkflowExecutionTimeout: 10 * time.Minute, + WorkflowTaskTimeout: time.Minute, +} +ctx = workflow.WithChildOptions(ctx, cwo) + +future := workflow.ExecuteChildWorkflow(ctx, ChildWorkflow, input) + +// Wait for child to start (important for ABANDON policy) +if err := future.GetChildWorkflowExecution().Get(ctx, nil); err != nil { + return err +} +``` + +## Handles to External Workflows + +```go +func CoordinatorWorkflow(ctx workflow.Context, targetWorkflowID string) error { + // Signal an external workflow + err := workflow.SignalExternalWorkflow(ctx, targetWorkflowID, "", "data-ready", payload).Get(ctx, nil) + if err != nil { + return err + } + + // Cancel an external workflow + err = workflow.RequestCancelExternalWorkflow(ctx, targetWorkflowID, "").Get(ctx, nil) + return err +} +``` + +## Parallel Execution + +Use `workflow.Go` to launch parallel work and `workflow.Selector` to collect results. + +```go +func ParallelWorkflow(ctx workflow.Context, items []string) ([]string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + // Launch activities in parallel + futures := make([]workflow.Future, len(items)) + for i, item := range items { + futures[i] = workflow.ExecuteActivity(actCtx, ProcessItem, item) + } + + // Collect all results + results := make([]string, len(items)) + for i, future := range futures { + if err := future.Get(ctx, &results[i]); err != nil { + return nil, err + } + } + return results, nil +} +``` + +### Using workflow.Go for background goroutines + +```go +ch := workflow.NewChannel(ctx) + +workflow.Go(ctx, func(ctx workflow.Context) { + // Background work + var result string + _ = workflow.ExecuteActivity(actCtx, SomeActivity).Get(ctx, &result) + ch.Send(ctx, result) +}) + +var result string +ch.Receive(ctx, &result) +``` + +## Selector Pattern + +`workflow.Selector` replaces Go's native `select` -- required for deterministic workflow execution. Use it to wait on multiple channels, futures, and timers simultaneously. + +```go +func ApprovalWorkflow(ctx workflow.Context) (string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + var outcome string + signalCh := workflow.GetSignalChannel(ctx, "approve") + actFuture := workflow.ExecuteActivity(actCtx, AutoReviewActivity) + + // Cancel timer if signal or activity wins + timerCtx, cancelTimer := workflow.WithCancel(ctx) + timer := workflow.NewTimer(timerCtx, 24*time.Hour) + + selector := workflow.NewSelector(ctx) + + // Branch 1: Signal received + selector.AddReceive(signalCh, func(c workflow.ReceiveChannel, more bool) { + var approved bool + c.Receive(ctx, &approved) + cancelTimer() + if approved { + outcome = "approved-by-signal" + } else { + outcome = "rejected-by-signal" + } + }) + + // Branch 2: Activity completed + selector.AddFuture(actFuture, func(f workflow.Future) { + var result string + _ = f.Get(ctx, &result) + cancelTimer() + outcome = result + }) + + // Branch 3: Timeout + selector.AddFuture(timer, func(f workflow.Future) { + if err := f.Get(ctx, nil); err == nil { + outcome = "timed-out" + } + // If timer was cancelled, err is CanceledError -- ignore + }) + + selector.Select(ctx) // Blocks until one branch fires + return outcome, nil +} +``` + +Key points: +- `AddReceive(channel, callback)` -- fires when a channel has a message (must consume with `c.Receive`) +- `AddFuture(future, callback)` -- fires when a future resolves (once per Selector) +- `AddDefault(callback)` -- fires immediately if nothing else is ready +- `Select(ctx)` -- blocks until one branch fires; call multiple times to process multiple events + +## Continue-as-New + +```go +func LongRunningWorkflow(ctx workflow.Context, state WorkflowState) (string, error) { + for { + state = processBatch(ctx, state) + + if state.IsComplete { + return "done", nil + } + + // Check if history is getting large + if workflow.GetInfo(ctx).GetContinueAsNewSuggested() { + return "", workflow.NewContinueAsNewError(ctx, LongRunningWorkflow, state) + } + } +} +``` + +Drain signals before continue-as-new to avoid signal loss: + +```go +for { + var signalVal string + ok := signalChan.ReceiveAsync(&signalVal) + if !ok { + break + } + // process signal +} +return "", workflow.NewContinueAsNewError(ctx, LongRunningWorkflow, state) +``` + +## Cancellation Handling + +Use `ctx.Done()` to detect cancellation and `workflow.NewDisconnectedContext` for cleanup that must run even after cancellation. + +```go +func MyWorkflow(ctx workflow.Context) error { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: time.Hour, + }) + + err := workflow.ExecuteActivity(actCtx, LongRunningActivity).Get(ctx, nil) + if err != nil && temporal.IsCanceledError(ctx.Err()) { + // Workflow was cancelled -- run cleanup with a disconnected context + workflow.GetLogger(ctx).Info("Workflow cancelled, running cleanup") + disconnectedCtx, _ := workflow.NewDisconnectedContext(ctx) + disconnectedCtx = workflow.WithActivityOptions(disconnectedCtx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + _ = workflow.ExecuteActivity(disconnectedCtx, CleanupActivity).Get(disconnectedCtx, nil) + return err // Return CanceledError + } + return err +} +``` + +## Saga Pattern (Compensations) + +**Important:** Compensation activities should be idempotent -- they may be retried (as with ALL activities). + +Use `workflow.NewDisconnectedContext` when running compensations so they execute even if the workflow is cancelled. + +```go +func OrderWorkflow(ctx workflow.Context, order Order) (string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + var compensations []func(ctx workflow.Context) error + + // Helper to run all compensations in reverse, using a disconnected context + // so compensations run even if the workflow is cancelled. + runCompensations := func() { + disconnectedCtx, _ := workflow.NewDisconnectedContext(ctx) + compCtx := workflow.WithActivityOptions(disconnectedCtx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + for i := len(compensations) - 1; i >= 0; i-- { + if err := compensations[i](compCtx); err != nil { + workflow.GetLogger(ctx).Error("Compensation failed", "error", err) + } + } + } + + // Register compensation BEFORE running the activity. + // If the activity completes the effect but fails on return, + // we still need the compensation. + compensations = append(compensations, func(ctx workflow.Context) error { + return workflow.ExecuteActivity(ctx, ReleaseInventoryIfReserved, order).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(actCtx, ReserveInventory, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + compensations = append(compensations, func(ctx workflow.Context) error { + return workflow.ExecuteActivity(ctx, RefundPaymentIfCharged, order).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(actCtx, ChargePayment, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + if err := workflow.ExecuteActivity(actCtx, ShipOrder, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + return "Order completed", nil +} +``` + +## Wait Condition with Timeout + +```go +func ApprovalWorkflow(ctx workflow.Context) (string, error) { + approved := false + + // Set up signal handler + workflow.Go(ctx, func(ctx workflow.Context) { + workflow.GetSignalChannel(ctx, "approve").Receive(ctx, &approved) + }) + + // Wait with 24-hour timeout -- returns (conditionMet, error) + conditionMet, err := workflow.AwaitWithTimeout(ctx, 24*time.Hour, func() bool { + return approved + }) + if err != nil { + return "", err + } + + if conditionMet { + return "approved", nil + } + return "auto-rejected due to timeout", nil +} +``` + +Without timeout: + +```go +err := workflow.Await(ctx, func() bool { return ready }) +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers may run activities asynchronously. Use `workflow.Await` with `workflow.AllHandlersFinished` before completing or continuing-as-new to prevent the workflow from closing while handlers are still running. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + // ... register handlers, main workflow logic ... + + // Before exiting, wait for all handlers to finish + err := workflow.Await(ctx, func() bool { + return workflow.AllHandlersFinished(ctx) + }) + if err != nil { + return "", err + } + return "done", nil +} +``` + +## Activity Heartbeat Details + +### WHY: +- **Support activity cancellation** -- Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** -- Heartbeat details persist across retries + +### WHEN: +- **Cancellable activities** -- Any activity that should respond to cancellation +- **Long-running activities** -- Track progress for resumability +- **Checkpointing** -- Save progress periodically + +```go +func ProcessLargeFile(ctx context.Context, filePath string) (string, error) { + // Recover from previous attempt + startIdx := 0 + if activity.HasHeartbeatDetails(ctx) { + if err := activity.GetHeartbeatDetails(ctx, &startIdx); err == nil { + startIdx++ // Resume from next item + } + } + + lines := readFileLines(filePath) + + for i := startIdx; i < len(lines); i++ { + processLine(lines[i]) + + // Heartbeat with progress -- if cancelled, ctx will be cancelled + activity.RecordHeartbeat(ctx, i) + + if ctx.Err() != nil { + // Activity was cancelled + cleanup() + return "", ctx.Err() + } + } + + return "completed", nil +} +``` + +## Timers + +```go +func TimerWorkflow(ctx workflow.Context) (string, error) { + // Simple sleep + err := workflow.Sleep(ctx, time.Hour) + if err != nil { + return "", err + } + + // Timer as a Future -- for use with Selector + timerCtx, cancelTimer := workflow.WithCancel(ctx) + timer := workflow.NewTimer(timerCtx, 30*time.Minute) + + // Cancel the timer when no longer needed + cancelTimer() + + return "Timer fired", nil +} +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + lao := workflow.LocalActivityOptions{ + StartToCloseTimeout: 5 * time.Second, + } + ctx = workflow.WithLocalActivityOptions(ctx, lao) + + var result string + err := workflow.ExecuteLocalActivity(ctx, QuickLookup, "key").Get(ctx, &result) + if err != nil { + return "", err + } + return result, nil +} +``` diff --git a/references/go/testing.md b/references/go/testing.md new file mode 100644 index 0000000..ab74bbd --- /dev/null +++ b/references/go/testing.md @@ -0,0 +1,238 @@ +# Go SDK Testing + +## Overview + +The Go SDK provides the `testsuite` package for testing Workflows and Activities. It uses the [testify](https://github.com/stretchr/testify) library for assertions (`assert`/`require`) and mocking (`mock`). The test environment supports automatic time-skipping for Workflows with timers. + +## Test Environment Setup + +Two approaches: struct-based with `suite.Suite` or function-based with `testsuite.NewTestWorkflowEnvironment()`. + +**Approach 1: Struct-based (testify suite)** + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/suite" + + "go.temporal.io/sdk/testsuite" +) + +type UnitTestSuite struct { + suite.Suite + testsuite.WorkflowTestSuite + + env *testsuite.TestWorkflowEnvironment +} + +func (s *UnitTestSuite) SetupTest() { + s.env = s.NewTestWorkflowEnvironment() +} + +func (s *UnitTestSuite) AfterTest(suiteName, testName string) { + s.env.AssertExpectations(s.T()) +} + +func (s *UnitTestSuite) Test_MyWorkflow_Success() { + s.env.ExecuteWorkflow(MyWorkflow, "input") + + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) +} + +func TestUnitTestSuite(t *testing.T) { + suite.Run(t, new(UnitTestSuite)) +} +``` + +**Approach 2: Function-based** + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.temporal.io/sdk/testsuite" +) + +func Test_MyWorkflow(t *testing.T) { + testSuite := &testsuite.WorkflowTestSuite{} + env := testSuite.NewTestWorkflowEnvironment() + env.RegisterActivity(MyActivity) + + env.ExecuteWorkflow(MyWorkflow, "input") + assert.True(t, env.IsWorkflowCompleted()) + assert.NoError(t, env.GetWorkflowError()) + + var result string + assert.NoError(t, env.GetWorkflowResult(&result)) + assert.Equal(t, "expected", result) +} +``` + +You must register all Activity Definitions used by the Workflow with `env.RegisterActivity(ActivityFunc)`. The Workflow itself does not need to be registered. + +## Activity Mocking + +Mock activities with `env.OnActivity()` to test Workflow logic in isolation. + +**Return mock values:** + +```go +env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return("mock_result", nil) +``` + +**Return a function replacement** (for parameter validation or custom logic): + +```go +env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return( + func(ctx context.Context, input string) (string, error) { + // Custom logic, assertions, etc. + return "computed_result", nil + }, +) +``` + +**Match specific arguments:** + +```go +env.OnActivity(MyActivity, mock.Anything, "specific_input").Return("result", nil) +``` + +When using mocks, you do not need to call `env.RegisterActivity()` for that Activity. The mock signature must match the original Activity function signature. + +## Testing Signals and Queries + +Use `RegisterDelayedCallback` to send Signals during Workflow execution. Use `QueryWorkflow` to test query handlers. + +```go +func (s *UnitTestSuite) Test_SignalsAndQueries() { + // Register a delayed callback to send a signal after 5 seconds + s.env.RegisterDelayedCallback(func() { + s.env.SignalWorkflow("approve", SignalData{Approved: true}) + }, time.Second*5) + + s.env.ExecuteWorkflow(ApprovalWorkflow, input) + + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) +} +``` + +**Query a running Workflow** (must be called inside `RegisterDelayedCallback` or after `ExecuteWorkflow`): + +```go +s.env.RegisterDelayedCallback(func() { + res, err := s.env.QueryWorkflow("getProgress") + s.NoError(err) + + var progress int + err = res.Get(&progress) + s.NoError(err) + s.Equal(50, progress) +}, time.Second*10+time.Millisecond) +``` + +`QueryWorkflow` returns a `converter.EncodedValue`. Use `.Get(&result)` to decode the value. + +For "Signal-With-Start" testing, set the delay to `0`. + +## Testing Failure Cases + +```go +func (s *UnitTestSuite) Test_WorkflowFailure() { + // Mock activity to return an error + s.env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return( + "", errors.New("activity failed")) + + s.env.ExecuteWorkflow(MyWorkflow, "input") + + s.True(s.env.IsWorkflowCompleted()) + + err := s.env.GetWorkflowError() + s.Error(err) + + var applicationErr *temporal.ApplicationError + s.True(errors.As(err, &applicationErr)) + s.Equal("activity failed", applicationErr.Error()) +} +``` + +`env.GetWorkflowError()` returns the Workflow error. Use `errors.As(err, &applicationErr)` to check the error type. Mock activities returning errors to test Workflow error-handling paths. + +## Replay Testing + +Use `worker.NewWorkflowReplayer()` to verify that code changes do not break determinism. Load history from a JSON file exported via the Temporal CLI or Web UI. + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.temporal.io/sdk/worker" +) + +func Test_ReplayFromFile(t *testing.T) { + replayer := worker.NewWorkflowReplayer() + replayer.RegisterWorkflow(MyWorkflow) + + err := replayer.ReplayWorkflowHistoryFromJSONFile(nil, "my_workflow_history.json") + assert.NoError(t, err) +} +``` + +Export history via CLI: `temporal workflow show --workflow-id --output json > history.json` + +**Replay from a programmatically fetched history:** + +```go +func Test_ReplayFromServer(t *testing.T) { + // Fetch history from the server + hist, err := GetWorkflowHistory(ctx, client, workflowID, runID) + assert.NoError(t, err) + + replayer := worker.NewWorkflowReplayer() + replayer.RegisterWorkflow(MyWorkflow) + + err = replayer.ReplayWorkflowHistory(nil, hist) + assert.NoError(t, err) +} +``` + +## Activity Testing + +Test Activities in isolation using `TestActivityEnvironment`. No Worker or Workflow needed. + +```go +func Test_MyActivity(t *testing.T) { + testSuite := &testsuite.WorkflowTestSuite{} + env := testSuite.NewTestActivityEnvironment() + env.RegisterActivity(MyActivity) + + val, err := env.ExecuteActivity(MyActivity, "input") + assert.NoError(t, err) + + var result string + assert.NoError(t, val.Get(&result)) + assert.Equal(t, "expected_output", result) +} +``` + +`ExecuteActivity` returns `(converter.EncodedValue, error)`. Use `val.Get(&result)` to extract the typed result. The Activity executes synchronously in the calling goroutine. + +## Best Practices + +1. Register all Activities used by the Workflow with `env.RegisterActivity()`, unless you mock them with `env.OnActivity()` +2. Use mocks to isolate Workflow logic from Activity implementations +3. Test failure paths by mocking Activities that return errors +4. Use replay testing before deploying Workflow code changes to catch non-determinism errors +5. Use unique task queues per test when running integration tests +6. Call `env.AssertExpectations(s.T())` in `AfterTest` to verify all mocks were called diff --git a/references/go/versioning.md b/references/go/versioning.md new file mode 100644 index 0000000..b6b6c27 --- /dev/null +++ b/references/go/versioning.md @@ -0,0 +1,232 @@ +# Go SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## GetVersion API + +`workflow.GetVersion` safely performs backwards-incompatible changes to Workflow Definitions. It returns the version to branch on, recording the result as a marker in the Event History. + +```go +v := workflow.GetVersion(ctx, "changeID", workflow.DefaultVersion, maxSupported) +``` + +- `changeID`: unique string identifying the change +- `minSupported`: oldest version still supported (`workflow.DefaultVersion` is `-1`) +- `maxSupported`: current/newest version +- Returns `maxSupported` for new executions; returns the recorded version on replay + +### Three-Step Lifecycle + +**Step 1: Add GetVersion with both code paths** + +Original code calls `ActivityA`. You want to replace it with `ActivityC`: + +```go +v := workflow.GetVersion(ctx, "Step1", workflow.DefaultVersion, 1) +if v == workflow.DefaultVersion { + // Old code path (for replay of existing workflows) + err = workflow.ExecuteActivity(ctx, ActivityA, data).Get(ctx, &result1) +} else { + // New code path + err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +} +``` + +For new executions, `GetVersion` returns `1` and records a marker. For replay of pre-change workflows (no marker), it returns `DefaultVersion` (`-1`). + +**Step 2: Remove old branch (increase minSupported)** + +After all `DefaultVersion` Workflow Executions have completed: + +```go +v := workflow.GetVersion(ctx, "Step1", 1, 1) +// Only the new code path remains +err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +``` + +Keep the `GetVersion` call even with a single branch. This ensures: +1. If an older execution replays on this code, it fails fast instead of proceeding incorrectly +2. If you need further changes, you just bump `maxSupported` + +**Step 3: Further changes (bump maxSupported)** + +Later, replace `ActivityC` with `ActivityD`: + +```go +v := workflow.GetVersion(ctx, "Step1", 1, 2) +if v == 1 { + err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +} else { + err = workflow.ExecuteActivity(ctx, ActivityD, data).Get(ctx, &result1) +} +``` + +After all version-1 executions complete, collapse again: + +```go +_ = workflow.GetVersion(ctx, "Step1", 2, 2) +err = workflow.ExecuteActivity(ctx, ActivityD, data).Get(ctx, &result1) +``` + +### Using GetVersion in Loops + +The return value for a given `changeID` is immutable once recorded. In loops, append the iteration number to the `changeID`: + +```go +for i := 0; i < 10; i++ { + v := workflow.GetVersion(ctx, fmt.Sprintf("myChange-%d", i), workflow.DefaultVersion, 1) + if v == workflow.DefaultVersion { + // old path + } else { + // new path + } +} +``` + +## Workflow Type Versioning + +Create a new Workflow Type for incompatible changes: + +```go +// Original +func MyWorkflow(ctx workflow.Context, input Input) (string, error) { + // v1 implementation +} + +// New version +func MyWorkflowV2(ctx workflow.Context, input Input) (string, error) { + // v2 implementation +} +``` + +Register both with the Worker: + +```go +w := worker.New(c, "my-task-queue", worker.Options{}) +w.RegisterWorkflow(MyWorkflow) +w.RegisterWorkflow(MyWorkflowV2) +``` + +Route new executions to the new type. Old workflows continue on the old type. Check for open executions before removing the old type: + +```bash +temporal workflow list --query 'WorkflowType = "MyWorkflow" AND ExecutionStatus = "Running"' +``` + +## Worker Versioning + +Worker Versioning manages versions at the deployment level, allowing multiple Worker versions to run simultaneously. + +### Key Concepts + +**Worker Deployment**: A logical service grouping similar Workers together (e.g., "loan-processor"). All versions of your code live under this umbrella. + +**Worker Deployment Version**: A specific snapshot of your code identified by a deployment name and Build ID (e.g., "loan-processor:v1.0" or "loan-processor:abc123"). + +### Configuring Workers for Versioning + +```go +w := worker.New(c, "my-task-queue", worker.Options{ + DeploymentOptions: worker.DeploymentOptions{ + UseVersioning: true, + Version: worker.WorkerDeploymentVersion{ + DeploymentName: "my-service", + BuildId: "v1.0.0", // or git commit hash + }, + DefaultVersioningBehavior: workflow.VersioningBehaviorPinned, + }, +}) +``` + +**Configuration fields:** +- `UseVersioning`: enables Worker Versioning +- `Version`: identifies the Worker Deployment Version (deployment name + build ID) +- `DefaultVersioningBehavior`: `VersioningBehaviorPinned` or `VersioningBehaviorAutoUpgrade` +- Build ID: typically a git commit hash, version number, or timestamp + +### PINNED vs AUTO_UPGRADE Behaviors + +**PINNED Behavior** + +Workflows stay locked to their original Worker version. + +**When to use PINNED:** +- Short-running workflows (minutes to hours) +- Consistency is critical (e.g., financial transactions) +- You want to eliminate version compatibility complexity +- Building new applications and want simplest development experience + +**AUTO_UPGRADE Behavior** + +Workflows can move to newer versions. + +**When to use AUTO_UPGRADE:** +- Long-running workflows (weeks or months) +- Workflows need to benefit from bug fixes during execution +- Migrating from traditional rolling deployments +- You are already using GetVersion for version transitions + +**Important:** AUTO_UPGRADE workflows still need GetVersion to handle version transitions safely since they can move between Worker versions. + +### Worker Configuration with Default Behavior + +```go +// For short-running workflows, prefer PINNED +w := worker.New(c, "orders-task-queue", worker.Options{ + DeploymentOptions: worker.DeploymentOptions{ + UseVersioning: true, + Version: worker.WorkerDeploymentVersion{ + DeploymentName: "order-service", + BuildId: os.Getenv("BUILD_ID"), + }, + DefaultVersioningBehavior: workflow.VersioningBehaviorPinned, + }, +}) +``` + +### Deployment Strategies + +**Blue-Green Deployments** + +Maintain two environments and switch traffic between them: +1. Deploy new code to idle environment +2. Run tests and validation +3. Switch traffic to new environment +4. Keep old environment for instant rollback + +**Rainbow Deployments** + +Multiple versions run simultaneously: +- New workflows use latest version +- Existing workflows complete on their original version +- Add new versions alongside existing ones +- Gradually sunset old versions as workflows complete + +This works well with Kubernetes where you manage multiple ReplicaSets running different Worker versions. + +Deploy a new version, then set it as current: + +```bash +temporal worker deployment set-current-version \ + --deployment-name my-service \ + --build-id v2.0.0 +``` + +### Querying Workflows by Worker Version + +```bash +# Find workflows on a specific Worker version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "my-service:v1.0.0" AND ExecutionStatus = "Running"' +``` + +## Best Practices + +1. **Keep GetVersion calls** even when only a single branch remains -- it guards against stale replays and simplifies future changes +2. **Use `TemporalChangeVersion` search attribute** to find Workflows running on old versions: + ```bash + temporal workflow list --query \ + 'WorkflowType = "MyWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion = "Step1"' + ``` +3. **Test with replay** before removing old branches to verify determinism is preserved +4. **Prefer Worker Versioning** for large-scale deployments to avoid accumulating patching branches diff --git a/references/python/patterns.md b/references/python/patterns.md index 00cd12a..6843985 100644 --- a/references/python/patterns.md +++ b/references/python/patterns.md @@ -106,6 +106,8 @@ class OrderWorkflow: raise ValueError("Order is full") ``` +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Raise an exception to reject the update; return `None` to accept. + ## Child Workflows ```python diff --git a/references/typescript/patterns.md b/references/typescript/patterns.md index 4b07947..3d59e23 100644 --- a/references/typescript/patterns.md +++ b/references/typescript/patterns.md @@ -132,6 +132,8 @@ export async function orderWorkflow(): Promise { } ``` +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Throw an error to reject the update; return normally to accept. + ## Child Workflows ```typescript From b9a0728b15496fdb1d00f95cd27055e38188cf78 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 19 Mar 2026 14:54:44 -0400 Subject: [PATCH 07/42] Setup CODEOWNERS to AI SDK team (#48) --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..872c89b --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @temporalio/ai-sdk From 29e46009c05dd213b1c8cfda51cc170d5a62c4cc Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 19 Mar 2026 16:33:47 -0400 Subject: [PATCH 08/42] Align version number in SKILL.md and plugin.json. (#49) --- SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SKILL.md b/SKILL.md index 6d9c888..1874d20 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,7 +1,7 @@ --- name: temporal-developer description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "Temporal Go", "Temporal Golang", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. -version: 1.0.0 +version: 0.1.0 --- # Skill: temporal-developer From b5719bc1434d4d5a9bc6b3f2822da9d9142aff22 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 19 Mar 2026 17:36:15 -0400 Subject: [PATCH 09/42] PR Tracking Initial Release (#4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add initial skill for testing, which is simply Steve's skill (#1) * Add initial skill for testing, which is simply Steve's skill * Rename skill to 'temporal-dev' and update version Updated skill name and version for Temporal Python. * Use claude to merge Steve's, Max's, and Mason's skills. (#2) * Use claude to merge Steve's, Max's, and Mason's skills. Did a review pass using claude's skill devlopment skills * Add missing things from Steve * trigger tweaks * Add in common gotchas from Johann * add simple feedback mechanism (#3) * Change skill name to kebab-case, for compatibility with Amp and Cline (#7) * Clean up references/core/ai-integration.md * Clean up references/core/common-gotchas.md * Clean up references/core/common-gotchas.md * Clean up references/core/determinism.md * Clean up references/core/determinism.md * Update error-reference.md * Update interactive-workflows.md * Clean up patterns.md * Cut shell scripts * Edit troubleshooting.md * remove interceptors for now * remove dynamic workflows * clarify on heartbeating of async activity completions, and prompt it a bit in relation to signals * Improve references/python/advanced-features.md * Use explicit namespace in connect * remove duplicated content from determinism.md, clean up * Improve references/python/data-handling.md * Prefer start_to_close_timeout * don't explicitely provide defaults for retry policies * error-handling.md cleanup * move idempotency patterns to patterns.md * remove multi-param activities * small edits * Unify sandbox stuff into one file * local activities aren't experimental * Clean up references/python/sync-vs-async.md * Cleanup observability.md, remove duplicated search attributes * Cut otel for now * cut a lot of duplicate stuff from python gotchas, address comments * de-duplicate content * Lots of improvements to testing * cleanup to top level of skill (like CLI install instructions), and to top-level of python * Improve patterns.md * clean up ai-patterns.md * Update readme with installation instructions * remove ts directory * De-couple core from python and TypeScript as much as possible * Remove TypeScript hints * add prompting for feedback at startup - wait for ethan on slack channel * shorten url * Update slack channel * Automated pass over on python cleanup & deduplication * Remove multi-patching from Python, since its obvious, dont waste tokens on it. (#34) * Add TypeScript (#31) Adds initial support for TypeScript to the skill --------- Co-authored-by: James Watkins-Harvey Co-authored-by: Chris Olszewski * Fix typos and reference links (#36) * Fix typos and reference links * 2 more typo fixes * quick edit to readme (#37) * Fix saga compensations to run under cancellation protection (#43) When a workflow is cancelled mid-saga, compensations must run in a cancellation-protected scope, otherwise they are immediately cancelled before they can execute. - Python: wrap compensation loop in asyncio.shield() so it runs even when the workflow receives a CancelledError - TypeScript: wrap compensation loop in CancellationScope.nonCancellable() so it runs even when the root scope is cancelled (per official docs: "Cleanup logic must be in a nonCancellable scope") - TypeScript: also fix compensation registration order — register BEFORE calling the activity (was already correct in Python) Co-authored-by: Claude Sonnet 4.6 (1M context) * Update readme for public preview (#45) * a few more readme tweaks (#46) * Add MIT License to the project (#47) * Add Go (supersedes other PR) (#38) * progress on go * Go translation workflow completed. * missed a few spots * Manual edits * Address feedback * Add gotcha about anonymous local activities * Sample code for payload converter * clarify sdk protection mechanisms * Setup CODEOWNERS to AI SDK team (#48) * Align version number in SKILL.md and plugin.json. (#49) --------- Co-authored-by: James Watkins-Harvey Co-authored-by: Chris Olszewski Co-authored-by: Claude Sonnet 4.6 (1M context) --- .github/CODEOWNERS | 1 + LICENSE | 21 + README.md | 43 +- SKILL.md | 132 +++++ references/core/ai-patterns.md | 166 ++++++ references/core/determinism.md | 118 ++++ references/core/dev-management.md | 26 + references/core/error-reference.md | 32 ++ references/core/gotchas.md | 196 +++++++ references/core/interactive-workflows.md | 49 ++ references/core/patterns.md | 443 +++++++++++++++ references/core/troubleshooting.md | 323 +++++++++++ references/core/versioning.md | 174 ++++++ references/go/advanced-features.md | 187 ++++++ references/go/data-handling.md | 262 +++++++++ references/go/determinism-protection.md | 98 ++++ references/go/determinism.md | 52 ++ references/go/error-handling.md | 184 ++++++ references/go/go.md | 242 ++++++++ references/go/gotchas.md | 290 ++++++++++ references/go/observability.md | 153 +++++ references/go/patterns.md | 536 ++++++++++++++++++ references/go/testing.md | 238 ++++++++ references/go/versioning.md | 232 ++++++++ references/python/advanced-features.md | 166 ++++++ references/python/ai-patterns.md | 334 +++++++++++ references/python/data-handling.md | 230 ++++++++ references/python/determinism-protection.md | 233 ++++++++ references/python/determinism.md | 51 ++ references/python/error-handling.md | 138 +++++ references/python/gotchas.md | 280 +++++++++ references/python/observability.md | 105 ++++ references/python/patterns.md | 395 +++++++++++++ references/python/python.md | 175 ++++++ references/python/sync-vs-async.md | 231 ++++++++ references/python/testing.md | 165 ++++++ references/python/versioning.md | 314 ++++++++++ references/typescript/advanced-features.md | 150 +++++ references/typescript/data-handling.md | 253 +++++++++ .../typescript/determinism-protection.md | 56 ++ references/typescript/determinism.md | 51 ++ references/typescript/error-handling.md | 119 ++++ references/typescript/gotchas.md | 312 ++++++++++ references/typescript/observability.md | 109 ++++ references/typescript/patterns.md | 417 ++++++++++++++ references/typescript/testing.md | 222 ++++++++ references/typescript/typescript.md | 172 ++++++ references/typescript/versioning.md | 211 +++++++ 48 files changed, 9086 insertions(+), 1 deletion(-) create mode 100644 .github/CODEOWNERS create mode 100644 LICENSE create mode 100644 SKILL.md create mode 100644 references/core/ai-patterns.md create mode 100644 references/core/determinism.md create mode 100644 references/core/dev-management.md create mode 100644 references/core/error-reference.md create mode 100644 references/core/gotchas.md create mode 100644 references/core/interactive-workflows.md create mode 100644 references/core/patterns.md create mode 100644 references/core/troubleshooting.md create mode 100644 references/core/versioning.md create mode 100644 references/go/advanced-features.md create mode 100644 references/go/data-handling.md create mode 100644 references/go/determinism-protection.md create mode 100644 references/go/determinism.md create mode 100644 references/go/error-handling.md create mode 100644 references/go/go.md create mode 100644 references/go/gotchas.md create mode 100644 references/go/observability.md create mode 100644 references/go/patterns.md create mode 100644 references/go/testing.md create mode 100644 references/go/versioning.md create mode 100644 references/python/advanced-features.md create mode 100644 references/python/ai-patterns.md create mode 100644 references/python/data-handling.md create mode 100644 references/python/determinism-protection.md create mode 100644 references/python/determinism.md create mode 100644 references/python/error-handling.md create mode 100644 references/python/gotchas.md create mode 100644 references/python/observability.md create mode 100644 references/python/patterns.md create mode 100644 references/python/python.md create mode 100644 references/python/sync-vs-async.md create mode 100644 references/python/testing.md create mode 100644 references/python/versioning.md create mode 100644 references/typescript/advanced-features.md create mode 100644 references/typescript/data-handling.md create mode 100644 references/typescript/determinism-protection.md create mode 100644 references/typescript/determinism.md create mode 100644 references/typescript/error-handling.md create mode 100644 references/typescript/gotchas.md create mode 100644 references/typescript/observability.md create mode 100644 references/typescript/patterns.md create mode 100644 references/typescript/testing.md create mode 100644 references/typescript/typescript.md create mode 100644 references/typescript/versioning.md diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..872c89b --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @temporalio/ai-sdk diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7092ef5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Temporal Technologies Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 922884a..6ba88db 100644 --- a/README.md +++ b/README.md @@ -1 +1,42 @@ -# skill-temporal-dev +# Temporal Development Skill + +A comprehensive skill for developers to use when building [Temporal](https://temporal.io/) applications. + +> [!WARNING] +> This Skill is currently in Public Preview, and will continue to evolve and improve. +> We would love to hear your feedback - positive or negative - over in the [Community Slack](https://t.mp/slack), in the [#topic-ai channel](https://temporalio.slack.com/archives/C0818FQPYKY) + +## Installation + +### As a Claude Code Plugin + +This skill is housed within a [Claude Code plugin](https://github.com/temporalio/agent-skills), which provides a simple way to install and receive future updates to the skill. + +1. Run `/plugin marketplace add temporalio/agent-skills` +2. Run `/plugin` to open the plugin manager +3. Select **Marketplaces** +4. Choose `temporal-marketplace` from the list +5. Select **Enable auto-update** or **Disable auto-update** +6. run `/plugin install temporal-developer@temporalio-agent-skills` +7. Restart Claude Code + +### Via `npx skills` - supports all major coding agents + +1. `npx skills add https://github.com/temporalio/skill-temporal-developer` +2. Follow prompts + +### Via manually cloning the skill repo: + +1. `mkdir -p ~/.claude/skills && git clone https://github.com/temporalio/skill-temporal-developer ~/.claude/skills/temporal-developer` + +Appropriately adjust the installation directory based on your coding agent. + +## Currently Supported Temporal SDK Langages + +- [x] Python ✅ +- [x] TypeScript ✅ +- [x] Go ✅ +- [ ] Java 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/42)) +- [ ] .NET 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/39)) +- [ ] Ruby 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/41)) +- [ ] PHP 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/40)) diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..1874d20 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,132 @@ +--- +name: temporal-developer +description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "Temporal Go", "Temporal Golang", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. +version: 0.1.0 +--- + +# Skill: temporal-developer + +## Overview + +Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python, TypeScript, and Go. + +## Core Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Temporal Cluster │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌────────────────┐ │ +│ │ Event History │ │ Task Queues │ │ Visibility │ │ +│ │ (Durable Log) │ │ (Work Router) │ │ (Search) │ │ +│ └─────────────────┘ └─────────────────┘ └────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + ▲ + │ Poll / Complete + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Worker │ +│ ┌─────────────────────────┐ ┌──────────────────────────────┐ │ +│ │ Workflow Definitions │ │ Activity Implementations │ │ +│ │ (Deterministic) │ │ (Non-deterministic OK) │ │ +│ └─────────────────────────┘ └──────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +**Components:** +- **Workflows** - Durable, deterministic functions that orchestrate activities +- **Activities** - Non-deterministic operations (API calls, I/O) that can fail and retry +- **Workers** - Long-running processes that poll task queues and execute code +- **Task Queues** - Named queues connecting clients to workers + +## History Replay: Why Determinism Matters + +Temporal achieves durability through **history replay**: + +1. **Initial Execution** - Worker runs workflow, generates Commands, stored as Events in history +2. **Recovery** - On restart/failure, Worker re-executes workflow from beginning +3. **Matching** - SDK compares generated Commands against stored Events +4. **Restoration** - Uses stored Activity results instead of re-executing + +**If Commands don't match Events = Non-determinism Error = Workflow blocked** + +| Workflow Code | Command | Event | +|--------------|---------|-------| +| Execute activity | `ScheduleActivityTask` | `ActivityTaskScheduled` | +| Sleep/timer | `StartTimer` | `TimerStarted` | +| Child workflow | `StartChildWorkflowExecution` | `ChildWorkflowExecutionStarted` | + +See `references/core/determinism.md` for detailed explanation. + +## Getting Started + +### Ensure Temporal CLI is installed + +Check if `temporal` CLI is installed. If not, follow these instructions: + +#### macOS + +``` +brew install temporal +``` + +#### Linux + +Check your machine's architecture and download the appropriate archive: + +- [Linux amd64](https://temporal.download/cli/archive/latest?platform=linux&arch=amd64) +- [Linux arm64](https://temporal.download/cli/archive/latest?platform=linux&arch=arm64) + +Once you've downloaded the file, extract the downloaded archive and add the temporal binary to your PATH by copying it to a directory like /usr/local/bin + +#### Windows + +Check your machine's architecture and download the appropriate archive: + +- [Windows amd64](https://temporal.download/cli/archive/latest?platform=windows&arch=amd64) +- [Windows arm64](https://temporal.download/cli/archive/latest?platform=windows&arch=arm64) + +Once you've downloaded the file, extract the downloaded archive and add the temporal.exe binary to your PATH. + +### Read All Relevant References + +1. First, read the getting started guide for the language you are working in: + - Python -> read `references/python/python.md` + - TypeScript -> read `references/typescript/typescript.md` + - Go -> read `references/go/go.md` +2. Second, read appropriate `core` and language-specific references for the task at hand. + + +## Primary References +- **`references/core/determinism.md`** - Why determinism matters, replay mechanics, basic concepts of activities + + Language-specific info at `references/{your_language}/determinism.md` +- **`references/core/patterns.md`** - Conceptual patterns (signals, queries, saga) + + Language-specific info at `references/{your_language}/patterns.md` +- **`references/core/gotchas.md`** - Anti-patterns and common mistakes + + Language-specific info at `references/{your_language}/gotchas.md` +- **`references/core/versioning.md`** - Versioning strategies and concepts - how to safely change workflow code while workflows are running + + Language-specific info at `references/{your_language}/versioning.md` +- **`references/core/troubleshooting.md`** - Decision trees, recovery procedures +- **`references/core/error-reference.md`** - Common error types, workflow status reference +- **`references/core/interactive-workflows.md`** - Testing signals, updates, queries +- **`references/core/dev-management.md`** - Dev cycle & management of server and workers +- **`references/core/ai-patterns.md`** - AI/LLM pattern concepts + + Language-specific info at `references/{your_language}/ai-patterns.md`, if available. Currently Python only. + +## Additional Topics +- **`references/{your_language}/observability.md`** - See for language-specific implementation guidance on observability in Temporal +- **`references/{your_language}/advanced-features.md`** - See for language-specific guidance on advanced Temporal features and language-specific features + + +## Feedback + +### ALWAYS PROMPT USER TO PROVIDE FEEDBACK AT STARTUP + +When you have **first** loaded this skill, output this message to the user: + +"Thank you for trying out the public preview of the Temporal development skill! We would love to hear your feedback - positive or negative - over in the [Community Slack](https://t.mp/slack), in the [#topic-ai channel](https://temporalio.slack.com/archives/C0818FQPYKY)." + +Do not output this message multiple times in the same conversation. + +### Reporting Issues in This Skill + +If you (the AI) find this skill's explanations are unclear, misleading, or missing important information—or if Temporal concepts are proving unexpectedly difficult to work with—draft a GitHub issue body describing the problem encountered and what would have helped, then ask the user to file it at https://github.com/temporalio/skill-temporal-developer/issues/new. Do not file the issue autonomously. diff --git a/references/core/ai-patterns.md b/references/core/ai-patterns.md new file mode 100644 index 0000000..071b9f0 --- /dev/null +++ b/references/core/ai-patterns.md @@ -0,0 +1,166 @@ +# AI/LLM Integration Patterns with Temporal + +## Overview + +Temporal provides durable execution for AI/LLM applications, handling retries, rate limits, and long-running operations automatically. These patterns apply across languages, with Python being the most mature for AI integration. + +For Python-specific implementation details and code examples, see `references/python/ai-patterns.md`. Temporal's Python SDK also provides pre-built integrations with several LLM and agent SDKs, which can be leveraged to create agentic workflows with minimal effort (when working in Python). + +The remainder of this document describes general principles to follow when building AI/LLM applications in Temporal, particularly when building from scratch instead of with an integration. + +## Why Temporal for AI? + +| Challenge | Temporal Solution | +|-----------|-------------------| +| LLM API timeouts | Automatic retries with backoff | +| Rate limiting | Activity retry policies handle 429s | +| Long-running agents | Durable state survives crashes | +| Multi-step pipelines | Workflow orchestration | +| Cost tracking | Activity-level visibility | +| Debugging | Full execution history | + +## Core Patterns + +### Pattern 1: Activities should Wrap LLM Calls + +- activity: call_llm + - inputs: + - model_id -> internally activity can route to different models, so we don't need 1 activity per unique model. + - prompt / chat history + - tools + - etc. + - returns model response, as a typed structured output + +**Benefits**: +- Single activity handles multiple use cases +- Consistent retry handling +- Centralized configuration + +### Pattern 2: Non-deterministic / heavy tools in Activities + +Tools which are non-deterministic and/or heavy actions (file system, hitting APIs, etc.) should be placed in activities: + +``` +Workflow: + ├── Activity: call_llm (get tool selection) + ├── Activity: execute_tool (run selected tool) + └── Activity: call_llm (interpret results) +``` + +**Benefits**: +- Independent retry for each step +- Clear audit trail in history +- Easier testing and mocking +- Failure isolation + +### Pattern 3: Tools that Mutate Agent State can be in the Workflow directly + +Generally, agent state is in bijection with workflow state. Thus, tools which mutate agent state and are deterministic (like TODO tools, just updating a hash map) typically belong in the workflow code rather than an activity. + +``` +Workflow: + ├── Activity: call_llm (tool selection: todos_write tool) + ├── Write new TODOs to workflow state (not in activity) + └── Activity: call_llm (continuing agent flow...) +``` + +### Pattern 4: Centralized Retry Management + +Disable retries in LLM client libraries, let Temporal handle retries. + +- LLM Client Config: + - max_retries = 0 ← Disable client retries at the LLM client level + +Use either the default activity retry policy, or customize it as needed for the situation. + +**Why**: +- Temporal retries are durable (survive crashes) +- Single retry configuration point +- Better visibility into retry attempts +- Consistent backoff behavior + + +### Pattern 5: Multi-Agent Orchestration + +Complex pipelines with multiple specialized agents: + +``` +Deep Research Example: + │ + ├── Planning Agent (Activity) + │ └── Output: subtopics to research + │ + ├── Query Generation Agent (Activity) + │ └── Output: search queries per subtopic + │ + ├── Parallel Web Search (Multiple Activities) + │ └── Output: search results (resilient to partial failures) + │ + └── Synthesis Agent (Activity) + └── Output: final report +``` + +**Key Pattern**: Use parallel execution with `return_exceptions=True` to continue with partial results when some searches fail. + +## Approximate Timeout Recommendations + +| Operation Type | Recommended Timeout | +|----------------|---------------------| +| Simple LLM calls (GPT-4, Claude-3) | 30 seconds | +| Reasoning models (o1, o3, extended thinking) | 300 seconds (5 min) | +| Web searches | 300 seconds (5 min) | +| Simple tool execution | 30-60 seconds | +| Image generation | 120 seconds | +| Document processing | 60-120 seconds | + +**Rationale**: +- Reasoning models need time for complex computation +- Web searches may hit rate limits requiring backoff +- Fast timeouts catch stuck operations +- Longer timeouts prevent premature failures for expensive operations + +## Rate Limit Handling + +### From HTTP Headers + +Parse rate limit info from API responses: + +- Response Headers: + - Retry-After: 30 + - X-RateLimit-Remaining: 0 + +- Activity: + - If rate limited: + - Raise retryable error with a next retry delay + - Temporal handles the delay + +## Error Handling + +### Retryable Errors +- Rate limits (429) +- Timeouts +- Temporary server errors (500, 502, 503) +- Network errors + +### Non-Retryable Errors +- Invalid API key (401) +- Invalid input/prompt +- Content policy violations +- Model not found + +## Best Practices + +1. **Disable client retries** - Let Temporal handle all retries +2. **Set appropriate timeouts** - Based on operation type +3. **Separate activities** - One per logical operation +4. **Use structured outputs** - For type safety and validation +5. **Handle partial failures** - Continue with available results +6. **Monitor costs** - Track LLM calls at activity level +7. **Test with mocks** - Mock LLM responses in tests + +## Observability + +See `references/{your_language}/observability.md` for the language you are working in for documentation on implementing observability in Temporal. It is generally recommended to add observability for: +- Token usage, via activity logging +- any else to help track LLM usage and debug agentic flows, within moderation. + diff --git a/references/core/determinism.md b/references/core/determinism.md new file mode 100644 index 0000000..af824d2 --- /dev/null +++ b/references/core/determinism.md @@ -0,0 +1,118 @@ +# Determinism in Temporal Workflows + +This document provides a conceptual-level overview to determinism in Temporal. Additional language-specific determinism information is available at `references/{your_language}/determinism.md`. + +## Overview + +Temporal workflows must be deterministic because of **history replay** - the mechanism that enables durable execution. + +## Why Determinism Matters + +### The Replay Mechanism + +When a Worker needs to restore workflow state (after crash, cache eviction, or continuing after a long timer), it **re-executes the workflow code from the beginning**. But instead of re-running external actions, it uses results stored in the Event History. + +``` +Initial Execution: + Code runs → Generates Commands → Server stores as Events + +Replay (Recovery): + Code runs again → Generates Commands → SDK compares to Events + If match: Use stored results, continue + If mismatch: NondeterminismError! +``` + +### Commands and Events + +Every workflow operation generates a Command that becomes an Event, here are some examples: + +| Workflow Code | Command Generated | Event Stored | +|--------------|-------------------|--------------| +| Execute activity | `ScheduleActivityTask` | `ActivityTaskScheduled` | +| Sleep/timer | `StartTimer` | `TimerStarted` | +| Child workflow | `StartChildWorkflowExecution` | `ChildWorkflowExecutionStarted` | +| Complete workflow | `CompleteWorkflowExecution` | `WorkflowExecutionCompleted` | + +### Non-Determinism Example + +``` +First Run (11:59 AM): + if datetime.now().hour < 12: → True + execute_activity(morning_task) → Command: ScheduleActivityTask("morning_task") + +Replay (12:01 PM): + if datetime.now().hour < 12: → False + execute_activity(afternoon_task) → Command: ScheduleActivityTask("afternoon_task") + +Result: Commands don't match history → NondeterminismError +``` + +## Sources of Non-Determinism + +### Time-Based Operations +- `datetime.now()`, `time.time()`, `Date.now()` +- Different value on each execution + +### Random Values +- `random.random()`, `Math.random()`, `uuid.uuid4()` +- Different value on each execution + +### External State +- Reading files, environment variables, databases, networking / HTTP calls +- State may change between executions + +### Non-Deterministic Iteration +- Map/dict iteration order (in some languages) +- Set iteration order + +### Threading/Concurrency +- Race conditions produce different outcomes +- Non-deterministic ordering + +## **Central Concept**: Place Non-Determinism within Activities + +In Temporal, activities are the primary mechanism for making non-deterministic code durable and persisted in workflow history. Generally speaking, you should place sources of non-determinism in activities, which provides durability and recording of results, as well as automated retries and more. See `references/{your_language}/{your_language}.md` for the language you are working in for how to do this in practice. + +For a few simple cases, like timestamps, random values, UUIDs, etc. the Temporal SDK in your language may provide durable variants that are simple to use. See `references/{your_language}/determinism.md` for the language you are working in for more info. + +## SDK Protection Mechanisms +Each Temporal SDK language provides a protection mechanism to make it easier to catch non-determinism errors earlier in development: + +- Python: The Python SDK runs workflows in a sandbox that intercepts and aborts non-deterministic calls early at runtime. +- TypeScript: The TypeScript SDK runs workflows in an isolated V8 sandbox, intercepting many common sources of non-determinism and replacing them automatically with deterministic variants. +- Go: The Go SDK has no runtime sandbox. Therefore, non-determinism bugs will never be immediately appararent, and are usually only observable during replay. The optional `workflowcheck` static analysis tool can be used to check for many sources of non-determinism at compile time. + +Regardless of which SDK you are using, it is your responsibility to ensure that workflow code does not contain sources of non-determinism. Use SDK-specific tools as well as replay tests for doing so. + +## Detecting Non-Determinism + +### During Execution +- `NondeterminismError` raised when Commands don't match Events +- Workflow becomes blocked until code is fixed + +### Testing with Replay + +Replay tests verify that workflows follow identical code paths when re-run, by attempting to replay recorded executions. See the replay testing section of `references/{your_language}/testing.md` for information on how to write these tests. + +## Recovery from Non-Determinism + +### Accidental Change +If you accidentally introduced non-determinism: +1. Revert code to match what's in history +2. Restart worker +3. Workflow auto-recovers + +### Intentional Change +If you need to change workflow logic: +1. Use the **Patching API** to support both old and new code paths +2. Or terminate old workflows and start new ones with updated code + +See `versioning.md` for patching details. + +## Best Practices + +1. **Use SDK-provided alternatives** for time, random, UUID +2. **Move I/O to activities** - workflows should only orchestrate +3. **Test with replay** before deploying workflow changes +4. **Use patching** for intentional changes to running workflows +5. **Keep workflows focused** - complex logic increases non-determinism risk diff --git a/references/core/dev-management.md b/references/core/dev-management.md new file mode 100644 index 0000000..01faed0 --- /dev/null +++ b/references/core/dev-management.md @@ -0,0 +1,26 @@ +# Development Server and Worker Management + +## Server Management + +Before starting workers or workflows, you MUST start a local dev server, using the Temporal CLI: + +```bash +temporal server start-dev # Start this in the background. +``` + +It is perfectly OK for this process to be shared across multiple projects / left running as you develop your Temporal code. + +## Worker Management Details + +### Starting Workers + +How you start a worker is project-dependent, but generally Temporal code should have a program entrypoint which starts a worker. If your project doesn't, you should define it. + +When you need a new worker, you should start it in the background (and preferrably have it log somewhere you can check), and then remember its PID so you can kill / clean it up later. + +**Best practice**: As far as local development goes, run only ONE worker instance with the latest code. Don't keep stale workers (running old code) around. + + +### Cleanup + +**Always kill workers when done.** Don't leave workers running. diff --git a/references/core/error-reference.md b/references/core/error-reference.md new file mode 100644 index 0000000..a0f905b --- /dev/null +++ b/references/core/error-reference.md @@ -0,0 +1,32 @@ +# Common Error Types Reference + +| Error Type | Error identifier (if any) | Where to Find | What Happened | Recovery | Link to additional info (if any) +|------------|---------------|---------------|---------------|----------|----------| +| **Non-determinism** | TMPRL1100 | `WorkflowTaskFailed` in history | Replay doesn't match history | Analyze error first. **If accidental**: fix code to match history → restart worker. **If intentional v2 change**: terminate → start fresh workflow. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1100.md | +| **Deadlock** | TMPRL1101 | `WorkflowTaskFailed` in history, worker logs | Workflow blocked too long (deadlock detected) | Remove blocking operations from workflow code (no I/O, no sleep, no threading locks). Use Temporal primitives instead. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1101.md | +| **Unfinished handlers** | TMPRL1102 | `WorkflowTaskFailed` in history | Workflow completed while update/signal handlers still running | Ensure all handlers complete before workflow finishes. Use `workflow.wait_condition()` to wait for handler completion. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1102.md | +| **Payload overflow** | TMPRL1103 | `WorkflowTaskFailed` or `ActivityTaskFailed` in history | Payload size limit exceeded (default 2MB) | Reduce payload size. Use external storage (S3, database) for large data and pass references instead. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1103.md | +| **Workflow code bug** | | `WorkflowTaskFailed` in history | Bug in workflow logic | Fix code → Restart worker → Workflow auto-resumes | | +| **Missing workflow** | | Worker logs | Workflow not registered | Add to worker.py → Restart worker | | +| **Missing activity** | | Worker logs | Activity not registered | Add to worker.py → Restart worker | | +| **Activity bug** | | `ActivityTaskFailed` in history | Bug in activity code | Fix code → Restart worker → Auto-retries | | +| **Activity retries** | | `ActivityTaskFailed` (count >2) | Repeated failures | Fix code → Restart worker → Auto-retries | | +| **Sandbox violation** | | Worker logs | Bad imports in workflow | Fix workflow.py imports → Restart worker | | +| **Task queue mismatch** | | Workflow never starts | Different queues in starter/worker | Align task queue names | | +| **Timeout** | | Status = TIMED_OUT | Operation too slow | Increase timeout config | | + +## Workflow Status Reference + +| Status | Meaning | Action | +|--------|---------|--------| +| `RUNNING` | Workflow in progress | Wait, or check if stalled | +| `COMPLETED` | Successfully finished | Get result, verify correctness | +| `FAILED` | Error during execution | Analyze error | +| `CANCELED` | Explicitly canceled | Review reason | +| `TERMINATED` | Force-stopped | Review reason | +| `TIMED_OUT` | Exceeded timeout | Increase timeout | + +## See Also + +- [Common Gotchas](gotchas.md) - Anti-patterns that cause these errors +- [Troubleshooting](troubleshooting.md) - Decision trees for diagnosing issues diff --git a/references/core/gotchas.md b/references/core/gotchas.md new file mode 100644 index 0000000..55b6ddb --- /dev/null +++ b/references/core/gotchas.md @@ -0,0 +1,196 @@ +# Common Temporal Gotchas + +Common mistakes and anti-patterns in Temporal development. Learning from these saves significant debugging time. + +This document provides a general overview of conceptual-level gotchas in Temporal. The exact form that these take and symptoms can vary by SDK language. See `references/{your_language}/gotchas.md` for language-specific info on common mistakes. + +## Non-Idempotent Activities + +**The Problem**: Activities may execute more than once due to retries or Worker failures. If an activity calls an external service without an idempotency key, you may charge a customer twice, send duplicate emails, or create duplicate records. + +**Symptoms**: +- Duplicate side effects (double charges, duplicate notifications) +- Data inconsistencies after retries + +**The Fix**: Always use idempotency keys when calling external services. Use the workflow ID, activity ID, or a domain-specific identifier (like order ID) as the key. + +**Note:** Local Activities skip the task queue for lower latency, but they're still subject to retries. The same idempotency rules apply. + +## Side Effects & Non-Determinism in Workflow Code + +**The Problem**: Code in workflow functions runs on first execution AND on every replay. Any side effect (logging, notifications, metrics, etc.) will happen multiple times and non-deterministic code (IO, current time, random numbers, threading, etc.) won't replay correctly. + +**Symptoms**: +- Non-determinism errors +- Sandbox violations, depending on SDK language +- Duplicate log entries +- Multiple notifications for the same event +- Inflated metrics + +**The Fix**: +- Use Temporal replay-aware managed side effects for common, non-business logic cases: + - Temporal workflow logging + - Temporal date time + - Temporal UUID generation + - Temporal random number generation +- Put all other side effects in Activities + +See `references/core/determinism.md` for more info. + +## Multiple Workers with Different Code + +**The Problem**: If Worker A runs part of a workflow with code v1, then Worker B (with code v2) picks it up, replay may produce different Commands. + +**Symptoms**: +- Non-determinism errors after deploying new code +- Errors mentioning "command mismatch" or "unexpected command" + +**The Fix**: +- Use Worker Versioning for production deployments +- Use patching APIs +- During development: kill old workers before starting new ones +- Ensure all workers run identical code + +**Note:** Workflows started with old code continue running after you change the code, which can then induce the above issues. During development (NOT production), you may want to terminate stale workflows (`temporal workflow terminate --workflow-id `). + +See `references/core/versioning.md` for more info. + +## Failing Activities Too Quickly + +**The Problem**: Using aggressive activity retry policies that give up too easily. + +**Symptoms**: +- Workflows failing on transient errors +- Unnecessary workflow failures during brief outages + +**The Fix**: Use appropriate activity retry policies. Let Temporal handle transient failures with exponential backoff. Reserve `maximum_attempts=1` for truly non-retryable operations. + +## Query Handler & Update Validator Mistakes + +### Modifying State in Queries & Update Validators + +**The Problem**: Queries and update validators are read-only. Modifying state causes non-determinism on replay, and must strictly be avoided. + +**Symptoms**: +- State inconsistencies after workflow replay +- Non-determinism errors + +**The Fix**: Queries and update validators must only read state. Use Updates for operations that need to modify state AND return a result. + +### Blocking in Queries & Update Validators + +**The Problem**: Queries and update validators must return immediately. They cannot await activities, child workflows, timers, or conditions. + +**Symptoms**: +- Query / update validators timeouts +- Deadlocks + +**The Fix**: Queries and update validators must only look at current state. Use Signals or Updates to trigger async operations. + +### Query vs Signal vs Update + +| Operation | Modifies State? | Returns Result? | Can Block? | Use For | +|-----------|-----------------|-----------------|------------|---------| +| **Query** | No | Yes | No | Read current state | +| **Signal** | Yes | No | Yes | Fire-and-forget mutations | +| **Update** | Yes | Yes | Yes | Mutations needing results | + +**Key rule**: Query to peek, Signal to push, Update to pop. + +## File Organization Issues + +Each SDK has specific requirements for how workflow and activity code should be organized. Mixing them incorrectly causes sandbox issues, bundling problems, or performance degradation. + +See language-specific gotchas for details. + +## Testing Mistakes + +### Only Testing Happy Paths + +**The Problem**: Not testing what happens when things go wrong. + +**Questions to answer**: +- What happens when an Activity exhausts all retries? +- What happens when a workflow is cancelled mid-execution? +- What happens during a Worker restart? + +**The Fix**: Test failure scenarios explicitly. Mock activities to fail, test cancellation handling, use replay testing. + +### Not Testing Replay Compatibility + +**The Problem**: Changing workflow code without verifying existing workflows can still replay. + +**Symptoms**: +- Non-determinism errors after deployment +- Stuck workflows that can't make progress + +**The Fix**: Use replay testing against saved histories from production or staging. + +## Error Handling Mistakes + +### Swallowing Errors + +**The Problem**: Catching errors without proper handling hides failures. + +**Symptoms**: +- Silent failures +- Workflows completing "successfully" despite errors +- Difficult debugging + +**The Fix**: Log errors and make deliberate decisions. Either re-raise, use a fallback, or explicitly document why ignoring is safe. + +### Wrong Retry Classification + +**The Problem**: Marking transient errors as non-retryable, or permanent errors as retryable. + +**Symptoms**: +- Workflows failing on temporary network issues (if marked non-retryable) +- Infinite retries on invalid input (if marked retryable) + +**The Fix**: +- **Retryable**: Network errors, timeouts, rate limits, temporary unavailability +- **Non-retryable**: Invalid input, authentication failures, business rule violations, resource not found + +## Cancellation Handling + +### Not Handling Workflow Cancellation + +**The Problem**: When a workflow is cancelled, cleanup code after the cancellation point doesn't run unless explicitly protected. + +**Symptoms**: +- Resources not released after cancellation +- Incomplete compensation/rollback +- Leaked state + +**The Fix**: Use language-specific cancellation scopes or try/finally blocks to ensure cleanup runs even on cancellation. See language-specific gotchas for implementation details. + +### Not Handling Activity Cancellation + +**The Problem**: Activities must opt in to receive cancellation. Without proper handling, a cancelled activity continues running to completion, wasting resources. + +**Requirements for activity cancellation**: +1. **Heartbeating** - Cancellation is delivered via heartbeat. Activities that don't heartbeat won't know they've been cancelled. +2. **Checking for cancellation** - Activity must explicitly check for cancellation or await a cancellation signal. + +**Symptoms**: +- Cancelled activities running to completion +- Wasted compute on work that will be discarded +- Delayed workflow cancellation + +**The Fix**: Heartbeat regularly and check for cancellation. See language-specific gotchas for implementation patterns. + +## Payload Size Limits + +**The Problem**: Temporal has built-in limits on payload sizes. Exceeding them causes workflows to fail. + +**Limits**: +- Max 2MB per individual payload +- Max 4MB per gRPC message +- Max 50MB for complete workflow history (aim for <10MB in practice) + +**Symptoms**: +- Payload too large errors +- gRPC message size exceeded errors +- Workflow history growing unboundedly + +**The Fix**: Store large data externally (S3/GCS) and pass references, use compression codecs, or chunk data across multiple activities. See the Large Data Handling pattern in `references/core/patterns.md`. diff --git a/references/core/interactive-workflows.md b/references/core/interactive-workflows.md new file mode 100644 index 0000000..3b02028 --- /dev/null +++ b/references/core/interactive-workflows.md @@ -0,0 +1,49 @@ +# Interactive Workflows + +Interactive workflows are workflows that use Temporal features such as signals or updates to pause and wait for external input. When testing and debugging these types of workflows you can send them input via the Temporal CLI. + +## Signals + +Fire-and-forget messages to a workflow. + +```bash +# Send signal to workflow +temporal workflow signal \ + --workflow-id \ + --name "signal_name" \ + --input '{"key": "value"}' +``` + +## Updates + +Request-response style interaction (returns a value). + +```bash +# Send update to workflow +temporal workflow update execute \ + --workflow-id \ + --name "update_name" \ + --input '{"approved": true}' +``` + +## Queries + +Read-only inspection of workflow state. + +```bash +# Query workflow state (read-only) +temporal workflow query \ + --workflow-id \ + --name "get_status" +``` + +## Typical Steps for Testing Interactive Workflows + +```bash +# 1. Start worker (command is project dependent) +# 2. Start workflow (command is project dependent) This code should output the workflow ID, if not, modify it to. +temporal workflow signal --workflow-id --name "signal_name" --input '{"key": "value"}' # 3. Send it interactive events, e.g. a signal. +# 4. Wait for workflow to complete (use Temporal CLI to check status) +# 5. Read workflow result, using the Temporal CLI +# 6. Cleanup the worker process if needed. +``` diff --git a/references/core/patterns.md b/references/core/patterns.md new file mode 100644 index 0000000..566e6f8 --- /dev/null +++ b/references/core/patterns.md @@ -0,0 +1,443 @@ +# Temporal Workflow Patterns + +## Overview + +Common patterns for building robust Temporal workflows. +See the language-specific references for the language you are working in: +- `references/{language}/{language}.md` for the root level documentation for that language +- `references/{language}/patterns.md` for language-specific example code of the patterns in this file. + +## Signals + +**Purpose**: Send data to a running workflow asynchronously (fire-and-forget). + +**When to Use**: +- Human approval workflows +- Adding items to a workflow's queue +- Notifying workflow of external events +- Live configuration updates + +**Characteristics**: +- Asynchronous - sender doesn't wait for response +- Can mutate workflow state +- Durable - signals are persisted in history +- Can be sent before workflow starts (signal-with-start) + +**Example Flow**: +``` +Client Workflow + │ │ + │──── signal(approve) ────▶│ + │ │ (updates state) + │ │ + │◀──── (no response) ──────│ +``` + +**Note:** A related but distinct pattern to signals is async activity completion. This is an advanced feature, which you may consider if the external system that would deliver the signal is unreliable and might fail to Signal, or +you want the external process to Heartbeat or receive Cancellation. If this may be the case, look at language-specific advanced features for your SDK language (`references/{your_language}/advanced-features.md`). + +## Queries + +**Purpose**: Read workflow state synchronously without modifying it. + +**When to Use**: +- Building dashboards showing workflow progress +- Health checks and monitoring +- Debugging workflow state +- Exposing current status to external systems + +**Characteristics**: +- Synchronous - caller waits for response +- Read-only - must not modify state +- Not recorded in history +- Executes on the worker, not persisted +- Can run even on completed workflows + +**Example Flow**: +``` +Client Workflow + │ │ + │──── query(status) ──────▶│ + │ │ (reads state) + │◀──── "processing" ───────│ +``` + +## Updates + +**Purpose**: Modify workflow state and receive a response synchronously. + +**When to Use**: +- Operations that need confirmation (add item, return count) +- Validation before accepting changes +- Replace signal+query combinations +- Request-response patterns within workflow + +**Characteristics**: +- Synchronous - caller waits for completion +- Can mutate state AND return values +- Supports validators to reject invalid updates before they even get persisted into history +- **Validators must NOT mutate workflow state or block** (no activities, sleeps, or commands) — they are read-only, similar to query handlers +- Recorded in history + +**Example Flow**: +``` +Client Workflow + │ │ + │──── update(addItem) ────▶│ + │ │ (validates, modifies state) + │◀──── {count: 5} ─────────│ +``` + +## Child Workflows + +**When to Use**: +- Prevent history from growing too large +- Isolate failure domains (child can fail without failing parent) +- Different retry policies for different parts + +**Characteristics**: +- Own history (doesn't bloat parent) +- Independent lifecycle options (ParentClosePolicy) +- Can be cancelled independently +- Results returned to parent + +**Parent Close Policies**: +- `TERMINATE` - Child terminated when parent closes (default) +- `ABANDON` - Child continues running independently +- `REQUEST_CANCEL` - Cancellation requested but not forced + +**Note:** Do not need to use child workflows simply for breaking complex logic down into smaller pieces. Standard programming abstractions within a workflow can already be used for that. + +## Continue-as-New + +**Purpose**: Prevent unbounded history growth by "restarting" with fresh history. + +**When to Use**: +- Long-running workflows (entity workflows, subscriptions) +- Workflows with many iterations +- When history approaches 10,000+ events +- Periodic cleanup of accumulated state + +**How It Works**: +``` +Workflow (history: 10,000 events) + │ + │ continueAsNew(currentState) + ▼ +New Workflow Execution (history: 0 events) + │ (same workflow ID, fresh history) + │ (receives currentState as input) +``` + +**Best Practice**: Check `historyLength` or `continueAsNewSuggested` periodically. + +## Saga Pattern + +**Purpose**: Distributed transactions with compensation for failures. + +**When to Use**: +- Multi-step operations that span services +- Operations requiring rollback on failure +- Financial transactions, order processing +- Booking systems with multiple reservations + +**How It Works**: +``` +Step 1: Reserve inventory + └─ Compensation: Release inventory + +Step 2: Charge payment + └─ Compensation: Refund payment + +Step 3: Ship order + └─ Compensation: Cancel shipment + +On failure at step 3: + Execute: Refund payment (step 2 compensation) + Execute: Release inventory (step 1 compensation) +``` + +**Implementation Pattern**: +1. Track compensation actions as you complete each step +2. On failure, execute compensations in reverse order +3. Handle compensation failures gracefully (log, alert, manual intervention) + +## Parallel Execution + +**Purpose**: Run multiple independent operations concurrently. + +**When to Use**: +- Processing multiple items that don't depend on each other +- Calling multiple APIs simultaneously +- Fan-out/fan-in patterns +- Reducing total workflow duration + +**Patterns**: +- `Promise` / `asyncio` - Use traditional concurrency helpers (e.g. wait for all, wait for first, etc) +- Partial failure handling - Continue with successful results + +## Entity Workflow Pattern + +**Purpose**: Model long-lived entities as workflows that handle events. + +**When to Use**: +- Subscription management +- User sessions +- Shopping carts +- Any stateful entity receiving events over time + +**How It Works**: +``` +Entity Workflow (user-123) + │ + ├── Receives signal: AddItem + │ └── Updates state + │ + ├── Receives signal: UpdateQuantity + │ └── Updates state + │ + ├── Receives query: GetCart + │ └── Returns current state + │ + └── continueAsNew when history grows +``` + +## Timer Patterns + +**Purpose**: Durable delays that survive worker restarts. + +**Use Cases**: +- Scheduled reminders +- Timeout handling +- Delayed actions +- Polling with intervals + +**Characteristics**: +- Timers are durable (persisted in history) +- Can be cancelled + +## Polling Patterns + +### Frequent Polling + +**Purpose**: Frequently (once per second of faster) repeatedly check external state until condition met. + +**Implementation**: + +``` +# Inside Activity (polling_activity): +while not condition_met: + result = await call_external_api() + if result.done: + break + activity.heartbeat("Invoking activity") + await sleep(poll_interval) + + +# In workflow code: +workflow.execute_activity( + polling_activity, + PollingActivityInput(...), + start_to_close_timeout=timedelta(seconds=60), + heartbeat_timeout=timedelta(seconds=2), +) +``` + +To ensure that polling_activity is restarted in a timely manner, we make sure that it heartbeats on every iteration. Note that heartbeating only works if we set the heartbeat_timeout to a shorter value than the Activity start_to_close_timeout timeout + +**Advantage:** Because the polling loop is inside the activity, this does not pollute the workflow history. + +### Infrequent Polling + +**Purpose**: Infrequently (once per minute or slower) repeatedly poll an external service. + +**Implementation**: + +Define an Activty which fails (raises an exception) exactly when polling is not completed. + +The polling loop is accomplised via activity retries, by setting the following Retry options: +- backoff_coefficient: to 1 +- initial_interval: to the polling interval (e.g. 60 seconds) + +This will enable the Activity to be retried exactly on the set interval. + +**Advantage:** Individual Activity retries are not recorded in Workflow History, so this approach can poll for a very long time without affecting the history size. + +## Idempotency Patterns + +**Purpose**: Ensure activities can be safely retried and replayed without causing duplicate side effects. + +**Why It Matters**: Temporal may re-execute activities during retries (on failure) or replay (on worker restart). Without idempotency, this can cause duplicate charges, duplicate emails, duplicate database entries, etc. + +### Using Idempotency Keys + +Pass a unique identifier to external services so they can detect and deduplicate repeated requests: + +``` +Activity: charge_payment(order_id, amount) + │ + └── Call payment API with: + amount: $100 + idempotency_key: "order-{order_id}" + │ + └── Payment provider deduplicates based on key + (second call with same key returns original result) +``` + +**Good idempotency key sources**: +- Workflow ID (unique per workflow execution) +- Business identifier (order ID, transaction ID) +- Workflow ID + activity name + attempt number + +### Check-Before-Act Pattern + +Query the external system's state before making changes: + +``` +Activity: send_welcome_email(user_id) + │ + ├── Check: Has welcome email been sent for user_id? + │ │ + │ ├── YES: Return early (already done) + │ │ + │ └── NO: Send email, mark as sent +``` + +### Designing Idempotent Activities + +1. **Use unique identifiers** as idempotency keys with external APIs +2. **Check before acting**: Query current state before making changes +3. **Make operations repeatable**: Ensure calling twice produces the same result +4. **Record outcomes**: Store transaction IDs or results for verification +5. **Leverage external system features**: Many APIs (Stripe, AWS, etc.) have built-in idempotency key support + +### Tracking State in Workflows + +For complex multi-step operations, track completion status in workflow state: + +``` +Workflow State: + payment_completed: false + shipment_created: false + +Run: + if not payment_completed: + charge_payment(...) + payment_completed = true + + if not shipment_created: + create_shipment(...) + shipment_created = true +``` + +This ensures that on replay, already-completed steps are skipped. + +## Large Data Handling + +**Purpose**: Handle data that exceeds Temporal's payload limits without polluting workflow history. + +**Limits** (see `references/core/gotchas.md` for details): +- Max 2MB per individual payload +- Max 4MB per gRPC message +- Max 50MB for workflow history (aim for <10MB) + +**Key Principle**: Large data should never flow through workflow history. Activities read and write large data directly, passing only small references through the workflow. + +**Wrong Approach**: +``` +Workflow + │ + ├── downloadFromStorage(ref) ──▶ returns large data (enters history) + │ + ├── processData(largeData) ────▶ large data as argument (enters history AGAIN) + │ + └── uploadToStorage(result) ───▶ large data as argument (enters history AGAIN) +``` + +This defeats the purpose—large data enters workflow history multiple times. + +**Correct Approach**: +``` +Workflow + │ + └── processLargeData(inputRef) ──▶ returns outputRef (small string) + │ + └── Activity internally: + download(inputRef) → process → upload → return outputRef +``` + +The workflow only handles references (small strings). The activity does all large data operations internally. + +**Implementation Pattern**: +1. Accept a reference (URL, S3 key, database ID) as activity input +2. Download/fetch the large data inside the activity +3. Process the data inside the activity +4. Upload/store the result inside the activity +5. Return only a reference to the result + +**Other Strategies**: +- **Compression**: Use a PayloadCodec to compress data automatically +- **Chunking**: Split large collections across multiple activities, each handling a subset + +## Activity Heartbeating + +**Purpose**: Enable cancellation delivery and progress tracking for long-running activities. + +**Why Heartbeat**: +1. **Support activity cancellation** - Cancellations are delivered to activities via heartbeat. Activities that don't heartbeat won't know they've been cancelled. +2. **Resume progress after failure** - Heartbeat details persist across retries, allowing activities to resume where they left off. +3. **Detect stuck activities** - If an activity stops heartbeating, Temporal can time it out and retry. + +**How Cancellation Works**: +``` +Workflow requests activity cancellation + │ + ▼ +Temporal Service marks activity for cancellation + │ + ▼ +Activity calls heartbeat() + │ + ├── Not cancelled: heartbeat succeeds, continues + │ + └── Cancelled: heartbeat raises exception + Activity can catch this to perform cleanup +``` + +**Key Point**: If an activity never heartbeats, it will run to completion even if cancelled—it has no way to learn about the cancellation. + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +**When to Use**: +- Short operations completing in milliseconds/seconds +- High-frequency calls where task queue overhead is significant +- Low-latency requirements where you can't afford task queue round-trip + +**Characteristics**: +- Executes on the same worker that runs the workflow +- No task queue round-trip (lower latency) +- Still recorded in history +- Should complete quickly (default timeout is short) + +**Trade-offs**: +- Less visibility in Temporal UI (no separate task) +- Must complete on the same worker +- Not suitable for long-running operations +- **Risk with consecutive local activities:** Local activity completions are only persisted when the current Workflow Task completes. Calling multiple local activities in a row (with nothing in between to yield the Workflow Task) increases the risk of losing work if the worker crashes mid-sequence. If you need a chain of operations with durable checkpoints between each step, use regular activities instead. + +## Choosing Between Patterns + +| Need | Pattern | +|------|---------| +| Send data, don't need response | Signal | +| Read state, no modification | Query | +| Modify state, need response | Update | +| Break down large workflow | Child Workflow | +| Prevent history growth | Continue-as-New | +| Rollback on failure | Saga | +| Process items concurrently | Parallel Execution | +| Long-lived stateful entity | Entity Workflow | +| Safe retries/replays | Idempotency | +| Low-latency short operations | Local Activities | diff --git a/references/core/troubleshooting.md b/references/core/troubleshooting.md new file mode 100644 index 0000000..e4ef2cb --- /dev/null +++ b/references/core/troubleshooting.md @@ -0,0 +1,323 @@ +# Temporal Troubleshooting Guide + +## Workflow Diagnosis Decision Tree + +``` +Workflow not behaving as expected? +│ +├─▶ What is the workflow status? +│ │ +│ ├─▶ RUNNING (but no progress) +│ │ └─▶ Go to: "Workflow Stuck" section +│ │ +│ ├─▶ FAILED +│ │ └─▶ Go to: "Workflow Failed" section +│ │ +│ ├─▶ TIMED_OUT +│ │ └─▶ Go to: "Timeout Issues" section +│ │ +│ └─▶ COMPLETED (but wrong result) +│ └─▶ Go to: "Wrong Result" section +``` + +## Workflow Stuck (RUNNING but No Progress) + +### Decision Tree + +``` +Workflow stuck in RUNNING? +│ +├─▶ Is a worker running? +│ │ +│ ├─▶ NO: Start a worker +│ │ └─▶ See references/core/dev-management.md +│ │ +│ └─▶ YES: Is it on the correct task queue? +│ │ +│ ├─▶ NO: Start worker with correct task queue +│ │ +│ └─▶ YES: Check for non-determinism +│ │ +│ ├─▶ NondeterminismError in logs? +│ │ └─▶ Go to: "Non-Determinism" section +│ │ +│ ├─▶ Check history for task failures +│ │ └─▶ Run: `temporal workflow show --workflow-id ` +│ │ │ +│ │ ├─▶ WorkflowTaskFailed event? +│ │ │ └─▶ Check error type in event details +│ │ │ └─▶ Go to relevant section in error-reference.md +│ │ │ +│ │ └─▶ ActivityTaskFailed event? +│ │ └─▶ Go to: "Activity Keeps Retrying" section +│ │ +│ └─▶ No errors in logs or history? +│ └─▶ Check if workflow is waiting for signal/timer +``` + +### Common Causes + +1. **No worker running** + - See references/core/dev-management.md + +2. **Worker on wrong task queue** + - Check: Worker logs for task queue name + - Fix: Start worker with matching task queue + +3. **Worker has stale code** + - Check: Worker startup time vs code changes + - Fix: Restart worker with updated code + +4. **Workflow waiting for signal** + - Check: Workflow history for pending signals + - Fix: Send expected signal or check signal sender + +5. **Activity stuck/timing out** + - Check: Activity retry attempts in history + - Fix: Investigate activity failure, increase timeout + +## Non-Determinism Errors + +### Decision Tree + +``` +NondeterminismError? +│ +├─▶ Was code intentionally changed? +│ │ +│ ├─▶ YES: Do you need to support in-flight workflows? +│ │ │ +│ │ ├─▶ YES (production): Use patching API +│ │ │ └─▶ See: references/core/versioning.md +│ │ │ +│ │ └─▶ NO (local dev/testing): Terminate or reset workflow +│ │ └─▶ `temporal workflow terminate --workflow-id ` +│ │ └─▶ Then start fresh with new code +│ │ +│ └─▶ NO: Accidental change +│ │ +│ ├─▶ Can you identify the change? +│ │ │ +│ │ ├─▶ YES: Revert and restart worker. Note, this doesn't always work if workflow has progressed past the change (may induce other code paths), so may need to reset workflow. +│ │ │ +│ │ └─▶ NO: Compare current code to expected history +│ │ └─▶ Check: Activity names, order, parameters +``` + +### Common Causes + +1. **Changed call order** + ``` + # Before # After (BREAKS) + await activity_a await activity_b + await activity_b await activity_a + ``` + +2. **Changed call name** + ``` + # Before # After (BREAKS) + await process_order(...) await handle_order(...) + ``` + +3. **Added/removed call** + - Adding new activity mid-workflow + - Removing activity that was previously called + +4. **Using non-deterministic code** + - `datetime.now()` in workflow (use `workflow.now()`) + - `random.random()` in workflow (use `workflow.random()`) + +### Recovery + +**Accidental Change:** +1. Identify the change +2. Revert code to match history +3. Restart worker +4. Workflow automatically recovers + +**Intentional Change:** +1. Use patching API for gradual migration +2. Or terminate old workflows, start new ones + +## Workflow Failed + +### Decision Tree + +``` +Workflow status = FAILED? +│ +├─▶ Check workflow error message +│ │ +│ ├─▶ Application error (your code) +│ │ └─▶ Fix the bug, start new workflow +│ │ +│ ├─▶ NondeterminismError +│ │ └─▶ Go to: "Non-Determinism" section +│ │ +│ └─▶ Timeout error +│ └─▶ Go to: "Timeout Issues" section +``` + +### Common Causes + +1. **Unhandled exception in workflow** + - Check error message and stack trace + - Fix bug in workflow code + +2. **Activity exhausted retries** + - All retry attempts failed + - Check activity logs for root cause + +3. **Non-retryable error thrown** + - Error marked as non-retryable + - Intentional failure, check business logic + +## Timeout Issues + +### Timeout Types + +| Timeout | Scope | What It Limits | +|---------|-------|----------------| +| `WorkflowExecutionTimeout` | Entire workflow | Total time including retries and continue-as-new | +| `WorkflowRunTimeout` | Single run | Time for one run (before continue-as-new) | +| `ScheduleToCloseTimeout` | Activity | Total time including retries | +| `StartToCloseTimeout` | Activity | Single attempt time | +| `HeartbeatTimeout` | Activity | Time between heartbeats | + +### Diagnosis + +``` +Timeout error? +│ +├─▶ Which timeout? +│ │ +│ ├─▶ Workflow timeout +│ │ └─▶ Increase timeout or optimize workflow. Better yet, consider removing the workflow timeout, as it is generally discourged unless *necessary* for your use case. +│ │ +│ ├─▶ ScheduleToCloseTimeout +│ │ └─▶ Activity taking too long overall (including retries) +│ │ +│ ├─▶ StartToCloseTimeout +│ │ └─▶ Single activity attempt too slow +│ │ +│ └─▶ HeartbeatTimeout +│ └─▶ Activity not heartbeating frequently enough +│ └─▶ Add heartbeat() calls in long activities +``` + +### Fixes + +1. **Increase timeout** if operation legitimately takes longer +2. **Add heartbeats** to long-running activities +3. **Optimize activity** to complete faster +4. **Break into smaller activities** for better granularity + +## Activity Keeps Retrying + +### Decision Tree + +``` +Activity retrying repeatedly? +│ +├─▶ Check activity error +│ │ +│ ├─▶ Transient error (network, timeout) +│ │ └─▶ Expected behavior, will eventually succeed +│ │ +│ ├─▶ Permanent error (bug, invalid input) +│ │ └─▶ Fix the bug or mark as non-retryable +│ │ +│ └─▶ Resource exhausted +│ └─▶ Add backoff, check rate limits +``` + +### Common Causes + +1. **Bug in activity code** + - Fix the bug + - Consider marking certain errors as non-retryable + +2. **External service down** + - Retries are working as intended + - Monitor service recovery + +3. **Invalid input** + - Validate inputs before activity + - Return non-retryable error for bad input + +## Wrong Result (Completed but Incorrect) + +### Diagnosis + +1. **Check workflow history** for unexpected activity results +2. **Verify activity implementations** produce correct output +3. **Check for race conditions** in parallel execution +4. **Verify signal handling** if signals are involved + +### Common Causes + +1. **Activity bug** - Wrong logic in activity +2. **Stale data** - Activity using outdated information +3. **Signal ordering** - Signals processed in unexpected order +4. **Parallel execution** - Race condition in concurrent operations + +## Worker Issues + +### Worker Not Starting + +``` +Worker won't start? +│ +├─▶ Connection error +│ └─▶ Check Temporal server is running +│ └─▶ `temporal server start-dev` (start in background, see references/core/dev-management.md) +│ +├─▶ Registration error +│ └─▶ Check workflow/activity definitions are valid +│ +└─▶ Other errors (imports, etc.) + └─▶ Debug those errors as usual. +``` + +### Worker Crashing + +1. **Out of memory** - Reduce concurrent tasks, check for leaks +2. **Unhandled exception** - Add error handling +3. **Dependency issue** - Check package versions + +## Useful Commands + +```bash +# Check Temporal server +temporal server start-dev + +# List workflows +temporal workflow list + +# Describe specific workflow +temporal workflow describe --workflow-id + +# Show workflow history +temporal workflow show --workflow-id + +# Terminate stuck workflow +temporal workflow terminate --workflow-id + +# Reset workflow to specific point +temporal workflow reset --workflow-id --event-id +``` + +## Quick Reference: Status → Action + +| Status | First Check | Common Fix | +|--------|-------------|------------| +| RUNNING (stuck) | Worker running? | Start/restart worker | +| FAILED | Error message | Fix bug, handle error | +| TIMED_OUT | Which timeout? | Increase timeout or optimize | +| TERMINATED | Who terminated? | Check audit log | +| CANCELED | Cancellation source | Expected or investigate | + +## See Also + +- [Common Gotchas](gotchas.md) - Anti-patterns that cause these issues +- [Error Reference](error-reference.md) - Quick error type lookup diff --git a/references/core/versioning.md b/references/core/versioning.md new file mode 100644 index 0000000..226bb83 --- /dev/null +++ b/references/core/versioning.md @@ -0,0 +1,174 @@ +# Workflow Versioning Concepts + +This document provides core conceptual explanations of workflow versioning in Temporal. For language-specific implementation details see `references/{your_language}/versioning.md`, for the language you are working in. + +## Overview + +Workflow versioning allows safe deployment of code changes without breaking running workflows. Three approaches available: + +1. **Patching API** - Code-level version branching +2. **Workflow Type Versioning** - New workflow types for incompatible changes +3. **Worker Versioning** - Deployment-level control with Build IDs + +## Why Versioning is Needed + +When workers restart after deployment, they resume open workflows through history replay. If updated code produces different Commands than the original code, it causes non-determinism errors. + +``` +Original Code (recorded in history): + await activity_a() + await activity_b() + +Updated Code (during replay): + await activity_a() + await activity_c() ← Different! NondeterminismError +``` + +## Approach 1: Patching API + +### Concept + +The patching API lets you branch code based on whether a workflow was started before or after a code change. + +``` +if patched("my-change"): + // New code path (for new and replaying new workflows) +else: + // Old code path (for replaying old workflows) +``` + +### Three-Phase Lifecycle + +**Phase 1: Patch In** +- Add both old and new code paths +- New workflows take new path, old workflows take old path + +**Phase 2: Deprecate** +- After all old workflows complete, remove old code +- Keep deprecation marker for history compatibility + +**Phase 3: Remove** +- After all deprecated workflows complete +- Remove patch entirely, only new code remains + +### When to Use + +- Adding, removing, or reordering activities/child workflows +- Changing which activity/child workflow is called +- Any change that alters the Command sequence + +### When NOT to Use + +- Changing activity implementations (activities aren't replayed) +- Changing arguments passed to activities or child workflows +- Changing retry policies +- Changing timer durations +- Adding new signal/query/update handlers (additive changes are safe) +- Bug fixes that don't change Command sequence + +Unnecessary patching adds complexity and can make workflow code unmanageable. + +## Approach 2: Workflow Type Versioning + +### Concept + +Create a new workflow type (e.g., `OrderWorkflowV2`) instead of patching. + +``` +// Old: OrderWorkflow +// New: OrderWorkflowV2 (completely new implementation) +``` + +### When to Use + +- Major incompatible changes +- Complete rewrites +- When patching would be too complex +- When you want clean separation + +### Process + +1. Create new workflow type with new name +2. Register both with worker +3. Start new workflows with new type +4. Wait for old workflows to complete +5. Remove old workflow type + +## Approach 3: Worker Versioning + +### Concept + +Manage versions at deployment level using Build IDs. Multiple worker versions can run simultaneously. + +``` +Worker v1.0 (Build ID: abc123) + └── Handles workflows started on this version + +Worker v2.0 (Build ID: def456) + └── Handles new workflows + └── Can also handle upgraded old workflows +``` + +### Key Concepts + +**Worker Deployment**: Logical service grouping (e.g., "order-service") + +**Build ID**: Specific code version (e.g., git commit hash) + +**Versioning Behaviors**: +- `PINNED` - Workflows stay on original worker version +- `AUTO_UPGRADE` - Workflows can move to newer versions + +### When to Use PINNED + +- Short-running workflows (minutes to hours) +- Consistency is critical +- Want simplest development experience +- Building new applications + +### When to Use AUTO_UPGRADE + +- Long-running workflows (weeks or months) +- Workflows need bug fixes during execution +- Still requires patching for version transitions + +## Choosing an Approach + +| Scenario | Recommended Approach | +|----------|---------------------| +| Small change, few running workflows | Patching API | +| Major rewrite | Workflow Type Versioning | +| Many short workflows, frequent deploys | Worker Versioning (PINNED) | +| Long-running workflows needing updates | Worker Versioning (AUTO_UPGRADE) + Patching | +| Quick fix, can wait for completion | Wait for workflows to complete | + +## Best Practices + +1. **Check for open executions** before removing old code +2. **Use descriptive patch IDs** (e.g., "add-fraud-check" not "patch-1") +3. **Deploy incrementally**: patch → deprecate → remove +4. **Test replay compatibility** before deploying changes +5. **Monitor old workflow counts** during migration + +## Finding Workflows by Version + +```bash +# Find workflows with specific patch +temporal workflow list --query \ + 'WorkflowType = "OrderWorkflow" AND TemporalChangeVersion = "add-fraud-check"' + +# Find pre-patch workflows +temporal workflow list --query \ + 'WorkflowType = "OrderWorkflow" AND TemporalChangeVersion IS NULL' + +# Find workflows on specific worker version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "my-service:v1.0.0"' +``` + +## Common Mistakes + +1. **Removing old code too early** - Breaks replaying workflows +2. **Not testing with replay** - Catches issues before production +3. **Patching non-Command changes** - Unnecessary complexity +4. **Forgetting to deprecate** - Accumulates dead code diff --git a/references/go/advanced-features.md b/references/go/advanced-features.md new file mode 100644 index 0000000..55e4e57 --- /dev/null +++ b/references/go/advanced-features.md @@ -0,0 +1,187 @@ +# Go SDK Advanced Features + +## Schedules + +Create recurring workflow executions using the Schedule API. + +```go +scheduleHandle, err := c.ScheduleClient().Create(ctx, client.ScheduleOptions{ + ID: "daily-report", + Spec: client.ScheduleSpec{ + CronExpressions: []string{"0 9 * * *"}, + }, + Action: &client.ScheduleWorkflowAction{ + ID: "daily-report-workflow", + Workflow: DailyReportWorkflow, + TaskQueue: "reports", + }, +}) +``` + +Using intervals instead of cron: + +```go +scheduleHandle, err := c.ScheduleClient().Create(ctx, client.ScheduleOptions{ + ID: "hourly-sync", + Spec: client.ScheduleSpec{ + Intervals: []client.ScheduleIntervalSpec{ + {Every: time.Hour}, + }, + }, + Action: &client.ScheduleWorkflowAction{ + ID: "hourly-sync-workflow", + Workflow: SyncWorkflow, + TaskQueue: "sync", + }, +}) +``` + +Manage schedules: + +```go +handle := c.ScheduleClient().GetHandle(ctx, "daily-report") + +// Pause / unpause +handle.Pause(ctx, client.SchedulePauseOptions{Note: "Maintenance window"}) +handle.Unpause(ctx, client.ScheduleUnpauseOptions{Note: "Maintenance complete"}) + +// Trigger immediately +handle.Trigger(ctx, client.ScheduleTriggerOptions{}) + +// Describe +desc, err := handle.Describe(ctx) + +// Delete +handle.Delete(ctx) +``` + +## Async Activity Completion + +For activities that complete asynchronously (e.g., human tasks, external callbacks). +If you configure a heartbeat_timeout on this activity, the external completer is responsible for sending heartbeats via the async handle. +If you do NOT set a heartbeat_timeout, no heartbeats are required. + +**Note:** If the external system that completes the asynchronous action can reliably be trusted to do the task and Signal back with the result, and it doesn't need to Heartbeat or receive Cancellation, then consider using **signals** instead. + +**Step 1: Return `activity.ErrResultPending` from the activity.** + +```go +func RequestApproval(ctx context.Context, requestID string) (string, error) { + activityInfo := activity.GetInfo(ctx) + taskToken := activityInfo.TaskToken + + // Store taskToken externally (e.g., database) for later completion + err := storeTaskToken(requestID, taskToken) + if err != nil { + return "", err + } + + // Signal that this activity will be completed externally + return "", activity.ErrResultPending +} +``` + +**Step 2: Complete from another process using the task token.** + +```go +temporalClient, err := client.Dial(client.Options{}) + +// Complete the activity +err = temporalClient.CompleteActivity(ctx, taskToken, "approved", nil) + +// Or fail it +err = temporalClient.CompleteActivity(ctx, taskToken, nil, errors.New("rejected")) +``` + +Or complete by ID (no task token needed): + +```go +err = temporalClient.CompleteActivityByID(ctx, namespace, workflowID, runID, activityID, "approved", nil) +``` + +## Worker Tuning + +Configure `worker.Options` for production workloads: + +```go +w := worker.New(c, "my-task-queue", worker.Options{ + // Max concurrent activity executions (default: 1000) + MaxConcurrentActivityExecutionSize: 500, + + // Max concurrent workflow task executions (default: 1000) + MaxConcurrentWorkflowTaskExecutionSize: 500, + + // Max concurrent activity task pollers (default: 2) + MaxConcurrentActivityTaskPollers: 4, + + // Max concurrent workflow task pollers (default: 2) + MaxConcurrentWorkflowTaskPollers: 4, + + // Graceful shutdown timeout (default: 0) + WorkerStopTimeout: 30 * time.Second, +}) +``` + +Scale pollers based on task queue throughput. If you observe high schedule-to-start latency, increase the number of pollers or add more workers. + +## Sessions + +Go-specific feature for routing multiple activities to the same worker. All activities using the session context execute on the same worker host. + +**Enable on the worker:** + +```go +w := worker.New(c, "fileprocessing", worker.Options{ + EnableSessionWorker: true, + MaxConcurrentSessionExecutionSize: 100, // default: 1000 +}) +``` + +**Use in a workflow:** + +```go +func FileProcessingWorkflow(ctx workflow.Context, file FileParam) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + sessionCtx, err := workflow.CreateSession(ctx, &workflow.SessionOptions{ + CreationTimeout: time.Minute, + ExecutionTimeout: 10 * time.Minute, + }) + if err != nil { + return err + } + defer workflow.CompleteSession(sessionCtx) + + // All three activities run on the same worker + var downloadResult string + err = workflow.ExecuteActivity(sessionCtx, DownloadFile, file.URL).Get(sessionCtx, &downloadResult) + if err != nil { + return err + } + + var processResult string + err = workflow.ExecuteActivity(sessionCtx, ProcessFile, downloadResult).Get(sessionCtx, &processResult) + if err != nil { + return err + } + + err = workflow.ExecuteActivity(sessionCtx, UploadFile, processResult).Get(sessionCtx, nil) + return err +} +``` + +Key points: +- `workflow.ErrSessionFailed` is returned if the worker hosting the session dies +- `CompleteSession` releases resources -- always call it (use `defer`) +- Use case: file processing (download, process, upload on same host), GPU workloads, or any pipeline needing local state +- `MaxConcurrentSessionExecutionSize` on `worker.Options` limits how many sessions a single worker can handle + +**Limitations:** +- Sessions do not survive worker process restarts — if the worker dies, the session fails and activities must be retried from the workflow level +- There is no server-side support for sessions — the Go SDK implements them entirely client-side using internal task queue routing +- Session concurrency limiting is per-process, not per-host — only one worker process per host if you rely on this + +**Relationship to worker-specific task queues:** Sessions are essentially a convenience API over the "worker-specific task queue" pattern, where each worker creates a unique task queue and routes activities to it. For simple cases where you don't need separate activities (e.g., download + process + upload can be one unit), consider using a single long-running activity with heartbeating instead. diff --git a/references/go/data-handling.md b/references/go/data-handling.md new file mode 100644 index 0000000..e887e7b --- /dev/null +++ b/references/go/data-handling.md @@ -0,0 +1,262 @@ +# Go SDK Data Handling + +## Overview + +The Go SDK uses the `converter.DataConverter` interface to serialize/deserialize workflow inputs, outputs, and activity parameters. The default converter converts values to JSON. + +## Default Data Converter + +The default `CompositeDataConverter` applies converters in order until one returns a non-nil Payload: + +1. `converter.NewNilPayloadConverter()` -- nil values +2. `converter.NewByteSlicePayloadConverter()` -- `[]byte` +3. `converter.NewProtoJSONPayloadConverter()` -- Protobuf messages as JSON +4. `converter.NewProtoPayloadConverter()` -- Protobuf messages as binary +5. `converter.NewJSONPayloadConverter()` -- anything JSON-serializable + +Structs must have exported fields to be serialized. + +## Custom Data Converter + +In most cases you don't implement the full `DataConverter` interface directly. Instead, implement a **`PayloadConverter`** for your specific type and insert it into a `CompositeDataConverter`. The `PayloadConverter` interface has four methods: + +```go +type PayloadConverter interface { + ToPayload(value interface{}) (*commonpb.Payload, error) // return nil if this type isn't handled + FromPayload(payload *commonpb.Payload, valuePtr interface{}) error + ToString(payload *commonpb.Payload) string + Encoding() string // e.g. "json/msgpack" +} +``` + +**Example — custom msgpack PayloadConverter:** + +```go +import ( + "encoding/json" + "fmt" + + commonpb "go.temporal.io/api/common/v1" + "go.temporal.io/sdk/converter" + "github.com/vmihailenco/msgpack/v5" +) + +const encodingMsgpack = "binary/msgpack" + +type MsgpackPayloadConverter struct{} + +func (c *MsgpackPayloadConverter) Encoding() string { + return encodingMsgpack +} + +func (c *MsgpackPayloadConverter) ToPayload(value interface{}) (*commonpb.Payload, error) { + if value == nil { + return nil, nil + } + data, err := msgpack.Marshal(value) + if err != nil { + return nil, fmt.Errorf("msgpack marshal: %w", err) + } + return &commonpb.Payload{ + Metadata: map[string][]byte{ + converter.MetadataEncoding: []byte(encodingMsgpack), + }, + Data: data, + }, nil +} + +func (c *MsgpackPayloadConverter) FromPayload(payload *commonpb.Payload, valuePtr interface{}) error { + if string(payload.GetMetadata()[converter.MetadataEncoding]) != encodingMsgpack { + return fmt.Errorf("unsupported encoding") + } + return msgpack.Unmarshal(payload.Data, valuePtr) +} + +func (c *MsgpackPayloadConverter) ToString(payload *commonpb.Payload) string { + // Decode to a map for human-readable display + var v interface{} + if err := msgpack.Unmarshal(payload.Data, &v); err != nil { + return fmt.Sprintf("", err) + } + b, _ := json.Marshal(v) + return string(b) +} +``` + +**Register in a CompositeDataConverter and pass to the client:** + +```go +dataConverter := converter.NewCompositeDataConverter( + converter.NewNilPayloadConverter(), + converter.NewByteSlicePayloadConverter(), + &MsgpackPayloadConverter{}, // handles your type; falls through to JSON for everything else + converter.NewJSONPayloadConverter(), +) + +c, err := client.Dial(client.Options{ + DataConverter: dataConverter, +}) +``` + +**Per-activity/child-workflow override** — use a different converter for specific calls: + +```go +actCtx := workflow.WithDataConverter(ctx, mySpecialConverter) +workflow.ExecuteActivity(actCtx, SensitiveActivity, input) +``` + +**Note:** If your converter makes remote calls (e.g., to a KMS for encryption), wrap it with `workflow.DataConverterWithoutDeadlockDetection` to avoid deadlock detection timeouts in workflow code. + +## Composition of Payload Converters + +Use `converter.NewCompositeDataConverter` to chain type-specific converters. The first converter that can handle the type wins. + +```go +dataConverter := converter.NewCompositeDataConverter( + converter.NewNilPayloadConverter(), + converter.NewByteSlicePayloadConverter(), + converter.NewProtoJSONPayloadConverter(), + converter.NewProtoPayloadConverter(), + YourCustomPayloadConverter(), + converter.NewJSONPayloadConverter(), +) +``` + +## Protobuf Support + +Binary protobuf: +```go +converter.NewProtoPayloadConverter() +``` + +JSON protobuf: +```go +converter.NewProtoJSONPayloadConverter() +``` + +Both are included in the default data converter. SDK v1.26.0 (March 2024) migrated from gogo/protobuf to google/protobuf. If you need backward compatibility with older payloads encoded with gogo, use the `LegacyTemporalProtoCompat` option. + +## Payload Encryption + +Implement the `converter.PayloadCodec` interface (`Encode` and `Decode`) and wrap the default data converter: + +```go +// Codec implements converter.PayloadCodec for encryption. +type Codec struct{} + +func (Codec) Encode(payloads []*commonpb.Payload) ([]*commonpb.Payload, error) { + result := make([]*commonpb.Payload, len(payloads)) + for i, p := range payloads { + origBytes, err := p.Marshal() + if err != nil { + return payloads, err + } + encrypted := encrypt(origBytes) // your encryption logic + result[i] = &commonpb.Payload{ + Metadata: map[string][]byte{converter.MetadataEncoding: []byte("binary/encrypted")}, + Data: encrypted, + } + } + return result, nil +} + +func (Codec) Decode(payloads []*commonpb.Payload) ([]*commonpb.Payload, error) { + result := make([]*commonpb.Payload, len(payloads)) + for i, p := range payloads { + if string(p.Metadata[converter.MetadataEncoding]) != "binary/encrypted" { + result[i] = p + continue + } + decrypted := decrypt(p.Data) // your decryption logic + result[i] = &commonpb.Payload{} + err := result[i].Unmarshal(decrypted) + if err != nil { + return payloads, err + } + } + return result, nil +} +``` + +Wrap with `CodecDataConverter` and pass to client: + +```go +var DataConverter = converter.NewCodecDataConverter( + converter.GetDefaultDataConverter(), + &Codec{}, +) + +c, err := client.Dial(client.Options{ + DataConverter: DataConverter, +}) +``` + +## Search Attributes + +Set at workflow start: + +```go +handle, err := c.ExecuteWorkflow(ctx, client.StartWorkflowOptions{ + ID: "order-123", + TaskQueue: "orders", + SearchAttributes: map[string]interface{}{ + "OrderStatus": "pending", + "CustomerId": "cust-456", + }, +}, OrderWorkflow, input) +``` + +Upsert from within a workflow: + +```go +err := workflow.UpsertSearchAttributes(ctx, map[string]interface{}{ + "OrderStatus": "completed", +}) +``` + +Typed search attributes (v1.26.0+, preferred): + +```go +var OrderStatusKey = temporal.NewSearchAttributeKeyKeyword("OrderStatus") + +err := workflow.UpsertTypedSearchAttributes(ctx, OrderStatusKey.ValueSet("completed")) +``` + +Query workflows by search attributes: + +```go +resp, err := c.ListWorkflow(ctx, &workflowservice.ListWorkflowExecutionsRequest{ + Query: `OrderStatus = "pending" AND CustomerId = "cust-456"`, +}) +``` + +## Workflow Memo + +Set in start options: + +```go +handle, err := c.ExecuteWorkflow(ctx, client.StartWorkflowOptions{ + ID: "order-123", + TaskQueue: "orders", + Memo: map[string]interface{}{ + "customerName": "Alice", + "notes": "Priority customer", + }, +}, OrderWorkflow, input) +``` + +Read memo from workflow info. Upsert memo (Go SDK only): + +```go +err := workflow.UpsertMemo(ctx, map[string]interface{}{ + "notes": "Updated notes", +}) +``` + +## Best Practices + +1. Use structs with exported fields for inputs and outputs +2. Prefer JSON for readability during development, protobuf for performance in production +3. Keep payloads small -- see `references/core/gotchas.md` for limits +4. Use `PayloadCodec` for encryption; never store sensitive data unencrypted +5. Configure the same data converter on both client and worker diff --git a/references/go/determinism-protection.md b/references/go/determinism-protection.md new file mode 100644 index 0000000..4a6f5f4 --- /dev/null +++ b/references/go/determinism-protection.md @@ -0,0 +1,98 @@ +# Go Workflow Determinism Protection + +## Overview + +The Go SDK has no runtime sandbox. Determinism is enforced by **developer convention** and **optional static analysis**. Unlike the Python and TypeScript SDKs, the Go SDK will not intercept or replace non-deterministic calls at runtime. The Go SDK does perform a limited runtime command-ordering check, but catching non-deterministic code before deployment requires the `workflowcheck` tool and testing, in particular replay tests (see `references/go/testing`). + +## workflowcheck Static Analysis + +### Install + +```bash +go install go.temporal.io/sdk/contrib/tools/workflowcheck@latest +``` + +### Run + +```bash +workflowcheck ./... +``` + +No output means all registered workflows are deterministic. Non-deterministic code produces hierarchical output showing the call chain to the offending code. + +Use `-show-pos` for exact file positions: + +```bash +workflowcheck -show-pos ./... +``` + +### What It Detects + +**Non-deterministic functions/variables:** +- `time.Now` -- obtaining current time +- `time.Sleep` -- sleeping +- `crypto/rand.Reader` -- crypto random reader +- `math/rand.globalRand` -- global pseudorandom +- `os.Stdin`, `os.Stdout`, `os.Stderr` -- standard I/O streams + +**Non-deterministic Go constructs:** +- Starting a goroutine (`go func()`) +- Sending to a channel +- Receiving from a channel +- Iterating over a channel via `range` +- Iterating over a map via `range` + +### Limitations + +`workflowcheck` cannot catch everything. It does **not** detect: +- Global variable mutation +- Non-determinism via reflection +- Runtime-conditional non-determinism + +### Suppressing False Positives + +Add `//workflowcheck:ignore` on or directly above the offending line: + +```go +now := time.Now() //workflowcheck:ignore +``` + +For broader suppression, use a YAML config file: + +```yaml +# workflowcheck.config.yaml +decls: + path/to/package.MyDeterministicFunc: false +``` + +```bash +workflowcheck -config workflowcheck.config.yaml ./... +``` + +## Determinism Rules + +**You must:** +- Use `workflow.Go(ctx, func(ctx workflow.Context) { ... })` instead of `go` +- Use `workflow.NewChannel(ctx)` instead of `chan` +- Use `workflow.NewSelector(ctx)` instead of `select` +- Use `workflow.Sleep(ctx, duration)` instead of `time.Sleep()` +- Use `workflow.Now(ctx)` instead of `time.Now()` +- Use `workflow.GetLogger(ctx)` instead of `fmt.Println` / `log.Println` +- Sort map keys before iterating, or use `workflow.SideEffect` / an activity + +**You must not:** +- Start native goroutines +- Use native channels or `select` +- Call `time.Now()` or `time.Sleep()` +- Use `math/rand` global functions or `crypto/rand.Reader` +- Access `os.Stdin`, `os.Stdout`, or `os.Stderr` +- Mutate global variables +- Make network calls, file I/O, or database queries (use activities) + +## Best Practices + +1. **Run `workflowcheck` in CI / pre-commit** -- catch non-deterministic code before it reaches production +2. **Keep workflow code thin** -- workflows should orchestrate; delegate all I/O and non-deterministic work to activities +3. **Use struct methods for activities** -- keeps imports clean and avoids pulling non-deterministic dependencies into workflow files +4. **Separate workflow and activity files** -- reduces the surface area that `workflowcheck` needs to analyze and keeps concerns isolated +5. **Test with replay** after any workflow code change to verify backward compatibility diff --git a/references/go/determinism.md b/references/go/determinism.md new file mode 100644 index 0000000..0cff905 --- /dev/null +++ b/references/go/determinism.md @@ -0,0 +1,52 @@ +# Go SDK Determinism + +## Overview + +The Go SDK has NO runtime sandbox (unlike Python/TypeScript). Workflows must be deterministic for replay, and determinism is enforced entirely by developer convention and optional static analysis via the `workflowcheck` tool (see `references/go/determinism-protection.md`). + +## Why Determinism Matters: History Replay + +Temporal provides durable execution through **History Replay**. When a Worker restores workflow state, it re-executes workflow code from the beginning. This requires the code to be **deterministic**. See `references/core/determinism.md` for a deep explanation. + +## Forbidden Operations + +Do not use any of the following in workflow code: + +- **Native goroutines** (`go func()`) -- use `workflow.Go()` instead +- **Native channels** (`chan`, send, receive, `range` over channel) -- use `workflow.Channel` instead +- **Native `select`** -- use `workflow.Selector` instead +- **`time.Now()`** -- use `workflow.Now(ctx)` instead +- **`time.Sleep()`** -- use `workflow.Sleep(ctx, duration)` instead +- **`math/rand` global** (e.g., `rand.Intn()`) -- use `workflow.SideEffect` instead +- **`crypto/rand.Reader`** -- use an activity instead +- **`os.Stdin` / `os.Stdout` / `os.Stderr`** -- use `workflow.GetLogger(ctx)` for logging +- **Map range iteration** (`for k, v := range myMap`) -- sort keys first, then iterate +- **Mutating global variables** -- use local state or `workflow.SideEffect` +- **Anonymous functions as local activities** -- the name is derived from the function and will be non-deterministic across replays; always use named functions for local activities + +## Safe Builtin Alternatives + +| Instead of | Use | +|---|---| +| `go func() { ... }()` | `workflow.Go(ctx, func(ctx workflow.Context) { ... })` | +| `chan T` | `workflow.NewChannel(ctx)` / `workflow.NewBufferedChannel(ctx, size)` | +| `select { ... }` | `workflow.NewSelector(ctx)` | +| `time.Now()` | `workflow.Now(ctx)` | +| `time.Sleep(d)` | `workflow.Sleep(ctx, d)` | +| `rand.Intn(100)` | `workflow.SideEffect(ctx, func(ctx workflow.Context) interface{} { return rand.Intn(100) })` | +| `uuid.New()` | `workflow.SideEffect` or pass as activity result | +| `log.Println(...)` | `workflow.GetLogger(ctx).Info(...)` | + +## Testing Replay Compatibility + +Use `worker.WorkflowReplayer` to verify code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/go/testing.md` + +## Best Practices + +1. Run `workflowcheck ./...` in CI to catch non-deterministic code early +2. Always use `workflow.*` APIs instead of native Go concurrency and time primitives +3. Move all I/O operations (network, filesystem, database) into activities +4. Sort map keys before iterating if you must iterate over a map in workflow code +5. Use `workflow.GetLogger(ctx)` instead of `fmt.Println` or `log.Println` for replay-safe logging +6. Keep workflow code focused on orchestration; delegate non-deterministic work to activities +7. Test with replay after making changes to workflow definitions diff --git a/references/go/error-handling.md b/references/go/error-handling.md new file mode 100644 index 0000000..92a856b --- /dev/null +++ b/references/go/error-handling.md @@ -0,0 +1,184 @@ +# Go SDK Error Handling + +## Overview + +The Go SDK uses error return values (not exceptions). All Temporal errors implement the `error` interface. Activity errors returned to workflows are wrapped in `*temporal.ActivityError`; use `errors.As` to unwrap them. + +## Application Errors + +```go +import "go.temporal.io/sdk/temporal" + +func ValidateOrder(ctx context.Context, order Order) error { + if !order.IsValid() { + return temporal.NewApplicationError( + "Invalid order", + "ValidationError", + ) + } + return nil +} +``` + +`temporal.NewApplicationError(message, errType, details...)` creates a retryable `*temporal.ApplicationError`. Use `NewApplicationErrorWithCause` to include a wrapped cause. + +## Non-Retryable Errors + +```go +func ChargeCard(ctx context.Context, input ChargeCardInput) (string, error) { + if !isValidCard(input.CardNumber) { + return "", temporal.NewNonRetryableApplicationError( + "Permanent failure - invalid credit card", + "PaymentError", + nil, // cause + ) + } + return processPayment(input.CardNumber, input.Amount) +} +``` + +`temporal.NewNonRetryableApplicationError(message, errType, cause, details...)` is always non-retryable regardless of RetryPolicy. You can also mark error types as non-retryable in the RetryPolicy instead: + +```go +RetryPolicy: &temporal.RetryPolicy{ + NonRetryableErrorTypes: []string{"PaymentError", "ValidationError"}, +}, +``` + +## Handling Activity Errors in Workflows + +```go +import ( + "errors" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func MyWorkflow(ctx workflow.Context) (string, error) { + var result string + err := workflow.ExecuteActivity(ctx, RiskyActivity).Get(ctx, &result) + if err != nil { + var applicationErr *temporal.ApplicationError + if errors.As(err, &applicationErr) { + switch applicationErr.Type() { + case "ValidationError": + // handle validation error + case "PaymentError": + // handle payment error + default: + // handle unknown error type + } + } + + var timeoutErr *temporal.TimeoutError + if errors.As(err, &timeoutErr) { + switch timeoutErr.TimeoutType() { + case enumspb.TIMEOUT_TYPE_START_TO_CLOSE: + // handle start-to-close timeout + case enumspb.TIMEOUT_TYPE_HEARTBEAT: + // handle heartbeat timeout + } + } + + var canceledErr *temporal.CanceledError + if errors.As(err, &canceledErr) { + // handle cancellation + } + + var panicErr *temporal.PanicError + if errors.As(err, &panicErr) { + // panicErr.Error() and panicErr.StackTrace() + } + + return "", err + } + return result, nil +} +``` + +## Retry Configuration + +```go +import ( + "time" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func MyWorkflow(ctx workflow.Context) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Minute, + RetryPolicy: &temporal.RetryPolicy{ + InitialInterval: time.Second, + BackoffCoefficient: 2.0, + MaximumInterval: time.Minute, + MaximumAttempts: 5, + NonRetryableErrorTypes: []string{"ValidationError", "PaymentError"}, + }, + } + ctx = workflow.WithActivityOptions(ctx, ao) + return workflow.ExecuteActivity(ctx, MyActivity).Get(ctx, nil) +} +``` + +Only set options such as `MaximumInterval`, `MaximumAttempts`, etc. if you have a domain-specific reason to. If not, prefer to leave them at their defaults. + +## Timeout Configuration + +```go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, // Single attempt max duration + ScheduleToCloseTimeout: 30 * time.Minute, // Total time including retries + ScheduleToStartTimeout: 10 * time.Minute, // Time waiting in task queue + HeartbeatTimeout: 2 * time.Minute, // Between heartbeats +} +ctx = workflow.WithActivityOptions(ctx, ao) +``` + +- **StartToCloseTimeout**: Max time for a single Activity Task Execution. Prefer this over ScheduleToCloseTimeout. +- **ScheduleToCloseTimeout**: Total time including retries. +- **ScheduleToStartTimeout**: Time an Activity Task can wait in the Task Queue before a Worker picks it up. Rarely needed. +- **HeartbeatTimeout**: Max time between heartbeats. Required for long-running activities to detect failures. + +Either `StartToCloseTimeout` or `ScheduleToCloseTimeout` must be set. + +## Workflow Failure + +Returning any error from a workflow function fails the execution. Return `nil` for success. + +**Important Go-specific behavior:** In the Go SDK, returning any error from a workflow fails the workflow execution by default — there is no automatic retry. This differs from other SDKs (Python, TypeScript) where non-`ApplicationError` exceptions cause the workflow task to retry indefinitely. In Go, if you want workflow-level retries, you must explicitly set a `RetryPolicy` on the `StartWorkflowOptions`. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + if someCondition { + return "", temporal.NewApplicationError( + "Cannot process order", + "BusinessError", + ) + } + return "success", nil +} +``` + +To prevent workflow retry, return a non-retryable error: + +```go +return "", temporal.NewNonRetryableApplicationError( + "Unrecoverable failure", + "FatalError", + nil, +) +``` + +**Note:** If an activity returns a non-retryable error, the workflow receives an `*temporal.ActivityError` wrapping it. To fail the workflow without retry, wrap it in a new `NewNonRetryableApplicationError`. + +## Best Practices + +1. Use specific error types for different failure modes +2. Mark permanent failures as non-retryable +3. Set appropriate timeouts; prefer `StartToCloseTimeout` over `ScheduleToCloseTimeout` +4. Let Temporal handle retries via RetryPolicy rather than implementing retry logic yourself +5. Use `errors.As` to unwrap and inspect specific error types +6. Design activities to be idempotent for safe retries (see `references/core/patterns.md`) diff --git a/references/go/go.md b/references/go/go.md new file mode 100644 index 0000000..cc87a6a --- /dev/null +++ b/references/go/go.md @@ -0,0 +1,242 @@ +# Temporal Go SDK Reference + +## Overview + +The Temporal Go SDK (`go.temporal.io/sdk`) provides a strongly-typed, idiomatic Go approach to building durable workflows. Workflows are regular exported Go functions. The Go SDK does not have an automatic sandbox -- determinism is the developer's responsibility, aided by the `workflowcheck` static analysis tool. + +## Quick Start + +**Add Dependency:** In your Go module, add the Temporal SDK: +```bash +go get go.temporal.io/sdk +``` + +**workflows/greeting.go** - Workflow definition: +```go +package workflows + +import ( + "time" + + "go.temporal.io/sdk/workflow" +) + +func GreetingWorkflow(ctx workflow.Context, name string) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, "Greet", name).Get(ctx, &result) + if err != nil { + return "", err + } + return result, nil +} +``` + +**activities/greet.go** - Activity definition: +```go +package activities + +import ( + "context" + "fmt" +) + +type Activities struct{} + +func (a *Activities) Greet(ctx context.Context, name string) (string, error) { + return fmt.Sprintf("Hello, %s!", name), nil +} +``` + +**worker/main.go** - Worker setup: +```go +package main + +import ( + "log" + + "yourmodule/activities" + "yourmodule/workflows" + + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/worker" +) + +func main() { + c, err := client.Dial(client.Options{}) + if err != nil { + log.Fatalln("Unable to create client", err) + } + defer c.Close() + + w := worker.New(c, "my-task-queue", worker.Options{}) + + w.RegisterWorkflow(workflows.GreetingWorkflow) + w.RegisterActivity(&activities.Activities{}) + + err = w.Run(worker.InterruptCh()) + if err != nil { + log.Fatalln("Unable to start worker", err) + } +} +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Run `go run worker/main.go` in the background. + +**starter/main.go** - Start a workflow execution: +```go +package main + +import ( + "context" + "fmt" + "log" + + "yourmodule/workflows" + + "github.com/google/uuid" + "go.temporal.io/sdk/client" +) + +func main() { + c, err := client.Dial(client.Options{}) + if err != nil { + log.Fatalln("Unable to create client", err) + } + defer c.Close() + + options := client.StartWorkflowOptions{ + ID: uuid.NewString(), + TaskQueue: "my-task-queue", + } + + we, err := c.ExecuteWorkflow(context.Background(), options, workflows.GreetingWorkflow, "my name") + if err != nil { + log.Fatalln("Unable to execute workflow", err) + } + + var result string + err = we.Get(context.Background(), &result) + if err != nil { + log.Fatalln("Unable to get workflow result", err) + } + + fmt.Println("Result:", result) +} +``` + +**Run the workflow:** Run `go run starter/main.go`. Should output: `Result: Hello, my name!`. + +## Key Concepts + +### Workflow Definition +- Exported function with `workflow.Context` as the first parameter +- Returns `(ResultType, error)` or just `error` +- Signature: `func MyWorkflow(ctx workflow.Context, input MyInput) (MyOutput, error)` +- Use `workflow.SetQueryHandler()`, `workflow.SetUpdateHandler()` for handlers +- Register with `w.RegisterWorkflow(MyWorkflow)` + +### Activity Definition +- Regular function or struct methods with `context.Context` as the first parameter +- Struct methods are preferred for dependency injection +- Signature: `func (a *Activities) MyActivity(ctx context.Context, input string) (string, error)` +- Register struct with `w.RegisterActivity(&Activities{})` (registers all exported methods) + +### Worker Setup +- Create client with `client.Dial(client.Options{})` +- Create worker with `worker.New(c, "task-queue", worker.Options{})` +- Register workflows and activities +- Run with `w.Run(worker.InterruptCh())` + +### Determinism + +**Workflow code must be deterministic!** The Go SDK has no sandbox -- determinism is enforced by convention and tooling. + +Use Temporal replacements instead of native Go constructs: +- `workflow.Go()` instead of `go` (goroutines) +- `workflow.Channel` instead of `chan` +- `workflow.Selector` instead of `select` +- `workflow.Sleep()` instead of `time.Sleep()` +- `workflow.Now()` instead of `time.Now()` +- `workflow.GetLogger()` instead of `log` / `fmt.Println` for replay-safe logging + +Use the **`workflowcheck`** static analysis tool to catch non-deterministic code: +```bash +go install go.temporal.io/sdk/contrib/tools/workflowcheck@latest +workflowcheck ./... +``` + +Read `references/core/determinism.md` and `references/go/determinism.md` to understand more. + +## File Organization Best Practice + +**Use separate packages for workflows, activities, and worker.** Activities as struct methods enable dependency injection at the worker level. + +``` +myapp/ +├── workflows/ +│ └── greeting.go # Only Workflow functions +├── activities/ +│ └── greet.go # Activity struct and methods +├── worker/ +│ └── main.go # Worker setup, imports both +└── starter/ + └── main.go # Client code to start workflows +``` + +**Activities as struct methods for dependency injection:** +```go +// activities/greet.go +type Activities struct { + HTTPClient *http.Client + DB *sql.DB +} + +func (a *Activities) FetchData(ctx context.Context, url string) (string, error) { + // Use a.HTTPClient, a.DB, etc. +} +``` + +```go +// worker/main.go - inject dependencies at worker startup +activities := &activities.Activities{ + HTTPClient: http.DefaultClient, + DB: db, +} +w.RegisterActivity(activities) +``` + +## Common Pitfalls + +1. **Using native goroutines/channels/select** - Use `workflow.Go()`, `workflow.Channel`, `workflow.Selector` +2. **Using `time.Sleep` or `time.Now`** - Use `workflow.Sleep()` and `workflow.Now()` +3. **Iterating over maps with `range`** - Map iteration order is non-deterministic; sort keys first +4. **Forgetting to register workflows/activities** - Worker will fail tasks for unregistered types +5. **Registering activity functions instead of struct** - Use `w.RegisterActivity(&Activities{})` not `w.RegisterActivity(a.MyMethod)` +6. **Forgetting to heartbeat** - Long-running activities need `activity.RecordHeartbeat(ctx, details)` +7. **Using `fmt.Println` in workflows** - Use `workflow.GetLogger(ctx)` for replay-safe logging +8. **Not setting Activity timeouts** - `StartToCloseTimeout` or `ScheduleToCloseTimeout` is required in `ActivityOptions` + +## Writing Tests + +See `references/go/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files +- **`references/go/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. +- **`references/go/determinism.md`** - Determinism rules, workflowcheck tool, safe alternatives +- **`references/go/gotchas.md`** - Go-specific mistakes and anti-patterns +- **`references/go/error-handling.md`** - ApplicationError, retry policies, non-retryable errors +- **`references/go/observability.md`** - Logging, metrics, tracing, Search Attributes +- **`references/go/testing.md`** - TestWorkflowEnvironment, time-skipping, activity mocking +- **`references/go/advanced-features.md`** - Schedules, worker tuning, and more +- **`references/go/data-handling.md`** - Data converters, payload codecs, encryption +- **`references/go/versioning.md`** - Patching API (`workflow.GetVersion`), Worker Versioning +- **`references/python/determinism-protection.md`** - Information on **`workflowcheck`** tool to help statically check for determinism issues. diff --git a/references/go/gotchas.md b/references/go/gotchas.md new file mode 100644 index 0000000..4b7ddf3 --- /dev/null +++ b/references/go/gotchas.md @@ -0,0 +1,290 @@ +# Go Gotchas + +Go-specific mistakes and anti-patterns. See also [Common Gotchas](references/core/gotchas.md) for language-agnostic concepts. + +## Goroutines and Concurrency + +### Using Native Go Concurrency Primitives + +**The Problem**: Native `go`, `chan`, and `select` are non-deterministic and will cause replay failures. + +```go +// BAD - Native goroutine +func MyWorkflow(ctx workflow.Context) error { + go func() { // Non-deterministic! + // do work + }() + return nil +} + +// GOOD - Use workflow.Go +func MyWorkflow(ctx workflow.Context) error { + workflow.Go(ctx, func(gCtx workflow.Context) { + // do work + }) + return nil +} +``` + +```go +// BAD - Native channel +func MyWorkflow(ctx workflow.Context) error { + ch := make(chan string) // Non-deterministic! + return nil +} + +// GOOD - Use workflow.Channel +func MyWorkflow(ctx workflow.Context) error { + ch := workflow.NewChannel(ctx) + return nil +} +``` + +```go +// BAD - Native select +select { +case val := <-ch1: + // handle +case val := <-ch2: + // handle +} + +// GOOD - Use workflow.Selector +selector := workflow.NewSelector(ctx) +selector.AddReceive(ch1, func(c workflow.ReceiveChannel, more bool) { + var val string + c.Receive(ctx, &val) + // handle +}) +selector.AddReceive(ch2, func(c workflow.ReceiveChannel, more bool) { + var val string + c.Receive(ctx, &val) + // handle +}) +selector.Select(ctx) +``` + +## Non-Deterministic Operations + +### Map Iteration + +```go +// BAD - Map range order is randomized +for k, v := range myMap { + // Non-deterministic order! +} + +// GOOD - Sort keys first +keys := make([]string, 0, len(myMap)) +for k := range myMap { + keys = append(keys, k) +} +sort.Strings(keys) +for _, k := range keys { + v := myMap[k] + // Deterministic order +} +``` + +### Time and Randomness + +```go +// BAD +t := time.Now() // System clock, non-deterministic +time.Sleep(time.Second) // Not replay-safe +r := rand.Intn(100) // Non-deterministic + +// GOOD +t := workflow.Now(ctx) // Deterministic +workflow.Sleep(ctx, time.Second) // Durable timer +encoded := workflow.SideEffect(ctx, func(ctx workflow.Context) interface{} { + return rand.Intn(100) +}) +var r int +encoded.Get(&r) +``` + +Use the `workflowcheck` static analysis tool to catch non-deterministic calls. For false positives, annotate with `//workflowcheck:ignore` on the line above. + +### Anonymous Functions as Local Activities + +**The Problem**: The Go SDK derives the local activity name from the function. Anonymous functions get a non-deterministic name that can change across builds, causing replay failures. + +```go +// BAD - anonymous function: name is non-deterministic +workflow.ExecuteLocalActivity(ctx, func(ctx context.Context) (string, error) { + return "result", nil +}) + +// GOOD - named function: stable, deterministic name +func QuickLookup(ctx context.Context) (string, error) { + return "result", nil +} + +workflow.ExecuteLocalActivity(ctx, QuickLookup) +``` + +Always use named functions for local activities (and regular activities). + +## Wrong Retry Classification + +**Example:** Transient network errors should be retried. Authentication errors should not be. +See `references/go/error-handling.md` for detailed guidance on error classification and retry policies. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```go +// BAD - No heartbeat, can't detect stuck activities or receive cancellation +func ProcessLargeFile(ctx context.Context, path string) error { + for _, chunk := range readChunks(path) { + process(chunk) // Takes hours, no heartbeat + } + return nil +} + +// GOOD - Regular heartbeats with progress +func ProcessLargeFile(ctx context.Context, path string) error { + for i, chunk := range readChunks(path) { + activity.RecordHeartbeat(ctx, fmt.Sprintf("Processing chunk %d", i)) + process(chunk) + } + return nil +} +``` + +### Heartbeat Timeout Too Short + +```go +// BAD - Heartbeat timeout shorter than processing time +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Minute, + HeartbeatTimeout: 10 * time.Second, // Too short! +} + +// GOOD - Heartbeat timeout allows for processing variance +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Minute, + HeartbeatTimeout: 2 * time.Minute, +} +``` + +Set heartbeat timeout as high as acceptable for your use case -- each heartbeat counts as an action. + +## Cancellation + +### Not Handling Workflow Cancellation + +```go +// BAD - Cleanup doesn't run on cancellation +func BadWorkflow(ctx workflow.Context) error { + _ = workflow.ExecuteActivity(ctx, AcquireResource).Get(ctx, nil) + _ = workflow.ExecuteActivity(ctx, DoWork).Get(ctx, nil) + _ = workflow.ExecuteActivity(ctx, ReleaseResource).Get(ctx, nil) // Never runs if cancelled! + return nil +} + +// GOOD - Use defer with NewDisconnectedContext for cleanup +func GoodWorkflow(ctx workflow.Context) error { + defer func() { + if !errors.Is(ctx.Err(), workflow.ErrCanceled) { + return + } + newCtx, _ := workflow.NewDisconnectedContext(ctx) + _ = workflow.ExecuteActivity(newCtx, ReleaseResource).Get(newCtx, nil) + }() + + err := workflow.ExecuteActivity(ctx, AcquireResource).Get(ctx, nil) + if err != nil { + return err + } + return workflow.ExecuteActivity(ctx, DoWork).Get(ctx, nil) +} +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: +1. **Heartbeating** - Cancellation is delivered via heartbeat +2. **Checking ctx.Done()** - Detect when cancellation arrives + +```go +// BAD - Activity ignores cancellation +func LongActivity(ctx context.Context) error { + doExpensiveWork() // Runs to completion even if cancelled + return nil +} + +// GOOD - Heartbeat and check ctx.Done() +func LongActivity(ctx context.Context) error { + for i, item := range items { + select { + case <-ctx.Done(): + cleanup() + return ctx.Err() + default: + activity.RecordHeartbeat(ctx, fmt.Sprintf("Processing item %d", i)) + process(item) + } + } + return nil +} +``` + +## Testing + +### Not Testing Failures + +It is important to make sure workflows work as expected under failure paths in addition to happy paths. Please see `references/go/testing.md` for more info. + +### Not Testing Replay + +Replay tests help you test that you do not have hidden sources of non-determinism bugs in your workflow code, and should be considered in addition to standard testing. Please see `references/go/testing.md` for more info. + +## Timers and Sleep + +### Using time.Sleep Instead of workflow.Sleep + +```go +// BAD: time.Sleep is not deterministic during replay +func BadWorkflow(ctx workflow.Context) error { + time.Sleep(60 * time.Second) // Non-deterministic! + return nil +} + +// GOOD: Use workflow.Sleep for deterministic timers +func GoodWorkflow(ctx workflow.Context) error { + workflow.Sleep(ctx, 60*time.Second) // Deterministic + return nil +} +``` + +### Using time.After Instead of workflow.NewTimer + +```go +// BAD: time.After is not replay-safe +func BadWorkflow(ctx workflow.Context) error { + <-time.After(5 * time.Minute) // Non-deterministic! + return nil +} + +// GOOD: Use workflow.NewTimer for durable timers +func GoodWorkflow(ctx workflow.Context) error { + timer := workflow.NewTimer(ctx, 5*time.Minute) + _ = timer.Get(ctx, nil) // Deterministic, durable + return nil +} +``` + +### Using time.Now() Instead of workflow.Now() + +```go +// BAD: time.Now() differs between execution and replay +deadline := time.Now().Add(24 * time.Hour) + +// GOOD: workflow.Now() is replay-safe +deadline := workflow.Now(ctx).Add(24 * time.Hour) +``` + +**Why this matters:** `time.Now()`, `time.Sleep()`, and `time.After()` use the system clock, which differs between original execution and replay. The `workflow.*` equivalents create durable, deterministic entries in the event history. diff --git a/references/go/observability.md b/references/go/observability.md new file mode 100644 index 0000000..ba55140 --- /dev/null +++ b/references/go/observability.md @@ -0,0 +1,153 @@ +# Go SDK Observability + +## Overview + +The Go SDK provides replay-safe logging via `workflow.GetLogger`, metrics via the Tally library with Prometheus export, and tracing via OpenTelemetry, OpenTracing, or Datadog. + +## Logging / Replay-Aware Logging + +### Workflow Logging + +Use `workflow.GetLogger(ctx)` for replay-safe logging. This logger automatically suppresses duplicate messages during replay. + +```go +func MyWorkflow(ctx workflow.Context, input string) (string, error) { + logger := workflow.GetLogger(ctx) + logger.Info("Workflow started", "input", input) + + var result string + err := workflow.ExecuteActivity(ctx, MyActivity, input).Get(ctx, &result) + if err != nil { + logger.Error("Activity failed", "error", err) + return "", err + } + + logger.Info("Workflow completed", "result", result) + return result, nil +} +``` + +The workflow logger automatically: +- Suppresses duplicate logs during replay +- Includes workflow context (workflow ID, run ID, etc.) + +### Activity Logging + +Use `activity.GetLogger(ctx)` for context-aware activity logging: + +```go +func MyActivity(ctx context.Context, input string) (string, error) { + logger := activity.GetLogger(ctx) + logger.Info("Processing input", "input", input) + // ... + return "done", nil +} +``` + +Activity logger includes: +- Activity ID, type, and task queue +- Workflow ID and run ID +- Attempt number (for retries) + +### Adding Persistent Fields + +Use `log.With` to create a logger with key-value pairs included in every entry: + +```go +logger := log.With(workflow.GetLogger(ctx), "orderId", orderId, "customerId", customerId) +logger.Info("Processing order") // includes orderId and customerId +``` + +## Customizing the Logger + +Set a custom logger via `client.Options{Logger: myLogger}`. Implement the `log.Logger` interface (Debug, Info, Warn, Error methods). + +### Using slog (Go 1.21+) + +```go +import ( + "log/slog" + "os" + + tlog "go.temporal.io/sdk/log" +) + +slogHandler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}) +logger := tlog.NewStructuredLogger(slog.New(slogHandler)) + +c, err := client.Dial(client.Options{ + Logger: logger, +}) +``` + +### Using Third-Party Loggers (Logrus, Zap, etc.) + +Use the [logur](https://github.com/logur/logur) adapter package: + +```go +import ( + "github.com/sirupsen/logrus" + logrusadapter "logur.dev/adapter/logrus" + "logur.dev/logur" +) + +logger := logur.LoggerToKV(logrusadapter.New(logrus.New())) +c, err := client.Dial(client.Options{ + Logger: logger, +}) +``` + +## Metrics + +Use the Tally library (`go.temporal.io/sdk/contrib/tally`) with Prometheus: + +```go +import ( + sdktally "go.temporal.io/sdk/contrib/tally" + "github.com/uber-go/tally/v4" + "github.com/uber-go/tally/v4/prometheus" +) + +func newPrometheusScope(c prometheus.Configuration) tally.Scope { + reporter, err := c.NewReporter( + prometheus.ConfigurationOptions{}, + ) + if err != nil { + log.Fatalln("error creating prometheus reporter", err) + } + scopeOpts := tally.ScopeOptions{ + CacheReporter: reporter, + Separator: "_", + SanitizeOptions: &sdktally.PrometheusSanitizeOptions, + } + scope, _ := tally.NewRootScope(scopeOpts, time.Second) + scope = sdktally.NewPrometheusNamingScope(scope) + return scope +} + +c, err := client.Dial(client.Options{ + MetricsHandler: sdktally.NewMetricsHandler(newPrometheusScope(prometheus.Configuration{ + ListenAddress: "0.0.0.0:9090", + TimerType: "histogram", + })), +}) +``` + +Key SDK metrics: +- `temporal_workflow_task_execution_latency` -- Workflow task processing time +- `temporal_activity_execution_latency` -- Activity execution time +- `temporal_workflow_task_replay_latency` -- Replay duration +- `temporal_request` -- Client requests to server +- `temporal_activity_schedule_to_start_latency` -- Time from scheduling to start + +## Search Attributes (Visibility) + +See the Search Attributes section of `references/go/data-handling.md` + +## Best Practices + +1. Always use `workflow.GetLogger(ctx)` in workflows -- never `fmt.Println` or `log.Println` (they produce duplicates on replay) +2. Use `activity.GetLogger(ctx)` in activities for structured context +3. Set up Prometheus metrics in production +4. Use search attributes for operational visibility and debugging +5. Use `workflow.IsReplaying(ctx)` only for custom side-effect-free logging -- the built-in logger handles replay suppression automatically diff --git a/references/go/patterns.md b/references/go/patterns.md new file mode 100644 index 0000000..732083f --- /dev/null +++ b/references/go/patterns.md @@ -0,0 +1,536 @@ +# Go SDK Patterns + +## Signals + +In Go, signals are received via channels, not handler functions. + +```go +func OrderWorkflow(ctx workflow.Context) (string, error) { + approved := false + var items []string + + approveCh := workflow.GetSignalChannel(ctx, "approve") + addItemCh := workflow.GetSignalChannel(ctx, "add-item") + + // Listen for signals in a goroutine so workflow can proceed + workflow.Go(ctx, func(ctx workflow.Context) { + for { + selector := workflow.NewSelector(ctx) + selector.AddReceive(approveCh, func(c workflow.ReceiveChannel, more bool) { + c.Receive(ctx, &approved) + }) + selector.AddReceive(addItemCh, func(c workflow.ReceiveChannel, more bool) { + var item string + c.Receive(ctx, &item) + items = append(items, item) + }) + selector.Select(ctx) + } + }) + + // Wait for approval + workflow.Await(ctx, func() bool { return approved }) + return fmt.Sprintf("Processed %d items", len(items)), nil +} +``` + +### Blocking receive from a single channel + +When waiting on a single signal, no Selector is needed: + +```go +var approveInput ApproveInput +workflow.GetSignalChannel(ctx, "approve").Receive(ctx, &approveInput) +``` + +## Queries + +**Important:** Queries must NOT modify workflow state. Query handlers run outside workflow context -- do not call `workflow.Go()`, `workflow.NewChannel()`, or any blocking workflow functions. + +```go +func StatusWorkflow(ctx workflow.Context) error { + currentState := "started" + progress := 0 + + err := workflow.SetQueryHandler(ctx, "get-status", func() (string, error) { + return currentState, nil + }) + if err != nil { + return err + } + + err = workflow.SetQueryHandler(ctx, "get-progress", func() (int, error) { + return progress, nil + }) + if err != nil { + return err + } + + // Workflow logic updates currentState and progress as it runs + currentState = "running" + for i := 0; i < 100; i++ { + progress = i + err := workflow.ExecuteActivity( + workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + }), + ProcessItem, i, + ).Get(ctx, nil) + if err != nil { + currentState = "failed" + return err + } + } + currentState = "done" + return nil +} +``` + +## Updates + +```go +func OrderWorkflow(ctx workflow.Context) (int, error) { + var items []string + + err := workflow.SetUpdateHandlerWithOptions( + ctx, + "add-item", + func(ctx workflow.Context, item string) (int, error) { + // Handler can mutate workflow state and return a value + items = append(items, item) + return len(items), nil + }, + workflow.UpdateHandlerOptions{ + Validator: func(ctx workflow.Context, item string) error { + if item == "" { + return fmt.Errorf("item cannot be empty") + } + if len(items) >= 100 { + return fmt.Errorf("order is full") + } + return nil + }, + }, + ) + if err != nil { + return 0, err + } + + // Block until cancelled + _ = ctx.Done().Receive(ctx, nil) + return len(items), nil +} +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Return an error to reject the update; return `nil` to accept. + +## Child Workflows + +```go +func ParentWorkflow(ctx workflow.Context, orders []Order) ([]string, error) { + cwo := workflow.ChildWorkflowOptions{ + WorkflowExecutionTimeout: 30 * time.Minute, + } + ctx = workflow.WithChildOptions(ctx, cwo) + + var results []string + for _, order := range orders { + var result string + err := workflow.ExecuteChildWorkflow(ctx, ProcessOrderWorkflow, order).Get(ctx, &result) + if err != nil { + return nil, err + } + results = append(results, result) + } + return results, nil +} +``` + +### Child Workflow Options + +```go +import enumspb "go.temporal.io/api/enums/v1" + +cwo := workflow.ChildWorkflowOptions{ + WorkflowID: fmt.Sprintf("child-%s", workflow.GetInfo(ctx).WorkflowExecution.ID), + + // ParentClosePolicy - what happens to child when parent closes + // PARENT_CLOSE_POLICY_TERMINATE (default), PARENT_CLOSE_POLICY_ABANDON, PARENT_CLOSE_POLICY_REQUEST_CANCEL + ParentClosePolicy: enumspb.PARENT_CLOSE_POLICY_ABANDON, + + WorkflowExecutionTimeout: 10 * time.Minute, + WorkflowTaskTimeout: time.Minute, +} +ctx = workflow.WithChildOptions(ctx, cwo) + +future := workflow.ExecuteChildWorkflow(ctx, ChildWorkflow, input) + +// Wait for child to start (important for ABANDON policy) +if err := future.GetChildWorkflowExecution().Get(ctx, nil); err != nil { + return err +} +``` + +## Handles to External Workflows + +```go +func CoordinatorWorkflow(ctx workflow.Context, targetWorkflowID string) error { + // Signal an external workflow + err := workflow.SignalExternalWorkflow(ctx, targetWorkflowID, "", "data-ready", payload).Get(ctx, nil) + if err != nil { + return err + } + + // Cancel an external workflow + err = workflow.RequestCancelExternalWorkflow(ctx, targetWorkflowID, "").Get(ctx, nil) + return err +} +``` + +## Parallel Execution + +Use `workflow.Go` to launch parallel work and `workflow.Selector` to collect results. + +```go +func ParallelWorkflow(ctx workflow.Context, items []string) ([]string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + // Launch activities in parallel + futures := make([]workflow.Future, len(items)) + for i, item := range items { + futures[i] = workflow.ExecuteActivity(actCtx, ProcessItem, item) + } + + // Collect all results + results := make([]string, len(items)) + for i, future := range futures { + if err := future.Get(ctx, &results[i]); err != nil { + return nil, err + } + } + return results, nil +} +``` + +### Using workflow.Go for background goroutines + +```go +ch := workflow.NewChannel(ctx) + +workflow.Go(ctx, func(ctx workflow.Context) { + // Background work + var result string + _ = workflow.ExecuteActivity(actCtx, SomeActivity).Get(ctx, &result) + ch.Send(ctx, result) +}) + +var result string +ch.Receive(ctx, &result) +``` + +## Selector Pattern + +`workflow.Selector` replaces Go's native `select` -- required for deterministic workflow execution. Use it to wait on multiple channels, futures, and timers simultaneously. + +```go +func ApprovalWorkflow(ctx workflow.Context) (string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + var outcome string + signalCh := workflow.GetSignalChannel(ctx, "approve") + actFuture := workflow.ExecuteActivity(actCtx, AutoReviewActivity) + + // Cancel timer if signal or activity wins + timerCtx, cancelTimer := workflow.WithCancel(ctx) + timer := workflow.NewTimer(timerCtx, 24*time.Hour) + + selector := workflow.NewSelector(ctx) + + // Branch 1: Signal received + selector.AddReceive(signalCh, func(c workflow.ReceiveChannel, more bool) { + var approved bool + c.Receive(ctx, &approved) + cancelTimer() + if approved { + outcome = "approved-by-signal" + } else { + outcome = "rejected-by-signal" + } + }) + + // Branch 2: Activity completed + selector.AddFuture(actFuture, func(f workflow.Future) { + var result string + _ = f.Get(ctx, &result) + cancelTimer() + outcome = result + }) + + // Branch 3: Timeout + selector.AddFuture(timer, func(f workflow.Future) { + if err := f.Get(ctx, nil); err == nil { + outcome = "timed-out" + } + // If timer was cancelled, err is CanceledError -- ignore + }) + + selector.Select(ctx) // Blocks until one branch fires + return outcome, nil +} +``` + +Key points: +- `AddReceive(channel, callback)` -- fires when a channel has a message (must consume with `c.Receive`) +- `AddFuture(future, callback)` -- fires when a future resolves (once per Selector) +- `AddDefault(callback)` -- fires immediately if nothing else is ready +- `Select(ctx)` -- blocks until one branch fires; call multiple times to process multiple events + +## Continue-as-New + +```go +func LongRunningWorkflow(ctx workflow.Context, state WorkflowState) (string, error) { + for { + state = processBatch(ctx, state) + + if state.IsComplete { + return "done", nil + } + + // Check if history is getting large + if workflow.GetInfo(ctx).GetContinueAsNewSuggested() { + return "", workflow.NewContinueAsNewError(ctx, LongRunningWorkflow, state) + } + } +} +``` + +Drain signals before continue-as-new to avoid signal loss: + +```go +for { + var signalVal string + ok := signalChan.ReceiveAsync(&signalVal) + if !ok { + break + } + // process signal +} +return "", workflow.NewContinueAsNewError(ctx, LongRunningWorkflow, state) +``` + +## Cancellation Handling + +Use `ctx.Done()` to detect cancellation and `workflow.NewDisconnectedContext` for cleanup that must run even after cancellation. + +```go +func MyWorkflow(ctx workflow.Context) error { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: time.Hour, + }) + + err := workflow.ExecuteActivity(actCtx, LongRunningActivity).Get(ctx, nil) + if err != nil && temporal.IsCanceledError(ctx.Err()) { + // Workflow was cancelled -- run cleanup with a disconnected context + workflow.GetLogger(ctx).Info("Workflow cancelled, running cleanup") + disconnectedCtx, _ := workflow.NewDisconnectedContext(ctx) + disconnectedCtx = workflow.WithActivityOptions(disconnectedCtx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + _ = workflow.ExecuteActivity(disconnectedCtx, CleanupActivity).Get(disconnectedCtx, nil) + return err // Return CanceledError + } + return err +} +``` + +## Saga Pattern (Compensations) + +**Important:** Compensation activities should be idempotent -- they may be retried (as with ALL activities). + +Use `workflow.NewDisconnectedContext` when running compensations so they execute even if the workflow is cancelled. + +```go +func OrderWorkflow(ctx workflow.Context, order Order) (string, error) { + actCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + + var compensations []func(ctx workflow.Context) error + + // Helper to run all compensations in reverse, using a disconnected context + // so compensations run even if the workflow is cancelled. + runCompensations := func() { + disconnectedCtx, _ := workflow.NewDisconnectedContext(ctx) + compCtx := workflow.WithActivityOptions(disconnectedCtx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + for i := len(compensations) - 1; i >= 0; i-- { + if err := compensations[i](compCtx); err != nil { + workflow.GetLogger(ctx).Error("Compensation failed", "error", err) + } + } + } + + // Register compensation BEFORE running the activity. + // If the activity completes the effect but fails on return, + // we still need the compensation. + compensations = append(compensations, func(ctx workflow.Context) error { + return workflow.ExecuteActivity(ctx, ReleaseInventoryIfReserved, order).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(actCtx, ReserveInventory, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + compensations = append(compensations, func(ctx workflow.Context) error { + return workflow.ExecuteActivity(ctx, RefundPaymentIfCharged, order).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(actCtx, ChargePayment, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + if err := workflow.ExecuteActivity(actCtx, ShipOrder, order).Get(ctx, nil); err != nil { + runCompensations() + return "", err + } + + return "Order completed", nil +} +``` + +## Wait Condition with Timeout + +```go +func ApprovalWorkflow(ctx workflow.Context) (string, error) { + approved := false + + // Set up signal handler + workflow.Go(ctx, func(ctx workflow.Context) { + workflow.GetSignalChannel(ctx, "approve").Receive(ctx, &approved) + }) + + // Wait with 24-hour timeout -- returns (conditionMet, error) + conditionMet, err := workflow.AwaitWithTimeout(ctx, 24*time.Hour, func() bool { + return approved + }) + if err != nil { + return "", err + } + + if conditionMet { + return "approved", nil + } + return "auto-rejected due to timeout", nil +} +``` + +Without timeout: + +```go +err := workflow.Await(ctx, func() bool { return ready }) +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers may run activities asynchronously. Use `workflow.Await` with `workflow.AllHandlersFinished` before completing or continuing-as-new to prevent the workflow from closing while handlers are still running. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + // ... register handlers, main workflow logic ... + + // Before exiting, wait for all handlers to finish + err := workflow.Await(ctx, func() bool { + return workflow.AllHandlersFinished(ctx) + }) + if err != nil { + return "", err + } + return "done", nil +} +``` + +## Activity Heartbeat Details + +### WHY: +- **Support activity cancellation** -- Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** -- Heartbeat details persist across retries + +### WHEN: +- **Cancellable activities** -- Any activity that should respond to cancellation +- **Long-running activities** -- Track progress for resumability +- **Checkpointing** -- Save progress periodically + +```go +func ProcessLargeFile(ctx context.Context, filePath string) (string, error) { + // Recover from previous attempt + startIdx := 0 + if activity.HasHeartbeatDetails(ctx) { + if err := activity.GetHeartbeatDetails(ctx, &startIdx); err == nil { + startIdx++ // Resume from next item + } + } + + lines := readFileLines(filePath) + + for i := startIdx; i < len(lines); i++ { + processLine(lines[i]) + + // Heartbeat with progress -- if cancelled, ctx will be cancelled + activity.RecordHeartbeat(ctx, i) + + if ctx.Err() != nil { + // Activity was cancelled + cleanup() + return "", ctx.Err() + } + } + + return "completed", nil +} +``` + +## Timers + +```go +func TimerWorkflow(ctx workflow.Context) (string, error) { + // Simple sleep + err := workflow.Sleep(ctx, time.Hour) + if err != nil { + return "", err + } + + // Timer as a Future -- for use with Selector + timerCtx, cancelTimer := workflow.WithCancel(ctx) + timer := workflow.NewTimer(timerCtx, 30*time.Minute) + + // Cancel the timer when no longer needed + cancelTimer() + + return "Timer fired", nil +} +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```go +func MyWorkflow(ctx workflow.Context) (string, error) { + lao := workflow.LocalActivityOptions{ + StartToCloseTimeout: 5 * time.Second, + } + ctx = workflow.WithLocalActivityOptions(ctx, lao) + + var result string + err := workflow.ExecuteLocalActivity(ctx, QuickLookup, "key").Get(ctx, &result) + if err != nil { + return "", err + } + return result, nil +} +``` diff --git a/references/go/testing.md b/references/go/testing.md new file mode 100644 index 0000000..ab74bbd --- /dev/null +++ b/references/go/testing.md @@ -0,0 +1,238 @@ +# Go SDK Testing + +## Overview + +The Go SDK provides the `testsuite` package for testing Workflows and Activities. It uses the [testify](https://github.com/stretchr/testify) library for assertions (`assert`/`require`) and mocking (`mock`). The test environment supports automatic time-skipping for Workflows with timers. + +## Test Environment Setup + +Two approaches: struct-based with `suite.Suite` or function-based with `testsuite.NewTestWorkflowEnvironment()`. + +**Approach 1: Struct-based (testify suite)** + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/suite" + + "go.temporal.io/sdk/testsuite" +) + +type UnitTestSuite struct { + suite.Suite + testsuite.WorkflowTestSuite + + env *testsuite.TestWorkflowEnvironment +} + +func (s *UnitTestSuite) SetupTest() { + s.env = s.NewTestWorkflowEnvironment() +} + +func (s *UnitTestSuite) AfterTest(suiteName, testName string) { + s.env.AssertExpectations(s.T()) +} + +func (s *UnitTestSuite) Test_MyWorkflow_Success() { + s.env.ExecuteWorkflow(MyWorkflow, "input") + + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) +} + +func TestUnitTestSuite(t *testing.T) { + suite.Run(t, new(UnitTestSuite)) +} +``` + +**Approach 2: Function-based** + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.temporal.io/sdk/testsuite" +) + +func Test_MyWorkflow(t *testing.T) { + testSuite := &testsuite.WorkflowTestSuite{} + env := testSuite.NewTestWorkflowEnvironment() + env.RegisterActivity(MyActivity) + + env.ExecuteWorkflow(MyWorkflow, "input") + assert.True(t, env.IsWorkflowCompleted()) + assert.NoError(t, env.GetWorkflowError()) + + var result string + assert.NoError(t, env.GetWorkflowResult(&result)) + assert.Equal(t, "expected", result) +} +``` + +You must register all Activity Definitions used by the Workflow with `env.RegisterActivity(ActivityFunc)`. The Workflow itself does not need to be registered. + +## Activity Mocking + +Mock activities with `env.OnActivity()` to test Workflow logic in isolation. + +**Return mock values:** + +```go +env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return("mock_result", nil) +``` + +**Return a function replacement** (for parameter validation or custom logic): + +```go +env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return( + func(ctx context.Context, input string) (string, error) { + // Custom logic, assertions, etc. + return "computed_result", nil + }, +) +``` + +**Match specific arguments:** + +```go +env.OnActivity(MyActivity, mock.Anything, "specific_input").Return("result", nil) +``` + +When using mocks, you do not need to call `env.RegisterActivity()` for that Activity. The mock signature must match the original Activity function signature. + +## Testing Signals and Queries + +Use `RegisterDelayedCallback` to send Signals during Workflow execution. Use `QueryWorkflow` to test query handlers. + +```go +func (s *UnitTestSuite) Test_SignalsAndQueries() { + // Register a delayed callback to send a signal after 5 seconds + s.env.RegisterDelayedCallback(func() { + s.env.SignalWorkflow("approve", SignalData{Approved: true}) + }, time.Second*5) + + s.env.ExecuteWorkflow(ApprovalWorkflow, input) + + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) +} +``` + +**Query a running Workflow** (must be called inside `RegisterDelayedCallback` or after `ExecuteWorkflow`): + +```go +s.env.RegisterDelayedCallback(func() { + res, err := s.env.QueryWorkflow("getProgress") + s.NoError(err) + + var progress int + err = res.Get(&progress) + s.NoError(err) + s.Equal(50, progress) +}, time.Second*10+time.Millisecond) +``` + +`QueryWorkflow` returns a `converter.EncodedValue`. Use `.Get(&result)` to decode the value. + +For "Signal-With-Start" testing, set the delay to `0`. + +## Testing Failure Cases + +```go +func (s *UnitTestSuite) Test_WorkflowFailure() { + // Mock activity to return an error + s.env.OnActivity(MyActivity, mock.Anything, mock.Anything).Return( + "", errors.New("activity failed")) + + s.env.ExecuteWorkflow(MyWorkflow, "input") + + s.True(s.env.IsWorkflowCompleted()) + + err := s.env.GetWorkflowError() + s.Error(err) + + var applicationErr *temporal.ApplicationError + s.True(errors.As(err, &applicationErr)) + s.Equal("activity failed", applicationErr.Error()) +} +``` + +`env.GetWorkflowError()` returns the Workflow error. Use `errors.As(err, &applicationErr)` to check the error type. Mock activities returning errors to test Workflow error-handling paths. + +## Replay Testing + +Use `worker.NewWorkflowReplayer()` to verify that code changes do not break determinism. Load history from a JSON file exported via the Temporal CLI or Web UI. + +```go +package sample + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.temporal.io/sdk/worker" +) + +func Test_ReplayFromFile(t *testing.T) { + replayer := worker.NewWorkflowReplayer() + replayer.RegisterWorkflow(MyWorkflow) + + err := replayer.ReplayWorkflowHistoryFromJSONFile(nil, "my_workflow_history.json") + assert.NoError(t, err) +} +``` + +Export history via CLI: `temporal workflow show --workflow-id --output json > history.json` + +**Replay from a programmatically fetched history:** + +```go +func Test_ReplayFromServer(t *testing.T) { + // Fetch history from the server + hist, err := GetWorkflowHistory(ctx, client, workflowID, runID) + assert.NoError(t, err) + + replayer := worker.NewWorkflowReplayer() + replayer.RegisterWorkflow(MyWorkflow) + + err = replayer.ReplayWorkflowHistory(nil, hist) + assert.NoError(t, err) +} +``` + +## Activity Testing + +Test Activities in isolation using `TestActivityEnvironment`. No Worker or Workflow needed. + +```go +func Test_MyActivity(t *testing.T) { + testSuite := &testsuite.WorkflowTestSuite{} + env := testSuite.NewTestActivityEnvironment() + env.RegisterActivity(MyActivity) + + val, err := env.ExecuteActivity(MyActivity, "input") + assert.NoError(t, err) + + var result string + assert.NoError(t, val.Get(&result)) + assert.Equal(t, "expected_output", result) +} +``` + +`ExecuteActivity` returns `(converter.EncodedValue, error)`. Use `val.Get(&result)` to extract the typed result. The Activity executes synchronously in the calling goroutine. + +## Best Practices + +1. Register all Activities used by the Workflow with `env.RegisterActivity()`, unless you mock them with `env.OnActivity()` +2. Use mocks to isolate Workflow logic from Activity implementations +3. Test failure paths by mocking Activities that return errors +4. Use replay testing before deploying Workflow code changes to catch non-determinism errors +5. Use unique task queues per test when running integration tests +6. Call `env.AssertExpectations(s.T())` in `AfterTest` to verify all mocks were called diff --git a/references/go/versioning.md b/references/go/versioning.md new file mode 100644 index 0000000..b6b6c27 --- /dev/null +++ b/references/go/versioning.md @@ -0,0 +1,232 @@ +# Go SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## GetVersion API + +`workflow.GetVersion` safely performs backwards-incompatible changes to Workflow Definitions. It returns the version to branch on, recording the result as a marker in the Event History. + +```go +v := workflow.GetVersion(ctx, "changeID", workflow.DefaultVersion, maxSupported) +``` + +- `changeID`: unique string identifying the change +- `minSupported`: oldest version still supported (`workflow.DefaultVersion` is `-1`) +- `maxSupported`: current/newest version +- Returns `maxSupported` for new executions; returns the recorded version on replay + +### Three-Step Lifecycle + +**Step 1: Add GetVersion with both code paths** + +Original code calls `ActivityA`. You want to replace it with `ActivityC`: + +```go +v := workflow.GetVersion(ctx, "Step1", workflow.DefaultVersion, 1) +if v == workflow.DefaultVersion { + // Old code path (for replay of existing workflows) + err = workflow.ExecuteActivity(ctx, ActivityA, data).Get(ctx, &result1) +} else { + // New code path + err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +} +``` + +For new executions, `GetVersion` returns `1` and records a marker. For replay of pre-change workflows (no marker), it returns `DefaultVersion` (`-1`). + +**Step 2: Remove old branch (increase minSupported)** + +After all `DefaultVersion` Workflow Executions have completed: + +```go +v := workflow.GetVersion(ctx, "Step1", 1, 1) +// Only the new code path remains +err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +``` + +Keep the `GetVersion` call even with a single branch. This ensures: +1. If an older execution replays on this code, it fails fast instead of proceeding incorrectly +2. If you need further changes, you just bump `maxSupported` + +**Step 3: Further changes (bump maxSupported)** + +Later, replace `ActivityC` with `ActivityD`: + +```go +v := workflow.GetVersion(ctx, "Step1", 1, 2) +if v == 1 { + err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) +} else { + err = workflow.ExecuteActivity(ctx, ActivityD, data).Get(ctx, &result1) +} +``` + +After all version-1 executions complete, collapse again: + +```go +_ = workflow.GetVersion(ctx, "Step1", 2, 2) +err = workflow.ExecuteActivity(ctx, ActivityD, data).Get(ctx, &result1) +``` + +### Using GetVersion in Loops + +The return value for a given `changeID` is immutable once recorded. In loops, append the iteration number to the `changeID`: + +```go +for i := 0; i < 10; i++ { + v := workflow.GetVersion(ctx, fmt.Sprintf("myChange-%d", i), workflow.DefaultVersion, 1) + if v == workflow.DefaultVersion { + // old path + } else { + // new path + } +} +``` + +## Workflow Type Versioning + +Create a new Workflow Type for incompatible changes: + +```go +// Original +func MyWorkflow(ctx workflow.Context, input Input) (string, error) { + // v1 implementation +} + +// New version +func MyWorkflowV2(ctx workflow.Context, input Input) (string, error) { + // v2 implementation +} +``` + +Register both with the Worker: + +```go +w := worker.New(c, "my-task-queue", worker.Options{}) +w.RegisterWorkflow(MyWorkflow) +w.RegisterWorkflow(MyWorkflowV2) +``` + +Route new executions to the new type. Old workflows continue on the old type. Check for open executions before removing the old type: + +```bash +temporal workflow list --query 'WorkflowType = "MyWorkflow" AND ExecutionStatus = "Running"' +``` + +## Worker Versioning + +Worker Versioning manages versions at the deployment level, allowing multiple Worker versions to run simultaneously. + +### Key Concepts + +**Worker Deployment**: A logical service grouping similar Workers together (e.g., "loan-processor"). All versions of your code live under this umbrella. + +**Worker Deployment Version**: A specific snapshot of your code identified by a deployment name and Build ID (e.g., "loan-processor:v1.0" or "loan-processor:abc123"). + +### Configuring Workers for Versioning + +```go +w := worker.New(c, "my-task-queue", worker.Options{ + DeploymentOptions: worker.DeploymentOptions{ + UseVersioning: true, + Version: worker.WorkerDeploymentVersion{ + DeploymentName: "my-service", + BuildId: "v1.0.0", // or git commit hash + }, + DefaultVersioningBehavior: workflow.VersioningBehaviorPinned, + }, +}) +``` + +**Configuration fields:** +- `UseVersioning`: enables Worker Versioning +- `Version`: identifies the Worker Deployment Version (deployment name + build ID) +- `DefaultVersioningBehavior`: `VersioningBehaviorPinned` or `VersioningBehaviorAutoUpgrade` +- Build ID: typically a git commit hash, version number, or timestamp + +### PINNED vs AUTO_UPGRADE Behaviors + +**PINNED Behavior** + +Workflows stay locked to their original Worker version. + +**When to use PINNED:** +- Short-running workflows (minutes to hours) +- Consistency is critical (e.g., financial transactions) +- You want to eliminate version compatibility complexity +- Building new applications and want simplest development experience + +**AUTO_UPGRADE Behavior** + +Workflows can move to newer versions. + +**When to use AUTO_UPGRADE:** +- Long-running workflows (weeks or months) +- Workflows need to benefit from bug fixes during execution +- Migrating from traditional rolling deployments +- You are already using GetVersion for version transitions + +**Important:** AUTO_UPGRADE workflows still need GetVersion to handle version transitions safely since they can move between Worker versions. + +### Worker Configuration with Default Behavior + +```go +// For short-running workflows, prefer PINNED +w := worker.New(c, "orders-task-queue", worker.Options{ + DeploymentOptions: worker.DeploymentOptions{ + UseVersioning: true, + Version: worker.WorkerDeploymentVersion{ + DeploymentName: "order-service", + BuildId: os.Getenv("BUILD_ID"), + }, + DefaultVersioningBehavior: workflow.VersioningBehaviorPinned, + }, +}) +``` + +### Deployment Strategies + +**Blue-Green Deployments** + +Maintain two environments and switch traffic between them: +1. Deploy new code to idle environment +2. Run tests and validation +3. Switch traffic to new environment +4. Keep old environment for instant rollback + +**Rainbow Deployments** + +Multiple versions run simultaneously: +- New workflows use latest version +- Existing workflows complete on their original version +- Add new versions alongside existing ones +- Gradually sunset old versions as workflows complete + +This works well with Kubernetes where you manage multiple ReplicaSets running different Worker versions. + +Deploy a new version, then set it as current: + +```bash +temporal worker deployment set-current-version \ + --deployment-name my-service \ + --build-id v2.0.0 +``` + +### Querying Workflows by Worker Version + +```bash +# Find workflows on a specific Worker version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "my-service:v1.0.0" AND ExecutionStatus = "Running"' +``` + +## Best Practices + +1. **Keep GetVersion calls** even when only a single branch remains -- it guards against stale replays and simplifies future changes +2. **Use `TemporalChangeVersion` search attribute** to find Workflows running on old versions: + ```bash + temporal workflow list --query \ + 'WorkflowType = "MyWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion = "Step1"' + ``` +3. **Test with replay** before removing old branches to verify determinism is preserved +4. **Prefer Worker Versioning** for large-scale deployments to avoid accumulating patching branches diff --git a/references/python/advanced-features.md b/references/python/advanced-features.md new file mode 100644 index 0000000..e0d3297 --- /dev/null +++ b/references/python/advanced-features.md @@ -0,0 +1,166 @@ +# Python SDK Advanced Features + +## Schedules + +Create recurring workflow executions. + +```python +from temporalio.client import ( + Schedule, + ScheduleActionStartWorkflow, + ScheduleSpec, + ScheduleIntervalSpec, +) + +# Create a schedule +schedule_id = "daily-report" +await client.create_schedule( + schedule_id, + Schedule( + action=ScheduleActionStartWorkflow( + DailyReportWorkflow.run, + id="daily-report", + task_queue="reports", + ), + spec=ScheduleSpec( + intervals=[ScheduleIntervalSpec(every=timedelta(days=1))], + ), + ), +) + +# Manage schedules +schedule = client.get_schedule_handle(schedule_id) +await schedule.pause("Maintenance window") +await schedule.unpause() +await schedule.trigger() # Run immediately +await schedule.delete() +``` + +## Async Activity Completion + +For activities that complete asynchronously (e.g., human tasks, external callbacks). +If you configure a heartbeat_timeout on this activity, the external completer is responsible for sending heartbeats via the async handle. +If you do NOT set a heartbeat_timeout, no heartbeats are required. + +**Note:** If the external system that completes the asynchronous action can reliably be trusted to do the task and Signal back with the result, and it doesn't need to Heartbeat or receive Cancellation, then consider using **signals** instead. + +```python +from temporalio import activity +from temporalio.client import Client + +@activity.defn +async def request_approval(request_id: str) -> None: + # Get task token for async completion + task_token = activity.info().task_token + + # Store task token for later completion (e.g., in database) + await store_task_token(request_id, task_token) + + # Mark this activity as waiting for external completion + activity.raise_complete_async() + +# Later, complete the activity from another process +async def complete_approval(request_id: str, approved: bool): + client = await Client.connect("localhost:7233", namespace="default") + task_token = await get_task_token(request_id) + + handle = client.get_async_activity_handle(task_token=task_token) + + # Optional: if a heartbeat_timeout was set, you can periodically: + # await handle.heartbeat(progress_details) + + if approved: + await handle.complete("approved") + else: + # You can also fail or report cancellation via the handle + await handle.fail(ApplicationError("Rejected")) +``` + +## Sandbox Customization + +The Python SDK runs workflows in a sandbox to help you ensure determinism. You can customize sandbox restrictions when needed. See `references/python/determinism-protection.md` + +## Gevent Compatibility Warning + +**The Python SDK is NOT compatible with gevent.** Gevent's monkey patching modifies Python's asyncio event loop in ways that break the SDK's deterministic execution model. + +If your application uses gevent: +- You cannot run Temporal workers in the same process +- Consider running workers in a separate process without gevent +- Use a message queue or HTTP API to communicate between gevent and Temporal processes + +## Worker Tuning + +Configure worker performance settings. + +```python +from concurrent.futures import ThreadPoolExecutor + +worker = Worker( + client, + task_queue="my-queue", + workflows=[MyWorkflow], + activities=[my_activity], + # Workflow task concurrency + max_concurrent_workflow_tasks=100, + # Activity task concurrency + max_concurrent_activities=100, + # Executor for sync activities + activity_executor=ThreadPoolExecutor(max_workers=50), + # Graceful shutdown timeout + graceful_shutdown_timeout=timedelta(seconds=30), +) +``` + +## Workflow Init Decorator + +Use `@workflow.init` to run initialization code when a workflow is first created. + +**Purpose:** Execute some setup code before signal/update happens or run is invoked. + +```python +@workflow.defn +class MyWorkflow: + @workflow.init + def __init__(self, initial_value: str) -> None: + # This runs only on first execution, not replay + self._value = initial_value + self._items: list[str] = [] + + @workflow.run + async def run(self) -> str: + # self._value and self._items are already initialized + return self._value +``` + +## Workflow Failure Exception Types + +Control which exceptions cause workflow task failures vs workflow failures. + +- Special case: if you include temporalio.workflow.NondeterminismError (or a superclass), non-determinism errors will fail the workflow instead of leaving it in a retrying state +- **Tip for testing:** Set to `[Exception]` in tests so any unhandled exception fails the workflow immediately rather than retrying the workflow task forever. This surfaces bugs faster. + +### Per-Workflow Configuration + +```python +@workflow.defn( + # These exception types will fail the workflow execution (not just the task) + failure_exception_types=[ValueError, CustomBusinessError] +) +class MyWorkflow: + @workflow.run + async def run(self) -> str: + raise ValueError("This fails the workflow, not just the task") +``` + +### Worker-Level Configuration + +```python +worker = Worker( + client, + task_queue="my-queue", + workflows=[MyWorkflow], + workflow_failure_exception_types=[ValueError, CustomBusinessError], +) +``` + diff --git a/references/python/ai-patterns.md b/references/python/ai-patterns.md new file mode 100644 index 0000000..a07e30a --- /dev/null +++ b/references/python/ai-patterns.md @@ -0,0 +1,334 @@ +# Python AI/LLM Integration Patterns + +## Overview + +This document provides Python-specific implementation details for integrating LLMs with Temporal. For conceptual patterns, see `references/core/ai-integration.md`. + +## Pydantic Data Converter Setup + +**Required** for handling complex types like OpenAI response objects: + +```python +from temporalio.client import Client +from temporalio.contrib.pydantic import pydantic_data_converter + +client = await Client.connect( + "localhost:7233", + namespace="default", + data_converter=pydantic_data_converter, +) +``` + +## OpenAI Client Configuration + +**Critical**: Disable client retries, let Temporal handle them: + +```python +from openai import AsyncOpenAI + +openai_client = AsyncOpenAI( + api_key=os.getenv("OPENAI_API_KEY"), + max_retries=0, # CRITICAL: Disable client retries + timeout=30.0, +) +``` + +## LiteLLM Configuration + +For multi-model support: + +```python +import litellm + +litellm.num_retries = 0 # Disable LiteLLM retries +``` + +## Generic LLM Activity + +Flexible, reusable activity for LLM calls: + +```python +import openai +from temporalio import activity +from temporalio.exceptions import ApplicationError +from pydantic import BaseModel +from typing import Optional, Any + +class LLMRequest(BaseModel): + model: str + system_prompt: str + user_input: str + tools: Optional[list] = None + response_format: Optional[type] = None + temperature: float = 0.7 + +class LLMResponse(BaseModel): + content: str + tool_calls: Optional[list] = None + usage: dict + +@activity.defn +async def call_llm(request: LLMRequest) -> LLMResponse: + """Generic LLM activity supporting multiple use cases.""" + try: + # As an example, calling OpenAI. This could be any chat API you wish though... + response = await openai_client.chat.completions.create( + model=request.model, + messages=[ + {"role": "system", "content": request.system_prompt}, + {"role": "user", "content": request.user_input}, + ], + tools=request.tools, + temperature=request.temperature, + ) + return LLMResponse( + content=response.choices[0].message.content or "", + tool_calls=response.choices[0].message.tool_calls, + usage=response.usage.model_dump(), + ) + + # Some example error cases to handle. These are not necessarily exhaustive, and depend on the API you are actually calling! + except openai.AuthenticationError as e: + # Invalid API key - permanent failure, don't retry + raise ApplicationError( + f"Invalid API key: {e}", + type="AuthenticationError", + non_retryable=True, + ) + + except openai.RateLimitError as e: + # Rate limited - transient, let Temporal retry with backoff + raise ApplicationError( + f"Rate limited: {e}", + type="RateLimitError", + next_retry_delay=... # parse this from headers + ) + + except openai.APIStatusError as e: + if e.status_code >= 500: + # Server error - transient, retry + raise ApplicationError( + f"OpenAI server error ({e.status_code}): {e}", + type="ServerError", + ) + else: + # Other client errors (400, etc.) - likely permanent + raise ApplicationError( + f"OpenAI client error ({e.status_code}): {e}", + type="ClientError", + non_retryable=True, + ) + + except openai.APIConnectionError as e: + # Network error - transient, retry + raise ApplicationError( + f"Connection error: {e}", + type="ConnectionError", + ) +``` + +## Activity Retry Policy + +Configure retries at the workflow level: + +```python +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy + +with workflow.unsafe.imports_passed_through(): + from activities.llm import call_llm, LLMRequest + +@workflow.defn +class LLMWorkflow: + @workflow.run + async def run(self, prompt: str) -> str: + # Note that because call_llm classfies different types of exceptions as retryable / non-retryable, + # we automatically get correct retry behavior just by calling it. + response = await workflow.execute_activity( + call_llm, + LLMRequest( + model="gpt-4", + system_prompt="You are a helpful assistant.", + user_input=prompt, + ), + start_to_close_timeout=timedelta(seconds=30), + ) + return response.content +``` + +## Tool-Calling Agent Workflow + +```python +from temporalio import workflow +from datetime import timedelta +from pydantic import BaseModel + +with workflow.unsafe.imports_passed_through(): + from activities.llm import call_llm, LLMRequest, LLMResponse + from activities.tools import execute_tool + from models.tools import ToolDefinition + +class AgentWorkflowInput(BaseModel): + user_request: str + tools: list[ToolDefinition] + +@workflow.defn +class AgentWorkflow: + @workflow.run + async def run(self, input: AgentWorkflowInput) -> str: + messages = [] + current_input = input.user_request + + while True: + # Phase 1: Get LLM response with tools + response = await workflow.execute_activity( + call_llm, + LLMRequest( + model="gpt-4", + system_prompt="You are a helpful agent with tools.", + user_input=current_input, + tools=[t.to_openai_format() for t in input.tools], + ), + start_to_close_timeout=timedelta(seconds=30), + ) + + # Check if LLM wants to use a tool + if not response.tool_calls: + return response.content + + # Phase 2: Execute tools + for tool_call in response.tool_calls: + tool_result = await workflow.execute_activity( + execute_tool, + tool_call, + start_to_close_timeout=timedelta(seconds=60), + ) + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": tool_result, + }) + + # Phase 3: Continue conversation with tool results + current_input = f"Tool results: {messages}" +``` + +## Structured Outputs + +Using Pydantic for validated responses: + +```python +from pydantic import BaseModel +from temporalio import activity + +class AnalysisResult(BaseModel): + sentiment: str + confidence: float + key_topics: list[str] + summary: str + +@activity.defn +async def analyze_text(text: str) -> AnalysisResult: + response = await openai_client.beta.chat.completions.parse( + model="gpt-4o", + messages=[ + {"role": "system", "content": "Analyze the following text."}, + {"role": "user", "content": text}, + ], + response_format=AnalysisResult, + ) + return response.choices[0].message.parsed +``` + +## Multi-Agent Pipeline (Deep Research) + +```python +from temporalio import workflow +from datetime import timedelta +import asyncio + +with workflow.unsafe.imports_passed_through(): + from activities.research import ( + generate_subtopics, + generate_search_queries, + search_web, + synthesize_report, + ) + +@workflow.defn +class DeepResearchWorkflow: + @workflow.run + async def run(self, topic: str) -> str: + # Phase 1: Planning + subtopics = await workflow.execute_activity( + generate_subtopics, + topic, + start_to_close_timeout=timedelta(seconds=60), + ) + + # Phase 2: Query Generation + queries = await workflow.execute_activity( + generate_search_queries, + subtopics, + start_to_close_timeout=timedelta(seconds=60), + ) + + # Phase 3: Parallel Web Search (resilient to partial failures) + search_tasks = [ + workflow.execute_activity( + search_web, + query, + start_to_close_timeout=timedelta(seconds=300), + schedule_to_close_timeout=timedelta(seconds=900), # We set a schedule to close timeout, so that if one search task repeatadly fails, then it won't hang up all the rest, in the below gather step. + ) + for query in queries + ] + + # Continue with partial results on failure + results = await asyncio.gather(*search_tasks, return_exceptions=True) + successful_results = [r for r in results if not isinstance(r, Exception)] + + # Phase 4: Synthesis + report = await workflow.execute_activity( + synthesize_report, + {"topic": topic, "research": successful_results}, + start_to_close_timeout=timedelta(seconds=300), + ) + + return report +``` + +## OpenAI Agents SDK Integration + +If using the OpenAI Agent SDK to create an agent, use Temporal's OpenAI contrib module to create a Temporal-aware durable agent: + +```python +from temporalio import workflow +from temporalio.contrib.openai import create_workflow_agent +from agents import Agent, Runner + +@workflow.defn +class DurableAgentWorkflow: + @workflow.run + async def run(self, task: str) -> str: + # Create a Temporal-aware agent + agent = create_workflow_agent( + model="gpt-4", + tools=[search_tool, calculator_tool], + ) + # Run it. Under the hood, the automatically dispatches to activities for LLM calls, etc. + result = await agent.run(task) + return result.output +``` + +## Best Practices + +1. **Always use Pydantic data converter** for complex types +2. **Disable retries in LLM clients** (max_retries=0) +3. **Set appropriate timeouts** per operation type +4. **Use structured outputs** for type safety +5. **Handle partial failures** in parallel operations +6. **Mock activities in tests** for fast, deterministic testing +7. **Log token usage** for cost tracking +8. **Version prompts** in code for reproducibility diff --git a/references/python/data-handling.md b/references/python/data-handling.md new file mode 100644 index 0000000..662101e --- /dev/null +++ b/references/python/data-handling.md @@ -0,0 +1,230 @@ +# Python SDK Data Handling + +## Overview + +The Python SDK uses data converters to serialize/deserialize workflow inputs, outputs, and activity parameters. + +## Default Data Converter + +The default converter handles: +- `None` +- `bytes` (as binary) +- Protobuf messages +- JSON-serializable types (dict, list, str, int, float, bool) + +## Pydantic Integration + +Use Pydantic models for validated, typed data. + +In your workflow definition, just use input and result types that subclass `pydantic.BaseModel`: + +```python +from pydantic import BaseModel + +class OrderInput(BaseModel): + order_id: str + items: list[str] + total: float + customer_email: str + +class OrderResult(BaseModel): + order_id: str + status: str + tracking_number: str | None = None + +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, input: OrderInput) -> OrderResult: + # Pydantic validation happens automatically + return OrderResult( + order_id=input.order_id, + status="completed", + tracking_number="TRK123", + ) +``` + +And when you configure the client, pass the `pydantic_data_converter`: + +```python +from temporalio.contrib.pydantic import pydantic_data_converter +# Configure client with Pydantic support +client = await Client.connect( + "localhost:7233", + namespace="default", + data_converter=pydantic_data_converter, +) +``` + +## Custom Data Conversion + +Usually the easiest way to do this is via implementing an EncodingPayloadConverter and CompositePayloadConverter. See: +- https://raw.githubusercontent.com/temporalio/samples-python/refs/heads/main/custom_converter/shared.py +- https://raw.githubusercontent.com/temporalio/samples-python/refs/heads/main/custom_converter/starter.py + +for an extended example. + +## Payload Encryption + +Encrypt sensitive workflow data. + +```python +from temporalio.converter import PayloadCodec +from temporalio.api.common.v1 import Payload +from cryptography.fernet import Fernet +from typing import Sequence + +class EncryptionCodec(PayloadCodec): + def __init__(self, key: bytes): + self._fernet = Fernet(key) + + async def encode(self, payloads: Sequence[Payload]) -> list[Payload]: + return [ + Payload( + metadata={"encoding": b"binary/encrypted"}, + # Since encryption uses C extensions that give up the GIL, we can avoid blocking the async event loop here. + data=await asyncio.to_thread(self._fernet.encrypt, p.SerializeToString()), + ) + for p in payloads + ] + + async def decode(self, payloads: Sequence[Payload]) -> list[Payload]: + result = [] + for p in payloads: + if p.metadata.get("encoding") == b"binary/encrypted": + decrypted = await asyncio.to_thread(self._fernet.decrypt, p.data) + decoded = Payload() + decoded.ParseFromString(decrypted) + result.append(decoded) + else: + result.append(p) + return result + +# Apply encryption codec +client = await Client.connect( + "localhost:7233", + namespace="default", + data_converter=DataConverter( + payload_codec=EncryptionCodec(encryption_key), + ), +) +``` + +## Search Attributes + +Custom searchable fields for workflow visibility. These can be created at workflow start: + +```python +from temporalio.common import ( + SearchAttributeKey, + SearchAttributePair, + TypedSearchAttributes, +) +from datetime import datetime +from datetime import timezone + +ORDER_ID = SearchAttributeKey.for_keyword("OrderId") +ORDER_STATUS = SearchAttributeKey.for_keyword("OrderStatus") +ORDER_TOTAL = SearchAttributeKey.for_float("OrderTotal") +CREATED_AT = SearchAttributeKey.for_datetime("CreatedAt") + +# At workflow start +handle = await client.start_workflow( + OrderWorkflow.run, + order, + id=f"order-{order.id}", + task_queue="orders", + search_attributes=TypedSearchAttributes([ + SearchAttributePair(ORDER_ID, order.id), + SearchAttributePair(ORDER_STATUS, "pending"), + SearchAttributePair(ORDER_TOTAL, order.total), + SearchAttributePair(CREATED_AT, datetime.now(timezone.utc)), + ]), +) +``` + +Or upserted during workflow execution: + +```python +from temporalio import workflow +from temporalio.common import SearchAttributeKey, SearchAttributePair, TypedSearchAttributes + +ORDER_STATUS = SearchAttributeKey.for_keyword("OrderStatus") + +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, order: Order) -> str: + # ... process order ... + + # Update search attribute + workflow.upsert_search_attributes(TypedSearchAttributes([ + SearchAttributePair(ORDER_STATUS, "completed"), + ])) + return "done" +``` + +### Querying Workflows by Search Attributes + +```python +# List workflows using search attributes +async for workflow in client.list_workflows( + 'OrderStatus = "processing" OR OrderStatus = "pending"' +): + print(f"Workflow {workflow.id} is still processing") +``` + +## Workflow Memo + +Store arbitrary metadata with workflows (not searchable). + +```python +# Set memo at workflow start +await client.execute_workflow( + OrderWorkflow.run, + order, + id=f"order-{order.id}", + task_queue="orders", + memo={ + "customer_name": order.customer_name, + "notes": "Priority customer", + }, +) +``` + +```python +# Read memo from workflow +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, order: Order) -> str: + notes: str = workflow.memo_value("notes", type_hint=str) + ... +``` + +## Deterministic APIs for Values + +Use these APIs within workflows for deterministic random values and UUIDs: + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + # Deterministic UUID (same on replay) + unique_id = workflow.uuid4() + + # Deterministic random (same on replay) + rng = workflow.random() + value = rng.randint(1, 100) + + return str(unique_id) +``` + +## Best Practices + +1. Use Pydantic for input/output validation +2. Keep payloads small—see `references/core/gotchas.md` for limits +3. Encrypt sensitive data with PayloadCodec +4. Use dataclasses for simple data structures +5. Use `workflow.uuid4()` and `workflow.random()` for deterministic values diff --git a/references/python/determinism-protection.md b/references/python/determinism-protection.md new file mode 100644 index 0000000..1376ced --- /dev/null +++ b/references/python/determinism-protection.md @@ -0,0 +1,233 @@ +# Python Workflow Sandbox + +## Overview + +The Python SDK runs workflows in a sandbox that provides automatic protection against non-deterministic operations. This is unique to the Python SDK. + +## How the Sandbox Works + +The sandbox: +- Isolates global state via `exec` compilation +- Restricts non-deterministic library calls via proxy objects +- Passes through standard library with restrictions +- Reloads workflow files on each execution + +## Forbidden Operations + +These operations will fail in the sandbox: + +- **Direct I/O**: Network calls, file reads/writes +- **Threading**: `threading` module operations +- **Subprocess**: `subprocess` calls +- **Global state**: Modifying mutable global variables +- **Blocking sleep**: `time.sleep()` (use `workflow.sleep(timedelta(...))`) + +## Pass-Through Pattern + +Third-party libraries that aren't sandbox-aware need explicit pass-through: + +```python +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + import pydantic + from my_module import my_dataclass +``` + +**When to use pass-through:** +- Data classes and models (Pydantic, dataclasses) +- Serialization libraries +- Type definitions +- Any library that doesn't do I/O or non-deterministic operations +- Performance, as many non-passthrough imports can be slower + +**Note:** The imports, even when using `imports_passed_through`, should all be at the top of the file. Runtime imports are an anti-pattern. + +## Importing Activities + +Activities should be imported through pass-through since they're defined outside the sandbox: + +```python +# workflows/order.py +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities.payment import process_payment + from activities.shipping import ship_order + +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, order_id: str) -> str: + await workflow.execute_activity( + process_payment, + order_id, + start_to_close_timeout=timedelta(minutes=5), + ) + return await workflow.execute_activity( + ship_order, + order_id, + start_to_close_timeout=timedelta(minutes=10), + ) +``` + +## Disabling the Sandbox + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + with workflow.unsafe.sandbox_unrestricted(): + # Unrestricted code block + pass + return "result" +``` + +- Per‑block escape hatch from runtime restrictions; imports unchanged. +- Use when: You need to call something the sandbox would normally block (e.g., a restricted stdlib call) in a very small, controlled section. +- **IMPORTANT:** Use it sparingly; you lose determinism checks inside the block +- Genuinely non-deterministic code still *MUST* go into activities. + +## Customizing Invalid Module Members + +`invalid_module_members` includes modules that cannot be accessed. + +Checks are compared against the fully qualified path to the item. + +```python +import dataclasses +from temporalio.worker import Worker +from temporalio.worker.workflow_sandbox import ( + SandboxedWorkflowRunner, + SandboxMatcher, + SandboxRestrictions, +) + +# Example 1: Remove a restriction on datetime.date.today(): +restrictions = dataclasses.replace( + SandboxRestrictions.default, + invalid_module_members=SandboxRestrictions.invalid_module_members_default.with_child_unrestricted( + "datetime", "date", "today", + ), +) + +# Example 2: Restrict the datetime.date class from being used +restrictions = dataclasses.replace( + SandboxRestrictions.default, + invalid_module_members=SandboxRestrictions.invalid_module_members_default | SandboxMatcher( + children={"datetime": SandboxMatcher(use={"date"})}, + ), +) + +worker = Worker( + ..., + workflow_runner=SandboxedWorkflowRunner(restrictions=restrictions), +) +``` + +## Import Notification Policy + +Control warnings/errors for sandbox import issues. Recommended for catching potential problems: + +```python +from temporalio import workflow +from temporalio.worker.workflow_sandbox import SandboxedWorkflowRunner, SandboxRestrictions + +restrictions = SandboxRestrictions.default.with_import_notification_policy( + workflow.SandboxImportNotificationPolicy.WARN_ON_DYNAMIC_IMPORT + | workflow.SandboxImportNotificationPolicy.WARN_ON_UNINTENTIONAL_PASSTHROUGH +) + +worker = Worker( + ..., + workflow_runner=SandboxedWorkflowRunner(restrictions=restrictions), +) +``` + +- `WARN_ON_DYNAMIC_IMPORT` (default) - warns on imports after initial workflow load +- `WARN_ON_UNINTENTIONAL_PASSTHROUGH` - warns when modules are imported into sandbox without explicit passthrough (not default, but highly recommended for catching missing passthroughs) +- `RAISE_ON_UNINTENTIONAL_PASSTHROUGH` - raise instead of warn + +Override per-import with the context manager: + +```python +with workflow.unsafe.sandbox_import_notification_policy( + workflow.SandboxImportNotificationPolicy.SILENT +): + import pydantic # No warning for this import +``` + +## Disable Lazy sys.modules Passthrough + +By default, passthrough modules are lazily added to the sandbox's `sys.modules` when accessed. To require explicit imports: + +```python +import dataclasses +from temporalio.worker.workflow_sandbox import SandboxedWorkflowRunner, SandboxRestrictions + +restrictions = dataclasses.replace( + SandboxRestrictions.default, + disable_lazy_sys_module_passthrough=True, +) + +worker = Worker( + ..., + workflow_runner=SandboxedWorkflowRunner(restrictions=restrictions), +) +``` + +When `True`, passthrough modules must be explicitly imported to appear in the sandbox's `sys.modules`. + +## File Organization + +**Critical**: Keep workflow definitions in separate files from activity definitions. + +The sandbox reloads workflow definition files on every execution. Minimizing file contents improves Worker performance. + +``` +my_temporal_app/ +├── workflows/ +│ └── order.py # Only workflow classes +├── activities/ +│ └── payment.py # Only activity functions +├── models/ +│ └── order.py # Shared data models +├── worker.py # Worker setup, imports both +└── starter.py # Client code +``` + +## Common Issues + +### Import Errors + +``` +Error: Cannot import 'pydantic' in sandbox +``` + +**Fix**: Use pass-through: + +```python +with workflow.unsafe.imports_passed_through(): + import pydantic +``` + +### Non-Determinism from Libraries + +Some libraries do internal caching or use current time: + +```python +# May cause non-determinism +import some_library +result = some_library.cached_operation() # Cache changes between replays +``` + +**Fix**: Move to activity or use pass-through with caution. + +## Best Practices + +1. **Separate workflow and activity files** for performance +2. **Use pass-through explicitly** for third-party libraries +3. **Keep workflow files small** to minimize reload time +4. **Move I/O to activities** always +5. **Test with replay** to catch sandbox issues early diff --git a/references/python/determinism.md b/references/python/determinism.md new file mode 100644 index 0000000..7276360 --- /dev/null +++ b/references/python/determinism.md @@ -0,0 +1,51 @@ +# Python SDK Determinism + +## Overview + +The Python SDK runs workflows in a sandbox that provides automatic protection against many non-deterministic operations. + +## Why Determinism Matters: History Replay + +Temporal provides durable execution through **History Replay**. When a Worker needs to restore workflow state (after a crash, cache eviction, or to continue after a long timer), it re-executes the workflow code from the beginning, which requires the workflow code to be **deterministic**. + +## Forbidden Operations + +- Direct I/O (network, filesystem) +- Threading operations +- `subprocess` calls +- Global mutable state modification +- `time.sleep()` (use `workflow.sleep(timedelta(...))`) +- and so on + +## Safe Builtin Alternatives to Common Non Deterministic Things + +| Forbidden | Safe Alternative | +|-----------|------------------| +| `datetime.now()` | `workflow.now()` | +| `datetime.utcnow()` | `workflow.now()` | +| `random.random()` | `rng = workflow.new_random() ; rng.randint(1, 100)` | +| `uuid.uuid4()` | `workflow.uuid4()` | +| `time.time()` | `workflow.now().timestamp()` | + +## Testing Replay Compatibility + +Use the `Replayer` class to verify your code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/python/testing.md`. + +## Sandbox Behavior + +The sandbox: +- Isolates global state via `exec` compilation +- Restricts non-deterministic library calls via proxy objects +- Passes through standard library with restrictions + +See more info at `references/python/determinism-protection.md` + +## Best Practices + +1. Use `workflow.now()` for all time operations +2. Use `workflow.random()` for random values +3. Use `workflow.uuid4()` for unique identifiers +4. Pass through third-party libraries explicitly +5. Test with replay to catch non-determinism +6. Keep workflows focused on orchestration, delegate I/O to activities +7. Use `workflow.logger` instead of print() for replay-safe logging diff --git a/references/python/error-handling.md b/references/python/error-handling.md new file mode 100644 index 0000000..19460cb --- /dev/null +++ b/references/python/error-handling.md @@ -0,0 +1,138 @@ +# Python SDK Error Handling + +## Overview + +The Python SDK uses `ApplicationError` for application-specific errors and provides comprehensive retry policy configuration. Generally, the following information about errors and retryability applies across activities, child workflows and Nexus operations. + +## Application Errors + +```python +from temporalio import activity +from temporalio.exceptions import ApplicationError + +@activity.defn +async def validate_order(order: Order) -> None: + if not order.is_valid(): + raise ApplicationError( + "Invalid order", + type="ValidationError", + ) +``` + +## Non-Retryable Errors + +```python +from dataclasses import dataclass +from temporalio import activity +from temporalio.exceptions import ApplicationError + +@dataclass +class ChargeCardInput: + card_number: str + amount: float + +@activity.defn +async def charge_card(input: ChargeCardInput) -> str: + if not is_valid_card(input.card_number): + raise ApplicationError( + "Permanent failure - invalid credit card", + type="PaymentError", + non_retryable=True, # Will not retry activity + ) + return await process_payment(input.card_number, input.amount) +``` + +## Handling Activity Errors + +```python +from datetime import timedelta +from temporalio import workflow +from temporalio.exceptions import ActivityError, ApplicationError + +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + try: + return await workflow.execute_activity( + risky_activity, + start_to_close_timeout=timedelta(minutes=5), + ) + except ActivityError as e: + workflow.logger.error(f"Activity failed: {e}") + # Handle or re-raise + raise ApplicationError("Workflow failed due to activity error") +``` + +## Retry Policy Configuration + +```python +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy + +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + result = await workflow.execute_activity( + my_activity, + start_to_close_timeout=timedelta(minutes=10), + retry_policy=RetryPolicy( + maximum_interval=timedelta(minutes=1), + maximum_attempts=5, + non_retryable_error_types=["ValidationError", "PaymentError"], + ), + ) + return result +``` + +Only set options such as maximum_interval, maximum_attempts etc. if you have a domain-specific reason to. +If not, prefer to leave them at their defaults. + +## Timeout Configuration + +```python +from datetime import timedelta +from temporalio import workflow + +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + return await workflow.execute_activity( + my_activity, + start_to_close_timeout=timedelta(minutes=5), # Single attempt + schedule_to_close_timeout=timedelta(minutes=30), # Including retries + heartbeat_timeout=timedelta(minutes=2), # Between heartbeats + ) +``` + +## Workflow Failure + +```python +from temporalio import workflow +from temporalio.exceptions import ApplicationError + +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + if some_condition: + raise ApplicationError( + "Cannot process order", + type="BusinessError", + ) + return "success" +``` + +**Note:** Do not use `non_retryable=` with `ApplicationError` inside a worklow (as opposed to an activity). + +## Best Practices + +1. Use specific error types for different failure modes +2. Mark permanent failures as non-retryable +3. Configure appropriate retry policies +4. Log errors before re-raising +5. Use `ActivityError` to catch activity failures in workflows +6. Design code to be idempotent for safe retries (see more at `references/core/patterns.md`) diff --git a/references/python/gotchas.md b/references/python/gotchas.md new file mode 100644 index 0000000..95ebe8a --- /dev/null +++ b/references/python/gotchas.md @@ -0,0 +1,280 @@ +# Python Gotchas + +Python-specific mistakes and anti-patterns. See also [Common Gotchas](references/core/gotchas.md) for language-agnostic concepts. + +## File Organization + +### Importing Activities into Workflow Files + +**The Problem**: The Python sandbox reloads workflow files on every task. Importing heavy activity modules slows down workers. + +```python +# BAD - activities.py gets reloaded constantly +# workflows.py +from activities import my_activity + +@workflow.defn +class MyWorkflow: + pass + +# GOOD - Pass-through import +# workflows.py +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import my_activity + +@workflow.defn +class MyWorkflow: + pass +``` + +`references/python/determinism-protection.md` contains more info about the Python sandbox. + +### Mixing Workflows and Activities + +```python +# BAD - Everything in one file +# app.py +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self): + await workflow.execute_activity(my_activity, ...) + +@activity.defn +async def my_activity(): + # Heavy imports, I/O, etc. + pass + +# GOOD - Separate files +# workflows.py +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self): + await workflow.execute_activity(my_activity, ...) + +# activities.py +@activity.defn +async def my_activity(): + pass +``` + +## Async vs Sync Activities + +The Temporal Python SDK supports both async and sync activities. See `references/python/sync-vs-async.md` to understand which to choose. Below are important anti-patterns for both aysnc and sync activities. + +### Blocking in Async Activities + +```python +# BAD - Blocks the event loop +@activity.defn +async def process_file(path: str) -> str: + with open(path) as f: # Blocking I/O in async! + return f.read() + +# GOOD Option 1 - Use sync activity with executor +@activity.defn +def process_file(path: str) -> str: + with open(path) as f: + return f.read() + +# Register with executor in worker +Worker( + client, + task_queue="my-queue", + activities=[process_file], + activity_executor=ThreadPoolExecutor(max_workers=10), +) + +# GOOD Option 2 - Use async I/O +@activity.defn +async def process_file(path: str) -> str: + async with aiofiles.open(path) as f: + return await f.read() +``` + +### Missing Executor for Sync Activities + +```python +# BAD - Sync activity REQUIRES executor +@activity.defn +def slow_computation(data: str) -> str: + return heavy_cpu_work(data) + +Worker( + client, + task_queue="my-queue", + activities=[slow_computation], + # Missing activity_executor! --> THIS IMMEDIATELY RAISES AN EXCEPTION! +) + +# GOOD - Provide executor +Worker( + client, + task_queue="my-queue", + activities=[slow_computation], + activity_executor=ThreadPoolExecutor(max_workers=10), +) +``` + +## Wrong Retry Classification + +**Example:** Transient networks errors should be retried. Authentication errors should not be. +See `references/python/error-handling.md` to understand how to classify errors. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```python +# BAD - No heartbeat, can't detect stuck activities +@activity.defn +async def process_large_file(path: str): + async for chunk in read_chunks(path): + process(chunk) # Takes hours, no heartbeat + +# GOOD - Regular heartbeats with progress +@activity.defn +async def process_large_file(path: str): + async for i, chunk in enumerate(read_chunks(path)): + activity.heartbeat(f"Processing chunk {i}") + process(chunk) +``` + +### Heartbeat Timeout Too Short + +```python +# BAD - Heartbeat timeout shorter than processing time +await workflow.execute_activity( + process_chunk, + start_to_close_timeout=timedelta(minutes=30), + heartbeat_timeout=timedelta(seconds=10), # Too short! +) + +# GOOD - Heartbeat timeout allows for processing variance +await workflow.execute_activity( + process_chunk, + start_to_close_timeout=timedelta(minutes=30), + heartbeat_timeout=timedelta(minutes=2), +) +``` + +Set heartbeat timeout as high as acceptable for your use case — each heartbeat counts as an action. + +## Cancellation + +### Not Handling Workflow Cancellation + +```python +# BAD - Cleanup doesn't run on cancellation +@workflow.defn +class BadWorkflow: + @workflow.run + async def run(self) -> None: + await workflow.execute_activity( + acquire_resource, + start_to_close_timeout=timedelta(minutes=5), + ) + await workflow.execute_activity( + do_work, + start_to_close_timeout=timedelta(minutes=5), + ) + await workflow.execute_activity( + release_resource, # Never runs if cancelled! + start_to_close_timeout=timedelta(minutes=5), + ) + +# GOOD - Use try/finally for cleanup +@workflow.defn +class GoodWorkflow: + @workflow.run + async def run(self) -> None: + await workflow.execute_activity( + acquire_resource, + start_to_close_timeout=timedelta(minutes=5), + ) + try: + await workflow.execute_activity( + do_work, + start_to_close_timeout=timedelta(minutes=5), + ) + finally: + # Runs even on cancellation + await workflow.execute_activity( + release_resource, + start_to_close_timeout=timedelta(minutes=5), + ) +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: +1. **Heartbeating** - Cancellation is delivered via heartbeat +2. **Catching the cancellation exception** - Exception is raised when heartbeat detects cancellation + +**Cancellation exceptions:** +- Async activities: `asyncio.CancelledError` +- Sync threaded activities: `temporalio.exceptions.CancelledError` + +```python +# BAD - Activity ignores cancellation +@activity.defn +async def long_activity() -> None: + await do_expensive_work() # Runs to completion even if cancelled +``` + +```python +# GOOD - Heartbeat and catch cancellation +@activity.defn +async def long_activity() -> None: + try: + for item in items: + activity.heartbeat() + await process(item) + except asyncio.CancelledError: + await cleanup() + raise +``` + +## Testing + +### Not Testing Failures + +It is important to make sure workflows work as expected under failure paths in addition to happy paths. Please see `references/python/testing.md` for more info. + +### Not Testing Replay + +Replay tests help you test that you do not have hidden sources of non-determinism bugs in your workflow code, and should be considered in addition to standard testing. Please see `references/python/testing.md` for more info. + +## Timers and Sleep + +### Using asyncio.sleep + +```python +# BAD: asyncio.sleep is not deterministic during replay +import asyncio + +@workflow.defn +class BadWorkflow: + @workflow.run + async def run(self) -> None: + await asyncio.sleep(60) # Non-deterministic! +``` + +```python +# GOOD: Use workflow.sleep for deterministic timers +from temporalio import workflow +from datetime import timedelta + +@workflow.defn +class GoodWorkflow: + @workflow.run + async def run(self) -> None: + await workflow.sleep(timedelta(seconds=60)) # Deterministic + # Or with string duration: + await workflow.sleep("1 minute") +``` + +**Why this matters:** `asyncio.sleep` uses the system clock, which differs between original execution and replay. `workflow.sleep` creates a durable timer in the event history, ensuring consistent behavior during replay. diff --git a/references/python/observability.md b/references/python/observability.md new file mode 100644 index 0000000..26296c3 --- /dev/null +++ b/references/python/observability.md @@ -0,0 +1,105 @@ +# Python SDK Observability + +## Overview + +The Python SDK provides comprehensive observability through logging, metrics, tracing, and visibility (Search Attributes). + +## Logging + +### Workflow Logging (Replay-Safe) + +Use `workflow.logger` for replay-safe logging that avoids duplicate messages: + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self, name: str) -> str: + workflow.logger.info("Workflow started", extra={"name": name}) + + result = await workflow.execute_activity( + my_activity, + start_to_close_timeout=timedelta(minutes=5), + ) + + workflow.logger.info("Activity completed", extra={"result": result}) + return result +``` + +The workflow logger automatically: +- Suppresses duplicate logs during replay +- Includes workflow context (workflow ID, run ID, etc.) + +### Activity Logging + +Use `activity.logger` for context-aware activity logging: + +```python +@activity.defn +async def process_order(order_id: str) -> str: + activity.logger.info(f"Processing order {order_id}") + + # Perform work... + + activity.logger.info("Order processed successfully") + return "completed" +``` + +Activity logger includes: +- Activity ID, type, and task queue +- Workflow ID and run ID +- Attempt number (for retries) + +### Customizing Logger Configuration + +```python +import logging + +# Applies to temporalio.workflow.logger and temporalio.activity.logger, as Temporal inherits the default logger +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +``` + +## Metrics + +### Enabling SDK Metrics + +```python +from temporalio.client import Client +from temporalio.runtime import Runtime, TelemetryConfig, PrometheusConfig + +# Create a custom runtime +runtime = Runtime( + telemetry=TelemetryConfig( + metrics=PrometheusConfig(bind_address="0.0.0.0:9000") + ) +) + +# Set it as the global default BEFORE any Client/Worker is created +# Do this only ONCE. +Runtime.set_default(runtime, error_if_already_set=True) +# error_if_already_set can be False if you want to overwrite an existing default without raising. + +# ...elsewhere, client = ... as usual +``` + +### Key SDK Metrics + +- `temporal_request` - Client requests to server +- `temporal_workflow_task_execution_latency` - Workflow task processing time +- `temporal_activity_execution_latency` - Activity execution time +- `temporal_workflow_task_replay_latency` - Replay duration + + +## Search Attributes (Visibility) + +See the Search Attributes section of `references/python/data-handling.md` + +## Best Practices + +1. Use `workflow.logger` in workflows, `activity.logger` in activities +2. Don't use print() in workflows - it will produce duplicate output on replay +3. Configure metrics for production monitoring +4. Use Search Attributes for business-level visibility diff --git a/references/python/patterns.md b/references/python/patterns.md new file mode 100644 index 0000000..6843985 --- /dev/null +++ b/references/python/patterns.md @@ -0,0 +1,395 @@ +# Python SDK Patterns + +## Signals + +```python +@workflow.defn +class OrderWorkflow: + def __init__(self): + self._approved = False + self._items = [] + + @workflow.signal + async def approve(self) -> None: + self._approved = True + + @workflow.signal + async def add_item(self, item: str) -> None: + self._items.append(item) + + @workflow.run + async def run(self) -> str: + # Wait for approval + await workflow.wait_condition(lambda: self._approved) + return f"Processed {len(self._items)} items" +``` + +### Dynamic Signal Handlers + +For handling signals with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined signal handlers. + +```python +@workflow.defn +class DynamicSignalWorkflow: + def __init__(self): + self._signals: dict[str, list[Any]] = {} + + @workflow.signal(dynamic=True) + async def handle_signal(self, name: str, args: Sequence[RawValue]) -> None: + if name not in self._signals: + self._signals[name] = [] + self._signals[name].append(workflow.payload_converter().from_payload(args[0])) +``` + +## Queries + +**Important:** Queries must NOT modify workflow state or have side effects. + +```python +@workflow.defn +class StatusWorkflow: + def __init__(self): + self._status = "pending" + self._progress = 0 + + @workflow.query + def get_status(self) -> str: + return self._status + + @workflow.query + def get_progress(self) -> int: + return self._progress + + @workflow.run + async def run(self) -> str: + self._status = "running" + for i in range(100): + self._progress = i + await workflow.execute_activity( + process_item, i, + start_to_close_timeout=timedelta(minutes=1) + ) + self._status = "completed" + return "done" +``` + +### Dynamic Query Handlers + +For handling queries with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined query handlers. + +```python +@workflow.query(dynamic=True) +def handle_query(self, name: str, args: Sequence[RawValue]) -> Any: + if name == "get_field": + field_name = workflow.payload_converter().from_payload(args[0]) + return getattr(self, f"_{field_name}", None) +``` + +## Updates + +```python +@workflow.defn +class OrderWorkflow: + def __init__(self): + self._items: list[str] = [] + + @workflow.update + async def add_item(self, item: str) -> int: + self._items.append(item) + return len(self._items) # Returns new count to caller + + @add_item.validator + def validate_add_item(self, item: str) -> None: + if not item: + raise ValueError("Item cannot be empty") + if len(self._items) >= 100: + raise ValueError("Order is full") +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Raise an exception to reject the update; return `None` to accept. + +## Child Workflows + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self, orders: list[Order]) -> list[str]: + results = [] + for order in orders: + result = await workflow.execute_child_workflow( + ProcessOrderWorkflow.run, + order, + id=f"order-{order.id}", + # Control what happens to child when parent completes + parent_close_policy=workflow.ParentClosePolicy.ABANDON, + ) + results.append(result) + return results +``` + +## Handles to External Workflows + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self, target_workflow_id: str) -> None: + # Get handle to external workflow + handle = workflow.get_external_workflow_handle(target_workflow_id) + + # Signal the external workflow + await handle.signal(TargetWorkflow.data_ready, data_payload) + + # Or cancel it + await handle.cancel() +``` + +## Parallel Execution + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self, items: list[str]) -> list[str]: + # Execute activities in parallel + tasks = [ + workflow.execute_activity( + process_item, item, + start_to_close_timeout=timedelta(minutes=5) + ) + for item in items + ] + return await asyncio.gather(*tasks) +``` + +### Deterministic Alternatives to asyncio + +Generally, asyncio is OK to use in Temoral workflows. But some asyncio calls are non-deterministic. Use Temporal's deterministic alternatives for safer concurrent operations: + +```python +# workflow.wait() - like asyncio.wait() +done, pending = await workflow.wait( + futures, + return_when=workflow.WaitConditionResult.FIRST_COMPLETED +) + +# workflow.as_completed() - like asyncio.as_completed() +async for future in workflow.as_completed(futures): + result = await future + # Process each result as it completes +``` + +## Continue-as-New + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self, state: WorkflowState) -> str: + while True: + state = await process_batch(state) + + if state.is_complete: + return "done" + + # Continue with fresh history before hitting limits + if workflow.info().is_continue_as_new_suggested(): + workflow.continue_as_new(args=[state]) +``` + +## Saga Pattern (Compensations) + +**Important:** Compensation activities should be idempotent - they may be retried (as with ALL activities). + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self, order: Order) -> str: + compensations: list[Callable[[], Awaitable[None]]] = [] + + try: + # Note - we save the compensation before running the activity, + # because the following could happen: + # 1. reserve_inventory starts running + # 2. it does successfully reserve inventory + # 3. but then fails for some other reason (timeout, reporting metrics, etc.) + # 4. in that case, the activity would have failed, but we still did the effect of reserving inventory + # So, we need to make sure we have a compensation already on the stack to handle that. + # This means the compensation needs to handle both the cases of reserved or unreserved inventory. + compensations.append(lambda: workflow.execute_activity( + release_inventory_if_reserved, order, + start_to_close_timeout=timedelta(minutes=5) + )) + await workflow.execute_activity( + reserve_inventory, order, + start_to_close_timeout=timedelta(minutes=5) + ) + + compensations.append(lambda: workflow.execute_activity( + refund_payment_if_charged, order, + start_to_close_timeout=timedelta(minutes=5) + )) + await workflow.execute_activity( + charge_payment, order, + start_to_close_timeout=timedelta(minutes=5) + ) + + await workflow.execute_activity( + ship_order, order, + start_to_close_timeout=timedelta(minutes=5) + ) + + return "Order completed" + + except Exception as e: + workflow.logger.error(f"Order failed: {e}, running compensations") + # asyncio.shield ensures compensations run even if the workflow is cancelled. + async def run_compensations(): + for compensate in reversed(compensations): + try: + await compensate() + except Exception as comp_err: + workflow.logger.error(f"Compensation failed: {comp_err}") + await asyncio.shield(asyncio.ensure_future(run_compensations())) + raise +``` + +## Cancellation Handling - leverages standard asyncio cancellation + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + try: + await workflow.execute_activity( + long_running_activity, + start_to_close_timeout=timedelta(hours=1), + ) + return "completed" + except asyncio.CancelledError: + # Workflow was cancelled - perform cleanup + workflow.logger.info("Workflow cancelled, running cleanup") + # Cleanup activities still run even after cancellation + await workflow.execute_activity( + cleanup_activity, + start_to_close_timeout=timedelta(minutes=5), + ) + raise # Re-raise to mark workflow as cancelled +``` + +## Wait Condition with Timeout + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + self._approved = False + + # Wait for approval with 24-hour timeout + try: + await workflow.wait_condition( + lambda: self._approved, + timeout=timedelta(hours=24) + ) + return "approved" + except asyncio.TimeoutError: + return "auto-rejected due to timeout" +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers should generally be non-async (avoid running activities from them). Otherwise, the workflow may complete before handlers finish their execution. However, making handlers non-async sometimes requires workarounds that add complexity. + +When async handlers are necessary, use `wait_condition(all_handlers_finished)` at the end of your workflow (or before continue-as-new) to prevent completion until all pending handlers complete. + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + # ... main workflow logic ... + + # Before exiting, wait for all handlers to finish + await workflow.wait_condition(workflow.all_handlers_finished) + return "done" +``` + +## Activity Heartbeat Details + +### WHY: +- **Support activity cancellation** - Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** - Heartbeat details persist across retries + +**Cancellation exceptions:** +- Async activities: `asyncio.CancelledError` +- Sync threaded activities: `temporalio.exceptions.CancelledError` + +### WHEN: +- **Cancellable activities** - Any activity that should respond to cancellation +- **Long-running activities** - Track progress for resumability +- **Checkpointing** - Save progress periodically + +```python +from temporalio.exceptions import CancelledError + +@activity.defn +def process_large_file(file_path: str) -> str: + # Get heartbeat details from previous attempt (if any) + heartbeat_details = activity.info().heartbeat_details + start_line = heartbeat_details[0] if heartbeat_details else 0 + + try: + with open(file_path) as f: + for i, line in enumerate(f): + if i < start_line: + continue # Skip already processed lines + + process_line(line) + + # Heartbeat with progress + # If cancelled, heartbeat() raises CancelledError + activity.heartbeat(i + 1) + + return "completed" + except CancelledError: + # Perform cleanup on cancellation + cleanup() + raise +``` + +## Timers + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + await workflow.sleep(timedelta(hours=1)) + + return "Timer fired" +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```python +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self) -> str: + result = await workflow.execute_local_activity( + quick_lookup, + "key", + start_to_close_timeout=timedelta(seconds=5), + ) + return result +``` + +## Using Pydantic Models + +See `references/python/data-handling.md`. diff --git a/references/python/python.md b/references/python/python.md new file mode 100644 index 0000000..130b1eb --- /dev/null +++ b/references/python/python.md @@ -0,0 +1,175 @@ +# Temporal Python SDK Reference + +## Overview + +The Temporal Python SDK (`temporalio`) provides a fully async, type-safe approach to building durable workflows. Python 3.9+ required. Workflows run in a sandbox by default for determinism protection. + +## Quick Demo of Temporal + +**Add Dependency on Temporal:** In the package management system of the Python project you are working on, add a dependency on `temporalio`. + +**activities/greet.py** - Activity definitions (separate file for performance): +```python +from temporalio import activity + +@activity.defn +def greet(name: str) -> str: + return f"Hello, {name}!" +``` + +**workflows/greeting.py** - Workflow definition (import activities through sandbox): +```python +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities.greet import greet + +@workflow.defn +class GreetingWorkflow: + @workflow.run + async def run(self, name: str) -> str: + return await workflow.execute_activity( + greet, name, start_to_close_timeout=timedelta(seconds=30) + ) +``` + +**worker.py** - Worker setup (imports activity and workflow, runs indefinitely and processes tasks): +```python +import asyncio +import concurrent.futures +from temporalio.client import Client +from temporalio.worker import Worker + +# Import the activity and workflow from our other files +from activities.greet import greet +from workflows.greeting import GreetingWorkflow + +async def main(): + # Create client connected to server at the given address + # This is the default port for `temporal server start-dev` + client = await Client.connect("localhost:7233") + + # Run the worker + with concurrent.futures.ThreadPoolExecutor(max_workers=100) as activity_executor: + worker = Worker( + client, + task_queue="my-task-queue", + workflows=[GreetingWorkflow], + activities=[greet], + activity_executor=activity_executor, + ) + await worker.run() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Start `python worker.py` in the background (appropriately adjust command for your project, like `uv run python worker.py`) + +**starter.py** - Start a workflow execution: +```python +import asyncio +from temporalio.client import Client +import uuid + +# Import the workflow from the previous code +from workflows.greeting import GreetingWorkflow + +async def main(): + # Create client connected to server at the given address + client = await Client.connect("localhost:7233") + + # Execute a workflow + result = await client.execute_workflow(GreetingWorkflow.run, "my name", id=str(uuid.uuid4()), task_queue="my-task-queue") + + print(f"Result: {result}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Run the workflow:** Run `python starter.py` (or uv run, etc.). Should output: `Result: Hello, my-name!`. + + +## Key Concepts + +### Workflow Definition +- Use `@workflow.defn` decorator on class +- Use `@workflow.run` on the entry point method +- Must be async (`async def`) +- Use `@workflow.signal`, `@workflow.query`, `@workflow.update` for handlers + +### Activity Definition +- Use `@activity.defn` decorator +- Can be sync or async functions +- **Default to sync activities** - safer and easier to debug +- Sync activities need `activity_executor` (ThreadPoolExecutor) +- Async activities require async-safe libraries throughout (e.g., `aiohttp` not `requests`) + +See `sync-vs-async.md` for detailed guidance on choosing between sync and async. + +### Worker Setup +- Connect client, create Worker with workflows and activities +- Run the worker +- Activities can specify custom executor + +### Determinism + +**Workflow code must be deterministic!**. All sources of non-determinism should either use Temporal-provided actions or (primarily) be defined in Activities. Read `references/core/determinism.md` and `references/python/determinism.md` to understand more. + +## File Organization Best Practice + +**Keep Workflow definitions in separate files from Activity definitions.** The Python SDK sandbox reloads Workflow definition files on every execution for determinism protection. Minimizing file contents improves Worker performance. + +``` +my_temporal_app/ +├── workflows/ +│ └── greeting.py # Only Workflow classes +├── activities/ +│ └── translate.py # Only Activity functions/classes +├── worker.py # Worker setup, imports both +└── starter.py # Client code to start workflows +``` + +**In the Workflow file, import Activities through the sandbox:** +```python +# workflows/greeting.py +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities.translate import TranslateActivities +``` + +## Common Pitfalls + +1. **Non-deterministic code in workflows** - Use activities for all non-deterministic and/or fallible code +2. **Blocking in async activities** - Use sync activities or async-safe libraries only +3. **Missing executor for sync activities** - Add `activity_executor=ThreadPoolExecutor()` +4. **Forgetting to heartbeat** - Long activities need `activity.heartbeat()` +5. **Using gevent** - Incompatible with SDK +6. **Using `print()` in workflows** - Use `workflow.logger` instead for replay-safe logging +7. **Mixing Workflows and Activities in same file** - Causes unnecessary reloads, hurts performance, bad structure +8. **Forgetting to wait on activity calls** - `workflow.execute_activity()` is async; you must eventually await it (directly or via `asyncio.gather()` for parallel execution) + +## Writing Tests + +See `references/python/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files +- **`references/python/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. +- **`references/python/determinism.md`** - Sandbox behavior, safe alternatives, pass-through pattern, history replay +- **`references/python/gotchas.md`** - Python-specific mistakes and anti-patterns +- **`references/python/error-handling.md`** - ApplicationError, retry policies, non-retryable errors, idempotency +- **`references/python/observability.md`** - Logging, metrics, tracing, Search Attributes +- **`references/python/testing.md`** - WorkflowEnvironment, time-skipping, activity mocking +- **`references/python/sync-vs-async.md`** - Sync vs async activities, event loop blocking, executor configuration +- **`references/python/advanced-features.md`** - Schedules, worker tuning, and more +- **`references/python/data-handling.md`** - Data converters, Pydantic, payload encryption +- **`references/python/versioning.md`** - Patching API, workflow type versioning, Worker Versioning +- **`references/python/determinism-protection.md`** - Python sandbox specifics, forbidden operations, pass-through imports +- **`references/python/ai-patterns.md`** - LLM integration, Pydantic data converter, AI workflow patterns diff --git a/references/python/sync-vs-async.md b/references/python/sync-vs-async.md new file mode 100644 index 0000000..7875582 --- /dev/null +++ b/references/python/sync-vs-async.md @@ -0,0 +1,231 @@ +# Python SDK: Sync vs Async Activities + +## Overview + +The Temporal Python SDK supports multiple ways of implementing Activities: + +- **Asynchronous** using `asyncio` +- **Synchronous multithreaded** using `concurrent.futures.ThreadPoolExecutor` +- **Synchronous multiprocess** using `concurrent.futures.ProcessPoolExecutor` + +Choosing the correct approach is critical—incorrect usage can cause sporadic failures and difficult-to-diagnose bugs. + +## Recommendation: Default to Synchronous + +Activities should be synchronous by default. Use async only when certain the code doesn't block the event loop. + +## The Event Loop Problem + +The Python async event loop runs in a single thread. When any task runs, no other tasks can execute until an `await` is reached. If code makes a blocking call (file I/O, synchronous HTTP, etc.), the entire event loop freezes. + +**Consequences of blocking the event loop:** +- Worker cannot communicate with Temporal Server +- Workflow progress blocks across the worker +- Potential deadlocks and unpredictable behavior +- Difficult-to-diagnose bugs + +## How the SDK Handles Each Type + +### Synchronous Activities + +- Run in the `activity_executor`, which you must provide +- Protected from accidentally blocking the global event loop +- Multiple activities run in parallel via OS thread scheduling +- Thread pool provides preemptive switching between tasks + +```python +from concurrent.futures import ThreadPoolExecutor +from temporalio.worker import Worker + +with ThreadPoolExecutor(max_workers=100) as executor: + worker = Worker( + client, + task_queue="my-queue", + workflows=[MyWorkflow], + activities=[my_sync_activity], + activity_executor=executor, + ) + await worker.run() +``` + +### Asynchronous Activities + +- Share the default asyncio event loop with the Temporal worker +- Any blocking call freezes the entire loop +- Require async-safe libraries throughout + +```python +@activity.defn +async def my_async_activity(name: str) -> str: + # Must use async-safe libraries only + async with aiohttp.ClientSession() as session: + async with session.get(f"http://api.example.com/{name}") as response: + return await response.text() +``` + +## HTTP Libraries: A Critical Choice + +| Library | Type | Safe in Async Activity? | +|---------|------|------------------------| +| `requests` | Blocking | No - blocks event loop | +| `urllib3` | Blocking | No - blocks event loop | +| `aiohttp` | Async | Yes | +| `httpx` | Both | Yes (use async mode) | + +**Example: Wrong way (blocks event loop)** +```python +@activity.defn +async def bad_activity(url: str) -> str: + import requests + response = requests.get(url) # BLOCKS the event loop! + return response.text +``` + +**Example: Correct way (async-safe)** +```python +@activity.defn +async def good_activity(url: str) -> str: + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + return await response.text() +``` + +## Running Blocking Code in Async Activities + +If blocking code must run in an async activity, offload it to a thread: + +```python +import asyncio + +@activity.defn +async def activity_with_blocking_call() -> str: + # Run blocking code in a thread pool + loop = asyncio.get_event_loop() + result = await loop.run_in_executor(None, blocking_function) + return result + +# Or use asyncio.to_thread (Python 3.9+) +@activity.defn +async def activity_with_blocking_call_v2() -> str: + result = await asyncio.to_thread(blocking_function) + return result +``` + +## When to Use Async Activities + +Use async activities only when: + +1. All code paths are async-safe (no blocking calls) +2. Using async-native libraries (aiohttp, asyncpg, motor, etc.) +3. Performance benefits are needed for I/O-bound operations +4. The team understands async constraints + +## When to Use Sync Activities + +Use sync activities when: + +1. Making HTTP calls with `requests` or similar blocking libraries +2. Performing file I/O operations +3. Using database drivers that aren't async-native +4. Uncertain whether code is async-safe +5. Integrating with legacy or third-party synchronous code + +## Debugging Tip + +If experiencing sporadic bugs, hangs, or timeouts: + +1. Convert async activities to sync +2. Test thoroughly +3. If bugs disappear, the original async activity had blocking calls + +## Threading Considerations + +### Multi-Core Usage + +For CPU-bound work and multi-core usage: + +- Prefer multiple worker processes and/or threaded synchronous activities. +- Use ProcessPoolExecutor for synchronous activities only if you understand and accept the extra complexity and different cancellation semantics. + +### Separate Workers for Workflows vs Activities + +Some teams deploy: +- Workflow-only workers (CPU-bound, need deadlock detection) +- Activity-only workers (I/O-bound, may need more parallelism) + +This prevents resource contention and allows independent scaling. + +## Complete Example: Sync Activity with ThreadPoolExecutor + +```python +import urllib.parse +import requests +from concurrent.futures import ThreadPoolExecutor +from temporalio import activity +from temporalio.client import Client +from temporalio.worker import Worker + +@activity.defn +def greet_in_spanish(name: str) -> str: + """Synchronous activity using requests library.""" + url = f"http://localhost:9999/get-spanish-greeting?name={urllib.parse.quote(name)}" + response = requests.get(url) + return response.text + +async def main(): + client = await Client.connect("localhost:7233", namespace="default") + + with ThreadPoolExecutor(max_workers=100) as executor: + worker = Worker( + client, + task_queue="greeting-tasks", + workflows=[GreetingWorkflow], + activities=[greet_in_spanish], + activity_executor=executor, + ) + await worker.run() +``` + +## Complete Example: Async Activity with aiohttp + +```python +import aiohttp +import urllib.parse +from temporalio import activity +from temporalio.client import Client +from temporalio.worker import Worker + +class TranslateActivities: + def __init__(self, session: aiohttp.ClientSession): + self.session = session + + @activity.defn + async def greet_in_spanish(self, name: str) -> str: + """Async activity using aiohttp - safe for event loop.""" + url = f"http://localhost:9999/get-spanish-greeting?name={urllib.parse.quote(name)}" + async with self.session.get(url) as response: + return await response.text() + +async def main(): + client = await Client.connect("localhost:7233", namespace="default") + + async with aiohttp.ClientSession() as session: + activities = TranslateActivities(session) + worker = Worker( + client, + task_queue="greeting-tasks", + workflows=[GreetingWorkflow], + activities=[activities.greet_in_spanish], + ) + await worker.run() +``` + +## Summary + +| Aspect | Sync Activities | Async Activities | +|--------|-----------------|------------------| +| Default choice | Yes | Only when certain | +| Blocking calls | Safe (runs in thread pool) | Dangerous (blocks event loop) | +| HTTP library | `requests`, `httpx` | `aiohttp`, `httpx` (async) | +| Executor needed | Yes (`ThreadPoolExecutor`) | No | +| Debugging | Easier | Harder (timing issues) | diff --git a/references/python/testing.md b/references/python/testing.md new file mode 100644 index 0000000..63a0d14 --- /dev/null +++ b/references/python/testing.md @@ -0,0 +1,165 @@ +# Python SDK Testing + +## Overview + +You test Temporal Python Workflows using the Temporal testing package plus a normal Python test framework like pytest. The Temporal Python SDK provides `WorkflowEnvironment` for testing workflows in a local environment and `ActivityEnvironment` for isolated activity testing. + +## Workflow Test Environment + +The core pattern is: + +1. Start a test WorkflowEnvironment (`WorkflowEnvironment.start_local()`). +2. Start a Worker in that environment with your Workflow and Activities registered. +3. Use the environment’s client to execute the Workflow, using a fresh UUID for the task queue name and workflow ID. +4. Assert on the result or status. + +`WorkflowEnvironment.start_local` configures a ready-to-go local environment for running and testing workflows: + +```python +import uuid +import pytest + +from temporalio.testing import WorkflowEnvironment +from temporalio.worker import Worker + +from activities import my_activity +from workflows import MyWorkflow + +@pytest.mark.asyncio +async def test_workflow(): + task_queue_name = str(uuid.uuid4()) + async with await WorkflowEnvironment.start_local() as env: + async with Worker( + env.client, + task_queue=task_queue_name, + workflows=[MyWorkflow], + activities=[my_activity], + ): + result = await env.client.execute_workflow( + MyWorkflow.run, + "input", + id=str(uuid.uuid4()), + task_queue=task_queue_name, + ) +``` + +Conveniently, the local `env` can be shared among tests, e.g. via a pytest fixture. + +If your workflows / tests involve long durations (such as using Temporal timers / sleeps), then you can use the time-skipping environment, via `WorkflowEnvironment.start_time_skipping()`. +Only use time-skipping if you must. It can *not* be shared among tests. + +## Mocking Activities + +```python +import uuid +import pytest + +from temporalio import activity +from temporalio.testing import WorkflowEnvironment +from temporalio.worker import Worker + +from workflows import MyWorkflow + +@activity.defn(name="compose_greeting") +async def compose_greeting_mocked(input: str) -> str: + return "mocked result" + +@pytest.mark.asyncio +async def test_with_mock(): + task_queue_name = str(uuid.uuid4()) + async with await WorkflowEnvironment.start_local() as env: + async with Worker( + env.client, + task_queue=task_queue_name, + workflows=[MyWorkflow], + activities=[compose_greeting_mocked], + ): + result = await env.client.execute_workflow(...) +``` + +## Testing Signals and Queries + +```python +@pytest.mark.asyncio +async def test_signals(): + async with await WorkflowEnvironment.start_local() as env: + async with Worker(...): + handle = await env.client.start_workflow(...) # same arguments as to execute_workflow + + # Send signal + await handle.signal(MyWorkflow.my_signal, "data") + + # Query state + status = await handle.query(MyWorkflow.get_status) + assert status == "expected" + + # Wait for completion + result = await handle.result() +``` + +## Testing Failure Cases + +Below shows an example of how to test failure cases: + +```python +# Test failure scenarios +@pytest.mark.asyncio +async def test_activity_failure_handling(): + async with await WorkflowEnvironment.start_local() as env: + # An example activity that always fails + @activity.defn + async def failing_activity() -> str: + raise ApplicationError("Simulated failure", non_retryable=True) + + async with Worker(...): + with pytest.raises(WorkflowFailureError): + await env.client.execute_workflow(...) +``` + +## Workflow Replay Testing + +```python +import json +import pytest +import uuid +from temporalio.client import WorkflowHistory +from temporalio.worker import Replayer + +from workflows import MyWorkflow + +@pytest.mark.asyncio +async def test_replay(): + with open("example-history.json", "r") as f: + history_json = json.load(f) + + replayer = Replayer(workflows=[MyWorkflow]) + + # From JSON file + await replayer.replay_workflow( + WorkflowHistory.from_json(workflow_id=str(uuid.uuid4()), history_json) + ) +``` + + +## Activity Testing + +```python +import pytest + +from temporalio.testing import ActivityEnvironment + +@pytest.mark.asyncio +async def test_activity(): + env = ActivityEnvironment() + result = await env.run(my_activity, "arg1", "arg2") + assert result == "expected" +``` + +## Best Practices + +1. Use the `WorkflowEnvironment.start_local` environment for most testing +2. Use time-skipping environment for workflows with durable timers / durable sleeps. +3. Mock external dependencies in activities +4. Test replay compatibility, especially when changing workflow code +5. Test signal/query handlers explicitly +6. Use unique workflow IDs and task queues per test to avoid conflicts. Easiest is a `uuid.uuid4()` diff --git a/references/python/versioning.md b/references/python/versioning.md new file mode 100644 index 0000000..abd4445 --- /dev/null +++ b/references/python/versioning.md @@ -0,0 +1,314 @@ +# Python SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## Patching API + +### The patched() Function + +The `patched()` function checks whether a Workflow should run new or old code: + +```python +from temporalio import workflow + +@workflow.defn +class ShippingWorkflow: + @workflow.run + async def run(self) -> None: + if workflow.patched("send-email-instead-of-fax"): + # New code path + await workflow.execute_activity( + send_email, + start_to_close_timeout=timedelta(minutes=5), + ) + else: + # Old code path (for replay of existing workflows) + await workflow.execute_activity( + send_fax, + start_to_close_timeout=timedelta(minutes=5), + ) +``` + +**How it works:** +- For new executions: `patched()` returns `True` and records a marker in the Workflow history +- For replay with the marker: `patched()` returns `True` (history includes this patch) +- For replay without the marker: `patched()` returns `False` (history predates this patch) + +**Python-specific behavior:** The `patched()` return value is memoized on first call. This means you cannot reliably use `patched()` in loops—it will return the same value every iteration. Workaround: append a sequence number to the patch ID for each iteration (e.g., `f"my-change-{i}"`). + +### Three-Step Patching Process + +Patching is a three-step process for safely deploying changes. + +**Warning:** Failing to follow this process correctly will result in non-determinism errors for in-flight workflows. + +**Step 1: Patch in New Code** + +Add the patch with both old and new code paths: + +```python +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, order: Order) -> str: + if workflow.patched("add-fraud-check"): + # New: Run fraud check before payment + await workflow.execute_activity( + check_fraud, + order, + start_to_close_timeout=timedelta(minutes=2), + ) + + # Original payment logic runs for both paths + return await workflow.execute_activity( + process_payment, + order, + start_to_close_timeout=timedelta(minutes=5), + ) +``` + +**Step 2: Deprecate the Patch** + +Once all pre-patch Workflow Executions have completed, remove the old code and use `deprecate_patch()`: + +```python +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, order: Order) -> str: + workflow.deprecate_patch("add-fraud-check") + + # Only new code remains + await workflow.execute_activity( + check_fraud, + order, + start_to_close_timeout=timedelta(minutes=2), + ) + + return await workflow.execute_activity( + process_payment, + order, + start_to_close_timeout=timedelta(minutes=5), + ) +``` + +**Step 3: Remove the Patch** + +After all workflows with the deprecated patch marker have completed, remove the `deprecate_patch()` call entirely: + +```python +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, order: Order) -> str: + await workflow.execute_activity( + check_fraud, + order, + start_to_close_timeout=timedelta(minutes=2), + ) + + return await workflow.execute_activity( + process_payment, + order, + start_to_close_timeout=timedelta(minutes=5), + ) +``` + +### Query Filters for Finding Workflows by Version + +Use List Filters to find workflows with specific patch versions: + +```bash +# Find running workflows with a specific patch +temporal workflow list --query \ + 'WorkflowType = "OrderWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion = "add-fraud-check"' + +# Find running workflows without any patch (pre-patch versions) +temporal workflow list --query \ + 'WorkflowType = "OrderWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion IS NULL' +``` + +## Workflow Type Versioning + +For incompatible changes, create a new Workflow Type instead of using patches: + +```python +@workflow.defn(name="PizzaWorkflow") +class PizzaWorkflow: + @workflow.run + async def run(self, order: PizzaOrder) -> str: + # Original implementation + return await self._process_order_v1(order) + +@workflow.defn(name="PizzaWorkflowV2") +class PizzaWorkflowV2: + @workflow.run + async def run(self, order: PizzaOrder) -> str: + # New implementation with incompatible changes + return await self._process_order_v2(order) +``` + +Register both with the Worker: + +```python +worker = Worker( + client, + task_queue="pizza-task-queue", + workflows=[PizzaWorkflow, PizzaWorkflowV2], + activities=[make_pizza, deliver_pizza], +) +``` + +Update client code to start new workflows with the new type: + +```python +# Old workflows continue on PizzaWorkflow +# New workflows use PizzaWorkflowV2 +handle = await client.start_workflow( + PizzaWorkflowV2.run, + order, + id=f"pizza-{order.id}", + task_queue="pizza-task-queue", +) +``` + +Check for open executions before removing the old type: + +```bash +temporal workflow list --query 'WorkflowType = "PizzaWorkflow" AND ExecutionStatus = "Running"' +``` + +## Worker Versioning + +Worker Versioning manages versions at the deployment level, allowing multiple Worker versions to run simultaneously. + +### Key Concepts + +**Worker Deployment**: A logical service grouping similar Workers together (e.g., "loan-processor"). All versions of your code live under this umbrella. + +**Worker Deployment Version**: A specific snapshot of your code identified by a deployment name and Build ID (e.g., "loan-processor:v1.0" or "loan-processor:abc123"). + +### Configuring Workers for Versioning + +```python +from temporalio.worker import Worker +from temporalio.worker.deployment_config import ( + WorkerDeploymentConfig, + WorkerDeploymentVersion, +) + +worker = Worker( + client, + task_queue="my-task-queue", + workflows=[MyWorkflow], + activities=[my_activity], + deployment_config=WorkerDeploymentConfig( + version=WorkerDeploymentVersion( + deployment_name="my-service", + build_id="v1.0.0", # or git commit hash + ), + use_worker_versioning=True, + ), +) +``` + +**Configuration parameters:** +- `use_worker_versioning`: Enables Worker Versioning +- `version`: Identifies the Worker Deployment Version (deployment name + build ID) +- Build ID: Typically a git commit hash, version number, or timestamp + +### PINNED vs AUTO_UPGRADE Behaviors + +**PINNED Behavior** + +Workflows stay locked to their original Worker version: + +```python +from temporalio.workflow import VersioningBehavior + +@workflow.defn +class StableWorkflow: + @workflow.run + async def run(self) -> str: + # This workflow will always run on its assigned version + return await workflow.execute_activity( + process_order, + start_to_close_timeout=timedelta(minutes=5), + ) +``` + +**When to use PINNED:** +- Short-running workflows (minutes to hours) +- Consistency is critical (e.g., financial transactions) +- You want to eliminate version compatibility complexity +- Building new applications and want simplest development experience + +**AUTO_UPGRADE Behavior** + +Workflows can move to newer versions: + +**When to use AUTO_UPGRADE:** +- Long-running workflows (weeks or months) +- Workflows need to benefit from bug fixes during execution +- Migrating from traditional rolling deployments +- You are already using patching APIs for version transitions + +**Important:** AUTO_UPGRADE workflows still need patching to handle version transitions safely since they can move between Worker versions. + +### Worker Configuration with Default Behavior + +```python +# For short-running workflows, prefer PINNED +worker = Worker( + client, + task_queue="orders-task-queue", + workflows=[OrderWorkflow], + activities=[process_order], + deployment_config=WorkerDeploymentConfig( + version=WorkerDeploymentVersion( + deployment_name="order-service", + build_id=os.environ["BUILD_ID"], + ), + use_worker_versioning=True, + # default_versioning_behavior=VersioningBehavior.PINNED, + ), +) +``` + +### Deployment Strategies + +**Blue-Green Deployments** + +Maintain two environments and switch traffic between them: +1. Deploy new code to idle environment +2. Run tests and validation +3. Switch traffic to new environment +4. Keep old environment for instant rollback + +**Rainbow Deployments** + +Multiple versions run simultaneously: +- New workflows use latest version +- Existing workflows complete on their original version +- Add new versions alongside existing ones +- Gradually sunset old versions as workflows complete + +This works well with Kubernetes where you manage multiple ReplicaSets running different Worker versions. + +### Querying Workflows by Worker Version + +```bash +# Find workflows on a specific Worker version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "my-service:v1.0.0" AND ExecutionStatus = "Running"' +``` + +## Best Practices + +1. **Check for open executions** before removing old code paths +2. **Use descriptive patch IDs** that explain the change (e.g., "add-fraud-check" not "patch-1") +3. **Deploy patches incrementally**: patch, deprecate, remove +4. **Use PINNED for short workflows** to simplify version management +5. **Use AUTO_UPGRADE with patching** for long-running workflows that need updates +6. **Generate Build IDs from code** (git hash) to ensure changes produce new versions +7. **Avoid rolling deployments** for high-availability services with long-running workflows diff --git a/references/typescript/advanced-features.md b/references/typescript/advanced-features.md new file mode 100644 index 0000000..17b7e61 --- /dev/null +++ b/references/typescript/advanced-features.md @@ -0,0 +1,150 @@ +# TypeScript SDK Advanced Features + +## Schedules + +Create recurring workflow executions. + +```typescript +import { Client, ScheduleOverlapPolicy } from '@temporalio/client'; + +const client = new Client(); + +// Create a schedule +const schedule = await client.schedule.create({ + scheduleId: 'daily-report', + spec: { + intervals: [{ every: '1 day' }], + }, + action: { + type: 'startWorkflow', + workflowType: 'dailyReportWorkflow', + taskQueue: 'reports', + args: [], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + }, +}); + +// Manage schedules +const handle = client.schedule.getHandle('daily-report'); +await handle.pause('Maintenance window'); +await handle.unpause(); +await handle.trigger(); // Run immediately +await handle.delete(); +``` + +## Async Activity Completion + +Complete an activity asynchronously from outside the activity function. Useful when the activity needs to wait for an external event. + +**In the activity - return the task token:** +```typescript +import { CompleteAsyncError, activityInfo } from '@temporalio/activity'; + +export async function doSomethingAsync(): Promise { + const taskToken: Uint8Array = activityInfo().taskToken; + setTimeout(() => doSomeWork(taskToken), 1000); + throw new CompleteAsyncError(); +} +``` + +**External completion (from another process, machine, etc.):** +```typescript +import { Client } from '@temporalio/client'; + +async function doSomeWork(taskToken: Uint8Array): Promise { + const client = new Client(); + // does some work... + await client.activity.complete(taskToken, "Job's done!"); +} +``` + +**When to use:** +- Waiting for human approval +- Waiting for external webhook callback +- Long-polling external systems + +## Worker Tuning + +Configure worker capacity for production workloads: + +```typescript +import { Worker, NativeConnection } from '@temporalio/worker'; + +const worker = await Worker.create({ + connection: await NativeConnection.connect({ address: 'temporal:7233' }), + taskQueue: 'my-queue', + workflowBundle: { codePath: require.resolve('./workflow-bundle.js') }, // Pre-bundled for production + activities, + + // Workflow execution concurrency (default: 40) + maxConcurrentWorkflowTaskExecutions: 100, + + // Activity execution concurrency (default: 100) + maxConcurrentActivityTaskExecutions: 200, + + // Graceful shutdown timeout (default: 0) + shutdownGraceTime: '30 seconds', + + // Max cached workflows (memory vs latency tradeoff) + maxCachedWorkflows: 1000, +}); +``` + +**Key settings:** +- `maxConcurrentWorkflowTaskExecutions`: Max workflows running simultaneously (default: 40) +- `maxConcurrentActivityTaskExecutions`: Max activities running simultaneously (default: 100) +- `shutdownGraceTime`: Time to wait for in-progress work before forced shutdown +- `maxCachedWorkflows`: Number of workflows to keep in cache (reduces replay on cache hit) + +## Sinks + +Sinks allow workflows to emit events for side effects (logging, metrics). + +```typescript +import { proxySinks, Sinks } from '@temporalio/workflow'; + +// Define sink interface +export interface LoggerSinks extends Sinks { + logger: { + info(message: string, attrs: Record): void; + error(message: string, attrs: Record): void; + }; +} + +// Use in workflow +const { logger } = proxySinks(); + +export async function myWorkflow(input: string): Promise { + logger.info('Workflow started', { input }); + + const result = await someActivity(input); + + logger.info('Workflow completed', { result }); + return result; +} + +// Implement sink in worker +const worker = await Worker.create({ + workflowsPath: require.resolve('./workflows'), // Use workflowBundle for production + activities, + taskQueue: 'my-queue', + sinks: { + logger: { + info: { + fn(workflowInfo, message, attrs) { + console.log(`[${workflowInfo.workflowId}] ${message}`, attrs); + }, + callDuringReplay: false, // Don't log during replay + }, + error: { + fn(workflowInfo, message, attrs) { + console.error(`[${workflowInfo.workflowId}] ${message}`, attrs); + }, + callDuringReplay: false, + }, + }, + }, +}); +``` diff --git a/references/typescript/data-handling.md b/references/typescript/data-handling.md new file mode 100644 index 0000000..bfd4925 --- /dev/null +++ b/references/typescript/data-handling.md @@ -0,0 +1,253 @@ +# TypeScript SDK Data Handling + +## Overview + +The TypeScript SDK uses data converters to serialize/deserialize workflow inputs, outputs, and activity parameters. + +## Default Data Converter + +The default converter handles: +- `undefined` and `null` +- `Uint8Array` (as binary) +- JSON-serializable types + +Note: Protobuf support requires using a data converter (`DefaultPayloadConverterWithProtobufs`). See the Protobuf Support section below. + +## Custom Data Converter + +Create custom converters for special serialization needs. + +```typescript +// payload-converter.ts +import { + PayloadConverter, + Payload, + defaultPayloadConverter, +} from '@temporalio/common'; + +class CustomPayloadConverter implements PayloadConverter { + toPayload(value: T): Payload | undefined { + // Custom serialization logic + return defaultPayloadConverter.toPayload(value); + } + + fromPayload(payload: Payload): T { + // Custom deserialization logic + return defaultPayloadConverter.fromPayload(payload); + } +} + +export const payloadConverter = new CustomPayloadConverter(); +``` + +```typescript +// client.ts +import { Client } from '@temporalio/client'; + +const client = new Client({ + dataConverter: { + payloadConverterPath: require.resolve('./payload-converter'), + }, +}); +``` + +```typescript +// worker.ts +import { Worker } from '@temporalio/worker'; + +const worker = await Worker.create({ + dataConverter: { + payloadConverterPath: require.resolve('./payload-converter'), + }, + // ... +}); +``` + +## Composition of Payload Converters + +```typescript +import { CompositePayloadConverter } from '@temporalio/common'; + +// The order matters — converters are tried in sequence until one returns a non-null Payload +export const payloadConverter = new CompositePayloadConverter( + new PayloadConverterFoo(), + new PayloadConverterBar(), +); +``` + +## Protobuf Support + +Using Protocol Buffers for type-safe serialization. + +**Note:** JSON serialization (the default) is preferred for TypeScript applications—it's simpler and more performant. Use Protobuf only when interoperating with services that require it. + +```typescript +import { DefaultPayloadConverterWithProtobufs } from '@temporalio/common/lib/protobufs'; + +const dataConverter: DataConverter = { + payloadConverter: new DefaultPayloadConverterWithProtobufs({ + protobufRoot: myProtobufRoot, + }), +}; +``` + +## Payload Codec (Encryption) + +Encrypt sensitive workflow data. + +```typescript +import { PayloadCodec, Payload } from '@temporalio/common'; + +class EncryptionCodec implements PayloadCodec { + private readonly encryptionKey: Uint8Array; + + constructor(key: Uint8Array) { + this.encryptionKey = key; + } + + async encode(payloads: Payload[]): Promise { + return Promise.all( + payloads.map(async (payload) => ({ + metadata: { + encoding: 'binary/encrypted', + }, + data: await this.encrypt(payload.data ?? new Uint8Array()), + })) + ); + } + + async decode(payloads: Payload[]): Promise { + return Promise.all( + payloads.map(async (payload) => { + if (payload.metadata?.encoding === 'binary/encrypted') { + return { + ...payload, + data: await this.decrypt(payload.data ?? new Uint8Array()), + }; + } + return payload; + }) + ); + } + + private async encrypt(data: Uint8Array): Promise { + // Implement encryption (e.g., using Web Crypto API) + return data; + } + + private async decrypt(data: Uint8Array): Promise { + // Implement decryption + return data; + } +} + +// Apply codec +const dataConverter: DataConverter = { + payloadCodecs: [new EncryptionCodec(encryptionKey)], +}; +``` + +## Search Attributes + +Custom searchable fields for workflow visibility. + +### Setting Search Attributes at Start + +```typescript +import { Client } from '@temporalio/client'; + +const client = new Client(); + +await client.workflow.start('orderWorkflow', { + taskQueue: 'orders', + workflowId: `order-${orderId}`, + args: [order], + searchAttributes: { + OrderId: [orderId], + CustomerType: ['premium'], + OrderTotal: [99.99], + CreatedAt: [new Date()], + }, +}); +``` + +### Upserting Search Attributes from Workflow + +```typescript +import { upsertSearchAttributes, workflowInfo } from '@temporalio/workflow'; + +export async function orderWorkflow(order: Order): Promise { + // Update status as workflow progresses + upsertSearchAttributes({ + OrderStatus: ['processing'], + }); + + await processOrder(order); + + upsertSearchAttributes({ + OrderStatus: ['completed'], + }); + + return 'done'; +} +``` + +### Reading Search Attributes + +```typescript +import { workflowInfo } from '@temporalio/workflow'; + +export async function orderWorkflow(): Promise { + const info = workflowInfo(); + const searchAttrs = info.searchAttributes; + const orderId = searchAttrs?.OrderId?.[0]; + // ... +} +``` + +### Querying Workflows by Search Attributes + +```typescript +const client = new Client(); + +// List workflows using search attributes +for await (const workflow of client.workflow.list({ + query: 'OrderStatus = "processing" AND CustomerType = "premium"', +})) { + console.log(`Workflow ${workflow.workflowId} is still processing`); +} +``` + +## Workflow Memo + +Store arbitrary metadata with workflows (not searchable). + +```typescript +// Set memo at workflow start +await client.workflow.start('orderWorkflow', { + taskQueue: 'orders', + workflowId: `order-${orderId}`, + args: [order], + memo: { + customerName: order.customerName, + notes: 'Priority customer', + }, +}); + +// Read memo from workflow +import { workflowInfo } from '@temporalio/workflow'; + +export async function orderWorkflow(): Promise { + const info = workflowInfo(); + const customerName = info.memo?.customerName; + // ... +} +``` + +## Best Practices + +1. Keep payloads small—see `references/core/gotchas.md` for limits +2. Use search attributes for business-level visibility and filtering +3. Encrypt sensitive data with PayloadCodec +4. Use memo for non-searchable metadata +5. Configure the same data converter on both client and worker diff --git a/references/typescript/determinism-protection.md b/references/typescript/determinism-protection.md new file mode 100644 index 0000000..54303ba --- /dev/null +++ b/references/typescript/determinism-protection.md @@ -0,0 +1,56 @@ +# TypeScript Workflow V8 Sandboxing + +## Overview + +The TypeScript SDK runs workflows in a V8 sandbox that provides automatic protection against non-deterministic operations, and replaces common non-deterministic function calls with deterministic variants. + +## Import Blocking + +The sandbox blocks imports of `fs`, `https` modules, and any Node/DOM APIs. Otherwise, workflow code can import any package as long as it does not reference Node.js or DOM APIs. + +**Note**: If you must use a library that references a Node.js or DOM API and you are certain that those APIs are not used at runtime, add that module to the `ignoreModules` list: + +```ts +const worker = await Worker.create({ + workflowsPath: require.resolve('./workflows'), // bundlerOptions only apply with workflowsPath + activities: require('./activities'), + taskQueue: 'my-task-queue', + bundlerOptions: { + // These modules may be imported (directly or transitively), + // but will be excluded from the Workflow bundle. + ignoreModules: ['fs', 'http', 'crypto'], + }, +}); +``` + +**Important**: Excluded modules are completely unavailable at runtime. Any attempt to call functions from these modules will throw an error. Only exclude modules when you are certain the code paths using them will never execute during workflow execution. + +**Note**: Modules with the `node:` prefix (e.g., `node:fs`) require additional webpack configuration to ignore. You may need to configure the bundler's `externals` or use webpack `resolve.alias` to handle these imports. + +Use this with *extreme caution*. + + +## Function Replacement + +Functions like `Math.random()`, `Date`, and `setTimeout()` are replaced by deterministic versions. + +Date-related functions return the timestamp at which the current workflow task was initially executed. That timestamp remains the same when the workflow task is replayed, and only advances when a durable operation occurs (like `sleep()`). For example: + +```ts +import { sleep } from '@temporalio/workflow'; + +// this prints the *exact* same timestamp repeatedly +for (let x = 0; x < 10; ++x) { + console.log(Date.now()); +} + +// this prints timestamps increasing roughly 1s each iteration +for (let x = 0; x < 10; ++x) { + await sleep('1 second'); + console.log(Date.now()); +} +``` + +Generally, this is the behavior you want. + +Additionally, `FinalizationRegistry` and `WeakRef` are removed because v8's garbage collector is not deterministic. diff --git a/references/typescript/determinism.md b/references/typescript/determinism.md new file mode 100644 index 0000000..47f8948 --- /dev/null +++ b/references/typescript/determinism.md @@ -0,0 +1,51 @@ +# TypeScript SDK Determinism + +## Overview + +The TypeScript SDK runs workflows in an isolated V8 sandbox that automatically provides determinism. + +## Why Determinism Matters + +Temporal provides durable execution through **History Replay**. When a Worker needs to restore workflow state (after a crash, cache eviction, or to continue after a long timer), it re-executes the workflow code from the beginning, which requires the workflow code to be **deterministic**. + +## Temporal's V8 Sandbox + +The Temporal TypeScript SDK executes all workflow code in sandbox, which (among other things), replaces common non-deterministic functions with deterministic variants. As an example, consider the code below: + +```ts +export async function myWorkflow(): Promise { + await importData(); + + if (Math.random() > 0.5) { + await sleep('30 minutes'); + } + + return await sendReport(); +} +``` + +The Temporal workflow sandbox will use the same random seed when replaying a workflow, so the above code will **deterministically** generate pseudo-random numbers. For UUIDs, use `uuid4()` from `@temporalio/workflow` which also uses the seeded PRNG. + +See `references/typescript/determinism-protection.md` for more information about the sandbox. + +## Forbidden Operations + +```typescript +// DO NOT do these in workflows: +import fs from 'fs'; // Node.js modules +fetch('https://...'); // Network I/O +``` + +Most non-determinism and side effects, such as the above, should be wrapped in Activities. + +## Testing Replay Compatibility + +Use `Worker.runReplayHistory()` to verify your code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/typescript/testing.md`. + +## Best Practices + +1. Use type-only imports for activities in workflow files +2. Match all @temporalio package versions +3. Prefer `sleep()` from workflow package — `setTimeout` works but `sleep()` handles cancellation scopes more clearly +4. Keep workflows focused on orchestration +5. Test with replay to verify determinism diff --git a/references/typescript/error-handling.md b/references/typescript/error-handling.md new file mode 100644 index 0000000..7072fbd --- /dev/null +++ b/references/typescript/error-handling.md @@ -0,0 +1,119 @@ +# TypeScript SDK Error Handling + +## Overview + +The TypeScript SDK uses `ApplicationFailure` for application errors with support for non-retryable marking. + +## Application Failures + +```typescript +import { ApplicationFailure } from '@temporalio/workflow'; + +export async function myWorkflow(): Promise { + throw ApplicationFailure.create({ + message: 'Invalid input', + type: 'ValidationError', + nonRetryable: true, + }); +} +``` + +## Activity Errors + +```typescript +import { ApplicationFailure } from '@temporalio/activity'; + +export async function validateActivity(input: string): Promise { + if (!isValid(input)) { + throw ApplicationFailure.create({ + message: `Invalid input: ${input}`, + type: 'ValidationError', + nonRetryable: true, + }); + } +} +``` + +## Handling Errors in Workflows + +```typescript +import { proxyActivities, ApplicationFailure, log } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { riskyActivity } = proxyActivities({ + startToCloseTimeout: '5 minutes', +}); + +export async function workflowWithErrorHandling(): Promise { + try { + return await riskyActivity(); + } catch (err) { + if (err instanceof ApplicationFailure) { + log.warn('Activity failed', { type: err.type, message: err.message }); + } + throw err; + } +} +``` + +## Retry Configuration + +```typescript +const { myActivity } = proxyActivities({ + startToCloseTimeout: '10 minutes', + retry: { + initialInterval: '1s', + backoffCoefficient: 2, + maximumInterval: '1m', + maximumAttempts: 5, + nonRetryableErrorTypes: ['ValidationError', 'PaymentError'], + }, +}); +``` + +**Note:** Only set retry options if you have a domain-specific reason to. The defaults are suitable for most use cases. + +## Timeout Configuration + +```typescript +const { myActivity } = proxyActivities({ + startToCloseTimeout: '5 minutes', // Single attempt + scheduleToCloseTimeout: '30 minutes', // Including retries + heartbeatTimeout: '30 seconds', // Between heartbeats +}); +``` + +## Workflow Failure + +Workflows can throw errors to indicate failure: + +```typescript +import { ApplicationFailure } from '@temporalio/workflow'; + +export async function myWorkflow(): Promise { + if (someCondition) { + throw ApplicationFailure.create({ + message: 'Workflow failed due to invalid state', + type: 'InvalidStateError', + }); + } + return 'success'; +} +``` + +**Warning:** Do NOT use `nonRetryable: true` for workflow failures in most cases. Unlike activities, workflow retries are controlled by the caller, not retry policies. Use `nonRetryable` only for errors that are truly unrecoverable (e.g., invalid input that will never be valid). + +## Idempotency + +For idempotency patterns (using keys, making activities granular), see `core/patterns.md`. + +## Best Practices + +1. Use specific error types for different failure modes +2. Set `nonRetryable: true` for permanent failures in activities +3. Configure `nonRetryableErrorTypes` in retry policy +4. Log errors before re-raising +5. Use `ApplicationFailure` to catch activity failures in workflows +6. Use the appropriate `log` import for your context: + - In workflows: `import { log } from '@temporalio/workflow'` (replay-safe) + - In activities: `import { log } from '@temporalio/activity'` diff --git a/references/typescript/gotchas.md b/references/typescript/gotchas.md new file mode 100644 index 0000000..d234f74 --- /dev/null +++ b/references/typescript/gotchas.md @@ -0,0 +1,312 @@ +# TypeScript Gotchas + +TypeScript-specific mistakes and anti-patterns. See also [Common Gotchas](../core/gotchas.md) for language-agnostic concepts. + +## Activity Imports + +### Importing Implementations Instead of Types + +**The Problem**: Importing activity implementations brings Node.js code into the V8 workflow sandbox, causing bundling errors or runtime failures. + +```typescript +// BAD - Brings actual code into workflow sandbox +import * as activities from './activities'; + +const { greet } = proxyActivities({ + startToCloseTimeout: '1 minute', +}); + +// GOOD - Type-only import +import type * as activities from './activities'; + +const { greet } = proxyActivities({ + startToCloseTimeout: '1 minute', +}); +``` + +### Importing Node.js Modules in Workflows + +```typescript +// BAD - fs is not available in workflow sandbox +import * as fs from 'fs'; + +export async function myWorkflow(): Promise { + const data = fs.readFileSync('file.txt'); // Will fail! +} + +// GOOD - File I/O belongs in activities +export async function myWorkflow(): Promise { + const data = await activities.readFile('file.txt'); +} +``` + +## Bundling Issues + +### Using workflowsPath in Production + +`workflowsPath` runs the bundler at Worker startup, which is slow and not suitable for production. Use `workflowBundle` with pre-bundled code instead. + +```typescript +// OK for development/testing, BAD for production - bundles at startup +const worker = await Worker.create({ + workflowsPath: require.resolve('./workflows'), + // ... +}); + +// GOOD for production - use pre-bundled code +import { bundleWorkflowCode } from '@temporalio/worker'; + +// Build step (run once at build time) +const bundle = await bundleWorkflowCode({ + workflowsPath: require.resolve('./workflows'), +}); +await fs.promises.writeFile('./workflow-bundle.js', bundle.code); + +// Worker startup (fast, no bundling) +const worker = await Worker.create({ + workflowBundle: { + codePath: require.resolve('./workflow-bundle.js'), + }, + // ... +}); +``` + +### Missing Dependencies in Workflow Bundle + +```typescript +// If using external packages in workflows, ensure they're bundled + +// worker.ts +const worker = await Worker.create({ + workflowsPath: require.resolve('./workflows'), + bundlerOptions: { + // Exclude Node.js-only packages that cause bundling errors + // WARNING: Modules listed here will be completely unavailable + // at workflow runtime - any imports will fail + ignoreModules: ['some-node-only-package'], + }, +}); +``` + +### Package Version Mismatches + +All `@temporalio/*` packages must have the same version. This can be verified by running `npm ls` or the appropriate command for your package manager. + +### Package Version Constraints - Prod vs. Non-Prod + +For production apps, you should use ~ version constraints (bug fixes only) on Temporal packages. For non-production apps, you may use ^ constraints (the npm default) instead. + +## Wrong Retry Classification + +A common mistake is treating transient errors as permanent (or vice versa): + +- **Transient errors** (retry): network timeouts, temporary service unavailability, rate limits +- **Permanent errors** (don't retry): invalid input, authentication failure, resource not found + +```typescript +// BAD: Retrying a permanent error +throw ApplicationFailure.create({ message: 'User not found' }); +// This will retry indefinitely! + +// GOOD: Mark permanent errors as non-retryable +throw ApplicationFailure.nonRetryable('User not found'); +``` + +For detailed guidance on error classification and retry policies, see `error-handling.md`. + +## Cancellation + +### Not Handling Workflow Cancellation + +```typescript +// BAD - Cleanup doesn't run on cancellation +export async function workflowWithCleanup(): Promise { + await activities.acquireResource(); + await activities.doWork(); + await activities.releaseResource(); // Never runs if cancelled! +} + +// GOOD - Use CancellationScope for cleanup +import { CancellationScope } from '@temporalio/workflow'; + +export async function workflowWithCleanup(): Promise { + await activities.acquireResource(); + try { + await activities.doWork(); + } finally { + // Run cleanup even on cancellation + await CancellationScope.nonCancellable(async () => { + await activities.releaseResource(); + }); + } +} +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: +1. **Heartbeating** - Cancellation is delivered via heartbeat +2. **Checking for cancellation** - Either await `Context.current().cancelled` or use `cancellationSignal()` + +```typescript +// BAD - Activity ignores cancellation +export async function longActivity(): Promise { + await doExpensiveWork(); // Runs to completion even if cancelled +} +``` + +```typescript +// GOOD - Heartbeat in background and race work against cancellation promise +import { Context, CancelledFailure } from '@temporalio/activity'; + +export async function longActivity(): Promise { + // Heartbeat in background so cancellation can be delivered + let heartbeatEnabled = true; + (async () => { + while (heartbeatEnabled) { + await Context.current().sleep(5000); + Context.current().heartbeat(); + } + })().catch(() => {}); + + try { + await Promise.race([ + Context.current().cancelled, // Rejects with CancelledFailure + doExpensiveWork(), + ]); + } catch (err) { + if (err instanceof CancelledFailure) { + await cleanup(); + } + throw err; + } finally { + heartbeatEnabled = false; + } +} +``` + +```typescript +// GOOD - Use AbortSignal with libraries that support it +import fetch from 'node-fetch'; +import { cancellationSignal, heartbeat } from '@temporalio/activity'; +import type { AbortSignal as FetchAbortSignal } from 'node-fetch/externals'; + +export async function cancellableFetch(url: string): Promise { + const response = await fetch(url, { signal: cancellationSignal() as FetchAbortSignal }); + + const contentLength = parseInt(response.headers.get('Content-Length')!); + let bytesRead = 0; + const chunks: Buffer[] = []; + + for await (const chunk of response.body) { + if (!(chunk instanceof Buffer)) throw new TypeError('Expected Buffer'); + bytesRead += chunk.length; + chunks.push(chunk); + heartbeat(bytesRead / contentLength); // Heartbeat to keep cancellation delivery alive + } + return Buffer.concat(chunks); +} +``` + +**Note:** `Promise.race` doesn't stop the losing promise—it continues running. Use `cancellationSignal()` or explicitly abort sub-operations when cleanup requires stopping in-flight work. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```typescript +// BAD - No heartbeat, can't detect stuck activities +export async function processLargeFile(path: string): Promise { + for await (const chunk of readChunks(path)) { + await processChunk(chunk); // Takes hours, no heartbeat + } +} + +// GOOD - Regular heartbeats with progress +import { heartbeat } from '@temporalio/activity'; + +export async function processLargeFile(path: string): Promise { + let i = 0; + for await (const chunk of readChunks(path)) { + heartbeat(`Processing chunk ${i++}`); + await processChunk(chunk); + } +} +``` + +### Heartbeat Timeout Too Short + +```typescript +// BAD - Heartbeat timeout shorter than processing time +const { processChunk } = proxyActivities({ + startToCloseTimeout: '30 minutes', + heartbeatTimeout: '10 seconds', // Too short! +}); + +// GOOD - Heartbeat timeout allows for processing variance +const { processChunk } = proxyActivities({ + startToCloseTimeout: '30 minutes', + heartbeatTimeout: '2 minutes', +}); +``` + +Set heartbeat timeout as high as acceptable for your use case — each heartbeat counts as an action. + +## Testing + +### Not Testing Failures + +```typescript +import { TestWorkflowEnvironment } from '@temporalio/testing'; +import { Worker } from '@temporalio/worker'; + +test('handles activity failure', async () => { + const env = await TestWorkflowEnvironment.createTimeSkipping(); + + const worker = await Worker.create({ + connection: env.nativeConnection, + taskQueue: 'test', + workflowsPath: require.resolve('./workflows'), + activities: { + // Activity that always fails + riskyOperation: async () => { + throw ApplicationFailure.nonRetryable('Simulated failure'); + }, + }, + }); + + await worker.runUntil(async () => { + await expect( + env.client.workflow.execute(riskyWorkflow, { + workflowId: 'test-failure', + taskQueue: 'test', + }) + ).rejects.toThrow('Simulated failure'); + }); + + await env.teardown(); +}); +``` + +### Not Testing Replay + +```typescript +import { Worker } from '@temporalio/worker'; +import * as fs from 'fs'; + +test('replay compatibility', async () => { + const history = JSON.parse(await fs.promises.readFile('./fixtures/workflow_history.json', 'utf8')); + + // Fails if current code is incompatible with history + await Worker.runReplayHistory( + { + workflowsPath: require.resolve('./workflows'), + }, + history, + ); +}); +``` + +## Timers and Sleep + +`setTimeout` works in workflows (the SDK mocks it), but `sleep()` from `@temporalio/workflow` is preferred because its interaction with cancellation scopes is more intuitive. See Timers in `references/typescript/patterns.md`. diff --git a/references/typescript/observability.md b/references/typescript/observability.md new file mode 100644 index 0000000..10244d7 --- /dev/null +++ b/references/typescript/observability.md @@ -0,0 +1,109 @@ +# TypeScript SDK Observability + +## Overview + +The TypeScript SDK provides replay-aware logging, metrics, and integrations for production observability. + +## Replay-Aware Logging + +Temporal's logger automatically suppresses duplicate messages during replay, preventing log spam when workflows recover state. + +### Workflow Logging + +Workflows run in a sandboxed environment and cannot use regular Node.js loggers directly. Since SDK 1.8.0, the `@temporalio/workflow` package exports a `log` object that provides replay-aware logging. Internally, it uses Sinks to funnel messages to the Runtime's logger. + +```typescript +import { log } from '@temporalio/workflow'; + +export async function orderWorkflow(orderId: string): Promise { + log.info('Processing order', { orderId }); + + const result = await processPayment(orderId); + log.debug('Payment processed', { orderId, result }); + + return result; +} +``` + +**Log levels**: `log.debug()`, `log.info()`, `log.warn()`, `log.error()` + +The workflow logger automatically suppresses duplicate messages during replay and includes workflow context metadata (workflowId, runId, etc.) on every log entry. + +### Activity Logging + +```typescript +import { log } from '@temporalio/activity'; + +export async function processPayment(orderId: string): Promise { + log.info('Processing payment', { orderId }); + return 'payment-id-123'; +} +``` + +The activity logger adds contextual metadata (activity ID, type, namespace) and funnels messages to the runtime's logger for consistent collection. + +## Customizing the Logger + +### Basic Configuration + +```typescript +import { DefaultLogger, Runtime } from '@temporalio/worker'; + +const logger = new DefaultLogger('DEBUG', ({ level, message }) => { + console.log(`Custom logger: ${level} - ${message}`); +}); +Runtime.install({ logger }); +``` + +### Winston Integration + +```typescript +import winston from 'winston'; +import { DefaultLogger, Runtime } from '@temporalio/worker'; + +const winstonLogger = winston.createLogger({ + level: 'debug', + format: winston.format.json(), + transports: [ + new winston.transports.File({ filename: 'temporal.log' }) + ], +}); + +const logger = new DefaultLogger('DEBUG', (entry) => { + winstonLogger.log({ + label: entry.meta?.activityId ? 'activity' : entry.meta?.workflowId ? 'workflow' : 'worker', + level: entry.level.toLowerCase(), + message: entry.message, + timestamp: Number(entry.timestampNanos / 1_000_000n), + ...entry.meta, + }); +}); + +Runtime.install({ logger }); +``` + +## Metrics + +### Prometheus Metrics + +```typescript +import { Runtime } from '@temporalio/worker'; + +Runtime.install({ + telemetryOptions: { + metrics: { + prometheus: { + bindAddress: '127.0.0.1:9091', + }, + }, + }, +}); +``` + +## Best Practices + +1. Use `log` from `@temporalio/workflow` for production observability. For temporary print debugging, `console.log()` is fine—it's direct and immediate, whereas `log` goes through sinks which may lose messages on workflow errors +2. Include correlation IDs (orderId, customerId) in log messages +3. Configure Winston or similar for production log aggregation +4. Monitor Prometheus metrics for worker health +5. Use Event History for debugging workflow issues diff --git a/references/typescript/patterns.md b/references/typescript/patterns.md new file mode 100644 index 0000000..3d59e23 --- /dev/null +++ b/references/typescript/patterns.md @@ -0,0 +1,417 @@ +# TypeScript SDK Patterns + +## Signals + +```typescript +import { defineSignal, setHandler, condition } from '@temporalio/workflow'; + +const approveSignal = defineSignal<[boolean]>('approve'); +const addItemSignal = defineSignal<[string]>('addItem'); + +export async function orderWorkflow(): Promise { + let approved = false; + const items: string[] = []; + + setHandler(approveSignal, (value) => { + approved = value; + }); + + setHandler(addItemSignal, (item) => { + items.push(item); + }); + + await condition(() => approved); + return `Processed ${items.length} items`; +} +``` + +## Dynamic Signal Handlers + +For handling signals with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined signal handlers. + +```typescript +import { setDefaultSignalHandler, condition } from '@temporalio/workflow'; + +export async function dynamicSignalWorkflow(): Promise> { + const signals: Record = {}; + + setDefaultSignalHandler((signalName: string, ...args: unknown[]) => { + if (!signals[signalName]) { + signals[signalName] = []; + } + signals[signalName].push(args); + }); + + await condition(() => signals['done'] !== undefined); + return signals; +} +``` + +## Queries + +**Important:** Queries must NOT modify workflow state or have side effects. + +```typescript +import { defineQuery, setHandler } from '@temporalio/workflow'; + +const statusQuery = defineQuery('status'); +const progressQuery = defineQuery('progress'); + +export async function progressWorkflow(): Promise { + let status = 'running'; + let progress = 0; + + setHandler(statusQuery, () => status); + setHandler(progressQuery, () => progress); + + for (let i = 0; i < 100; i++) { + progress = i; + await doWork(); + } + status = 'completed'; +} +``` + +## Dynamic Query Handlers + +For handling queries with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined query handlers. + +```typescript +import { setDefaultQueryHandler } from '@temporalio/workflow'; + +export async function dynamicQueryWorkflow(): Promise { + const state: Record = { + status: 'running', + progress: 0, + }; + + setDefaultQueryHandler((queryName: string) => { + return state[queryName]; + }); + + // ... workflow logic +} +``` + +## Updates + +```typescript +import { defineUpdate, setHandler, condition } from '@temporalio/workflow'; + +// Define the update - specify return type and argument types +export const addItemUpdate = defineUpdate('addItem'); +export const addItemValidatedUpdate = defineUpdate('addItemValidated'); + +export async function orderWorkflow(): Promise { + const items: string[] = []; + let completed = false; + + // Simple update handler - returns new item count + setHandler(addItemUpdate, (item: string) => { + items.push(item); + return items.length; + }); + + // Update handler with validator - rejects invalid input before execution + setHandler( + addItemValidatedUpdate, + (item: string) => { + items.push(item); + return items.length; + }, + { + validator: (item: string) => { + if (!item) throw new Error('Item cannot be empty'); + if (items.length >= 100) throw new Error('Order is full'); + }, + } + ); + + await condition(() => completed); + return `Order with ${items.length} items completed`; +} +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Throw an error to reject the update; return normally to accept. + +## Child Workflows + +```typescript +import { executeChild } from '@temporalio/workflow'; + +export async function parentWorkflow(orders: Order[]): Promise { + const results: string[] = []; + + for (const order of orders) { + const result = await executeChild(processOrderWorkflow, { + args: [order], + workflowId: `order-${order.id}`, + }); + results.push(result); + } + + return results; +} +``` + +### Child Workflow Options + +```typescript +import { executeChild, ParentClosePolicy, ChildWorkflowCancellationType } from '@temporalio/workflow'; + +const result = await executeChild(childWorkflow, { + args: [input], + workflowId: `child-${workflowInfo().workflowId}`, + + // ParentClosePolicy - what happens to child when parent closes + // TERMINATE (default), ABANDON, REQUEST_CANCEL + parentClosePolicy: ParentClosePolicy.TERMINATE, + + // ChildWorkflowCancellationType - how cancellation is handled + // WAIT_CANCELLATION_COMPLETED (default), WAIT_CANCELLATION_REQUESTED, TRY_CANCEL, ABANDON + cancellationType: ChildWorkflowCancellationType.WAIT_CANCELLATION_COMPLETED, +}); +``` + +## Handles to External Workflows + +```typescript +import { getExternalWorkflowHandle } from '@temporalio/workflow'; +import { mySignal } from './other-workflows'; + +export async function coordinatorWorkflow(targetWorkflowId: string): Promise { + const handle = getExternalWorkflowHandle(targetWorkflowId); + + // Signal the external workflow + await handle.signal(mySignal, { data: 'payload' }); + + // Or cancel it + await handle.cancel(); +} +``` + +## Parallel Execution + +```typescript +export async function parallelWorkflow(items: string[]): Promise { + return await Promise.all( + items.map((item) => processItem(item)) + ); +} +``` + +## Continue-as-New + +```typescript +import { continueAsNew, workflowInfo } from '@temporalio/workflow'; + +export async function longRunningWorkflow(state: State): Promise { + while (true) { + state = await processNextBatch(state); + + if (state.isComplete) { + return 'done'; + } + + const info = workflowInfo(); + if (info.continueAsNewSuggested || info.historyLength > 10000) { + await continueAsNew(state); + } + } +} +``` + +## Saga Pattern + +**Important:** Compensation activities should be idempotent. + +```typescript +import { CancellationScope, log } from '@temporalio/workflow'; + +export async function sagaWorkflow(order: Order): Promise { + const compensations: Array<() => Promise> = []; + + try { + // IMPORTANT: Save compensation BEFORE calling the activity + // If activity fails after completing but before returning, + // compensation must still be registered + compensations.push(() => releaseInventory(order)); + await reserveInventory(order); + + compensations.push(() => refundPayment(order)); + await chargePayment(order); + + await shipOrder(order); + return 'Order completed'; + } catch (err) { + // nonCancellable ensures compensations run even if the workflow is cancelled + await CancellationScope.nonCancellable(async () => { + for (const compensate of compensations.reverse()) { + try { + await compensate(); + } catch (compErr) { + log.warn('Compensation failed', { error: compErr }); + } + } + }); + throw err; + } +} +``` + +## Cancellation Scopes + +Cancellation scopes control how cancellation propagates to activities and child workflows. Use them for cleanup logic, timeouts, and manual cancellation. + +```typescript +import { CancellationScope, sleep } from '@temporalio/workflow'; + +export async function scopedWorkflow(): Promise { + // Non-cancellable scope - runs even if workflow cancelled + await CancellationScope.nonCancellable(async () => { + await cleanupActivity(); + }); + + // Timeout scope + await CancellationScope.withTimeout('5 minutes', async () => { + await longRunningActivity(); + }); + + // Manual cancellation + const scope = new CancellationScope(); + const promise = scope.run(() => someActivity()); + scope.cancel(); +} +``` + +## Triggers (Promise-like Signals) + +**WHY**: Triggers provide a one-shot promise that resolves when a signal is received. Cleaner than condition() for single-value signals. + +**WHEN to use**: +- Waiting for a single response (approval, completion notification) +- Converting signal-based events into awaitable promises + +```typescript +import { Trigger } from '@temporalio/workflow'; + +export async function triggerWorkflow(): Promise { + const approvalTrigger = new Trigger(); + + setHandler(approveSignal, (approved) => { + approvalTrigger.resolve(approved); + }); + + const approved = await approvalTrigger; + return approved ? 'Approved' : 'Rejected'; +} +``` + +## Wait Condition with Timeout + +```typescript +import { condition, CancelledFailure } from '@temporalio/workflow'; + +export async function approvalWorkflow(): Promise { + let approved = false; + + setHandler(approveSignal, () => { + approved = true; + }); + + // Wait for approval with 24-hour timeout + const gotApproval = await condition(() => approved, '24 hours'); + + if (gotApproval) { + return 'approved'; + } else { + return 'auto-rejected due to timeout'; + } +} +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers should generally be non-async (avoid running activities from them). Otherwise, the workflow may complete before handlers finish their execution. However, making handlers non-async sometimes requires workarounds that add complexity. + +When async handlers are necessary, use `condition(allHandlersFinished)` at the end of your workflow (or before continue-as-new) to prevent completion until all pending handlers complete. + +```typescript +import { condition, allHandlersFinished } from '@temporalio/workflow'; + +export async function handlerAwareWorkflow(): Promise { + // ... main workflow logic ... + + // Before exiting, wait for all handlers to finish + await condition(allHandlersFinished); + return 'done'; +} +``` + +## Activity Heartbeat Details + +### WHY: +- **Support activity cancellation** - Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** - Heartbeat details persist across retries + +### WHEN: +- **Cancellable activities** - Any activity that should respond to cancellation +- **Long-running activities** - Track progress for resumability +- **Checkpointing** - Save progress periodically + +```typescript +import { heartbeat, activityInfo, CancelledFailure } from '@temporalio/activity'; + +export async function processLargeFile(filePath: string): Promise { + const info = activityInfo(); + // Get heartbeat details from previous attempt (if any) + const startLine: number = info.heartbeatDetails ?? 0; + + const lines = await readFileLines(filePath); + + try { + for (let i = startLine; i < lines.length; i++) { + await processLine(lines[i]); + // Heartbeat with progress + // If activity is cancelled, heartbeat() throws CancelledFailure + heartbeat(i + 1); + } + return 'completed'; + } catch (e) { + if (e instanceof CancelledFailure) { + // Perform cleanup on cancellation + await cleanup(); + } + throw e; + } +} +``` + +## Timers + +```typescript +import { sleep } from '@temporalio/workflow'; + +export async function timerWorkflow(): Promise { + await sleep('1 hour'); + return 'Timer fired'; +} +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```typescript +import { proxyLocalActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { quickLookup } = proxyLocalActivities({ + startToCloseTimeout: '5 seconds', +}); + +export async function localActivityWorkflow(): Promise { + const result = await quickLookup('key'); + return result; +} +``` diff --git a/references/typescript/testing.md b/references/typescript/testing.md new file mode 100644 index 0000000..e945ed8 --- /dev/null +++ b/references/typescript/testing.md @@ -0,0 +1,222 @@ +# TypeScript SDK Testing + +## Overview + +The TypeScript SDK provides `TestWorkflowEnvironment` for testing workflows with time-skipping and activity mocking support. Use `createTimeSkipping()` for automatic time advancement when testing workflows with timers, or `createLocal()` for a full local server without time-skipping. + +**Note:** Prefer to use `createLocal()` for full-featured support. Only use `createTimeSkipping()` if you genuinely need time skipping for testing your workflow. + +## Test Environment Setup + +```typescript +import { TestWorkflowEnvironment } from '@temporalio/testing'; +import { Worker } from '@temporalio/worker'; + +describe('Workflow', () => { + let testEnv: TestWorkflowEnvironment; + + before(async () => { + testEnv = await TestWorkflowEnvironment.createLocal(); + }); + + after(async () => { + await testEnv?.teardown(); + }); + + it('runs workflow', async () => { + const { client, nativeConnection } = testEnv; + + const worker = await Worker.create({ + connection: nativeConnection, + taskQueue: 'test', + workflowsPath: require.resolve('./workflows'), + activities: require('./activities'), + }); + + await worker.runUntil(async () => { + const result = await client.workflow.execute(greetingWorkflow, { + taskQueue: 'test', + workflowId: 'test-workflow', + args: ['World'], + }); + expect(result).toEqual('Hello, World!'); + }); + }); +}); +``` + +## Activity Mocking + +```typescript +const worker = await Worker.create({ + connection: nativeConnection, + taskQueue: 'test', + workflowsPath: require.resolve('./workflows'), + activities: { + // Mock activity implementation + greet: async (name: string) => `Mocked: ${name}`, + }, +}); +``` + +## Testing Signals and Queries + +```typescript +import { defineQuery, defineSignal } from '@temporalio/workflow'; + +// Define query and signal (typically in a shared file) +const getStatusQuery = defineQuery('getStatus'); +const approveSignal = defineSignal('approve'); + +it('handles signals and queries', async () => { + await worker.runUntil(async () => { + const handle = await client.workflow.start(approvalWorkflow, { + taskQueue: 'test', + workflowId: 'approval-test', + }); + + // Query current state + const status = await handle.query(getStatusQuery); + expect(status).toEqual('pending'); + + // Send signal + await handle.signal(approveSignal); + + // Wait for completion + const result = await handle.result(); + expect(result).toEqual('Approved!'); + }); +}); +``` + +## Testing Failure Cases + +Test that workflows handle errors correctly: + +```typescript +import { TestWorkflowEnvironment } from '@temporalio/testing'; +import { Worker } from '@temporalio/worker'; +import { WorkflowFailedError } from '@temporalio/client'; +import assert from 'assert'; + +describe('Failure handling', () => { + let testEnv: TestWorkflowEnvironment; + + before(async () => { + testEnv = await TestWorkflowEnvironment.createLocal(); + }); + + after(async () => { + await testEnv?.teardown(); + }); + + it('handles activity failure', async () => { + const { client, nativeConnection } = testEnv; + + const worker = await Worker.create({ + connection: nativeConnection, + taskQueue: 'test', + workflowsPath: require.resolve('./workflows'), + activities: { + // Mock activity that always fails + myActivity: async () => { + throw new Error('Activity failed'); + }, + }, + }); + + await worker.runUntil(async () => { + try { + await client.workflow.execute(myWorkflow, { + workflowId: 'test-failure', + taskQueue: 'test', + }); + assert.fail('Expected workflow to fail'); + } catch (err) { + assert(err instanceof WorkflowFailedError); + } + }); + }); +}); +``` + +## Replay Testing + +```typescript +import { Worker } from '@temporalio/worker'; +import { Client, Connection } from '@temporalio/client'; +import fs from 'fs'; + +describe('Replay', () => { + it('replays workflow history from JSON file', async () => { + // Load history from a JSON file (exported from Web UI or Temporal CLI) + const filePath = './history_file.json'; + const history = JSON.parse(await fs.promises.readFile(filePath, 'utf8')); + + await Worker.runReplayHistory( + { + workflowsPath: require.resolve('./workflows'), + }, + history, + 'my-workflow-id' // Optional: provide workflowId if your workflow depends on it + ); + }); + + it('replays workflow history from server', async () => { + // Fetch history programmatically using the client + const connection = await Connection.connect({ address: 'localhost:7233' }); + const client = new Client({ connection, namespace: 'default' }); + const handle = client.workflow.getHandle('my-workflow-id'); + const history = await handle.fetchHistory(); + + await Worker.runReplayHistory( + { + workflowsPath: require.resolve('./workflows'), + }, + history, + 'my-workflow-id' + ); + }); +}); +``` + +## Activity Testing + +Test activities in isolation without running a workflow: + +```typescript +import { MockActivityEnvironment } from '@temporalio/testing'; +import { CancelledFailure } from '@temporalio/activity'; +import { myActivity } from './activities'; +import assert from 'assert'; + +describe('Activity tests', () => { + it('completes successfully', async () => { + const env = new MockActivityEnvironment(); + const result = await env.run(myActivity, 'input'); + assert.equal(result, 'expected output'); + }); + + it('handles cancellation', async () => { + const env = new MockActivityEnvironment(); + // Cancel the activity after a short delay + setTimeout(() => env.cancel(), 100); + try { + await env.run(longRunningActivity, 'input'); + assert.fail('Expected cancellation'); + } catch (err) { + assert(err instanceof CancelledFailure); + } + }); +}); +``` + +**Note:** `MockActivityEnvironment` provides `heartbeat()` and cancellation support for testing activity behavior. + +## Best Practices + +1. Use time-skipping for workflows with timers +2. Mock external dependencies in activities +3. Test replay compatibility when changing workflow code +4. Use unique workflow IDs per test +5. Clean up test environment after tests diff --git a/references/typescript/typescript.md b/references/typescript/typescript.md new file mode 100644 index 0000000..9918ee7 --- /dev/null +++ b/references/typescript/typescript.md @@ -0,0 +1,172 @@ +# Temporal TypeScript SDK Reference + +## Overview + +The Temporal TypeScript SDK provides a modern Promise based approach to building durable workflows. Workflows are bundled and run in an isolated runtime with automatic replacements for determinism protection. + +**CRITICAL**: All `@temporalio/*` packages must have the same version number. + +## Understanding Replay + +Temporal workflows are durable through history replay. For details on how this works, see `references/core/determinism.md`. + +## Quick Start + +**Add Dependencies:** Install the Temporal SDK packages (use the package manager appropriate for your project): +```bash +npm install @temporalio/client @temporalio/worker @temporalio/workflow @temporalio/activity +``` + +Note: if you are working in production, it is strongly advised to use ~ version constraints, i.e. `npm install ... --save-prefix='~'` if using NPM. + +**activities.ts** - Activity definitions (separate file to distinguish workflow vs activity code): +```typescript +export async function greet(name: string): Promise { + return `Hello, ${name}!`; +} +``` + +**workflows.ts** - Workflow definition (use type-only imports for activities): +```typescript +import { proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { greet } = proxyActivities({ + startToCloseTimeout: '1 minute', +}); + +export async function greetingWorkflow(name: string): Promise { + return await greet(name); +} +``` + +**worker.ts** - Worker setup (imports activities and workflows, runs indefinitely): +```typescript +import { Worker } from '@temporalio/worker'; +import * as activities from './activities'; + +async function run() { + const worker = await Worker.create({ + workflowsPath: require.resolve('./workflows'), // For production, use workflowBundle instead + activities, + taskQueue: 'greeting-queue', + }); + await worker.run(); +} + +run().catch(console.error); +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Run `npx ts-node worker.ts` in the background. + +**client.ts** - Start a workflow execution: +```typescript +import { Client } from '@temporalio/client'; +import { greetingWorkflow } from './workflows'; +import { v4 as uuid } from 'uuid'; + +async function run() { + const client = new Client(); + + const result = await client.workflow.execute(greetingWorkflow, { + workflowId: uuid(), + taskQueue: 'greeting-queue', + args: ['my name'], + }); + + console.log(`Result: ${result}`); +} + +run().catch(console.error); +``` + +**Run the workflow:** Run `npx ts-node client.ts`. Should output: `Result: Hello, my name!`. + +## Key Concepts + +### Workflow Definition +- Async functions exported from workflow file +- Use `proxyActivities()` with type-only imports +- Use `defineSignal()`, `defineQuery()`, `defineUpdate()`, `setHandler()` for handlers + +### Activity Definition +- Regular async functions +- Can perform I/O, network calls, etc. +- Use `heartbeat()` for long operations + +### Worker Setup +- Use `Worker.create()` with `workflowsPath` (dev) or `workflowBundle` (production) - see `references/typescript/gotchas.md` +- Import activities directly (not via proxy) + +## File Organization Best Practice + +**Keep Workflow definitions in separate files from Activity definitions.** The TypeScript SDK bundles workflow files separately. Minimizing workflow file contents improves Worker startup time. + +``` +my_temporal_app/ +├── workflows/ +│ └── greeting.ts # Only Workflow functions +├── activities/ +│ └── translate.ts # Only Activity functions +├── worker.ts # Worker setup, imports both +└── client.ts # Client code to start workflows +``` + +**In the Workflow file, use type-only imports for activities:** +```typescript +// workflows/greeting.ts +import { proxyActivities } from '@temporalio/workflow'; +import type * as activities from '../activities/translate'; + +const { translate } = proxyActivities({ + startToCloseTimeout: '1 minute', +}); +``` + +## Determinism Rules + +The TypeScript SDK runs workflows in an isolated V8 sandbox. + +**Automatic replacements:** +- `Math.random()` → deterministic seeded PRNG +- `Date.now()` → workflow start time +- `setTimeout` → deterministic timer + +**Safe to use:** +- `sleep()` from `@temporalio/workflow` +- `condition()` for waiting +- Standard JavaScript operations + +See `references/typescript/determinism.md` for detailed rules. + +## Common Pitfalls + +1. **Importing activities without `type`** - Use `import type * as activities` +2. **Version mismatch** - All @temporalio packages must match +3. **Direct I/O in workflows** - Use activities for external calls +4. **Missing `proxyActivities`** - Required to call activities from workflows +5. **Forgetting to bundle workflows** - Worker needs `workflowsPath` or `workflowBundle` +6. **Using workflowsPath in production** - Use `workflowBundle` for production (see `references/typescript/gotchas.md`) +7. **Forgetting to heartbeat** - Long-running activities need `heartbeat()` calls +8. **Logging in workflows** - For observability, use `import { log } from '@temporalio/workflow'` (routes through sinks). For temporary print debugging, `console.log()` is fine—it's direct and immediate, whereas `log` may lose messages on workflow errors. +9. **Forgetting to wait on activity calls** - Activity calls return Promises; you must eventually await them (directly or via `Promise.all()` for parallel execution) + +## Writing Tests + +See `references/typescript/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files +- **`references/typescript/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. +- **`references/typescript/determinism.md`** - Essentials of determinism in TypeScript +- **`references/typescript/gotchas.md`** - TypeScript-specific mistakes and anti-patterns +- **`references/typescript/error-handling.md`** - ApplicationFailure, retry policies, non-retryable errors +- **`references/typescript/observability.md`** - Logging, metrics, tracing +- **`references/typescript/testing.md`** - TestWorkflowEnvironment, time-skipping, activity mocking +- **`references/typescript/advanced-features.md`** - Schedules, worker tuning, and more +- **`references/typescript/data-handling.md`** - Data converters, payload encryption, etc. +- **`references/typescript/versioning.md`** - Patching API, workflow type versioning, Worker Versioning +- **`references/typescript/determinism-protection.md`** - V8 sandbox and bundling diff --git a/references/typescript/versioning.md b/references/typescript/versioning.md new file mode 100644 index 0000000..a9f57a2 --- /dev/null +++ b/references/typescript/versioning.md @@ -0,0 +1,211 @@ +# TypeScript SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## Patching API + +The Patching API lets you change Workflow Definitions without causing non-deterministic behavior in running Workflows. + +### The patched() Function + +The `patched()` function takes a `patchId` string and returns a boolean: + +```typescript +import { patched } from '@temporalio/workflow'; + +export async function myWorkflow(): Promise { + if (patched('my-change-id')) { + // New code path + await newImplementation(); + } else { + // Old code path (for replay of existing executions) + await oldImplementation(); + } +} +``` + +**How it works:** +- If the Workflow is running for the first time, `patched()` returns `true` and inserts a marker into the Event History +- During replay, if the history contains a marker with the same `patchId`, `patched()` returns `true` +- During replay, if no matching marker exists, `patched()` returns `false` + +**TypeScript-specific behavior:** Unlike Python/.NET/Ruby, `patched()` is not memoized when it returns `false`. This means you can use `patched()` in loops. However, if a single patch requires coordinated behavioral changes at different points in your workflow, you may need to manually memoize the result: + +```typescript +const useNewBehavior = patched('my-change'); +// Use useNewBehavior at multiple points in workflow +``` + +### Three-Step Patching Process + +Patching is a three-step process for safely deploying changes. + +**Warning:** Failing to follow this process correctly will result in non-determinism errors for in-flight workflows. + +#### Step 1: Patch in New Code + +Add the patch alongside the old code: + +```typescript +import { patched } from '@temporalio/workflow'; + +// Original code sent fax notifications +export async function shippingConfirmation(): Promise { + if (patched('changedNotificationType')) { + await sendEmail(); // New code + } else { + await sendFax(); // Old code for replay + } + await sleep('1 day'); +} +``` + +#### Step 2: Deprecate the Patch + +Once all Workflows using the old code have completed, deprecate the patch: + +```typescript +import { deprecatePatch } from '@temporalio/workflow'; + +export async function shippingConfirmation(): Promise { + deprecatePatch('changedNotificationType'); + await sendEmail(); + await sleep('1 day'); +} +``` + +The `deprecatePatch()` function records a marker that does not fail replay when Workflow code does not emit it, allowing a transition period. + +#### Step 3: Remove the Patch + +After all Workflows using `deprecatePatch` have completed, remove it entirely: + +```typescript +export async function shippingConfirmation(): Promise { + await sendEmail(); + await sleep('1 day'); +} +``` + +### Query Filters for Versioned Workflows + +Use List Filters to find Workflows by version: + +``` +# Find running Workflows with a specific patch +WorkflowType = "shippingConfirmation" AND ExecutionStatus = "Running" AND TemporalChangeVersion = "changedNotificationType" + +# Find running Workflows without the patch (started before patching) +WorkflowType = "shippingConfirmation" AND ExecutionStatus = "Running" AND TemporalChangeVersion IS NULL +``` + +## Workflow Type Versioning + +An alternative to patching is creating new Workflow functions for incompatible changes: + +```typescript +// Original Workflow +export async function pizzaWorkflow(order: PizzaOrder): Promise { + // Original implementation +} + +// New version with incompatible changes +export async function pizzaWorkflowV2(order: PizzaOrder): Promise { + // Updated implementation +} +``` + +Register both Workflows with the Worker: + +```typescript +const worker = await Worker.create({ + workflowsPath: require.resolve('./workflows'), // Use workflowBundle for production + taskQueue: 'pizza-queue', +}); +``` + +Update client code to start new Workflows with the new type: + +```typescript +// Start new executions with V2 +await client.workflow.start(pizzaWorkflowV2, { + workflowId: 'order-123', + taskQueue: 'pizza-queue', + args: [order], +}); +``` + +Use List Filters to check for remaining V1 executions: + +``` +WorkflowType = "pizzaWorkflow" AND ExecutionStatus = "Running" +``` + +After all V1 executions complete, remove the old Workflow function. + +## Worker Versioning + +Worker Versioning allows multiple Worker versions to run simultaneously, routing Workflows to specific versions without code-level patching. Workflows are pinned to the Worker Deployment Version they started on. + +> **Note:** Worker Versioning is currently in Public Preview. The legacy Worker Versioning API (before 2025) will be removed from Temporal Server in March 2026. + +### Key Concepts + +- **Worker Deployment**: A logical name for your application (e.g., "order-service") +- **Worker Deployment Version**: A specific build identified by deployment name + Build ID +- **Workflow Pinning**: Workflows complete on the Worker Deployment Version they started on + +### Configuring Workers for Versioning + +```typescript +import { Worker, NativeConnection } from '@temporalio/worker'; + +const worker = await Worker.create({ + workflowsPath: require.resolve('./workflows'), // Use workflowBundle for production + taskQueue: 'my-queue', + connection: await NativeConnection.connect({ address: 'temporal:7233' }), + workerDeploymentOptions: { + useWorkerVersioning: true, + version: { + deploymentName: 'order-service', + buildId: '1.0.0', // Git hash, semver, build number, etc. + }, + }, +}); +``` + +**Configuration options:** +- `useWorkerVersioning`: Enables Worker Versioning +- `version.deploymentName`: Logical name for your service (consistent across versions) +- `version.buildId`: Unique identifier for this build + +### Deployment Workflow + +1. Deploy new Worker version with a new `buildId` +2. Use the Temporal CLI to set the new version as current: + ```bash + temporal worker deployment set-current-version \ + --deployment-name order-service \ + --build-id 2.0.0 + ``` +3. New Workflows start on the new version +4. Existing Workflows continue on their original version until completion +5. Decommission old Workers once all their Workflows complete + +### When to Use Worker Versioning + +Worker Versioning is best suited for: +- **Short-running Workflows**: Old Workers only need to run briefly during deployment transitions +- **Frequent deployments**: Eliminates the need for code-level patching on every change +- **Blue-green deployments**: Run old and new versions simultaneously with traffic control + +For long-running Workflows, consider combining Worker Versioning with the Patching API, or use Continue-as-New to move Workflows to newer versions. + +## Best Practices + +1. Use descriptive `patchId` names that explain the change +2. Follow the three-step patching process completely before removing patches +3. Use List Filters to verify no running Workflows before removing version support +4. Keep Worker Deployment names consistent across all versions +5. Use unique, traceable Build IDs (git hashes, semver, timestamps) +6. Test version transitions with replay tests before deploying From 6137e38346042541bbf28a0171918c08bccd3f76 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 19 Mar 2026 18:15:14 -0400 Subject: [PATCH 10/42] hotfix: clean up skills.sh instruction (#51) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ba88db..5f395b5 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This skill is housed within a [Claude Code plugin](https://github.com/temporalio ### Via `npx skills` - supports all major coding agents -1. `npx skills add https://github.com/temporalio/skill-temporal-developer` +1. `npx skills add temporalio/skill-temporal-developer` 2. Follow prompts ### Via manually cloning the skill repo: From e4afb09941b5032524f7cc81115cadf3bb08980a Mon Sep 17 00:00:00 2001 From: Mason Egger Date: Wed, 25 Mar 2026 14:09:37 -0500 Subject: [PATCH 11/42] Add packaging workflow to release a public version and support Claude.ai uploads The skill works great as a Claude Code plugin where SKILL.md and references are auto-discovered, butusers may want to upload it to Claude.ai projects instead. You currently do this by uploaded a .zip file to your Claude.ai UI. This adds a GitHub Actions workflow that packages the skill and references into a ZIP on every push to main, and creates a GitHub Release when the version in SKILL.md increases. Users can grab the ZIP from the release and upload it directly to a Claude.ai project without needing to clone the repo. --- .github/workflows/package-skill.yml | 59 +++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 .github/workflows/package-skill.yml diff --git a/.github/workflows/package-skill.yml b/.github/workflows/package-skill.yml new file mode 100644 index 0000000..69c48f5 --- /dev/null +++ b/.github/workflows/package-skill.yml @@ -0,0 +1,59 @@ +# ABOUTME: GitHub Actions workflow that packages the skill for upload to Claude.ai. +# ABOUTME: Creates a ZIP artifact on every push to main and a GitHub Release when the version in SKILL.md increases. + +name: Package Skill + +on: + push: + branches: [main] + workflow_dispatch: + +jobs: + package: + runs-on: ubuntu-latest + permissions: + contents: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Read version from SKILL.md + id: version + run: | + version=$(grep '^version:' SKILL.md | sed 's/version:[[:space:]]*//') + echo "version=$version" >> "$GITHUB_OUTPUT" + echo "tag=v$version" >> "$GITHUB_OUTPUT" + + - name: Check if tag exists + id: tag_check + run: | + if git rev-parse "refs/tags/${{ steps.version.outputs.tag }}" >/dev/null 2>&1; then + echo "exists=true" >> "$GITHUB_OUTPUT" + else + echo "exists=false" >> "$GITHUB_OUTPUT" + fi + + - name: Package skill + run: | + zip -r temporal-developer-skill.zip \ + SKILL.md \ + references/ \ + -x '*.DS_Store' + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: temporal-developer-skill + path: temporal-developer-skill.zip + + - name: Create release + if: steps.tag_check.outputs.exists == 'false' + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ steps.version.outputs.tag }} + name: ${{ steps.version.outputs.tag }} + files: temporal-developer-skill.zip + generate_release_notes: true From 68ebe14ff540a5b508aed897d0fdc2dc401cdb1f Mon Sep 17 00:00:00 2001 From: Jackson Lo <4205685+jacksonlo@users.noreply.github.com> Date: Tue, 31 Mar 2026 14:20:51 -0400 Subject: [PATCH 12/42] Fix typos and broken references across skill docs (#56) - Fix missing trailing pipe in error-reference.md table header - Fix wrong reference path in go.md (python -> go determinism-protection) - Add missing .md extension to testing reference in go/determinism-protection.md - Fix typos: "Activty" -> "Activity", "accomplised" -> "accomplished", "discourged" -> "discouraged" --- references/core/error-reference.md | 2 +- references/core/patterns.md | 4 ++-- references/core/troubleshooting.md | 2 +- references/go/determinism-protection.md | 2 +- references/go/go.md | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/references/core/error-reference.md b/references/core/error-reference.md index a0f905b..74570ae 100644 --- a/references/core/error-reference.md +++ b/references/core/error-reference.md @@ -1,6 +1,6 @@ # Common Error Types Reference -| Error Type | Error identifier (if any) | Where to Find | What Happened | Recovery | Link to additional info (if any) +| Error Type | Error identifier (if any) | Where to Find | What Happened | Recovery | Link to additional info (if any) | |------------|---------------|---------------|---------------|----------|----------| | **Non-determinism** | TMPRL1100 | `WorkflowTaskFailed` in history | Replay doesn't match history | Analyze error first. **If accidental**: fix code to match history → restart worker. **If intentional v2 change**: terminate → start fresh workflow. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1100.md | | **Deadlock** | TMPRL1101 | `WorkflowTaskFailed` in history, worker logs | Workflow blocked too long (deadlock detected) | Remove blocking operations from workflow code (no I/O, no sleep, no threading locks). Use Temporal primitives instead. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1101.md | diff --git a/references/core/patterns.md b/references/core/patterns.md index 566e6f8..2ab5b72 100644 --- a/references/core/patterns.md +++ b/references/core/patterns.md @@ -253,9 +253,9 @@ To ensure that polling_activity is restarted in a timely manner, we make sure th **Implementation**: -Define an Activty which fails (raises an exception) exactly when polling is not completed. +Define an Activity which fails (raises an exception) exactly when polling is not completed. -The polling loop is accomplised via activity retries, by setting the following Retry options: +The polling loop is accomplished via activity retries, by setting the following Retry options: - backoff_coefficient: to 1 - initial_interval: to the polling interval (e.g. 60 seconds) diff --git a/references/core/troubleshooting.md b/references/core/troubleshooting.md index e4ef2cb..952d4e2 100644 --- a/references/core/troubleshooting.md +++ b/references/core/troubleshooting.md @@ -192,7 +192,7 @@ Timeout error? ├─▶ Which timeout? │ │ │ ├─▶ Workflow timeout -│ │ └─▶ Increase timeout or optimize workflow. Better yet, consider removing the workflow timeout, as it is generally discourged unless *necessary* for your use case. +│ │ └─▶ Increase timeout or optimize workflow. Better yet, consider removing the workflow timeout, as it is generally discouraged unless *necessary* for your use case. │ │ │ ├─▶ ScheduleToCloseTimeout │ │ └─▶ Activity taking too long overall (including retries) diff --git a/references/go/determinism-protection.md b/references/go/determinism-protection.md index 4a6f5f4..cc8d8f5 100644 --- a/references/go/determinism-protection.md +++ b/references/go/determinism-protection.md @@ -2,7 +2,7 @@ ## Overview -The Go SDK has no runtime sandbox. Determinism is enforced by **developer convention** and **optional static analysis**. Unlike the Python and TypeScript SDKs, the Go SDK will not intercept or replace non-deterministic calls at runtime. The Go SDK does perform a limited runtime command-ordering check, but catching non-deterministic code before deployment requires the `workflowcheck` tool and testing, in particular replay tests (see `references/go/testing`). +The Go SDK has no runtime sandbox. Determinism is enforced by **developer convention** and **optional static analysis**. Unlike the Python and TypeScript SDKs, the Go SDK will not intercept or replace non-deterministic calls at runtime. The Go SDK does perform a limited runtime command-ordering check, but catching non-deterministic code before deployment requires the `workflowcheck` tool and testing, in particular replay tests (see `references/go/testing.md`). ## workflowcheck Static Analysis diff --git a/references/go/go.md b/references/go/go.md index cc87a6a..974ee7c 100644 --- a/references/go/go.md +++ b/references/go/go.md @@ -239,4 +239,4 @@ See `references/go/testing.md` for info on writing tests. - **`references/go/advanced-features.md`** - Schedules, worker tuning, and more - **`references/go/data-handling.md`** - Data converters, payload codecs, encryption - **`references/go/versioning.md`** - Patching API (`workflow.GetVersion`), Worker Versioning -- **`references/python/determinism-protection.md`** - Information on **`workflowcheck`** tool to help statically check for determinism issues. +- **`references/go/determinism-protection.md`** - Information on **`workflowcheck`** tool to help statically check for determinism issues. From b040ae10b4a9ca03fbb432af4bd22561fd43ce64 Mon Sep 17 00:00:00 2001 From: "Trevor J. Yao" <55645157+trevoryao@users.noreply.github.com> Date: Tue, 31 Mar 2026 14:58:57 -0400 Subject: [PATCH 13/42] Fix Python reference bugs: incorrect API name, syntax error, broken cross-ref, misleading comment (#55) --- references/python/ai-patterns.md | 2 +- references/python/determinism.md | 2 +- references/python/testing.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/references/python/ai-patterns.md b/references/python/ai-patterns.md index a07e30a..6a45272 100644 --- a/references/python/ai-patterns.md +++ b/references/python/ai-patterns.md @@ -2,7 +2,7 @@ ## Overview -This document provides Python-specific implementation details for integrating LLMs with Temporal. For conceptual patterns, see `references/core/ai-integration.md`. +This document provides Python-specific implementation details for integrating LLMs with Temporal. For conceptual patterns, see `references/core/ai-patterns.md`. ## Pydantic Data Converter Setup diff --git a/references/python/determinism.md b/references/python/determinism.md index 7276360..e925f7c 100644 --- a/references/python/determinism.md +++ b/references/python/determinism.md @@ -23,7 +23,7 @@ Temporal provides durable execution through **History Replay**. When a Worker ne |-----------|------------------| | `datetime.now()` | `workflow.now()` | | `datetime.utcnow()` | `workflow.now()` | -| `random.random()` | `rng = workflow.new_random() ; rng.randint(1, 100)` | +| `random.random()` | `rng = workflow.random() ; rng.randint(1, 100)` | | `uuid.uuid4()` | `workflow.uuid4()` | | `time.time()` | `workflow.now().timestamp()` | diff --git a/references/python/testing.md b/references/python/testing.md index 63a0d14..e4a7823 100644 --- a/references/python/testing.md +++ b/references/python/testing.md @@ -136,7 +136,7 @@ async def test_replay(): # From JSON file await replayer.replay_workflow( - WorkflowHistory.from_json(workflow_id=str(uuid.uuid4()), history_json) + WorkflowHistory.from_json(str(uuid.uuid4()), history_json) ) ``` From 57c08ef74b44378231b624594da1d276ddcf9e15 Mon Sep 17 00:00:00 2001 From: Brian Strauch Date: Tue, 31 Mar 2026 14:16:11 -0700 Subject: [PATCH 14/42] Add `@workflow.init` decorator to python.md Key Concepts (#57) * docs: add @workflow.init decorator to python.md Key Concepts Co-Authored-By: Claude Sonnet 4.6 * Apply suggestion from @brianstrauch * Update references/python/python.md --------- Co-authored-by: Claude Sonnet 4.6 --- references/python/python.md | 1 + 1 file changed, 1 insertion(+) diff --git a/references/python/python.md b/references/python/python.md index 130b1eb..2c56843 100644 --- a/references/python/python.md +++ b/references/python/python.md @@ -98,6 +98,7 @@ if __name__ == "__main__": ### Workflow Definition - Use `@workflow.defn` decorator on class +- Put any state initialization logic in the `__init__` of your workflow class to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `@workflow.init` decorator and parameters to your `__init__`. - Use `@workflow.run` on the entry point method - Must be async (`async def`) - Use `@workflow.signal`, `@workflow.query`, `@workflow.update` for handlers From 8369c659d060602d553c94ff9a2bb6360c69fa33 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 2 Apr 2026 10:22:27 -0400 Subject: [PATCH 15/42] Remove ASCII diagram, replace with prose. (#66) --- SKILL.md | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/SKILL.md b/SKILL.md index 1874d20..325709f 100644 --- a/SKILL.md +++ b/SKILL.md @@ -12,31 +12,18 @@ Temporal is a durable execution platform that makes workflows survive failures a ## Core Architecture -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Temporal Cluster │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌────────────────┐ │ -│ │ Event History │ │ Task Queues │ │ Visibility │ │ -│ │ (Durable Log) │ │ (Work Router) │ │ (Search) │ │ -│ └─────────────────┘ └─────────────────┘ └────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ - ▲ - │ Poll / Complete - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Worker │ -│ ┌─────────────────────────┐ ┌──────────────────────────────┐ │ -│ │ Workflow Definitions │ │ Activity Implementations │ │ -│ │ (Deterministic) │ │ (Non-deterministic OK) │ │ -│ └─────────────────────────┘ └──────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` +The **Temporal Cluster** is the central orchestration backend. It maintains three key subsystems: the **Event History** (a durable log of all workflow state), **Task Queues** (which route work to the right workers), and a **Visibility** store (for searching and listing workflows). There are three ways to run a Cluster: + +- **Temporal CLI dev server** — a local, single-process server started with `temporal server start-dev`. Suitable for development and testing only, not production. +- **Self-hosted** — you deploy and manage the Temporal server and its dependencies (e.g., database) in your own infrastructure for production use. +- **Temporal Cloud** — a fully managed production service operated by Temporal. No cluster infrastructure to manage. + +**Workers** are long-running processes that you run and manage. They poll Task Queues for work and execute your code. You might run a single Worker process on one machine during development, or run many Worker processes across a large fleet of machines in production. Each Worker hosts two types of code: + +- **Workflow Definitions** — durable, deterministic functions that orchestrate work. These must not have side effects. +- **Activity Implementations** — non-deterministic operations (API calls, file I/O, etc.) that can fail and be retried. -**Components:** -- **Workflows** - Durable, deterministic functions that orchestrate activities -- **Activities** - Non-deterministic operations (API calls, I/O) that can fail and retry -- **Workers** - Long-running processes that poll task queues and execute code -- **Task Queues** - Named queues connecting clients to workers +Workers communicate with the Cluster via a poll/complete loop: they poll a Task Queue for tasks, execute the corresponding Workflow or Activity code, and report results back. ## History Replay: Why Determinism Matters From 9b97ceebd07ba362a063baaec36939545190af5d Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 2 Apr 2026 14:12:24 -0400 Subject: [PATCH 16/42] [fix] Add missing section to TS's observability (#65) --- references/typescript/observability.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/references/typescript/observability.md b/references/typescript/observability.md index 10244d7..211fbc6 100644 --- a/references/typescript/observability.md +++ b/references/typescript/observability.md @@ -100,6 +100,10 @@ Runtime.install({ }); ``` +## Search Attributes (Visibility) + +See the Search Attributes section of `references/typescript/data-handling.md` + ## Best Practices 1. Use `log` from `@temporalio/workflow` for production observability. For temporary print debugging, `console.log()` is fine—it's direct and immediate, whereas `log` goes through sinks which may lose messages on workflow errors From 0c8586b4c232696757ef02b5aed0a9fdc4ff6f32 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 2 Apr 2026 17:07:33 -0400 Subject: [PATCH 17/42] Add Java SDK support (#42) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add Java SDK reference files (11 files) Create complete Java reference documentation covering: - java.md: Entry point with quick start tutorial, key concepts - patterns.md: 17 patterns (signals, queries, updates, child workflows, saga, cancellation scopes, heartbeating, etc.) - determinism.md: Safe alternatives table, forbidden operations - determinism-protection.md: Convention-based enforcement (no sandbox) - error-handling.md: ApplicationFailure, retry/timeout config - gotchas.md: Non-deterministic operations, cancellation, heartbeating - testing.md: TestWorkflowEnvironment, Mockito mocking, replay testing - versioning.md: Workflow.getVersion(), worker versioning - data-handling.md: Jackson, PayloadConverter, encryption, search attributes - observability.md: SLF4J logging, Micrometer metrics - advanced-features.md: Schedules, async completion, worker tuning Co-Authored-By: Claude Opus 4.6 (1M context) * Fix Java alignment issues from self-review - Reduce gotchas.md Non-Deterministic Operations from ~94 lines to ~12 (reference determinism.md instead of duplicating) - Remove Workflow Failure Exception Types duplication from error-handling.md (keep only in advanced-features.md) - Expand versioning.md Worker Versioning with Key Concepts, PINNED vs AUTO_UPGRADE, Deployment Strategies subsections - Fix section names to match Python reference style: Activity Heartbeat Details, Handling Activity Errors, Retry Policy Configuration, Workflow Test Environment, Mocking Activities, Workflow Replay Testing - Reduce data-handling.md Payload Encryption verbosity - Reduce observability.md Logger Customization verbosity - Reduce testing.md to single approach per section - Rename determinism.md "Convention-Based Enforcement" to "SDK Protection" - Fix handler guidance in patterns.md to match Python Co-Authored-By: Claude Opus 4.6 (1M context) * Fix correctness issues in Java reference files - patterns.md: Fix Queries section — ActivityStub → typed interface (Workflow.newActivityStub returns the typed interface, not ActivityStub) - data-handling.md: Add missing ProtobufPayloadConverter to default converter chain (4th of 5 converters) Co-Authored-By: Claude Opus 4.6 (1M context) * Add Java to SKILL.md and core/determinism.md - SKILL.md: Add "Temporal Java" trigger phrase, update Overview to list Java, add Java entry to Getting Started references - core/determinism.md: Add Java entry to SDK Protection Mechanisms (no sandbox, convention-based, NonDeterministicException at replay) Co-Authored-By: Claude Opus 4.6 (1M context) * Apply manual editorial fixes to Java references - java.md: Remove "Understanding Replay" section (covered by Overview), simplify File Organization note (no sandbox rationale) - gotchas.md: Move Heartbeating before Cancellation, make Wrong Retry Classification brief with reference (not inline examples) - error-handling.md: Remove editorializing from Workflow Failure note - determinism-protection.md: Remove cross-language comparison paragraph (state Java's approach on its own terms) Co-Authored-By: Claude Opus 4.6 (1M context) * Add temporal-workflowcheck static analysis to Java determinism docs - determinism-protection.md: Add "Static Analysis with temporal-workflowcheck" section with Gradle/Maven setup, manual run, and suppression instructions. Beta warning included. - determinism.md: Update overview and SDK Protection to reference workflowcheck - core/determinism.md: Update Java entry in SDK Protection Mechanisms Co-Authored-By: Claude Opus 4.6 (1M context) * Integrate feedback from Go PR into Java patterns - Updates: Add validator note — validators must not mutate state or block (matches note added to Python, TypeScript, Go, and core) - Saga Pattern: Use Workflow.newDetachedCancellationScope() for compensations so they execute even if the workflow is cancelled (mirrors Go's workflow.NewDisconnectedContext pattern) Co-Authored-By: Claude Sonnet 4.6 (1M context) * docs: add @WorkflowInit description to java.md Key Concepts Co-Authored-By: Claude Sonnet 4.6 * mark java as supported * Apply suggestions from code review Co-authored-by: Brian Strauch * strongly recommend java 21+ * Softened stance on static checker and replay testing. * address python/typescript sandboxing comment --------- Co-authored-by: Claude Opus 4.6 (1M context) Co-authored-by: Brian Strauch Co-authored-by: Brian Strauch --- README.md | 2 +- SKILL.md | 5 +- references/core/determinism.md | 3 +- references/java/advanced-features.md | 167 +++++++ references/java/data-handling.md | 288 ++++++++++++ references/java/determinism-protection.md | 83 ++++ references/java/determinism.md | 55 +++ references/java/error-handling.md | 188 ++++++++ references/java/gotchas.md | 177 ++++++++ references/java/java.md | 249 +++++++++++ references/java/observability.md | 134 ++++++ references/java/patterns.md | 509 ++++++++++++++++++++++ references/java/testing.md | 184 ++++++++ references/java/versioning.md | 281 ++++++++++++ 14 files changed, 2321 insertions(+), 4 deletions(-) create mode 100644 references/java/advanced-features.md create mode 100644 references/java/data-handling.md create mode 100644 references/java/determinism-protection.md create mode 100644 references/java/determinism.md create mode 100644 references/java/error-handling.md create mode 100644 references/java/gotchas.md create mode 100644 references/java/java.md create mode 100644 references/java/observability.md create mode 100644 references/java/patterns.md create mode 100644 references/java/testing.md create mode 100644 references/java/versioning.md diff --git a/README.md b/README.md index 5f395b5..124b367 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Appropriately adjust the installation directory based on your coding agent. - [x] Python ✅ - [x] TypeScript ✅ - [x] Go ✅ -- [ ] Java 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/42)) +- [x] Java ✅ - [ ] .NET 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/39)) - [ ] Ruby 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/41)) - [ ] PHP 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/40)) diff --git a/SKILL.md b/SKILL.md index 325709f..38c2185 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,6 +1,6 @@ --- name: temporal-developer -description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "Temporal Go", "Temporal Golang", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. +description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "Temporal Go", "Temporal Golang", "Temporal Java", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. version: 0.1.0 --- @@ -8,7 +8,7 @@ version: 0.1.0 ## Overview -Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python, TypeScript, and Go. +Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python, TypeScript, Go, and Java. ## Core Architecture @@ -79,6 +79,7 @@ Once you've downloaded the file, extract the downloaded archive and add the temp 1. First, read the getting started guide for the language you are working in: - Python -> read `references/python/python.md` - TypeScript -> read `references/typescript/typescript.md` + - Java -> read `references/java/java.md` - Go -> read `references/go/go.md` 2. Second, read appropriate `core` and language-specific references for the task at hand. diff --git a/references/core/determinism.md b/references/core/determinism.md index af824d2..16f04db 100644 --- a/references/core/determinism.md +++ b/references/core/determinism.md @@ -76,10 +76,11 @@ In Temporal, activities are the primary mechanism for making non-deterministic c For a few simple cases, like timestamps, random values, UUIDs, etc. the Temporal SDK in your language may provide durable variants that are simple to use. See `references/{your_language}/determinism.md` for the language you are working in for more info. ## SDK Protection Mechanisms -Each Temporal SDK language provides a protection mechanism to make it easier to catch non-determinism errors earlier in development: +Each Temporal SDK language provides a different level of protection against non-determinism: - Python: The Python SDK runs workflows in a sandbox that intercepts and aborts non-deterministic calls early at runtime. - TypeScript: The TypeScript SDK runs workflows in an isolated V8 sandbox, intercepting many common sources of non-determinism and replacing them automatically with deterministic variants. +- Java: The Java SDK has no sandbox. Determinism is enforced by developer conventions — the SDK provides `Workflow.*` APIs as safe alternatives (e.g., `Workflow.sleep()` instead of `Thread.sleep()`), and non-determinism is only detected at replay time via `NonDeterministicException`. A static analysis tool (`temporal-workflowcheck`, beta) can catch violations at build time. Cooperative threading under a global lock eliminates the need for synchronization. - Go: The Go SDK has no runtime sandbox. Therefore, non-determinism bugs will never be immediately appararent, and are usually only observable during replay. The optional `workflowcheck` static analysis tool can be used to check for many sources of non-determinism at compile time. Regardless of which SDK you are using, it is your responsibility to ensure that workflow code does not contain sources of non-determinism. Use SDK-specific tools as well as replay tests for doing so. diff --git a/references/java/advanced-features.md b/references/java/advanced-features.md new file mode 100644 index 0000000..e897bb1 --- /dev/null +++ b/references/java/advanced-features.md @@ -0,0 +1,167 @@ +# Java SDK Advanced Features + +## Schedules + +Create recurring workflow executions. + +```java +import io.temporal.client.schedules.*; + +ScheduleClient scheduleClient = ScheduleClient.newInstance(service); + +// Create a schedule +String scheduleId = "daily-report"; +ScheduleHandle handle = scheduleClient.createSchedule( + scheduleId, + Schedule.newBuilder() + .setAction( + ScheduleActionStartWorkflow.newBuilder() + .setWorkflowType(DailyReportWorkflow.class) + .setOptions( + WorkflowOptions.newBuilder() + .setWorkflowId("daily-report") + .setTaskQueue("reports") + .build() + ) + .build() + ) + .setSpec( + ScheduleSpec.newBuilder() + .setIntervals( + List.of(new ScheduleIntervalSpec(Duration.ofDays(1))) + ) + .build() + ) + .build(), + ScheduleOptions.newBuilder().build() +); + +// Manage schedules +ScheduleHandle scheduleHandle = scheduleClient.getHandle(scheduleId); +scheduleHandle.pause("Maintenance window"); +scheduleHandle.unpause(); +scheduleHandle.trigger(); // Run immediately +scheduleHandle.delete(); +``` + +## Async Activity Completion + +For activities that complete asynchronously (e.g., human tasks, external callbacks). +If you configure a heartbeat timeout on this activity, the external completer is responsible for sending heartbeats via the async handle. + +**Note:** If the external system can reliably Signal back with the result and doesn't need to Heartbeat or receive Cancellation, consider using **signals** instead. + +```java +public class ApprovalActivitiesImpl implements ApprovalActivities { + @Override + public String requestApproval(String requestId) { + ActivityExecutionContext ctx = Activity.getExecutionContext(); + + // Get task token for async completion + byte[] taskToken = ctx.getTaskToken(); + + // Store task token for later completion (e.g., in database) + storeTaskToken(requestId, taskToken); + + // Mark this activity as waiting for external completion + ctx.doNotCompleteOnReturn(); + + return null; // Return value is ignored + } +} + +// Later, complete the activity from another process +public void completeApproval(String requestId, boolean approved) { + WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); + WorkflowClient client = WorkflowClient.newInstance(service); + + ActivityCompletionClient completionClient = client.newActivityCompletionClient(); + + byte[] taskToken = getTaskToken(requestId); + + if (approved) { + completionClient.complete(taskToken, "approved"); + } else { + completionClient.completeExceptionally( + taskToken, + new RuntimeException("Rejected") + ); + } +} +``` + +## Worker Tuning + +Configure worker performance settings. + +```java +WorkerOptions workerOptions = WorkerOptions.newBuilder() + // Max concurrent workflow task executions (default: 200) + .setMaxConcurrentWorkflowTaskExecutionSize(200) + // Max concurrent activity executions (default: 200) + .setMaxConcurrentActivityExecutionSize(200) + // Max concurrent local activity executions (default: 200) + .setMaxConcurrentLocalActivityExecutionSize(200) + // Max workflow task pollers (default: 5) + .setMaxConcurrentWorkflowTaskPollers(5) + // Max activity task pollers (default: 5) + .setMaxConcurrentActivityTaskPollers(5) + .build(); + +WorkerFactory factory = WorkerFactory.newInstance(client); +Worker worker = factory.newWorker("my-queue", workerOptions); +worker.registerWorkflowImplementationTypes(MyWorkflowImpl.class); +worker.registerActivitiesImplementations(new MyActivitiesImpl()); +factory.start(); +``` + +## Workflow Failure Exception Types + +Control which exceptions cause workflow failures vs workflow task failures. + +By default, only `ApplicationFailure` (and its subclasses) fail the workflow execution. All other exceptions fail the **workflow task**, causing the task to retry indefinitely until the code is fixed or the workflow is terminated. + +### Per-Workflow Configuration + +Use `WorkflowImplementationOptions` to specify which exception types should fail the workflow: + +```java +Worker worker = factory.newWorker("my-queue"); +worker.registerWorkflowImplementationTypes( + WorkflowImplementationOptions.newBuilder() + .setFailWorkflowExceptionTypes( + IllegalArgumentException.class, + CustomBusinessException.class + ) + .build(), + MyWorkflowImpl.class +); +``` + +With this configuration, `IllegalArgumentException` and `CustomBusinessException` thrown from the workflow will fail the workflow execution instead of just the workflow task. + +### Worker-Level Configuration + +Apply to all workflows registered on the worker: + +```java +WorkerFactoryOptions factoryOptions = WorkerFactoryOptions.newBuilder() + .setWorkflowHostLocalTaskQueueScheduleToStartTimeout(Duration.ofSeconds(10)) + .build(); +WorkerFactory factory = WorkerFactory.newInstance(client, factoryOptions); + +Worker worker = factory.newWorker("my-queue"); +// Register each workflow type with its own failure exception types +worker.registerWorkflowImplementationTypes( + WorkflowImplementationOptions.newBuilder() + .setFailWorkflowExceptionTypes( + IllegalArgumentException.class, + CustomBusinessException.class + ) + .build(), + MyWorkflowImpl.class, + AnotherWorkflowImpl.class +); +``` + +- **Tip for testing:** Set `setFailWorkflowExceptionTypes(Throwable.class)` so any unhandled exception fails the workflow immediately rather than retrying the workflow task forever. This surfaces bugs faster. diff --git a/references/java/data-handling.md b/references/java/data-handling.md new file mode 100644 index 0000000..2ef1891 --- /dev/null +++ b/references/java/data-handling.md @@ -0,0 +1,288 @@ +# Java SDK Data Handling + +## Overview + +The Java SDK uses data converters to serialize/deserialize workflow inputs, outputs, and activity parameters. The `DataConverter` interface controls how values are converted to and from Temporal `Payload` protobufs. + +## Default Data Converter + +`DefaultDataConverter` applies converters in order, using the first that accepts the value: + +1. `NullPayloadConverter` — `null` values +2. `ByteArrayPayloadConverter` — `byte[]` as raw binary +3. `ProtobufJsonPayloadConverter` — Protobuf `Message` instances as JSON +4. `ProtobufPayloadConverter` — Protobuf `Message` instances as binary +5. `JacksonJsonPayloadConverter` — Everything else via Jackson `ObjectMapper` + +## Jackson Integration + +Use `JacksonJsonPayloadConverter` with a custom `ObjectMapper` for advanced serialization (e.g., Java 8 time module, custom serializers): + +```java +ObjectMapper mapper = new ObjectMapper() + .registerModule(new JavaTimeModule()) + .disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + +DefaultDataConverter converter = DefaultDataConverter.newDefaultInstance() + .withPayloadConverterOverrides( + new JacksonJsonPayloadConverter(mapper) + ); + +WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); +WorkflowClient client = WorkflowClient.newInstance( + service, + WorkflowClientOptions.newBuilder() + .setDataConverter(converter) + .build() +); +``` + +## Custom Data Converter + +Implement `PayloadConverter` for custom serialization: + +```java +public class MyCustomPayloadConverter implements PayloadConverter { + @Override + public String getEncodingType() { + return "json/my-custom"; + } + + @Override + public Optional toData(Object value) throws DataConverterException { + // Return Optional.empty() if this converter doesn't handle the type + if (!(value instanceof MyCustomType)) { + return Optional.empty(); + } + // Serialize to Payload + byte[] data = serialize(value); + return Optional.of( + Payload.newBuilder() + .putMetadata("encoding", ByteString.copyFromUtf8(getEncodingType())) + .setData(ByteString.copyFrom(data)) + .build() + ); + } + + @Override + public T fromData(Payload content, Class valueClass, Type valueType) + throws DataConverterException { + // Deserialize from Payload + return deserialize(content.getData().toByteArray(), valueClass); + } +} +``` + +Override specific converters in the default chain: + +```java +DefaultDataConverter converter = DefaultDataConverter.newDefaultInstance() + .withPayloadConverterOverrides(new MyCustomPayloadConverter()); +``` + +## Composition of Payload Converters + +`DefaultDataConverter` holds a list of `PayloadConverter` instances tried in order. The first converter whose `toData()` returns a non-empty `Optional` wins. When using `withPayloadConverterOverrides()`, converters with matching encoding types replace existing ones. + +```java +DefaultDataConverter converter = DefaultDataConverter.newDefaultInstance() + .withPayloadConverterOverrides( + new MyCustomPayloadConverter(), // encoding: "json/my-custom" + new JacksonJsonPayloadConverter(mapper) // replaces default Jackson converter + ); +``` + +## Protobuf Support + +Protobuf messages are handled by `ProtobufJsonPayloadConverter` (enabled by default). It serializes `com.google.protobuf.Message` instances as JSON for human readability in the Temporal UI. + +```java +// Protobuf messages work out of the box as workflow/activity params +@WorkflowInterface +public interface MyWorkflow { + @WorkflowMethod + MyProtoResult run(MyProtoInput input); +} +``` + +For binary protobuf encoding instead of JSON, use `ProtobufPayloadConverter`: + +```java +DefaultDataConverter converter = DefaultDataConverter.newDefaultInstance() + .withPayloadConverterOverrides(new ProtobufPayloadConverter()); +``` + +## Payload Encryption + +Use `PayloadCodec` with `CodecDataConverter` to encrypt/compress payloads: + +```java +public class EncryptionCodec implements PayloadCodec { + private final SecretKey key; + + public EncryptionCodec(SecretKey key) { + this.key = key; + } + + @Override + public List encode(List payloads) { + return payloads.stream().map(payload -> { + // Encrypt payload.toByteArray() using your chosen algorithm (e.g., AES/GCM) + byte[] encrypted = encryptBytes(payload.toByteArray(), key); + return Payload.newBuilder() + .putMetadata("encoding", ByteString.copyFromUtf8("binary/encrypted")) + .setData(ByteString.copyFrom(encrypted)) + .build(); + }).collect(Collectors.toList()); + } + + @Override + public List decode(List payloads) { + return payloads.stream().map(payload -> { + String encoding = payload.getMetadataOrDefault( + "encoding", ByteString.EMPTY).toStringUtf8(); + if (!"binary/encrypted".equals(encoding)) return payload; + // Decrypt and reconstruct the original Payload + byte[] decrypted = decryptBytes(payload.getData().toByteArray(), key); + return Payload.parseFrom(decrypted); + }).collect(Collectors.toList()); + } +} +``` + +Apply the codec to the client: + +```java +CodecDataConverter codecDataConverter = new CodecDataConverter( + DefaultDataConverter.newDefaultInstance(), + Collections.singletonList(new EncryptionCodec(secretKey)) +); + +WorkflowClient client = WorkflowClient.newInstance( + service, + WorkflowClientOptions.newBuilder() + .setDataConverter(codecDataConverter) + .build() +); +``` + +## Search Attributes + +Custom searchable fields for workflow visibility. + +```java +import io.temporal.common.SearchAttributeKey; +import io.temporal.common.SearchAttributes; + +// Define typed search attribute keys +static final SearchAttributeKey ORDER_ID = + SearchAttributeKey.forKeyword("OrderId"); +static final SearchAttributeKey ORDER_STATUS = + SearchAttributeKey.forKeyword("OrderStatus"); +static final SearchAttributeKey ORDER_TOTAL = + SearchAttributeKey.forDouble("OrderTotal"); +static final SearchAttributeKey CREATED_AT = + SearchAttributeKey.forOffsetDateTime("CreatedAt"); + +// Set at workflow start +WorkflowOptions options = WorkflowOptions.newBuilder() + .setWorkflowId("order-" + orderId) + .setTaskQueue("orders") + .setTypedSearchAttributes( + SearchAttributes.newBuilder() + .set(ORDER_ID, orderId) + .set(ORDER_STATUS, "pending") + .set(ORDER_TOTAL, 99.99) + .set(CREATED_AT, OffsetDateTime.now()) + .build() + ) + .build(); +``` + +Upsert during workflow execution: + +```java +@WorkflowInterface +public interface OrderWorkflow { + @WorkflowMethod + String run(Order order); +} + +public class OrderWorkflowImpl implements OrderWorkflow { + static final SearchAttributeKey ORDER_STATUS = + SearchAttributeKey.forKeyword("OrderStatus"); + + @Override + public String run(Order order) { + // ... process order ... + + Workflow.upsertTypedSearchAttributes( + ORDER_STATUS.valueSet("completed") + ); + return "done"; + } +} +``` + +### Querying Workflows by Search Attributes + +```java +ListWorkflowExecutionsRequest request = ListWorkflowExecutionsRequest.newBuilder() + .setNamespace("default") + .setQuery("OrderStatus = 'processing' OR OrderStatus = 'pending'") + .build(); +``` + +## Workflow Memo + +Store arbitrary metadata with workflows (not searchable). + +```java +// Set memo at workflow start +WorkflowOptions options = WorkflowOptions.newBuilder() + .setWorkflowId("order-" + orderId) + .setTaskQueue("orders") + .setMemo(Map.of( + "customer_name", order.getCustomerName(), + "notes", "Priority customer" + )) + .build(); +``` + +```java +// Read memo from workflow +@Override +public String run(Order order) { + String notes = Workflow.getMemo("notes", String.class); + // ... +} +``` + +## Deterministic APIs for Values + +Use these APIs within workflows for deterministic values: + +```java +@Override +public String run() { + // Deterministic UUID (same on replay) + String uniqueId = Workflow.randomUUID().toString(); + + // Deterministic random (same on replay) + Random rng = Workflow.newRandom(); + int value = rng.nextInt(100); + + // Deterministic current time (same on replay) + long now = Workflow.currentTimeMillis(); + + return uniqueId; +} +``` + +## Best Practices + +1. Use Jackson `ObjectMapper` customization for complex serialization needs +2. Keep payloads small — see `references/core/gotchas.md` for limits +3. Encrypt sensitive data with `PayloadCodec` and `CodecDataConverter` +4. Use POJOs or Protobuf messages for workflow/activity parameters +5. Use `Workflow.randomUUID()`, `Workflow.newRandom()`, and `Workflow.currentTimeMillis()` for deterministic values diff --git a/references/java/determinism-protection.md b/references/java/determinism-protection.md new file mode 100644 index 0000000..78c4446 --- /dev/null +++ b/references/java/determinism-protection.md @@ -0,0 +1,83 @@ +# Java Determinism Protection + +## Overview + +The Java SDK has **no sandbox** (only Python and TypeScript have sandboxing). Java relies on developer conventions and runtime replay detection to enforce determinism. A static analysis tool (`temporal-workflowcheck`) is available in beta. + +## Forbidden Operations + +```java +// BAD: Non-deterministic operations in workflow code +Thread.sleep(1000); +UUID id = UUID.randomUUID(); +double val = Math.random(); +long now = System.currentTimeMillis(); +new Thread(() -> doWork()).start(); +CompletableFuture.supplyAsync(() -> compute()); + +// GOOD: Deterministic Workflow.* alternatives +Workflow.sleep(Duration.ofSeconds(1)); +String id = Workflow.randomUUID().toString(); +int val = Workflow.newRandom().nextInt(); +long now = Workflow.currentTimeMillis(); +Promise promise = Async.procedure(() -> doWork()); +CompletablePromise promise = Workflow.newPromise(); +``` + +## Static Analysis with `temporal-workflowcheck` + +**Warning:** This tool is in beta. + +`temporal-workflowcheck` scans compiled bytecode to detect non-deterministic operations in workflow code. It catches threading, I/O, randomization, system time access, and non-final static field access — including transitive violations through call chains. + +### Setup (Gradle) + +Add the dependency as a compile-only check: + +```groovy +dependencies { + implementation 'io.temporal:temporal-sdk:1.+' + compileOnly 'io.temporal:temporal-workflowcheck:1.+' +} +``` + +See the [Gradle sample](https://github.com/temporalio/sdk-java/tree/master/temporal-workflowcheck/samples/gradle) for full task configuration. + +### Setup (Maven) + +See the [Maven sample](https://github.com/temporalio/sdk-java/tree/master/temporal-workflowcheck/samples/maven) for POM configuration. + +### Running Manually + +Download the `-all.jar` from Maven Central (`io.temporal:temporal-workflowcheck`) and run: + +```bash +java -jar temporal-workflowcheck--all.jar check +``` + +### Suppressing False Positives + +Use the `@WorkflowCheck.SuppressWarnings` annotation on methods: + +```java +@WorkflowCheck.SuppressWarnings(invalidMembers = "currentTimeMillis") +public long getCurrentMillis() { + return System.currentTimeMillis(); +} +``` + +Or use a `.properties` configuration file with `--config ` for third-party library false positives. + +## Convention-Based Enforcement + +Java workflow code runs in a cooperative threading model where only one workflow thread executes at a time under a global lock. The SDK does not intercept or block non-deterministic calls. Instead, non-determinism is detected at **replay time**: if replayed code produces results that differ from the recorded history, the SDK throws a `NonDeterministicException`. + +Use both `temporal-workflowcheck` (static, pre-deploy) and `WorkflowReplayer` (replay testing) to catch non-determinism before production. + +## Best Practices + +1. Run `temporal-workflowcheck` in CI to catch non-deterministic code statically +2. Always use `Workflow.*` APIs instead of standard Java equivalents for time, randomness, UUIDs, sleeping, and threading +3. Test all workflow code changes with `WorkflowReplayer` against recorded histories +4. Keep workflows focused on orchestration logic; move all I/O and side effects into activities +5. Avoid mutable static state shared across workflow instances diff --git a/references/java/determinism.md b/references/java/determinism.md new file mode 100644 index 0000000..1981d00 --- /dev/null +++ b/references/java/determinism.md @@ -0,0 +1,55 @@ +# Java SDK Determinism + +## Overview + +The Java SDK has **no sandbox** (only Python and TypeScript have sandboxing). The Java SDK relies on developer conventions to enforce determinism. The SDK provides `Workflow.*` APIs as safe replacements for common non-deterministic operations. A static analysis tool (`temporal-workflowcheck`, beta) can catch violations at build time — see `references/java/determinism-protection.md`. + +## Why Determinism Matters: History Replay + +Temporal provides durable execution through **History Replay**. When a Worker needs to restore workflow state (after a crash, cache eviction, or to continue after a long timer), it re-executes the workflow code from the beginning, which requires the workflow code to be **deterministic**. + +## SDK Protection + +Java workflow code runs in a cooperative threading model where only one workflow thread executes at a time under a global lock. The SDK does not intercept or block non-deterministic calls at runtime. If you call a forbidden operation, it will silently succeed during the initial execution but cause a `NonDeterministicException` when the workflow is replayed. + +`temporal-workflowcheck` (static analysis, beta) and `WorkflowReplayer` (replay testing) can help uncover some violations, but they are not exhaustive — careful code review and adherence to the rules below remain essential. + +## Forbidden Operations + +- `Thread.sleep()` — blocks the real thread, bypasses Temporal timers +- `new Thread()` or thread pools — breaks the cooperative threading model +- `synchronized` blocks and explicit locks — can deadlock with the workflow executor +- `UUID.randomUUID()` — non-deterministic across replays +- `Math.random()` or `new Random()` — non-deterministic across replays +- `System.currentTimeMillis()` or `Instant.now()` — non-deterministic across replays +- Direct I/O (network, filesystem, database) — side effects must run in activities +- Mutable global/static state — shared state breaks isolation between workflow instances +- `CompletableFuture` — bypasses the workflow scheduler; use `Promise` instead + +## Safe Builtin Alternatives + +| Forbidden | Safe Alternative | +|-----------|------------------| +| `Thread.sleep(millis)` | `Workflow.sleep(Duration.ofMillis(millis))` | +| `UUID.randomUUID()` | `Workflow.randomUUID()` | +| `Math.random()` | `Workflow.newRandom().nextInt()` | +| `System.currentTimeMillis()` | `Workflow.currentTimeMillis()` | +| `new Thread(runnable)` | `Async.function(func)` / `Async.procedure(proc)` | +| `CompletableFuture` | `Promise` / `CompletablePromise` | +| `BlockingQueue` | `WorkflowQueue` | +| `Future` | `Promise` | + +## Testing Replay Compatibility + +Use the `WorkflowReplayer` class to verify your code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/java/testing.md`. + +## Best Practices + +1. Use `Workflow.currentTimeMillis()` for all time operations +2. Use `Workflow.newRandom()` for random values +3. Use `Workflow.randomUUID()` for unique identifiers +4. Use `Async.function()` / `Async.procedure()` instead of raw threads +5. Use `Promise` and `CompletablePromise` instead of `CompletableFuture` +6. Test with `WorkflowReplayer` to catch non-determinism +7. Keep workflows focused on orchestration, delegate I/O to activities +8. Use `Workflow.getLogger()` for replay-safe logging diff --git a/references/java/error-handling.md b/references/java/error-handling.md new file mode 100644 index 0000000..97d4cea --- /dev/null +++ b/references/java/error-handling.md @@ -0,0 +1,188 @@ +# Java SDK Error Handling + +## Overview + +The Java SDK uses `ApplicationFailure` for application-specific errors and `RetryOptions` for retry configuration. Generally, the following information about errors and retryability applies across activities, child workflows and Nexus operations. + +## Application Errors + +```java +import io.temporal.activity.ActivityInterface; +import io.temporal.activity.ActivityMethod; +import io.temporal.failure.ApplicationFailure; + +@ActivityInterface +public interface OrderActivities { + @ActivityMethod + void validateOrder(Order order); +} + +public class OrderActivitiesImpl implements OrderActivities { + @Override + public void validateOrder(Order order) { + if (!order.isValid()) { + throw ApplicationFailure.newFailure( + "Invalid order", + "ValidationError" + ); + } + } +} +``` + +Any exception that is not an `ApplicationFailure` is automatically converted to one, with the fully qualified class name as the type. For example, throwing `new NullPointerException("msg")` is equivalent to `ApplicationFailure.newFailure("msg", "java.lang.NullPointerException")`. + +## Non-Retryable Errors + +```java +import io.temporal.failure.ApplicationFailure; + +public class PaymentActivitiesImpl implements PaymentActivities { + @Override + public String chargeCard(String cardNumber, double amount) { + if (!isValidCard(cardNumber)) { + throw ApplicationFailure.newNonRetryableFailure( + "Permanent failure - invalid credit card", + "PaymentError" + ); + } + return processPayment(cardNumber, amount); + } +} +``` + +You can also mark error types as non-retryable via `RetryOptions.setDoNotRetry()`: + +```java +RetryOptions retryOptions = RetryOptions.newBuilder() + .setDoNotRetry( + CreditCardProcessingException.class.getName(), + "ValidationError" + ) + .build(); +``` + +Use `newNonRetryableFailure()` when the **activity implementer** knows the error is permanent. Use `setDoNotRetry()` when the **caller** wants to control retryability. + +## Activity Errors + +Activity failures are always wrapped in `ActivityFailure`. The original exception becomes the `cause`: + +- `ActivityFailure` → `ApplicationFailure` (application error) +- `ActivityFailure` → `TimeoutFailure` (timeout) +- `ActivityFailure` → `CanceledFailure` (cancellation) + +## Handling Activity Errors + +```java +import io.temporal.failure.ActivityFailure; +import io.temporal.failure.ApplicationFailure; +import io.temporal.failure.TimeoutFailure; +import io.temporal.workflow.Workflow; + +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + try { + return activities.riskyOperation(); + } catch (ActivityFailure af) { + if (af.getCause() instanceof ApplicationFailure) { + ApplicationFailure appFailure = (ApplicationFailure) af.getCause(); + String type = appFailure.getType(); + // Handle based on error type + } else if (af.getCause() instanceof TimeoutFailure) { + // Handle timeout + } + throw ApplicationFailure.newFailure( + "Workflow failed due to activity error", + "WorkflowError" + ); + } + } +} +``` + +## Retry Policy Configuration + +```java +import io.temporal.activity.ActivityOptions; +import io.temporal.common.RetryOptions; +import io.temporal.workflow.Workflow; + +import java.time.Duration; + +public class MyWorkflowImpl implements MyWorkflow { + + private final MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(10)) + .setRetryOptions(RetryOptions.newBuilder() + .setMaximumInterval(Duration.ofMinutes(1)) + .setMaximumAttempts(5) + .setDoNotRetry("ValidationError", "PaymentError") + .build()) + .build() + ); + + @Override + public String run() { + return activities.myActivity(); + } +} +``` + +Only set options such as `maximumInterval`, `maximumAttempts` etc. if you have a domain-specific reason to. If not, prefer to leave them at their defaults. + +## Timeout Configuration + +```java +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) // Single attempt + .setScheduleToCloseTimeout(Duration.ofMinutes(30)) // Including retries + .setHeartbeatTimeout(Duration.ofMinutes(2)) // Between heartbeats + .build(); +``` + +## Workflow Failure + +**IMPORTANT:** Only `ApplicationFailure` causes a workflow to fail. Any other exception thrown from workflow code causes the workflow task to retry indefinitely, not the workflow itself. + +```java +import io.temporal.failure.ApplicationFailure; + +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + if (someCondition) { + throw ApplicationFailure.newFailure( + "Cannot process order", + "BusinessError" + ); + } + return "success"; + } +} +``` + +To allow other exception types to fail the workflow instead of causing infinite task retries, see `references/java/advanced-features.md` for configuring `setFailWorkflowExceptionTypes()`. + +Use checked exceptions with `Workflow.wrap()` to rethrow them as unchecked: + +```java +try { + return someCall(); +} catch (Exception e) { + throw Workflow.wrap(e); +} +``` + +## Best Practices + +1. Use specific error types for different failure modes +2. Mark permanent failures as non-retryable +3. Configure appropriate retry policies +4. Log errors before re-raising +5. Catch `ActivityFailure` (not `ApplicationFailure`) for activity failures in workflows +6. Design code to be idempotent for safe retries (see more at `references/core/patterns.md`) +7. Use `ApplicationFailure.newFailure()` to fail workflows — other exceptions cause infinite task retries diff --git a/references/java/gotchas.md b/references/java/gotchas.md new file mode 100644 index 0000000..567fb64 --- /dev/null +++ b/references/java/gotchas.md @@ -0,0 +1,177 @@ +# Java Gotchas + +Java-specific mistakes and anti-patterns. See also [Common Gotchas](../core/gotchas.md) for language-agnostic concepts. + +## Non-Deterministic Operations + +**Critical: The Java SDK has NO sandbox.** Unlike Python (which uses a sandbox) or TypeScript (which uses V8 isolation), the Java SDK relies entirely on developer conventions. Non-deterministic calls silently succeed during initial execution but cause `NonDeterministicException` on replay. + +Forbidden in workflow code — use the Temporal `Workflow.*` equivalents instead: +- `Thread.sleep` → `Workflow.sleep` +- `UUID.randomUUID` → `Workflow.randomUUID` +- `Math.random` → `Workflow.newRandom` +- `System.currentTimeMillis` → `Workflow.currentTimeMillis` +- `new Thread` → `Async.function` +- `synchronized` blocks → unnecessary (workflow code runs under a global lock) + +See `references/java/determinism.md` for the full table of forbidden operations, safe alternatives, and detailed examples. + +## Wrong Retry Classification + +**Example:** Transient networks errors should be retried. Authentication errors should not be. +See `references/java/error-handling.md` to understand how to classify errors. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```java +// BAD - No heartbeat, can't detect stuck activities +@Override +public void processLargeFile(String path) { + for (String chunk : readChunks(path)) { + process(chunk); // Takes hours, no heartbeat + } +} + +// GOOD - Regular heartbeats with progress +@Override +public void processLargeFile(String path) { + int i = 0; + for (String chunk : readChunks(path)) { + Activity.getExecutionContext().heartbeat("Processing chunk " + i++); + process(chunk); + } +} +``` + +### Heartbeat Timeout Too Short + +```java +// BAD - Heartbeat timeout shorter than processing time +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(30)) + .setHeartbeatTimeout(Duration.ofSeconds(10)) // Too short! + .build(); + +// GOOD - Heartbeat timeout allows for processing variance +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(30)) + .setHeartbeatTimeout(Duration.ofMinutes(2)) + .build(); +``` + +Set heartbeat timeout as high as acceptable for your use case — each heartbeat counts as an action. + +## Cancellation + +### Not Handling Workflow Cancellation + +```java +// BAD - Cleanup doesn't run on cancellation +public class BadWorkflow implements MyWorkflow { + @Override + public void run() { + activities.acquireResource(); + activities.doWork(); + activities.releaseResource(); // Never runs if cancelled! + } +} +``` + +```java +// GOOD - Use try/finally with CancellationScope.nonCancellable +import io.temporal.workflow.CancellationScope; +import io.temporal.workflow.Workflow; + +public class GoodWorkflow implements MyWorkflow { + @Override + public void run() { + activities.acquireResource(); + try { + activities.doWork(); + } finally { + CancellationScope scope = Workflow.newDetachedCancellationScope( + () -> activities.releaseResource() + ); + scope.run(); + } + } +} +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: +1. **Heartbeating** - Cancellation is delivered via heartbeat +2. **Catching CanceledFailure** - Thrown when heartbeat detects cancellation + +```java +// BAD - Activity ignores cancellation +@Override +public void longActivity() { + doExpensiveWork(); // Runs to completion even if cancelled +} +``` + +```java +// GOOD - Heartbeat and catch cancellation +import io.temporal.activity.Activity; +import io.temporal.failure.CanceledFailure; + +@Override +public void longActivity() { + try { + for (int i = 0; i < items.size(); i++) { + Activity.getExecutionContext().heartbeat(i); + process(items.get(i)); + } + } catch (CanceledFailure e) { + cleanup(); + throw e; + } +} +``` + +## Testing + +### Not Testing Failures + +It is important to make sure workflows work as expected under failure paths in addition to happy paths. Please see `references/java/testing.md` for more info. + +### Not Testing Replay + +Replay tests help you test that you do not have hidden sources of non-determinism bugs in your workflow code, and should be considered in addition to standard testing. This is especially critical in Java since there is no sandbox. Please see `references/java/testing.md` for more info. + +## Timers and Sleep + +### Using Thread.sleep + +```java +// BAD - Thread.sleep is not deterministic during replay +public class BadWorkflow implements MyWorkflow { + @Override + public void run() { + try { + Thread.sleep(60000); // Non-deterministic! + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } +} +``` + +```java +// GOOD - Use Workflow.sleep for deterministic timers +import io.temporal.workflow.Workflow; +import java.time.Duration; + +public class GoodWorkflow implements MyWorkflow { + @Override + public void run() { + Workflow.sleep(Duration.ofSeconds(60)); // Deterministic + } +} +``` + +**Why this matters:** `Thread.sleep` uses the system clock, which differs between original execution and replay. `Workflow.sleep` creates a durable timer in the event history, ensuring consistent behavior during replay. Unlike Python and TypeScript, there is no sandbox to catch this — the call silently succeeds and only fails on replay. diff --git a/references/java/java.md b/references/java/java.md new file mode 100644 index 0000000..e18d723 --- /dev/null +++ b/references/java/java.md @@ -0,0 +1,249 @@ +# Temporal Java SDK Reference + +## Overview + +The Temporal Java SDK (`io.temporal:temporal-sdk`) uses an interface + implementation pattern for both Workflows and Activities. Java 8+ required; Java 21+ strongly recommended for virtual thread support. + +## Quick Start + +**Add Dependencies:** + +Gradle: +```groovy +implementation 'io.temporal:temporal-sdk:1.+' +``` + +Maven: +```xml + + io.temporal + temporal-sdk + [1.0,) + +``` + +**GreetActivities.java** - Activity interface: +```java +package greetingapp; + +import io.temporal.activity.ActivityInterface; +import io.temporal.activity.ActivityMethod; + +@ActivityInterface +public interface GreetActivities { + + @ActivityMethod + String greet(String name); +} +``` + +**GreetActivitiesImpl.java** - Activity implementation: +```java +package greetingapp; + +public class GreetActivitiesImpl implements GreetActivities { + + @Override + public String greet(String name) { + return "Hello, " + name + "!"; + } +} +``` + +**GreetingWorkflow.java** - Workflow interface: +```java +package greetingapp; + +import io.temporal.workflow.WorkflowInterface; +import io.temporal.workflow.WorkflowMethod; + +@WorkflowInterface +public interface GreetingWorkflow { + + @WorkflowMethod + String greet(String name); +} +``` + +**GreetingWorkflowImpl.java** - Workflow implementation: +```java +package greetingapp; + +import io.temporal.activity.ActivityOptions; +import io.temporal.workflow.Workflow; + +import java.time.Duration; + +public class GreetingWorkflowImpl implements GreetingWorkflow { + + private final GreetActivities activities = Workflow.newActivityStub( + GreetActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .build() + ); + + @Override + public String greet(String name) { + return activities.greet(name); + } +} +``` + +**GreetingWorker.java** - Worker setup: +```java +package greetingapp; + +import io.temporal.client.WorkflowClient; +import io.temporal.serviceclient.WorkflowServiceStubs; +import io.temporal.worker.Worker; +import io.temporal.worker.WorkerFactory; + +public class GreetingWorker { + + public static void main(String[] args) { + // Create gRPC stubs for local dev server (localhost:7233) + WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); + + // Create client + WorkflowClient client = WorkflowClient.newInstance(service); + + // Create factory and worker + WorkerFactory factory = WorkerFactory.newInstance(client); + Worker worker = factory.newWorker("greeting-queue"); + + // Register workflow and activity implementations + worker.registerWorkflowImplementationTypes(GreetingWorkflowImpl.class); + worker.registerActivitiesImplementations(new GreetActivitiesImpl()); + + // Start polling + factory.start(); + } +} +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Run `GreetingWorker.main()` (e.g., `./gradlew run` or `mvn compile exec:java -Dexec.mainClass="greetingapp.GreetingWorker"`). + +**Starter.java** - Start a workflow execution: +```java +package greetingapp; + +import io.temporal.client.WorkflowClient; +import io.temporal.client.WorkflowOptions; +import io.temporal.serviceclient.WorkflowServiceStubs; + +import java.util.UUID; + +public class Starter { + + public static void main(String[] args) { + WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); + WorkflowClient client = WorkflowClient.newInstance(service); + + GreetingWorkflow workflow = client.newWorkflowStub( + GreetingWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId(UUID.randomUUID().toString()) + .setTaskQueue("greeting-queue") + .build() + ); + + String result = workflow.greet("my name"); + System.out.println("Result: " + result); + } +} +``` + +**Run the workflow:** Run `Starter.main()`. Should output: `Result: Hello, my name!`. + +## Key Concepts + +### Workflow Definition +- Annotate interface with `@WorkflowInterface` +- Put any state initialization logic in the workflow constructor to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `@WorkflowInit` decorator and parameters to your constructor. +- Annotate entry point method with `@WorkflowMethod` (exactly one per interface) +- Use `@SignalMethod` for signal handlers +- Use `@QueryMethod` for query handlers +- Use `@UpdateMethod` for update handlers +- Implementation class implements the interface + +### Activity Definition +- Annotate interface with `@ActivityInterface` +- Optionally annotate methods with `@ActivityMethod` (for custom names) +- Implementation class can throw any exception +- Call from workflow via `Workflow.newActivityStub()` + +### Worker Setup +- `WorkflowServiceStubs` -- gRPC connection to Temporal Server +- `WorkflowClient` -- client used by worker to communicate with server +- `WorkerFactory` -- creates Worker instances +- `Worker` -- polls a single Task Queue, register workflows and activities on it +- Call `factory.start()` to begin polling + +## File Organization Best Practice + +**Keep Workflow and Activity definitions in separate files.** Separating them is good practice for clarity and maintainability. + +``` +greetingapp/ +├── GreetActivities.java # Activity interface +├── GreetActivitiesImpl.java # Activity implementation +├── GreetingWorkflow.java # Workflow interface +├── GreetingWorkflowImpl.java # Workflow implementation +├── GreetingWorker.java # Worker setup +└── Starter.java # Client code to start workflows +``` + +## Determinism Rules + +The Java SDK has **no sandbox**. The developer is fully responsible for writing deterministic workflow code. All non-deterministic operations must happen in Activities. + +**Do not use in workflow code:** +- `Thread` / `new Thread()` -- use `Workflow.newTimer()` or `Async.function()` +- `synchronized` / `Lock` -- workflow code is single-threaded +- `UUID.randomUUID()` -- use `Workflow.randomUUID()` +- `Math.random()` -- use `Workflow.newRandom()` +- `System.currentTimeMillis()` / `Instant.now()` -- use `Workflow.currentTimeMillis()` +- File I/O, network calls, database access -- use Activities +- `Thread.sleep()` -- use `Workflow.sleep()` +- Mutable static fields -- workflow instances must not share state + +**Use Workflow.* APIs instead:** +- `Workflow.sleep()` for timers +- `Workflow.currentTimeMillis()` for current time +- `Workflow.randomUUID()` for UUIDs +- `Workflow.newRandom()` for random numbers +- `Workflow.getLogger()` for replay-safe logging + +See `references/core/determinism.md` for detailed determinism rules. + +## Common Pitfalls + +1. **Non-deterministic code in workflows** - Use `Workflow.*` APIs instead of standard Java APIs; perform I/O in Activities +2. **Forgetting `@WorkflowInterface` or `@ActivityInterface`** - Annotations are required on interfaces for registration +3. **Multiple `@WorkflowMethod` on one interface** - Only one `@WorkflowMethod` is allowed per `@WorkflowInterface` +4. **Using `Thread.sleep()` in workflows** - Use `Workflow.sleep()` for deterministic timers +5. **Forgetting to heartbeat** - Long-running activities need `Activity.getExecutionContext().heartbeat()` +6. **Using `System.out.println()` in workflows** - Use `Workflow.getLogger()` for replay-safe logging +7. **Not registering activities as instances** - `registerActivitiesImplementations()` takes object instances (`new MyActivitiesImpl()`), not classes +8. **Blocking the workflow thread** - Never perform I/O or long computations in workflow code; use Activities +9. **Sharing mutable state between workflow instances** - Each workflow execution must be independent + +## Writing Tests + +See `references/java/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files +- **`references/java/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. +- **`references/java/determinism.md`** - Determinism rules and safe alternatives for Java +- **`references/java/gotchas.md`** - Java-specific mistakes and anti-patterns +- **`references/java/error-handling.md`** - ApplicationFailure, retry policies, non-retryable errors +- **`references/java/observability.md`** - Logging, metrics, tracing, Search Attributes +- **`references/java/testing.md`** - TestWorkflowEnvironment, time-skipping, activity mocking +- **`references/java/advanced-features.md`** - Schedules, worker tuning, and more +- **`references/java/data-handling.md`** - Data converters, Jackson, payload encryption +- **`references/java/versioning.md`** - Patching API, workflow type versioning, Worker Versioning diff --git a/references/java/observability.md b/references/java/observability.md new file mode 100644 index 0000000..d7d9528 --- /dev/null +++ b/references/java/observability.md @@ -0,0 +1,134 @@ +# Java SDK Observability + +## Overview + +The Java SDK provides observability through replay-safe logging, Micrometer-based metrics, and visibility (Search Attributes). + +## Logging + +### Workflow Logging (Replay-Safe) + +Use `Workflow.getLogger()` for replay-safe logging that suppresses duplicate messages during replay: + +```java +public class OrderWorkflowImpl implements OrderWorkflow { + private static final Logger logger = Workflow.getLogger(OrderWorkflowImpl.class); + + @Override + public String run(Order order) { + logger.info("Workflow started for order {}", order.getId()); + + String result = Workflow.newActivityStub(OrderActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build() + ).processOrder(order); + + logger.info("Activity completed with result {}", result); + return result; + } +} +``` + +The workflow logger automatically: +- Suppresses duplicate logs during replay +- Includes workflow context (workflow ID, run ID, etc.) +- Uses SLF4J under the hood + +### Activity Logging + +Use standard SLF4J loggers in activities. Activity context is available via `Activity.getExecutionContext()`: + +```java +public class OrderActivitiesImpl implements OrderActivities { + private static final Logger logger = + LoggerFactory.getLogger(OrderActivitiesImpl.class); + + @Override + public String processOrder(Order order) { + logger.info("Processing order {}", order.getId()); + + // Access activity context for metadata + ActivityExecutionContext ctx = Activity.getExecutionContext(); + logger.info("Activity ID: {}, attempt: {}", + ctx.getInfo().getActivityId(), + ctx.getInfo().getAttempt()); + + // Perform work... + logger.info("Order processed successfully"); + return "completed"; + } +} +``` + +## Customizing the Logger + +The Java SDK uses SLF4J. Configure your preferred backend: + +### Logback (logback.xml) + +```xml + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + +``` + +Log4j2 is also supported as an SLF4J backend with equivalent configuration. + +## Metrics + +### Micrometer with Prometheus + +The Java SDK uses Micrometer for metrics collection. Configure with `MicrometerClientStatsReporter`: + +```java +import io.micrometer.prometheus.PrometheusConfig; +import io.micrometer.prometheus.PrometheusMeterRegistry; +import io.temporal.common.reporter.MicrometerClientStatsReporter; +import com.uber.m3.tally.RootScopeBuilder; +import com.uber.m3.tally.Scope; +import com.uber.m3.util.Duration; + +// Set up Prometheus registry +PrometheusMeterRegistry registry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); + +// Create the Temporal metrics scope +Scope scope = new RootScopeBuilder() + .reporter(new MicrometerClientStatsReporter(registry)) + .reportEvery(Duration.ofSeconds(10)); + +// Apply to service stubs +WorkflowServiceStubs service = WorkflowServiceStubs.newServiceStubs( + WorkflowServiceStubsOptions.newBuilder() + .setMetricsScope(scope) + .build() +); + +// Expose Prometheus endpoint (e.g., via HTTP server) +// registry.scrape() returns the metrics in Prometheus format +``` + +### Key SDK Metrics + +- `temporal_request` — Client requests to server +- `temporal_workflow_task_execution_latency` — Workflow task processing time +- `temporal_activity_execution_latency` — Activity execution time +- `temporal_workflow_task_replay_latency` — Replay duration + +## Best Practices + +1. Use `Workflow.getLogger()` in workflows, standard SLF4J loggers in activities +2. Do not use `System.out.println()` in workflows — it produces duplicate output on replay +3. Configure Micrometer metrics for production monitoring +4. Use Search Attributes for business-level visibility — see `references/java/data-handling.md` diff --git a/references/java/patterns.md b/references/java/patterns.md new file mode 100644 index 0000000..ed2fb37 --- /dev/null +++ b/references/java/patterns.md @@ -0,0 +1,509 @@ +# Java SDK Patterns + +## Signals + +```java +@WorkflowInterface +public interface OrderWorkflow { + @WorkflowMethod + String run(); + + @SignalMethod + void approve(); + + @SignalMethod + void addItem(String item); +} + +public class OrderWorkflowImpl implements OrderWorkflow { + private boolean approved = false; + private final List items = new ArrayList<>(); + + @Override + public void approve() { + this.approved = true; + } + + @Override + public void addItem(String item) { + this.items.add(item); + } + + @Override + public String run() { + Workflow.await(() -> this.approved); + return "Processed " + this.items.size() + " items"; + } +} +``` + +### Dynamic Signal Handlers + +For handling signals with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined signal handlers. + +```java +public class DynamicSignalWorkflowImpl implements DynamicSignalWorkflow { + private final Map> signals = new HashMap<>(); + + @Override + public String run() { + Workflow.registerListener( + (DynamicSignalHandler) (signalName, encodedArgs) -> { + signals.computeIfAbsent(signalName, k -> new ArrayList<>()) + .add(encodedArgs.get(0, String.class)); + }); + // ... workflow logic ... + } +} +``` + +## Queries + +**Important:** Queries must NOT modify workflow state or have side effects. + +```java +@WorkflowInterface +public interface StatusWorkflow { + @WorkflowMethod + String run(); + + @QueryMethod + String getStatus(); + + @QueryMethod + int getProgress(); +} + +public class StatusWorkflowImpl implements StatusWorkflow { + private String status = "pending"; + private int progress = 0; + + @Override + public String getStatus() { + return this.status; + } + + @Override + public int getProgress() { + return this.progress; + } + + @Override + public String run() { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(1)) + .build()); + + this.status = "running"; + for (int i = 0; i < 100; i++) { + this.progress = i; + activities.processItem(i); + } + this.status = "completed"; + return "done"; + } +} +``` + +### Dynamic Query Handlers + +For handling queries with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined query handlers. + +```java +Workflow.registerListener( + (DynamicQueryHandler) (queryName, encodedArgs) -> { + if (queryName.equals("getField")) { + String fieldName = encodedArgs.get(0, String.class); + return fields.get(fieldName); + } + return null; + }); +``` + +## Updates + +```java +@WorkflowInterface +public interface OrderWorkflow { + @WorkflowMethod + String run(); + + @UpdateMethod + int addItem(String item); + + @UpdateValidatorMethod(updateName = "addItem") + void validateAddItem(String item); +} + +public class OrderWorkflowImpl implements OrderWorkflow { + private final List items = new ArrayList<>(); + + @Override + public int addItem(String item) { + this.items.add(item); + return this.items.size(); // Returns new count to caller + } + + @Override + public void validateAddItem(String item) { + if (item == null || item.isEmpty()) { + throw new IllegalArgumentException("Item cannot be empty"); + } + if (this.items.size() >= 100) { + throw new IllegalArgumentException("Order is full"); + } + } + + // ... run() ... +} +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Throw an exception to reject the update; return normally to accept. + +## Child Workflows + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public List run(List orders) { + List results = new ArrayList<>(); + for (Order order : orders) { + ProcessOrderWorkflow child = Workflow.newChildWorkflowStub( + ProcessOrderWorkflow.class, + ChildWorkflowOptions.newBuilder() + .setWorkflowId("order-" + order.getId()) + .build()); + results.add(child.run(order)); + } + return results; + } +} +``` + +## Child Workflow Options + +```java +ChildWorkflowOptions options = ChildWorkflowOptions.newBuilder() + .setWorkflowId("child-workflow-id") + // Control what happens to child when parent closes + .setParentClosePolicy(ParentClosePolicy.PARENT_CLOSE_POLICY_ABANDON) + // Control what happens to child when parent is cancelled + .setCancellationType(ChildWorkflowCancellationType.WAIT_CANCELLATION_COMPLETED) + .setWorkflowExecutionTimeout(Duration.ofMinutes(10)) + .build(); + +ProcessOrderWorkflow child = Workflow.newChildWorkflowStub( + ProcessOrderWorkflow.class, options); +``` + +## Handles to External Workflows + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public void run(String targetWorkflowId) { + // Get handle to external workflow + TargetWorkflow external = Workflow.newExternalWorkflowStub( + TargetWorkflow.class, targetWorkflowId); + + // Signal the external workflow + external.dataReady(dataPayload); + + // Or cancel it using untyped stub + ExternalWorkflowStub untypedExternal = + Workflow.newUntypedExternalWorkflowStub(targetWorkflowId); + untypedExternal.cancel(); + } +} +``` + +## Parallel Execution + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public List run(List items) { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build()); + + // Execute activities in parallel + List> promises = new ArrayList<>(); + for (String item : items) { + promises.add(Async.function(activities::processItem, item)); + } + + // Wait for all to complete + Promise.allOf(promises).get(); + + // Collect results + List results = new ArrayList<>(); + for (Promise promise : promises) { + results.add(promise.get()); + } + return results; + } +} +``` + +## Continue-as-New + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run(WorkflowState state) { + while (true) { + state = processBatch(state); + + if (state.isComplete()) { + return "done"; + } + + // Continue with fresh history before hitting limits + if (Workflow.getInfo().isContinueAsNewSuggested()) { + Workflow.continueAsNew(state); + } + } + } +} +``` + +## Saga Pattern (Compensations) + +**Important:** Compensation activities should be idempotent — they may be retried (as with ALL activities). + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run(Order order) { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build()); + + List compensations = new ArrayList<>(); + + try { + // Note - we save the compensation BEFORE running the activity, + // because the following could happen: + // 1. reserveInventory starts running + // 2. it does successfully reserve inventory + // 3. but then fails for some other reason (timeout, reporting metrics, etc.) + // 4. in that case, the activity would have failed, but the effect still happened + // So, the compensation needs to handle both reserved and unreserved states. + compensations.add(() -> activities.releaseInventoryIfReserved(order)); + activities.reserveInventory(order); + + compensations.add(() -> activities.refundPaymentIfCharged(order)); + activities.chargePayment(order); + + activities.shipOrder(order); + + return "Order completed"; + + } catch (Exception e) { + Workflow.getLogger(MyWorkflowImpl.class) + .error("Order failed, running compensations", e); + // Use a detached cancellation scope so compensations run even if + // the workflow itself was cancelled. + CancellationScope compensationScope = Workflow.newDetachedCancellationScope(() -> { + Collections.reverse(compensations); + for (Runnable compensate : compensations) { + try { + compensate.run(); + } catch (Exception compErr) { + Workflow.getLogger(MyWorkflowImpl.class) + .error("Compensation failed", compErr); + } + } + }); + compensationScope.run(); + throw Workflow.wrap(e); + } + } +} +``` + +## Cancellation Scopes + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + try { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofHours(1)) + .build()); + + activities.longRunningActivity(); + return "completed"; + + } catch (CanceledFailure e) { + // Workflow was cancelled - perform cleanup + Workflow.getLogger(MyWorkflowImpl.class) + .info("Workflow cancelled, running cleanup"); + + // Use nonCancellable scope so cleanup activities still run + CancellationScope cleanupScope = Workflow.newDetachedCancellationScope( + () -> { + MyActivities activities = Workflow.newActivityStub( + MyActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build()); + activities.cleanupActivity(); + }); + cleanupScope.run(); + throw e; // Re-throw to mark workflow as cancelled + } + } +} +``` + +Timeout scope: + +```java +CancellationScope timeoutScope = Workflow.newCancellationScope( + () -> { + // This scope will be cancelled after 30 minutes + activities.longRunningActivity(); + }); +timeoutScope.run(); +// Cancel after timeout +Workflow.newTimer(Duration.ofMinutes(30)).thenApply(r -> { + timeoutScope.cancel(); + return null; +}); +``` + +## Wait Condition with Timeout + +```java +public class MyWorkflowImpl implements MyWorkflow { + private boolean approved = false; + + @Override + public String run() { + // Wait for approval with 24-hour timeout + boolean received = Workflow.await(Duration.ofHours(24), () -> this.approved); + if (received) { + return "approved"; + } + return "auto-rejected due to timeout"; + } +} +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers should generally be non-async (avoid running activities from them). Otherwise, the workflow may complete before handlers finish their execution. However, making handlers non-async sometimes requires workarounds that add complexity. + +When handlers do run async operations, call `Workflow.await(() -> Workflow.isEveryHandlerFinished())` at the end of your workflow (or before continue-as-new) to prevent completion until all pending handlers complete. + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + // ... main workflow logic ... + + // Before exiting, wait for all handlers to finish + Workflow.await(() -> Workflow.isEveryHandlerFinished()); + return "done"; + } +} +``` + +## Activity Heartbeat Details + +### WHY: +- **Support activity cancellation** — Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** — Heartbeat details persist across retries + +### WHEN: +- **Cancellable activities** — Any activity that should respond to cancellation +- **Long-running activities** — Track progress for resumability +- **Checkpointing** — Save progress periodically + +```java +@ActivityInterface +public interface MyActivities { + @ActivityMethod + String processLargeFile(String filePath); +} + +public class MyActivitiesImpl implements MyActivities { + @Override + public String processLargeFile(String filePath) { + ActivityExecutionContext ctx = Activity.getExecutionContext(); + + // Get heartbeat details from previous attempt (if any) + Optional lastLine = ctx.getHeartbeatDetails(Integer.class); + int startLine = lastLine.orElse(0); + + try { + List lines = readFile(filePath); + for (int i = startLine; i < lines.size(); i++) { + processLine(lines.get(i)); + + // Heartbeat with progress + // If cancelled, heartbeat() throws CanceledFailure + ctx.heartbeat(i + 1); + } + return "completed"; + } catch (ActivityCompletionException e) { + // CanceledFailure extends ActivityCompletionException + cleanup(); + throw e; + } + } +} +``` + +Set `heartbeatTimeout` in `ActivityOptions` to enable heartbeat-based failure detection: + +```java +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofHours(1)) + .setHeartbeatTimeout(Duration.ofSeconds(30)) + .build(); +``` + +## Timers + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + Workflow.sleep(Duration.ofHours(1)); + + return "Timer fired"; + } +} +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```java +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run() { + MyActivities localActivities = Workflow.newLocalActivityStub( + MyActivities.class, + LocalActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(5)) + .build()); + + String result = localActivities.quickLookup("key"); + return result; + } +} +``` diff --git a/references/java/testing.md b/references/java/testing.md new file mode 100644 index 0000000..80ed9b2 --- /dev/null +++ b/references/java/testing.md @@ -0,0 +1,184 @@ +# Java SDK Testing + +## Overview + +You test Temporal Java Workflows using `TestWorkflowEnvironment` (manual setup) or `TestWorkflowExtension` (JUnit 5). Activity mocking uses Mockito. The SDK provides `WorkflowReplayer` for replay-based compatibility testing. + +## Workflow Test Environment + +```java +import io.temporal.testing.TestWorkflowExtension; +import io.temporal.testing.TestWorkflowEnvironment; +import io.temporal.client.WorkflowClient; +import io.temporal.client.WorkflowOptions; +import io.temporal.worker.Worker; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class MyWorkflowTest { + + @RegisterExtension + public static final TestWorkflowExtension testWorkflowExtension = + TestWorkflowExtension.newBuilder() + .setWorkflowTypes(MyWorkflowImpl.class) + .setDoNotStart(true) + .build(); + + @Test + void testWorkflow(TestWorkflowEnvironment env, Worker worker, WorkflowClient client) { + worker.registerActivitiesImplementations(new MyActivitiesImpl()); + env.start(); + + MyWorkflow workflow = client.newWorkflowStub( + MyWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(worker.getTaskQueue()) + .build()); + + String result = workflow.run("input"); + assertEquals("expected", result); + } +} +``` + +For manual lifecycle control (e.g., JUnit 4 or custom setups), use `TestWorkflowEnvironment` directly with `@BeforeEach`/`@AfterEach`. + +## Mocking Activities + +```java +import static org.mockito.Mockito.*; + +@Test +void testWithMockedActivities( + TestWorkflowEnvironment env, + Worker worker, + WorkflowClient client) { + // withoutAnnotations() prevents Mockito from copying Temporal annotations + MyActivities activities = mock(MyActivities.class, withSettings().withoutAnnotations()); + when(activities.composeGreeting("Hello", "World")).thenReturn("mocked result"); + + worker.registerActivitiesImplementations(activities); + env.start(); + + MyWorkflow workflow = client.newWorkflowStub( + MyWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(worker.getTaskQueue()) + .build()); + + String result = workflow.run("input"); + assertEquals("mocked result", result); + verify(activities).composeGreeting("Hello", "World"); +} +``` + +## Testing Signals and Queries + +```java +@Test +void testSignalsAndQueries( + TestWorkflowEnvironment env, + Worker worker, + WorkflowClient client) { + worker.registerActivitiesImplementations(new MyActivitiesImpl()); + env.start(); + + MyWorkflow workflow = client.newWorkflowStub( + MyWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(worker.getTaskQueue()) + .build()); + + // Start workflow asynchronously + WorkflowClient.start(workflow::run, "input"); + + // Send signal + workflow.mySignal("data"); + + // Query state + String status = workflow.getStatus(); + assertEquals("expected", status); + + // Wait for completion + String result = WorkflowStub.fromTyped(workflow).getResult(String.class); +} +``` + +## Testing Failure Cases + +```java +import io.temporal.client.WorkflowException; + +@Test +void testActivityFailure( + TestWorkflowEnvironment env, + Worker worker, + WorkflowClient client) { + MyActivities activities = mock(MyActivities.class, withSettings().withoutAnnotations()); + when(activities.unreliableAction(anyString())) + .thenThrow(new RuntimeException("Simulated failure")); + + worker.registerActivitiesImplementations(activities); + env.start(); + + MyWorkflow workflow = client.newWorkflowStub( + MyWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(worker.getTaskQueue()) + .build()); + + assertThrows(WorkflowException.class, () -> workflow.run("input")); +} +``` + +## Workflow Replay Testing + +```java +import io.temporal.testing.WorkflowReplayer; + +@Test +void testReplayFromHistory() throws Exception { + WorkflowReplayer.replayWorkflowExecutionFromResource( + "my-workflow-history.json", + MyWorkflowImpl.class); +} +``` + +Replay from a `WorkflowHistory` object: + +```java +import io.temporal.common.WorkflowExecutionHistory; + +@Test +void testReplayFromJsonString() throws Exception { + String historyJson = new String(Files.readAllBytes(Paths.get("history.json"))); + WorkflowReplayer.replayWorkflowExecution( + WorkflowExecutionHistory.fromJson(historyJson), + MyWorkflowImpl.class); +} +``` + +## Activity Testing + +Activity implementations are plain Java classes. Test them directly: + +```java +@Test +void testActivity() { + MyActivitiesImpl activities = new MyActivitiesImpl(); + String result = activities.composeGreeting("Hello", "World"); + assertEquals("Hello World", result); +} +``` + +For activities that use `Activity.getExecutionContext()` or heartbeating, use `TestActivityEnvironment` to provide the activity context. + +## Best Practices + +1. Use `TestWorkflowExtension` with JUnit 5 for concise test setup +2. Always use `withSettings().withoutAnnotations()` when mocking activity interfaces with Mockito +3. Mock external dependencies in activities, not in workflows +4. Test replay compatibility when changing workflow code (see `references/java/determinism.md`) +5. Test signal/query handlers explicitly +6. Use unique task queues per test to avoid conflicts (handled automatically by `TestWorkflowExtension`) diff --git a/references/java/versioning.md b/references/java/versioning.md new file mode 100644 index 0000000..d1a9205 --- /dev/null +++ b/references/java/versioning.md @@ -0,0 +1,281 @@ +# Java SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## Patching API + +### Workflow.getVersion() + +`Workflow.getVersion(String changeId, int minSupported, int maxSupported)` returns the version to use for a given change: + +```java +import io.temporal.workflow.Workflow; + +@WorkflowInterface +public interface ShippingWorkflow { + @WorkflowMethod + void run(); +} + +public class ShippingWorkflowImpl implements ShippingWorkflow { + @Override + public void run() { + int version = Workflow.getVersion( + "send-email-instead-of-fax", + Workflow.DEFAULT_VERSION, // minSupported (no change) + 1 // maxSupported (current version) + ); + + if (version == 1) { + // New code path + Workflow.newActivityStub(MyActivities.class, options).sendEmail(); + } else { + // Old code path (for replay of existing workflows) + Workflow.newActivityStub(MyActivities.class, options).sendFax(); + } + } +} +``` + +**How it works:** +- For new executions: returns `maxSupported` and records a marker in history +- For replay with the marker: returns the recorded version +- For replay without the marker: returns `DEFAULT_VERSION` (-1) + +### Three-Step Patching Process + +**Step 1: Patch in New Code** + +Add the version check with both old and new code paths: + +```java +public class OrderWorkflowImpl implements OrderWorkflow { + @Override + public String run(Order order) { + int version = Workflow.getVersion( + "add-fraud-check", + Workflow.DEFAULT_VERSION, + 1); + + if (version >= 1) { + activities.checkFraud(order); + } + + return activities.processPayment(order); + } +} +``` + +**Step 2: Remove Old Code Path** + +Once all pre-patch Workflow Executions have completed, remove the old branch and set `minSupported` to `1`: + +```java +public class OrderWorkflowImpl implements OrderWorkflow { + @Override + public String run(Order order) { + Workflow.getVersion("add-fraud-check", 1, 1); + + activities.checkFraud(order); + return activities.processPayment(order); + } +} +``` + +**Step 3: Remove the Patch** + +After all workflows with the patch marker have completed, remove the `getVersion` call entirely: + +```java +public class OrderWorkflowImpl implements OrderWorkflow { + @Override + public String run(Order order) { + activities.checkFraud(order); + return activities.processPayment(order); + } +} +``` + +### Recording TemporalChangeVersion Search Attribute + +Unlike the Python and TypeScript SDKs, the Java SDK does **not** automatically record the `TemporalChangeVersion` search attribute. You must manually upsert it: + +```java +import io.temporal.workflow.Workflow; +import io.temporal.common.SearchAttributeKey; +import java.util.List; + +public class OrderWorkflowImpl implements OrderWorkflow { + private static final SearchAttributeKey> TEMPORAL_CHANGE_VERSION = + SearchAttributeKey.forKeywordList("TemporalChangeVersion"); + + @Override + public String run(Order order) { + int version = Workflow.getVersion("add-fraud-check", Workflow.DEFAULT_VERSION, 1); + + // Manually record for query filtering + Workflow.upsertTypedSearchAttributes( + TEMPORAL_CHANGE_VERSION.valueSet(List.of("add-fraud-check-1"))); + + if (version >= 1) { + activities.checkFraud(order); + } + return activities.processPayment(order); + } +} +``` + +Query with: + +```bash +temporal workflow list --query \ + 'TemporalChangeVersion = "add-fraud-check-1" AND ExecutionStatus = "Running"' +``` + +## Workflow Type Versioning + +For incompatible changes, create a new Workflow Type: + +```java +@WorkflowInterface +public interface PizzaWorkflow { + @WorkflowMethod + String run(PizzaOrder order); +} + +// Original implementation +public class PizzaWorkflowImpl implements PizzaWorkflow { + @Override + public String run(PizzaOrder order) { + return processOrderV1(order); + } +} + +// New workflow type for incompatible changes +@WorkflowInterface +public interface PizzaWorkflowV2 { + @WorkflowMethod + String run(PizzaOrder order); +} + +public class PizzaWorkflowV2Impl implements PizzaWorkflowV2 { + @Override + public String run(PizzaOrder order) { + return processOrderV2(order); + } +} +``` + +Register both with the Worker: + +```java +worker.registerWorkflowImplementationTypes( + PizzaWorkflowImpl.class, + PizzaWorkflowV2Impl.class); +``` + +Start new workflows with the new type: + +```java +PizzaWorkflowV2 workflow = client.newWorkflowStub( + PizzaWorkflowV2.class, + WorkflowOptions.newBuilder() + .setTaskQueue("pizza-task-queue") + .build()); +workflow.run(order); +``` + +Check for open executions before removing the old type: + +```bash +temporal workflow list --query 'WorkflowType = "PizzaWorkflow" AND ExecutionStatus = "Running"' +``` + +## Worker Versioning + +Worker Versioning manages versions at the deployment level. Available since Java SDK v1.29. + +### Key Concepts + +- **Worker Deployment**: A logical group of Workers processing the same Task Queue, identified by a deployment name (e.g., `"order-service"`). +- **Worker Deployment Version**: A specific version within a deployment, identified by the combination of deployment name and Build ID (e.g., `"order-service:v1.0.0"`). Each version corresponds to a particular code revision. + +### Configuring Workers + +```java +import io.temporal.worker.Worker; +import io.temporal.worker.WorkerFactory; +import io.temporal.worker.WorkerOptions; +import io.temporal.worker.WorkerDeploymentOptions; +import io.temporal.worker.WorkerDeploymentVersion; + +WorkerDeploymentVersion version = WorkerDeploymentVersion.newBuilder() + .setDeploymentName("order-service") + .setBuildId("v1.0.0") // or git commit hash + .build(); + +WorkerDeploymentOptions deploymentOptions = WorkerDeploymentOptions.newBuilder() + .setVersion(version) + .setUseWorkerVersioning(true) + .build(); + +WorkerFactory factory = WorkerFactory.newInstance(client); +Worker worker = factory.newWorker( + "my-task-queue", + WorkerOptions.newBuilder() + .setDeploymentOptions(deploymentOptions) + .build()); + +worker.registerWorkflowImplementationTypes(MyWorkflowImpl.class); +worker.registerActivitiesImplementations(new MyActivitiesImpl()); +factory.start(); +``` + +### PINNED vs AUTO_UPGRADE Behaviors + +Set the versioning behavior on the workflow definition: + +```java +import io.temporal.workflow.VersioningBehavior; +import io.temporal.workflow.Workflow; + +public class MyWorkflowImpl implements MyWorkflow { + @Override + public String run(String input) { + Workflow.setVersioningBehavior(VersioningBehavior.PINNED); + // ... workflow logic + } +} +``` + +**PINNED**: Workflow stays on the Worker version that started it. Use for short-running workflows or when consistency within a single execution is critical. New workflows start on the current version; existing ones stay put. + +**AUTO_UPGRADE**: Workflow moves to the latest Worker version on the next Workflow Task. Use for long-running workflows that need bug fixes or feature updates. Combine with `Workflow.getVersion()` patching to handle version transitions safely. + +### Deployment Strategies + +**Blue-Green**: Run two deployment versions simultaneously. Set the new version as the current deployment. PINNED workflows finish on the old version; new workflows start on the new version. Drain the old version once all its workflows complete. + +**Rainbow**: Run multiple versions concurrently for gradual rollouts. Each version handles its own workflows. Useful when you have many long-running PINNED workflows across several code revisions. + +### Querying Workflows by Worker Version + +```bash +# List workflows running on a specific version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "order-service:v1.0.0" AND ExecutionStatus = "Running"' + +# Count workflows per version to monitor drain progress +temporal workflow count --query \ + 'TemporalWorkerDeploymentVersion = "order-service:v1.0.0" AND ExecutionStatus = "Running"' +``` + +## Best Practices + +1. **Check for open executions** before removing old code paths +2. **Use descriptive change IDs** that explain the change (e.g., `"add-fraud-check"` not `"patch-1"`) +3. **Deploy patches incrementally**: patch, remove old path, remove `getVersion` +4. **Manually upsert `TemporalChangeVersion`** search attribute when using `getVersion` if you need query filtering +5. **Use PINNED for short workflows** to simplify version management +6. **Use AUTO_UPGRADE with patching** for long-running workflows that need updates +7. **Generate Build IDs from code** (git hash) to ensure changes produce new versions From 7d5ae761fffd79fbcf0492d6b04b5d1ebbc8ae2c Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 2 Apr 2026 17:08:53 -0400 Subject: [PATCH 18/42] Reduce repetition in determinism sans sandboxing (#67) * Reduce repetition in determinism sans sandboxing. * fix merge --- references/go/determinism-protection.md | 2 +- references/go/go.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/references/go/determinism-protection.md b/references/go/determinism-protection.md index cc8d8f5..b37d94a 100644 --- a/references/go/determinism-protection.md +++ b/references/go/determinism-protection.md @@ -2,7 +2,7 @@ ## Overview -The Go SDK has no runtime sandbox. Determinism is enforced by **developer convention** and **optional static analysis**. Unlike the Python and TypeScript SDKs, the Go SDK will not intercept or replace non-deterministic calls at runtime. The Go SDK does perform a limited runtime command-ordering check, but catching non-deterministic code before deployment requires the `workflowcheck` tool and testing, in particular replay tests (see `references/go/testing.md`). +The Go SDK has no runtime sandbox (only Python and TypeScript have sandboxing). Determinism is enforced by **developer convention** and **optional static analysis**. The Go SDK will not intercept or replace non-deterministic calls at runtime. The Go SDK does perform a limited runtime command-ordering check, but catching non-deterministic code before deployment requires the `workflowcheck` tool and testing, in particular replay tests (see `references/go/testing.md`). ## workflowcheck Static Analysis diff --git a/references/go/go.md b/references/go/go.md index 974ee7c..827d35c 100644 --- a/references/go/go.md +++ b/references/go/go.md @@ -2,7 +2,7 @@ ## Overview -The Temporal Go SDK (`go.temporal.io/sdk`) provides a strongly-typed, idiomatic Go approach to building durable workflows. Workflows are regular exported Go functions. The Go SDK does not have an automatic sandbox -- determinism is the developer's responsibility, aided by the `workflowcheck` static analysis tool. +The Temporal Go SDK (`go.temporal.io/sdk`) provides a strongly-typed, idiomatic Go approach to building durable workflows. Workflows are regular exported Go functions. ## Quick Start From 25eb55398ca0a34b56fcdebe4dccda00e4884cbf Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 3 Apr 2026 05:11:50 -0400 Subject: [PATCH 19/42] Bump to 0.2.0 for Java release (#72) --- SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SKILL.md b/SKILL.md index 38c2185..9043177 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,7 +1,7 @@ --- name: temporal-developer description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "Temporal Go", "Temporal Golang", "Temporal Java", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. -version: 0.1.0 +version: 0.2.0 --- # Skill: temporal-developer From 33747f0438a10316c2e8727a23fd1278d2c9f172 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 10:11:47 -0400 Subject: [PATCH 20/42] Auto-formatting: ran `mdformat --extensions frontmatter --number .` (#81) * Auto-formatting: ran `mdformat --extensions frontmatter --number .` * manual tweaks * manual tweaks --- README.md | 4 +- SKILL.md | 22 +++++----- references/core/ai-patterns.md | 12 +++-- references/core/determinism.md | 13 +++++- references/core/dev-management.md | 1 - references/core/error-reference.md | 16 +++---- references/core/gotchas.md | 28 +++++++++--- references/core/patterns.md | 44 +++++++++++++++++-- references/core/troubleshooting.md | 14 +++--- references/core/versioning.md | 4 ++ references/go/advanced-features.md | 2 + references/go/data-handling.md | 2 + references/go/determinism-protection.md | 5 +++ references/go/go.md | 12 +++++ references/go/gotchas.md | 1 + references/go/observability.md | 3 ++ references/go/patterns.md | 3 ++ references/go/versioning.md | 6 +++ references/java/gotchas.md | 2 + references/java/java.md | 16 ++++++- references/java/observability.md | 1 + references/java/patterns.md | 2 + references/java/versioning.md | 1 + references/python/advanced-features.md | 2 +- references/python/data-handling.md | 2 + references/python/determinism-protection.md | 2 + references/python/determinism.md | 1 + references/python/gotchas.md | 2 + references/python/observability.md | 3 +- references/python/patterns.md | 3 ++ references/python/python.md | 10 ++++- references/python/sync-vs-async.md | 4 ++ references/python/testing.md | 1 - references/python/versioning.md | 6 +++ references/typescript/advanced-features.md | 4 ++ references/typescript/data-handling.md | 1 + .../typescript/determinism-protection.md | 1 - references/typescript/gotchas.md | 1 + references/typescript/patterns.md | 3 ++ references/typescript/typescript.md | 14 +++++- references/typescript/versioning.md | 3 ++ 41 files changed, 226 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 124b367..8f2a0a5 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,12 @@ A comprehensive skill for developers to use when building [Temporal](https://tem This skill is housed within a [Claude Code plugin](https://github.com/temporalio/agent-skills), which provides a simple way to install and receive future updates to the skill. -1. Run `/plugin marketplace add temporalio/agent-skills` +1. Run `/plugin marketplace add temporalio/agent-skills` 2. Run `/plugin` to open the plugin manager 3. Select **Marketplaces** 4. Choose `temporal-marketplace` from the list 5. Select **Enable auto-update** or **Disable auto-update** -6. run `/plugin install temporal-developer@temporalio-agent-skills` +6. run `/plugin install temporal-developer@temporalio-agent-skills` 7. Restart Claude Code ### Via `npx skills` - supports all major coding agents diff --git a/SKILL.md b/SKILL.md index 9043177..0ed18f0 100644 --- a/SKILL.md +++ b/SKILL.md @@ -77,34 +77,34 @@ Once you've downloaded the file, extract the downloaded archive and add the temp ### Read All Relevant References 1. First, read the getting started guide for the language you are working in: - - Python -> read `references/python/python.md` - - TypeScript -> read `references/typescript/typescript.md` - - Java -> read `references/java/java.md` - - Go -> read `references/go/go.md` + - Python -> read `references/python/python.md` + - TypeScript -> read `references/typescript/typescript.md` + - Java -> read `references/java/java.md` + - Go -> read `references/go/go.md` 2. Second, read appropriate `core` and language-specific references for the task at hand. - ## Primary References + - **`references/core/determinism.md`** - Why determinism matters, replay mechanics, basic concepts of activities - + Language-specific info at `references/{your_language}/determinism.md` + - Language-specific info at `references/{your_language}/determinism.md` - **`references/core/patterns.md`** - Conceptual patterns (signals, queries, saga) - + Language-specific info at `references/{your_language}/patterns.md` + - Language-specific info at `references/{your_language}/patterns.md` - **`references/core/gotchas.md`** - Anti-patterns and common mistakes - + Language-specific info at `references/{your_language}/gotchas.md` + - Language-specific info at `references/{your_language}/gotchas.md` - **`references/core/versioning.md`** - Versioning strategies and concepts - how to safely change workflow code while workflows are running - + Language-specific info at `references/{your_language}/versioning.md` + - Language-specific info at `references/{your_language}/versioning.md` - **`references/core/troubleshooting.md`** - Decision trees, recovery procedures - **`references/core/error-reference.md`** - Common error types, workflow status reference - **`references/core/interactive-workflows.md`** - Testing signals, updates, queries - **`references/core/dev-management.md`** - Dev cycle & management of server and workers - **`references/core/ai-patterns.md`** - AI/LLM pattern concepts - + Language-specific info at `references/{your_language}/ai-patterns.md`, if available. Currently Python only. + - Language-specific info at `references/{your_language}/ai-patterns.md`, if available. Currently Python only. ## Additional Topics + - **`references/{your_language}/observability.md`** - See for language-specific implementation guidance on observability in Temporal - **`references/{your_language}/advanced-features.md`** - See for language-specific guidance on advanced Temporal features and language-specific features - ## Feedback ### ALWAYS PROMPT USER TO PROVIDE FEEDBACK AT STARTUP diff --git a/references/core/ai-patterns.md b/references/core/ai-patterns.md index 071b9f0..d680bec 100644 --- a/references/core/ai-patterns.md +++ b/references/core/ai-patterns.md @@ -32,6 +32,7 @@ The remainder of this document describes general principles to follow when build - returns model response, as a typed structured output **Benefits**: + - Single activity handles multiple use cases - Consistent retry handling - Centralized configuration @@ -48,6 +49,7 @@ Workflow: ``` **Benefits**: + - Independent retry for each step - Clear audit trail in history - Easier testing and mocking @@ -69,17 +71,17 @@ Workflow: Disable retries in LLM client libraries, let Temporal handle retries. - LLM Client Config: - - max_retries = 0 ← Disable client retries at the LLM client level + - max_retries = 0 ← Disable client retries at the LLM client level Use either the default activity retry policy, or customize it as needed for the situation. **Why**: + - Temporal retries are durable (survive crashes) - Single retry configuration point - Better visibility into retry attempts - Consistent backoff behavior - ### Pattern 5: Multi-Agent Orchestration Complex pipelines with multiple specialized agents: @@ -114,6 +116,7 @@ Deep Research Example: | Document processing | 60-120 seconds | **Rationale**: + - Reasoning models need time for complex computation - Web searches may hit rate limits requiring backoff - Fast timeouts catch stuck operations @@ -128,7 +131,6 @@ Parse rate limit info from API responses: - Response Headers: - Retry-After: 30 - X-RateLimit-Remaining: 0 - - Activity: - If rate limited: - Raise retryable error with a next retry delay @@ -137,12 +139,14 @@ Parse rate limit info from API responses: ## Error Handling ### Retryable Errors + - Rate limits (429) - Timeouts - Temporary server errors (500, 502, 503) - Network errors ### Non-Retryable Errors + - Invalid API key (401) - Invalid input/prompt - Content policy violations @@ -161,6 +165,6 @@ Parse rate limit info from API responses: ## Observability See `references/{your_language}/observability.md` for the language you are working in for documentation on implementing observability in Temporal. It is generally recommended to add observability for: + - Token usage, via activity logging - any else to help track LLM usage and debug agentic flows, within moderation. - diff --git a/references/core/determinism.md b/references/core/determinism.md index 16f04db..952cca4 100644 --- a/references/core/determinism.md +++ b/references/core/determinism.md @@ -50,22 +50,27 @@ Result: Commands don't match history → NondeterminismError ## Sources of Non-Determinism ### Time-Based Operations + - `datetime.now()`, `time.time()`, `Date.now()` - Different value on each execution ### Random Values + - `random.random()`, `Math.random()`, `uuid.uuid4()` - Different value on each execution ### External State + - Reading files, environment variables, databases, networking / HTTP calls - State may change between executions ### Non-Deterministic Iteration + - Map/dict iteration order (in some languages) - Set iteration order ### Threading/Concurrency + - Race conditions produce different outcomes - Non-deterministic ordering @@ -76,9 +81,10 @@ In Temporal, activities are the primary mechanism for making non-deterministic c For a few simple cases, like timestamps, random values, UUIDs, etc. the Temporal SDK in your language may provide durable variants that are simple to use. See `references/{your_language}/determinism.md` for the language you are working in for more info. ## SDK Protection Mechanisms + Each Temporal SDK language provides a different level of protection against non-determinism: -- Python: The Python SDK runs workflows in a sandbox that intercepts and aborts non-deterministic calls early at runtime. +- Python: The Python SDK runs workflows in a sandbox that intercepts and aborts non-deterministic calls early at runtime. - TypeScript: The TypeScript SDK runs workflows in an isolated V8 sandbox, intercepting many common sources of non-determinism and replacing them automatically with deterministic variants. - Java: The Java SDK has no sandbox. Determinism is enforced by developer conventions — the SDK provides `Workflow.*` APIs as safe alternatives (e.g., `Workflow.sleep()` instead of `Thread.sleep()`), and non-determinism is only detected at replay time via `NonDeterministicException`. A static analysis tool (`temporal-workflowcheck`, beta) can catch violations at build time. Cooperative threading under a global lock eliminates the need for synchronization. - Go: The Go SDK has no runtime sandbox. Therefore, non-determinism bugs will never be immediately appararent, and are usually only observable during replay. The optional `workflowcheck` static analysis tool can be used to check for many sources of non-determinism at compile time. @@ -88,6 +94,7 @@ Regardless of which SDK you are using, it is your responsibility to ensure that ## Detecting Non-Determinism ### During Execution + - `NondeterminismError` raised when Commands don't match Events - Workflow becomes blocked until code is fixed @@ -98,13 +105,17 @@ Replay tests verify that workflows follow identical code paths when re-run, by a ## Recovery from Non-Determinism ### Accidental Change + If you accidentally introduced non-determinism: + 1. Revert code to match what's in history 2. Restart worker 3. Workflow auto-recovers ### Intentional Change + If you need to change workflow logic: + 1. Use the **Patching API** to support both old and new code paths 2. Or terminate old workflows and start new ones with updated code diff --git a/references/core/dev-management.md b/references/core/dev-management.md index 01faed0..45385d3 100644 --- a/references/core/dev-management.md +++ b/references/core/dev-management.md @@ -20,7 +20,6 @@ When you need a new worker, you should start it in the background (and preferrab **Best practice**: As far as local development goes, run only ONE worker instance with the latest code. Don't keep stale workers (running old code) around. - ### Cleanup **Always kill workers when done.** Don't leave workers running. diff --git a/references/core/error-reference.md b/references/core/error-reference.md index 74570ae..29a40b7 100644 --- a/references/core/error-reference.md +++ b/references/core/error-reference.md @@ -6,14 +6,14 @@ | **Deadlock** | TMPRL1101 | `WorkflowTaskFailed` in history, worker logs | Workflow blocked too long (deadlock detected) | Remove blocking operations from workflow code (no I/O, no sleep, no threading locks). Use Temporal primitives instead. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1101.md | | **Unfinished handlers** | TMPRL1102 | `WorkflowTaskFailed` in history | Workflow completed while update/signal handlers still running | Ensure all handlers complete before workflow finishes. Use `workflow.wait_condition()` to wait for handler completion. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1102.md | | **Payload overflow** | TMPRL1103 | `WorkflowTaskFailed` or `ActivityTaskFailed` in history | Payload size limit exceeded (default 2MB) | Reduce payload size. Use external storage (S3, database) for large data and pass references instead. | https://github.com/temporalio/rules/blob/main/rules/TMPRL1103.md | -| **Workflow code bug** | | `WorkflowTaskFailed` in history | Bug in workflow logic | Fix code → Restart worker → Workflow auto-resumes | | -| **Missing workflow** | | Worker logs | Workflow not registered | Add to worker.py → Restart worker | | -| **Missing activity** | | Worker logs | Activity not registered | Add to worker.py → Restart worker | | -| **Activity bug** | | `ActivityTaskFailed` in history | Bug in activity code | Fix code → Restart worker → Auto-retries | | -| **Activity retries** | | `ActivityTaskFailed` (count >2) | Repeated failures | Fix code → Restart worker → Auto-retries | | -| **Sandbox violation** | | Worker logs | Bad imports in workflow | Fix workflow.py imports → Restart worker | | -| **Task queue mismatch** | | Workflow never starts | Different queues in starter/worker | Align task queue names | | -| **Timeout** | | Status = TIMED_OUT | Operation too slow | Increase timeout config | | +| **Workflow code bug** | | `WorkflowTaskFailed` in history | Bug in workflow logic | Fix code → Restart worker → Workflow auto-resumes | | +| **Missing workflow** | | Worker logs | Workflow not registered | Add to worker.py → Restart worker | | +| **Missing activity** | | Worker logs | Activity not registered | Add to worker.py → Restart worker | | +| **Activity bug** | | `ActivityTaskFailed` in history | Bug in activity code | Fix code → Restart worker → Auto-retries | | +| **Activity retries** | | `ActivityTaskFailed` (count >2) | Repeated failures | Fix code → Restart worker → Auto-retries | | +| **Sandbox violation** | | Worker logs | Bad imports in workflow | Fix workflow.py imports → Restart worker | | +| **Task queue mismatch** | | Workflow never starts | Different queues in starter/worker | Align task queue names | | +| **Timeout** | | Status = TIMED_OUT | Operation too slow | Increase timeout config | | ## Workflow Status Reference diff --git a/references/core/gotchas.md b/references/core/gotchas.md index 55b6ddb..677362f 100644 --- a/references/core/gotchas.md +++ b/references/core/gotchas.md @@ -9,6 +9,7 @@ This document provides a general overview of conceptual-level gotchas in Tempora **The Problem**: Activities may execute more than once due to retries or Worker failures. If an activity calls an external service without an idempotency key, you may charge a customer twice, send duplicate emails, or create duplicate records. **Symptoms**: + - Duplicate side effects (double charges, duplicate notifications) - Data inconsistencies after retries @@ -21,6 +22,7 @@ This document provides a general overview of conceptual-level gotchas in Tempora **The Problem**: Code in workflow functions runs on first execution AND on every replay. Any side effect (logging, notifications, metrics, etc.) will happen multiple times and non-deterministic code (IO, current time, random numbers, threading, etc.) won't replay correctly. **Symptoms**: + - Non-determinism errors - Sandbox violations, depending on SDK language - Duplicate log entries @@ -28,11 +30,12 @@ This document provides a general overview of conceptual-level gotchas in Tempora - Inflated metrics **The Fix**: + - Use Temporal replay-aware managed side effects for common, non-business logic cases: - - Temporal workflow logging - - Temporal date time - - Temporal UUID generation - - Temporal random number generation + - Temporal workflow logging + - Temporal date time + - Temporal UUID generation + - Temporal random number generation - Put all other side effects in Activities See `references/core/determinism.md` for more info. @@ -42,10 +45,12 @@ See `references/core/determinism.md` for more info. **The Problem**: If Worker A runs part of a workflow with code v1, then Worker B (with code v2) picks it up, replay may produce different Commands. **Symptoms**: + - Non-determinism errors after deploying new code - Errors mentioning "command mismatch" or "unexpected command" **The Fix**: + - Use Worker Versioning for production deployments - Use patching APIs - During development: kill old workers before starting new ones @@ -60,6 +65,7 @@ See `references/core/versioning.md` for more info. **The Problem**: Using aggressive activity retry policies that give up too easily. **Symptoms**: + - Workflows failing on transient errors - Unnecessary workflow failures during brief outages @@ -72,6 +78,7 @@ See `references/core/versioning.md` for more info. **The Problem**: Queries and update validators are read-only. Modifying state causes non-determinism on replay, and must strictly be avoided. **Symptoms**: + - State inconsistencies after workflow replay - Non-determinism errors @@ -82,6 +89,7 @@ See `references/core/versioning.md` for more info. **The Problem**: Queries and update validators must return immediately. They cannot await activities, child workflows, timers, or conditions. **Symptoms**: + - Query / update validators timeouts - Deadlocks @@ -110,6 +118,7 @@ See language-specific gotchas for details. **The Problem**: Not testing what happens when things go wrong. **Questions to answer**: + - What happens when an Activity exhausts all retries? - What happens when a workflow is cancelled mid-execution? - What happens during a Worker restart? @@ -121,6 +130,7 @@ See language-specific gotchas for details. **The Problem**: Changing workflow code without verifying existing workflows can still replay. **Symptoms**: + - Non-determinism errors after deployment - Stuck workflows that can't make progress @@ -133,6 +143,7 @@ See language-specific gotchas for details. **The Problem**: Catching errors without proper handling hides failures. **Symptoms**: + - Silent failures - Workflows completing "successfully" despite errors - Difficult debugging @@ -144,10 +155,12 @@ See language-specific gotchas for details. **The Problem**: Marking transient errors as non-retryable, or permanent errors as retryable. **Symptoms**: + - Workflows failing on temporary network issues (if marked non-retryable) - Infinite retries on invalid input (if marked retryable) **The Fix**: + - **Retryable**: Network errors, timeouts, rate limits, temporary unavailability - **Non-retryable**: Invalid input, authentication failures, business rule violations, resource not found @@ -158,6 +171,7 @@ See language-specific gotchas for details. **The Problem**: When a workflow is cancelled, cleanup code after the cancellation point doesn't run unless explicitly protected. **Symptoms**: + - Resources not released after cancellation - Incomplete compensation/rollback - Leaked state @@ -169,10 +183,12 @@ See language-specific gotchas for details. **The Problem**: Activities must opt in to receive cancellation. Without proper handling, a cancelled activity continues running to completion, wasting resources. **Requirements for activity cancellation**: + 1. **Heartbeating** - Cancellation is delivered via heartbeat. Activities that don't heartbeat won't know they've been cancelled. 2. **Checking for cancellation** - Activity must explicitly check for cancellation or await a cancellation signal. **Symptoms**: + - Cancelled activities running to completion - Wasted compute on work that will be discarded - Delayed workflow cancellation @@ -184,11 +200,13 @@ See language-specific gotchas for details. **The Problem**: Temporal has built-in limits on payload sizes. Exceeding them causes workflows to fail. **Limits**: + - Max 2MB per individual payload - Max 4MB per gRPC message -- Max 50MB for complete workflow history (aim for <10MB in practice) +- Max 50MB for complete workflow history (aim for < 10MB in practice) **Symptoms**: + - Payload too large errors - gRPC message size exceeded errors - Workflow history growing unboundedly diff --git a/references/core/patterns.md b/references/core/patterns.md index 2ab5b72..7e7c7a3 100644 --- a/references/core/patterns.md +++ b/references/core/patterns.md @@ -2,8 +2,9 @@ ## Overview -Common patterns for building robust Temporal workflows. +Common patterns for building robust Temporal workflows. See the language-specific references for the language you are working in: + - `references/{language}/{language}.md` for the root level documentation for that language - `references/{language}/patterns.md` for language-specific example code of the patterns in this file. @@ -12,18 +13,21 @@ See the language-specific references for the language you are working in: **Purpose**: Send data to a running workflow asynchronously (fire-and-forget). **When to Use**: + - Human approval workflows - Adding items to a workflow's queue - Notifying workflow of external events - Live configuration updates **Characteristics**: + - Asynchronous - sender doesn't wait for response - Can mutate workflow state - Durable - signals are persisted in history - Can be sent before workflow starts (signal-with-start) **Example Flow**: + ``` Client Workflow │ │ @@ -41,12 +45,14 @@ you want the external process to Heartbeat or receive Cancellation. If this may **Purpose**: Read workflow state synchronously without modifying it. **When to Use**: + - Building dashboards showing workflow progress - Health checks and monitoring - Debugging workflow state - Exposing current status to external systems **Characteristics**: + - Synchronous - caller waits for response - Read-only - must not modify state - Not recorded in history @@ -54,6 +60,7 @@ you want the external process to Heartbeat or receive Cancellation. If this may - Can run even on completed workflows **Example Flow**: + ``` Client Workflow │ │ @@ -67,12 +74,14 @@ Client Workflow **Purpose**: Modify workflow state and receive a response synchronously. **When to Use**: + - Operations that need confirmation (add item, return count) - Validation before accepting changes - Replace signal+query combinations - Request-response patterns within workflow **Characteristics**: + - Synchronous - caller waits for completion - Can mutate state AND return values - Supports validators to reject invalid updates before they even get persisted into history @@ -80,6 +89,7 @@ Client Workflow - Recorded in history **Example Flow**: + ``` Client Workflow │ │ @@ -91,34 +101,39 @@ Client Workflow ## Child Workflows **When to Use**: + - Prevent history from growing too large - Isolate failure domains (child can fail without failing parent) - Different retry policies for different parts **Characteristics**: + - Own history (doesn't bloat parent) - Independent lifecycle options (ParentClosePolicy) - Can be cancelled independently - Results returned to parent **Parent Close Policies**: + - `TERMINATE` - Child terminated when parent closes (default) - `ABANDON` - Child continues running independently - `REQUEST_CANCEL` - Cancellation requested but not forced -**Note:** Do not need to use child workflows simply for breaking complex logic down into smaller pieces. Standard programming abstractions within a workflow can already be used for that. +**Note:** Do not need to use child workflows simply for breaking complex logic down into smaller pieces. Standard programming abstractions within a workflow can already be used for that. ## Continue-as-New **Purpose**: Prevent unbounded history growth by "restarting" with fresh history. **When to Use**: + - Long-running workflows (entity workflows, subscriptions) - Workflows with many iterations - When history approaches 10,000+ events - Periodic cleanup of accumulated state **How It Works**: + ``` Workflow (history: 10,000 events) │ @@ -136,12 +151,14 @@ New Workflow Execution (history: 0 events) **Purpose**: Distributed transactions with compensation for failures. **When to Use**: + - Multi-step operations that span services - Operations requiring rollback on failure - Financial transactions, order processing - Booking systems with multiple reservations **How It Works**: + ``` Step 1: Reserve inventory └─ Compensation: Release inventory @@ -158,6 +175,7 @@ On failure at step 3: ``` **Implementation Pattern**: + 1. Track compensation actions as you complete each step 2. On failure, execute compensations in reverse order 3. Handle compensation failures gracefully (log, alert, manual intervention) @@ -167,12 +185,14 @@ On failure at step 3: **Purpose**: Run multiple independent operations concurrently. **When to Use**: + - Processing multiple items that don't depend on each other - Calling multiple APIs simultaneously - Fan-out/fan-in patterns - Reducing total workflow duration **Patterns**: + - `Promise` / `asyncio` - Use traditional concurrency helpers (e.g. wait for all, wait for first, etc) - Partial failure handling - Continue with successful results @@ -181,12 +201,14 @@ On failure at step 3: **Purpose**: Model long-lived entities as workflows that handle events. **When to Use**: + - Subscription management - User sessions - Shopping carts - Any stateful entity receiving events over time **How It Works**: + ``` Entity Workflow (user-123) │ @@ -207,12 +229,14 @@ Entity Workflow (user-123) **Purpose**: Durable delays that survive worker restarts. **Use Cases**: + - Scheduled reminders - Timeout handling - Delayed actions - Polling with intervals **Characteristics**: + - Timers are durable (persisted in history) - Can be cancelled @@ -256,12 +280,13 @@ To ensure that polling_activity is restarted in a timely manner, we make sure th Define an Activity which fails (raises an exception) exactly when polling is not completed. The polling loop is accomplished via activity retries, by setting the following Retry options: + - backoff_coefficient: to 1 - initial_interval: to the polling interval (e.g. 60 seconds) This will enable the Activity to be retried exactly on the set interval. -**Advantage:** Individual Activity retries are not recorded in Workflow History, so this approach can poll for a very long time without affecting the history size. +**Advantage:** Individual Activity retries are not recorded in Workflow History, so this approach can poll for a very long time without affecting the history size. ## Idempotency Patterns @@ -285,6 +310,7 @@ Activity: charge_payment(order_id, amount) ``` **Good idempotency key sources**: + - Workflow ID (unique per workflow execution) - Business identifier (order ID, transaction ID) - Workflow ID + activity name + attempt number @@ -337,13 +363,15 @@ This ensures that on replay, already-completed steps are skipped. **Purpose**: Handle data that exceeds Temporal's payload limits without polluting workflow history. **Limits** (see `references/core/gotchas.md` for details): + - Max 2MB per individual payload - Max 4MB per gRPC message -- Max 50MB for workflow history (aim for <10MB) +- Max 50MB for workflow history (aim for < 10MB) **Key Principle**: Large data should never flow through workflow history. Activities read and write large data directly, passing only small references through the workflow. **Wrong Approach**: + ``` Workflow │ @@ -357,6 +385,7 @@ Workflow This defeats the purpose—large data enters workflow history multiple times. **Correct Approach**: + ``` Workflow │ @@ -369,6 +398,7 @@ Workflow The workflow only handles references (small strings). The activity does all large data operations internally. **Implementation Pattern**: + 1. Accept a reference (URL, S3 key, database ID) as activity input 2. Download/fetch the large data inside the activity 3. Process the data inside the activity @@ -376,6 +406,7 @@ The workflow only handles references (small strings). The activity does all larg 5. Return only a reference to the result **Other Strategies**: + - **Compression**: Use a PayloadCodec to compress data automatically - **Chunking**: Split large collections across multiple activities, each handling a subset @@ -384,11 +415,13 @@ The workflow only handles references (small strings). The activity does all larg **Purpose**: Enable cancellation delivery and progress tracking for long-running activities. **Why Heartbeat**: + 1. **Support activity cancellation** - Cancellations are delivered to activities via heartbeat. Activities that don't heartbeat won't know they've been cancelled. 2. **Resume progress after failure** - Heartbeat details persist across retries, allowing activities to resume where they left off. 3. **Detect stuck activities** - If an activity stops heartbeating, Temporal can time it out and retry. **How Cancellation Works**: + ``` Workflow requests activity cancellation │ @@ -411,17 +444,20 @@ Activity calls heartbeat() **Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. **When to Use**: + - Short operations completing in milliseconds/seconds - High-frequency calls where task queue overhead is significant - Low-latency requirements where you can't afford task queue round-trip **Characteristics**: + - Executes on the same worker that runs the workflow - No task queue round-trip (lower latency) - Still recorded in history - Should complete quickly (default timeout is short) **Trade-offs**: + - Less visibility in Temporal UI (no separate task) - Must complete on the same worker - Not suitable for long-running operations diff --git a/references/core/troubleshooting.md b/references/core/troubleshooting.md index 952d4e2..1df80f9 100644 --- a/references/core/troubleshooting.md +++ b/references/core/troubleshooting.md @@ -59,19 +59,15 @@ Workflow stuck in RUNNING? 1. **No worker running** - See references/core/dev-management.md - 2. **Worker on wrong task queue** - Check: Worker logs for task queue name - Fix: Start worker with matching task queue - 3. **Worker has stale code** - Check: Worker startup time vs code changes - Fix: Restart worker with updated code - 4. **Workflow waiting for signal** - Check: Workflow history for pending signals - Fix: Send expected signal or check signal sender - 5. **Activity stuck/timing out** - Check: Activity retry attempts in history - Fix: Investigate activity failure, increase timeout @@ -107,6 +103,7 @@ NondeterminismError? ### Common Causes 1. **Changed call order** + ``` # Before # After (BREAKS) await activity_a await activity_b @@ -114,28 +111,33 @@ NondeterminismError? ``` 2. **Changed call name** + ``` # Before # After (BREAKS) await process_order(...) await handle_order(...) ``` 3. **Added/removed call** + - Adding new activity mid-workflow - Removing activity that was previously called 4. **Using non-deterministic code** + - `datetime.now()` in workflow (use `workflow.now()`) - `random.random()` in workflow (use `workflow.random()`) ### Recovery **Accidental Change:** + 1. Identify the change 2. Revert code to match history 3. Restart worker 4. Workflow automatically recovers **Intentional Change:** + 1. Use patching API for gradual migration 2. Or terminate old workflows, start new ones @@ -163,11 +165,9 @@ Workflow status = FAILED? 1. **Unhandled exception in workflow** - Check error message and stack trace - Fix bug in workflow code - 2. **Activity exhausted retries** - All retry attempts failed - Check activity logs for root cause - 3. **Non-retryable error thrown** - Error marked as non-retryable - Intentional failure, check business logic @@ -236,11 +236,9 @@ Activity retrying repeatedly? 1. **Bug in activity code** - Fix the bug - Consider marking certain errors as non-retryable - 2. **External service down** - Retries are working as intended - Monitor service recovery - 3. **Invalid input** - Validate inputs before activity - Return non-retryable error for bad input diff --git a/references/core/versioning.md b/references/core/versioning.md index 226bb83..3081dcb 100644 --- a/references/core/versioning.md +++ b/references/core/versioning.md @@ -40,14 +40,17 @@ else: ### Three-Phase Lifecycle **Phase 1: Patch In** + - Add both old and new code paths - New workflows take new path, old workflows take old path **Phase 2: Deprecate** + - After all old workflows complete, remove old code - Keep deprecation marker for history compatibility **Phase 3: Remove** + - After all deprecated workflows complete - Remove patch entirely, only new code remains @@ -116,6 +119,7 @@ Worker v2.0 (Build ID: def456) **Build ID**: Specific code version (e.g., git commit hash) **Versioning Behaviors**: + - `PINNED` - Workflows stay on original worker version - `AUTO_UPGRADE` - Workflows can move to newer versions diff --git a/references/go/advanced-features.md b/references/go/advanced-features.md index 55e4e57..b64ce94 100644 --- a/references/go/advanced-features.md +++ b/references/go/advanced-features.md @@ -174,12 +174,14 @@ func FileProcessingWorkflow(ctx workflow.Context, file FileParam) error { ``` Key points: + - `workflow.ErrSessionFailed` is returned if the worker hosting the session dies - `CompleteSession` releases resources -- always call it (use `defer`) - Use case: file processing (download, process, upload on same host), GPU workloads, or any pipeline needing local state - `MaxConcurrentSessionExecutionSize` on `worker.Options` limits how many sessions a single worker can handle **Limitations:** + - Sessions do not survive worker process restarts — if the worker dies, the session fails and activities must be retried from the workflow level - There is no server-side support for sessions — the Go SDK implements them entirely client-side using internal task queue routing - Session concurrency limiting is per-process, not per-host — only one worker process per host if you rely on this diff --git a/references/go/data-handling.md b/references/go/data-handling.md index e887e7b..18ccf57 100644 --- a/references/go/data-handling.md +++ b/references/go/data-handling.md @@ -125,11 +125,13 @@ dataConverter := converter.NewCompositeDataConverter( ## Protobuf Support Binary protobuf: + ```go converter.NewProtoPayloadConverter() ``` JSON protobuf: + ```go converter.NewProtoJSONPayloadConverter() ``` diff --git a/references/go/determinism-protection.md b/references/go/determinism-protection.md index b37d94a..2cdd829 100644 --- a/references/go/determinism-protection.md +++ b/references/go/determinism-protection.md @@ -29,6 +29,7 @@ workflowcheck -show-pos ./... ### What It Detects **Non-deterministic functions/variables:** + - `time.Now` -- obtaining current time - `time.Sleep` -- sleeping - `crypto/rand.Reader` -- crypto random reader @@ -36,6 +37,7 @@ workflowcheck -show-pos ./... - `os.Stdin`, `os.Stdout`, `os.Stderr` -- standard I/O streams **Non-deterministic Go constructs:** + - Starting a goroutine (`go func()`) - Sending to a channel - Receiving from a channel @@ -45,6 +47,7 @@ workflowcheck -show-pos ./... ### Limitations `workflowcheck` cannot catch everything. It does **not** detect: + - Global variable mutation - Non-determinism via reflection - Runtime-conditional non-determinism @@ -72,6 +75,7 @@ workflowcheck -config workflowcheck.config.yaml ./... ## Determinism Rules **You must:** + - Use `workflow.Go(ctx, func(ctx workflow.Context) { ... })` instead of `go` - Use `workflow.NewChannel(ctx)` instead of `chan` - Use `workflow.NewSelector(ctx)` instead of `select` @@ -81,6 +85,7 @@ workflowcheck -config workflowcheck.config.yaml ./... - Sort map keys before iterating, or use `workflow.SideEffect` / an activity **You must not:** + - Start native goroutines - Use native channels or `select` - Call `time.Now()` or `time.Sleep()` diff --git a/references/go/go.md b/references/go/go.md index 827d35c..546e1b1 100644 --- a/references/go/go.md +++ b/references/go/go.md @@ -7,11 +7,13 @@ The Temporal Go SDK (`go.temporal.io/sdk`) provides a strongly-typed, idiomatic ## Quick Start **Add Dependency:** In your Go module, add the Temporal SDK: + ```bash go get go.temporal.io/sdk ``` **workflows/greeting.go** - Workflow definition: + ```go package workflows @@ -37,6 +39,7 @@ func GreetingWorkflow(ctx workflow.Context, name string) (string, error) { ``` **activities/greet.go** - Activity definition: + ```go package activities @@ -53,6 +56,7 @@ func (a *Activities) Greet(ctx context.Context, name string) (string, error) { ``` **worker/main.go** - Worker setup: + ```go package main @@ -90,6 +94,7 @@ func main() { **Start the worker:** Run `go run worker/main.go` in the background. **starter/main.go** - Start a workflow execution: + ```go package main @@ -136,6 +141,7 @@ func main() { ## Key Concepts ### Workflow Definition + - Exported function with `workflow.Context` as the first parameter - Returns `(ResultType, error)` or just `error` - Signature: `func MyWorkflow(ctx workflow.Context, input MyInput) (MyOutput, error)` @@ -143,12 +149,14 @@ func main() { - Register with `w.RegisterWorkflow(MyWorkflow)` ### Activity Definition + - Regular function or struct methods with `context.Context` as the first parameter - Struct methods are preferred for dependency injection - Signature: `func (a *Activities) MyActivity(ctx context.Context, input string) (string, error)` - Register struct with `w.RegisterActivity(&Activities{})` (registers all exported methods) ### Worker Setup + - Create client with `client.Dial(client.Options{})` - Create worker with `worker.New(c, "task-queue", worker.Options{})` - Register workflows and activities @@ -159,6 +167,7 @@ func main() { **Workflow code must be deterministic!** The Go SDK has no sandbox -- determinism is enforced by convention and tooling. Use Temporal replacements instead of native Go constructs: + - `workflow.Go()` instead of `go` (goroutines) - `workflow.Channel` instead of `chan` - `workflow.Selector` instead of `select` @@ -167,6 +176,7 @@ Use Temporal replacements instead of native Go constructs: - `workflow.GetLogger()` instead of `log` / `fmt.Println` for replay-safe logging Use the **`workflowcheck`** static analysis tool to catch non-deterministic code: + ```bash go install go.temporal.io/sdk/contrib/tools/workflowcheck@latest workflowcheck ./... @@ -191,6 +201,7 @@ myapp/ ``` **Activities as struct methods for dependency injection:** + ```go // activities/greet.go type Activities struct { @@ -230,6 +241,7 @@ See `references/go/testing.md` for info on writing tests. ## Additional Resources ### Reference Files + - **`references/go/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. - **`references/go/determinism.md`** - Determinism rules, workflowcheck tool, safe alternatives - **`references/go/gotchas.md`** - Go-specific mistakes and anti-patterns diff --git a/references/go/gotchas.md b/references/go/gotchas.md index 4b7ddf3..6ba46ff 100644 --- a/references/go/gotchas.md +++ b/references/go/gotchas.md @@ -206,6 +206,7 @@ func GoodWorkflow(ctx workflow.Context) error { ### Not Handling Activity Cancellation Activities must **opt in** to receive cancellation. This requires: + 1. **Heartbeating** - Cancellation is delivered via heartbeat 2. **Checking ctx.Done()** - Detect when cancellation arrives diff --git a/references/go/observability.md b/references/go/observability.md index ba55140..23ad62f 100644 --- a/references/go/observability.md +++ b/references/go/observability.md @@ -28,6 +28,7 @@ func MyWorkflow(ctx workflow.Context, input string) (string, error) { ``` The workflow logger automatically: + - Suppresses duplicate logs during replay - Includes workflow context (workflow ID, run ID, etc.) @@ -45,6 +46,7 @@ func MyActivity(ctx context.Context, input string) (string, error) { ``` Activity logger includes: + - Activity ID, type, and task queue - Workflow ID and run ID - Attempt number (for retries) @@ -134,6 +136,7 @@ c, err := client.Dial(client.Options{ ``` Key SDK metrics: + - `temporal_workflow_task_execution_latency` -- Workflow task processing time - `temporal_activity_execution_latency` -- Activity execution time - `temporal_workflow_task_replay_latency` -- Replay duration diff --git a/references/go/patterns.md b/references/go/patterns.md index 732083f..298cca4 100644 --- a/references/go/patterns.md +++ b/references/go/patterns.md @@ -284,6 +284,7 @@ func ApprovalWorkflow(ctx workflow.Context) (string, error) { ``` Key points: + - `AddReceive(channel, callback)` -- fires when a channel has a message (must consume with `c.Receive`) - `AddFuture(future, callback)` -- fires when a future resolves (once per Selector) - `AddDefault(callback)` -- fires immediately if nothing else is ready @@ -457,10 +458,12 @@ func MyWorkflow(ctx workflow.Context) (string, error) { ## Activity Heartbeat Details ### WHY: + - **Support activity cancellation** -- Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled - **Resume progress after worker failure** -- Heartbeat details persist across retries ### WHEN: + - **Cancellable activities** -- Any activity that should respond to cancellation - **Long-running activities** -- Track progress for resumability - **Checkpointing** -- Save progress periodically diff --git a/references/go/versioning.md b/references/go/versioning.md index b6b6c27..c8f7280 100644 --- a/references/go/versioning.md +++ b/references/go/versioning.md @@ -45,6 +45,7 @@ err = workflow.ExecuteActivity(ctx, ActivityC, data).Get(ctx, &result1) ``` Keep the `GetVersion` call even with a single branch. This ensures: + 1. If an older execution replays on this code, it fails fast instead of proceeding incorrectly 2. If you need further changes, you just bump `maxSupported` @@ -139,6 +140,7 @@ w := worker.New(c, "my-task-queue", worker.Options{ ``` **Configuration fields:** + - `UseVersioning`: enables Worker Versioning - `Version`: identifies the Worker Deployment Version (deployment name + build ID) - `DefaultVersioningBehavior`: `VersioningBehaviorPinned` or `VersioningBehaviorAutoUpgrade` @@ -151,6 +153,7 @@ w := worker.New(c, "my-task-queue", worker.Options{ Workflows stay locked to their original Worker version. **When to use PINNED:** + - Short-running workflows (minutes to hours) - Consistency is critical (e.g., financial transactions) - You want to eliminate version compatibility complexity @@ -161,6 +164,7 @@ Workflows stay locked to their original Worker version. Workflows can move to newer versions. **When to use AUTO_UPGRADE:** + - Long-running workflows (weeks or months) - Workflows need to benefit from bug fixes during execution - Migrating from traditional rolling deployments @@ -189,6 +193,7 @@ w := worker.New(c, "orders-task-queue", worker.Options{ **Blue-Green Deployments** Maintain two environments and switch traffic between them: + 1. Deploy new code to idle environment 2. Run tests and validation 3. Switch traffic to new environment @@ -197,6 +202,7 @@ Maintain two environments and switch traffic between them: **Rainbow Deployments** Multiple versions run simultaneously: + - New workflows use latest version - Existing workflows complete on their original version - Add new versions alongside existing ones diff --git a/references/java/gotchas.md b/references/java/gotchas.md index 567fb64..4943f0d 100644 --- a/references/java/gotchas.md +++ b/references/java/gotchas.md @@ -7,6 +7,7 @@ Java-specific mistakes and anti-patterns. See also [Common Gotchas](../core/gotc **Critical: The Java SDK has NO sandbox.** Unlike Python (which uses a sandbox) or TypeScript (which uses V8 isolation), the Java SDK relies entirely on developer conventions. Non-deterministic calls silently succeed during initial execution but cause `NonDeterministicException` on replay. Forbidden in workflow code — use the Temporal `Workflow.*` equivalents instead: + - `Thread.sleep` → `Workflow.sleep` - `UUID.randomUUID` → `Workflow.randomUUID` - `Math.random` → `Workflow.newRandom` @@ -103,6 +104,7 @@ public class GoodWorkflow implements MyWorkflow { ### Not Handling Activity Cancellation Activities must **opt in** to receive cancellation. This requires: + 1. **Heartbeating** - Cancellation is delivered via heartbeat 2. **Catching CanceledFailure** - Thrown when heartbeat detects cancellation diff --git a/references/java/java.md b/references/java/java.md index e18d723..b260424 100644 --- a/references/java/java.md +++ b/references/java/java.md @@ -9,11 +9,13 @@ The Temporal Java SDK (`io.temporal:temporal-sdk`) uses an interface + implement **Add Dependencies:** Gradle: + ```groovy implementation 'io.temporal:temporal-sdk:1.+' ``` Maven: + ```xml io.temporal @@ -23,6 +25,7 @@ Maven: ``` **GreetActivities.java** - Activity interface: + ```java package greetingapp; @@ -38,6 +41,7 @@ public interface GreetActivities { ``` **GreetActivitiesImpl.java** - Activity implementation: + ```java package greetingapp; @@ -51,6 +55,7 @@ public class GreetActivitiesImpl implements GreetActivities { ``` **GreetingWorkflow.java** - Workflow interface: + ```java package greetingapp; @@ -66,6 +71,7 @@ public interface GreetingWorkflow { ``` **GreetingWorkflowImpl.java** - Workflow implementation: + ```java package greetingapp; @@ -91,6 +97,7 @@ public class GreetingWorkflowImpl implements GreetingWorkflow { ``` **GreetingWorker.java** - Worker setup: + ```java package greetingapp; @@ -127,6 +134,7 @@ public class GreetingWorker { **Start the worker:** Run `GreetingWorker.main()` (e.g., `./gradlew run` or `mvn compile exec:java -Dexec.mainClass="greetingapp.GreetingWorker"`). **Starter.java** - Start a workflow execution: + ```java package greetingapp; @@ -161,6 +169,7 @@ public class Starter { ## Key Concepts ### Workflow Definition + - Annotate interface with `@WorkflowInterface` - Put any state initialization logic in the workflow constructor to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `@WorkflowInit` decorator and parameters to your constructor. - Annotate entry point method with `@WorkflowMethod` (exactly one per interface) @@ -170,12 +179,14 @@ public class Starter { - Implementation class implements the interface ### Activity Definition + - Annotate interface with `@ActivityInterface` - Optionally annotate methods with `@ActivityMethod` (for custom names) - Implementation class can throw any exception - Call from workflow via `Workflow.newActivityStub()` ### Worker Setup + - `WorkflowServiceStubs` -- gRPC connection to Temporal Server - `WorkflowClient` -- client used by worker to communicate with server - `WorkerFactory` -- creates Worker instances @@ -201,6 +212,7 @@ greetingapp/ The Java SDK has **no sandbox**. The developer is fully responsible for writing deterministic workflow code. All non-deterministic operations must happen in Activities. **Do not use in workflow code:** + - `Thread` / `new Thread()` -- use `Workflow.newTimer()` or `Async.function()` - `synchronized` / `Lock` -- workflow code is single-threaded - `UUID.randomUUID()` -- use `Workflow.randomUUID()` @@ -210,7 +222,8 @@ The Java SDK has **no sandbox**. The developer is fully responsible for writing - `Thread.sleep()` -- use `Workflow.sleep()` - Mutable static fields -- workflow instances must not share state -**Use Workflow.* APIs instead:** +**Use `Workflow.*` APIs instead:** + - `Workflow.sleep()` for timers - `Workflow.currentTimeMillis()` for current time - `Workflow.randomUUID()` for UUIDs @@ -238,6 +251,7 @@ See `references/java/testing.md` for info on writing tests. ## Additional Resources ### Reference Files + - **`references/java/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. - **`references/java/determinism.md`** - Determinism rules and safe alternatives for Java - **`references/java/gotchas.md`** - Java-specific mistakes and anti-patterns diff --git a/references/java/observability.md b/references/java/observability.md index d7d9528..338fcb7 100644 --- a/references/java/observability.md +++ b/references/java/observability.md @@ -31,6 +31,7 @@ public class OrderWorkflowImpl implements OrderWorkflow { ``` The workflow logger automatically: + - Suppresses duplicate logs during replay - Includes workflow context (workflow ID, run ID, etc.) - Uses SLF4J under the hood diff --git a/references/java/patterns.md b/references/java/patterns.md index ed2fb37..e6428a9 100644 --- a/references/java/patterns.md +++ b/references/java/patterns.md @@ -423,10 +423,12 @@ public class MyWorkflowImpl implements MyWorkflow { ## Activity Heartbeat Details ### WHY: + - **Support activity cancellation** — Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled - **Resume progress after worker failure** — Heartbeat details persist across retries ### WHEN: + - **Cancellable activities** — Any activity that should respond to cancellation - **Long-running activities** — Track progress for resumability - **Checkpointing** — Save progress periodically diff --git a/references/java/versioning.md b/references/java/versioning.md index d1a9205..0e520f2 100644 --- a/references/java/versioning.md +++ b/references/java/versioning.md @@ -38,6 +38,7 @@ public class ShippingWorkflowImpl implements ShippingWorkflow { ``` **How it works:** + - For new executions: returns `maxSupported` and records a marker in history - For replay with the marker: returns the recorded version - For replay without the marker: returns `DEFAULT_VERSION` (-1) diff --git a/references/python/advanced-features.md b/references/python/advanced-features.md index e0d3297..3584a64 100644 --- a/references/python/advanced-features.md +++ b/references/python/advanced-features.md @@ -85,6 +85,7 @@ The Python SDK runs workflows in a sandbox to help you ensure determinism. You c **The Python SDK is NOT compatible with gevent.** Gevent's monkey patching modifies Python's asyncio event loop in ways that break the SDK's deterministic execution model. If your application uses gevent: + - You cannot run Temporal workers in the same process - Consider running workers in a separate process without gevent - Use a message queue or HTTP API to communicate between gevent and Temporal processes @@ -163,4 +164,3 @@ worker = Worker( workflow_failure_exception_types=[ValueError, CustomBusinessError], ) ``` - diff --git a/references/python/data-handling.md b/references/python/data-handling.md index 662101e..65f4a99 100644 --- a/references/python/data-handling.md +++ b/references/python/data-handling.md @@ -7,6 +7,7 @@ The Python SDK uses data converters to serialize/deserialize workflow inputs, ou ## Default Data Converter The default converter handles: + - `None` - `bytes` (as binary) - Protobuf messages @@ -59,6 +60,7 @@ client = await Client.connect( ## Custom Data Conversion Usually the easiest way to do this is via implementing an EncodingPayloadConverter and CompositePayloadConverter. See: + - https://raw.githubusercontent.com/temporalio/samples-python/refs/heads/main/custom_converter/shared.py - https://raw.githubusercontent.com/temporalio/samples-python/refs/heads/main/custom_converter/starter.py diff --git a/references/python/determinism-protection.md b/references/python/determinism-protection.md index 1376ced..3ff9543 100644 --- a/references/python/determinism-protection.md +++ b/references/python/determinism-protection.md @@ -7,6 +7,7 @@ The Python SDK runs workflows in a sandbox that provides automatic protection ag ## How the Sandbox Works The sandbox: + - Isolates global state via `exec` compilation - Restricts non-deterministic library calls via proxy objects - Passes through standard library with restrictions @@ -35,6 +36,7 @@ with workflow.unsafe.imports_passed_through(): ``` **When to use pass-through:** + - Data classes and models (Pydantic, dataclasses) - Serialization libraries - Type definitions diff --git a/references/python/determinism.md b/references/python/determinism.md index e925f7c..e1b53a7 100644 --- a/references/python/determinism.md +++ b/references/python/determinism.md @@ -34,6 +34,7 @@ Use the `Replayer` class to verify your code changes are compatible with existin ## Sandbox Behavior The sandbox: + - Isolates global state via `exec` compilation - Restricts non-deterministic library calls via proxy objects - Passes through standard library with restrictions diff --git a/references/python/gotchas.md b/references/python/gotchas.md index 95ebe8a..a32b045 100644 --- a/references/python/gotchas.md +++ b/references/python/gotchas.md @@ -211,10 +211,12 @@ class GoodWorkflow: ### Not Handling Activity Cancellation Activities must **opt in** to receive cancellation. This requires: + 1. **Heartbeating** - Cancellation is delivered via heartbeat 2. **Catching the cancellation exception** - Exception is raised when heartbeat detects cancellation **Cancellation exceptions:** + - Async activities: `asyncio.CancelledError` - Sync threaded activities: `temporalio.exceptions.CancelledError` diff --git a/references/python/observability.md b/references/python/observability.md index 26296c3..0130d89 100644 --- a/references/python/observability.md +++ b/references/python/observability.md @@ -27,6 +27,7 @@ class MyWorkflow: ``` The workflow logger automatically: + - Suppresses duplicate logs during replay - Includes workflow context (workflow ID, run ID, etc.) @@ -46,6 +47,7 @@ async def process_order(order_id: str) -> str: ``` Activity logger includes: + - Activity ID, type, and task queue - Workflow ID and run ID - Attempt number (for retries) @@ -92,7 +94,6 @@ Runtime.set_default(runtime, error_if_already_set=True) - `temporal_activity_execution_latency` - Activity execution time - `temporal_workflow_task_replay_latency` - Replay duration - ## Search Attributes (Visibility) See the Search Attributes section of `references/python/data-handling.md` diff --git a/references/python/patterns.md b/references/python/patterns.md index 6843985..ae70757 100644 --- a/references/python/patterns.md +++ b/references/python/patterns.md @@ -321,14 +321,17 @@ class MyWorkflow: ## Activity Heartbeat Details ### WHY: + - **Support activity cancellation** - Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled - **Resume progress after worker failure** - Heartbeat details persist across retries **Cancellation exceptions:** + - Async activities: `asyncio.CancelledError` - Sync threaded activities: `temporalio.exceptions.CancelledError` ### WHEN: + - **Cancellable activities** - Any activity that should respond to cancellation - **Long-running activities** - Track progress for resumability - **Checkpointing** - Save progress periodically diff --git a/references/python/python.md b/references/python/python.md index 2c56843..bc0a0f3 100644 --- a/references/python/python.md +++ b/references/python/python.md @@ -9,6 +9,7 @@ The Temporal Python SDK (`temporalio`) provides a fully async, type-safe approac **Add Dependency on Temporal:** In the package management system of the Python project you are working on, add a dependency on `temporalio`. **activities/greet.py** - Activity definitions (separate file for performance): + ```python from temporalio import activity @@ -18,6 +19,7 @@ def greet(name: str) -> str: ``` **workflows/greeting.py** - Workflow definition (import activities through sandbox): + ```python from datetime import timedelta from temporalio import workflow @@ -35,6 +37,7 @@ class GreetingWorkflow: ``` **worker.py** - Worker setup (imports activity and workflow, runs indefinitely and processes tasks): + ```python import asyncio import concurrent.futures @@ -70,6 +73,7 @@ if __name__ == "__main__": **Start the worker:** Start `python worker.py` in the background (appropriately adjust command for your project, like `uv run python worker.py`) **starter.py** - Start a workflow execution: + ```python import asyncio from temporalio.client import Client @@ -93,10 +97,10 @@ if __name__ == "__main__": **Run the workflow:** Run `python starter.py` (or uv run, etc.). Should output: `Result: Hello, my-name!`. - ## Key Concepts ### Workflow Definition + - Use `@workflow.defn` decorator on class - Put any state initialization logic in the `__init__` of your workflow class to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `@workflow.init` decorator and parameters to your `__init__`. - Use `@workflow.run` on the entry point method @@ -104,6 +108,7 @@ if __name__ == "__main__": - Use `@workflow.signal`, `@workflow.query`, `@workflow.update` for handlers ### Activity Definition + - Use `@activity.defn` decorator - Can be sync or async functions - **Default to sync activities** - safer and easier to debug @@ -113,6 +118,7 @@ if __name__ == "__main__": See `sync-vs-async.md` for detailed guidance on choosing between sync and async. ### Worker Setup + - Connect client, create Worker with workflows and activities - Run the worker - Activities can specify custom executor @@ -136,6 +142,7 @@ my_temporal_app/ ``` **In the Workflow file, import Activities through the sandbox:** + ```python # workflows/greeting.py from temporalio import workflow @@ -162,6 +169,7 @@ See `references/python/testing.md` for info on writing tests. ## Additional Resources ### Reference Files + - **`references/python/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. - **`references/python/determinism.md`** - Sandbox behavior, safe alternatives, pass-through pattern, history replay - **`references/python/gotchas.md`** - Python-specific mistakes and anti-patterns diff --git a/references/python/sync-vs-async.md b/references/python/sync-vs-async.md index 7875582..247b0e5 100644 --- a/references/python/sync-vs-async.md +++ b/references/python/sync-vs-async.md @@ -19,6 +19,7 @@ Activities should be synchronous by default. Use async only when certain the cod The Python async event loop runs in a single thread. When any task runs, no other tasks can execute until an `await` is reached. If code makes a blocking call (file I/O, synchronous HTTP, etc.), the entire event loop freezes. **Consequences of blocking the event loop:** + - Worker cannot communicate with Temporal Server - Workflow progress blocks across the worker - Potential deadlocks and unpredictable behavior @@ -73,6 +74,7 @@ async def my_async_activity(name: str) -> str: | `httpx` | Both | Yes (use async mode) | **Example: Wrong way (blocks event loop)** + ```python @activity.defn async def bad_activity(url: str) -> str: @@ -82,6 +84,7 @@ async def bad_activity(url: str) -> str: ``` **Example: Correct way (async-safe)** + ```python @activity.defn async def good_activity(url: str) -> str: @@ -150,6 +153,7 @@ For CPU-bound work and multi-core usage: ### Separate Workers for Workflows vs Activities Some teams deploy: + - Workflow-only workers (CPU-bound, need deadlock detection) - Activity-only workers (I/O-bound, may need more parallelism) diff --git a/references/python/testing.md b/references/python/testing.md index e4a7823..71a47b1 100644 --- a/references/python/testing.md +++ b/references/python/testing.md @@ -140,7 +140,6 @@ async def test_replay(): ) ``` - ## Activity Testing ```python diff --git a/references/python/versioning.md b/references/python/versioning.md index abd4445..1daab78 100644 --- a/references/python/versioning.md +++ b/references/python/versioning.md @@ -30,6 +30,7 @@ class ShippingWorkflow: ``` **How it works:** + - For new executions: `patched()` returns `True` and records a marker in the Workflow history - For replay with the marker: `patched()` returns `True` (history includes this patch) - For replay without the marker: `patched()` returns `False` (history predates this patch) @@ -213,6 +214,7 @@ worker = Worker( ``` **Configuration parameters:** + - `use_worker_versioning`: Enables Worker Versioning - `version`: Identifies the Worker Deployment Version (deployment name + build ID) - Build ID: Typically a git commit hash, version number, or timestamp @@ -238,6 +240,7 @@ class StableWorkflow: ``` **When to use PINNED:** + - Short-running workflows (minutes to hours) - Consistency is critical (e.g., financial transactions) - You want to eliminate version compatibility complexity @@ -248,6 +251,7 @@ class StableWorkflow: Workflows can move to newer versions: **When to use AUTO_UPGRADE:** + - Long-running workflows (weeks or months) - Workflows need to benefit from bug fixes during execution - Migrating from traditional rolling deployments @@ -280,6 +284,7 @@ worker = Worker( **Blue-Green Deployments** Maintain two environments and switch traffic between them: + 1. Deploy new code to idle environment 2. Run tests and validation 3. Switch traffic to new environment @@ -288,6 +293,7 @@ Maintain two environments and switch traffic between them: **Rainbow Deployments** Multiple versions run simultaneously: + - New workflows use latest version - Existing workflows complete on their original version - Add new versions alongside existing ones diff --git a/references/typescript/advanced-features.md b/references/typescript/advanced-features.md index 17b7e61..ed9817d 100644 --- a/references/typescript/advanced-features.md +++ b/references/typescript/advanced-features.md @@ -39,6 +39,7 @@ await handle.delete(); Complete an activity asynchronously from outside the activity function. Useful when the activity needs to wait for an external event. **In the activity - return the task token:** + ```typescript import { CompleteAsyncError, activityInfo } from '@temporalio/activity'; @@ -50,6 +51,7 @@ export async function doSomethingAsync(): Promise { ``` **External completion (from another process, machine, etc.):** + ```typescript import { Client } from '@temporalio/client'; @@ -61,6 +63,7 @@ async function doSomeWork(taskToken: Uint8Array): Promise { ``` **When to use:** + - Waiting for human approval - Waiting for external webhook callback - Long-polling external systems @@ -93,6 +96,7 @@ const worker = await Worker.create({ ``` **Key settings:** + - `maxConcurrentWorkflowTaskExecutions`: Max workflows running simultaneously (default: 40) - `maxConcurrentActivityTaskExecutions`: Max activities running simultaneously (default: 100) - `shutdownGraceTime`: Time to wait for in-progress work before forced shutdown diff --git a/references/typescript/data-handling.md b/references/typescript/data-handling.md index bfd4925..c8be6f8 100644 --- a/references/typescript/data-handling.md +++ b/references/typescript/data-handling.md @@ -7,6 +7,7 @@ The TypeScript SDK uses data converters to serialize/deserialize workflow inputs ## Default Data Converter The default converter handles: + - `undefined` and `null` - `Uint8Array` (as binary) - JSON-serializable types diff --git a/references/typescript/determinism-protection.md b/references/typescript/determinism-protection.md index 54303ba..81c513a 100644 --- a/references/typescript/determinism-protection.md +++ b/references/typescript/determinism-protection.md @@ -29,7 +29,6 @@ const worker = await Worker.create({ Use this with *extreme caution*. - ## Function Replacement Functions like `Math.random()`, `Date`, and `setTimeout()` are replaced by deterministic versions. diff --git a/references/typescript/gotchas.md b/references/typescript/gotchas.md index d234f74..61763b3 100644 --- a/references/typescript/gotchas.md +++ b/references/typescript/gotchas.md @@ -145,6 +145,7 @@ export async function workflowWithCleanup(): Promise { ### Not Handling Activity Cancellation Activities must **opt in** to receive cancellation. This requires: + 1. **Heartbeating** - Cancellation is delivered via heartbeat 2. **Checking for cancellation** - Either await `Context.current().cancelled` or use `cancellationSignal()` diff --git a/references/typescript/patterns.md b/references/typescript/patterns.md index 3d59e23..6dc2b32 100644 --- a/references/typescript/patterns.md +++ b/references/typescript/patterns.md @@ -289,6 +289,7 @@ export async function scopedWorkflow(): Promise { **WHY**: Triggers provide a one-shot promise that resolves when a signal is received. Cleaner than condition() for single-value signals. **WHEN to use**: + - Waiting for a single response (approval, completion notification) - Converting signal-based events into awaitable promises @@ -351,10 +352,12 @@ export async function handlerAwareWorkflow(): Promise { ## Activity Heartbeat Details ### WHY: + - **Support activity cancellation** - Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled - **Resume progress after worker failure** - Heartbeat details persist across retries ### WHEN: + - **Cancellable activities** - Any activity that should respond to cancellation - **Long-running activities** - Track progress for resumability - **Checkpointing** - Save progress periodically diff --git a/references/typescript/typescript.md b/references/typescript/typescript.md index 9918ee7..9e125cb 100644 --- a/references/typescript/typescript.md +++ b/references/typescript/typescript.md @@ -13,13 +13,15 @@ Temporal workflows are durable through history replay. For details on how this w ## Quick Start **Add Dependencies:** Install the Temporal SDK packages (use the package manager appropriate for your project): + ```bash npm install @temporalio/client @temporalio/worker @temporalio/workflow @temporalio/activity ``` -Note: if you are working in production, it is strongly advised to use ~ version constraints, i.e. `npm install ... --save-prefix='~'` if using NPM. +Note: if you are working in production, it is strongly advised to use ~ version constraints, i.e. `npm install ... --save-prefix='~'` if using NPM. **activities.ts** - Activity definitions (separate file to distinguish workflow vs activity code): + ```typescript export async function greet(name: string): Promise { return `Hello, ${name}!`; @@ -27,6 +29,7 @@ export async function greet(name: string): Promise { ``` **workflows.ts** - Workflow definition (use type-only imports for activities): + ```typescript import { proxyActivities } from '@temporalio/workflow'; import type * as activities from './activities'; @@ -41,6 +44,7 @@ export async function greetingWorkflow(name: string): Promise { ``` **worker.ts** - Worker setup (imports activities and workflows, runs indefinitely): + ```typescript import { Worker } from '@temporalio/worker'; import * as activities from './activities'; @@ -62,6 +66,7 @@ run().catch(console.error); **Start the worker:** Run `npx ts-node worker.ts` in the background. **client.ts** - Start a workflow execution: + ```typescript import { Client } from '@temporalio/client'; import { greetingWorkflow } from './workflows'; @@ -87,16 +92,19 @@ run().catch(console.error); ## Key Concepts ### Workflow Definition + - Async functions exported from workflow file - Use `proxyActivities()` with type-only imports - Use `defineSignal()`, `defineQuery()`, `defineUpdate()`, `setHandler()` for handlers ### Activity Definition + - Regular async functions - Can perform I/O, network calls, etc. - Use `heartbeat()` for long operations ### Worker Setup + - Use `Worker.create()` with `workflowsPath` (dev) or `workflowBundle` (production) - see `references/typescript/gotchas.md` - Import activities directly (not via proxy) @@ -115,6 +123,7 @@ my_temporal_app/ ``` **In the Workflow file, use type-only imports for activities:** + ```typescript // workflows/greeting.ts import { proxyActivities } from '@temporalio/workflow'; @@ -130,11 +139,13 @@ const { translate } = proxyActivities({ The TypeScript SDK runs workflows in an isolated V8 sandbox. **Automatic replacements:** + - `Math.random()` → deterministic seeded PRNG - `Date.now()` → workflow start time - `setTimeout` → deterministic timer **Safe to use:** + - `sleep()` from `@temporalio/workflow` - `condition()` for waiting - Standard JavaScript operations @@ -160,6 +171,7 @@ See `references/typescript/testing.md` for info on writing tests. ## Additional Resources ### Reference Files + - **`references/typescript/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. - **`references/typescript/determinism.md`** - Essentials of determinism in TypeScript - **`references/typescript/gotchas.md`** - TypeScript-specific mistakes and anti-patterns diff --git a/references/typescript/versioning.md b/references/typescript/versioning.md index a9f57a2..b4b8e19 100644 --- a/references/typescript/versioning.md +++ b/references/typescript/versioning.md @@ -25,6 +25,7 @@ export async function myWorkflow(): Promise { ``` **How it works:** + - If the Workflow is running for the first time, `patched()` returns `true` and inserts a marker into the Event History - During replay, if the history contains a marker with the same `patchId`, `patched()` returns `true` - During replay, if no matching marker exists, `patched()` returns `false` @@ -175,6 +176,7 @@ const worker = await Worker.create({ ``` **Configuration options:** + - `useWorkerVersioning`: Enables Worker Versioning - `version.deploymentName`: Logical name for your service (consistent across versions) - `version.buildId`: Unique identifier for this build @@ -195,6 +197,7 @@ const worker = await Worker.create({ ### When to Use Worker Versioning Worker Versioning is best suited for: + - **Short-running Workflows**: Old Workers only need to run briefly during deployment transitions - **Frequent deployments**: Eliminates the need for code-level patching on every change - **Blue-green deployments**: Run old and new versions simultaneously with traffic control From 97155470117f08fa921836fe861258dc6437d82f Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 10:12:07 -0400 Subject: [PATCH 21/42] Upstreaming https://github.com/temporalio/codex-temporal-plugin/pull/6/changes (#80) --- SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SKILL.md b/SKILL.md index 0ed18f0..5d2a338 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,6 +1,6 @@ --- name: temporal-developer -description: This skill should be used when the user asks to "create a Temporal workflow", "write a Temporal activity", "debug stuck workflow", "fix non-determinism error", "Temporal Python", "Temporal TypeScript", "Temporal Go", "Temporal Golang", "Temporal Java", "workflow replay", "activity timeout", "signal workflow", "query workflow", "worker not starting", "activity keeps retrying", "Temporal heartbeat", "continue-as-new", "child workflow", "saga pattern", "workflow versioning", "durable execution", "reliable distributed systems", or mentions Temporal SDK development. +description: Develop, debug, and manage Temporal applications across Python, TypeScript, Go, and Java. Use when the user is building workflows, activities, or workers with a Temporal SDK, debugging issues like non-determinism errors, stuck workflows, or activity retries, using Temporal CLI, Temporal Server, or Temporal Cloud, or working with durable execution concepts like signals, queries, heartbeats, versioning, continue-as-new, child workflows, or saga patterns. version: 0.2.0 --- From f4926d58f6285ec2ce3c26632b84f34f26450f43 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 10:12:21 -0400 Subject: [PATCH 22/42] Add skill to plugin syncing workflow (#78) * Add skill to plugin syncing workflow * Fix Semgrep report --- .github/workflows/sync-skill-to-plugins.yml | 106 ++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/workflows/sync-skill-to-plugins.yml diff --git a/.github/workflows/sync-skill-to-plugins.yml b/.github/workflows/sync-skill-to-plugins.yml new file mode 100644 index 0000000..d5ca8a7 --- /dev/null +++ b/.github/workflows/sync-skill-to-plugins.yml @@ -0,0 +1,106 @@ +# ABOUTME: GitHub Actions workflow that syncs skill contents to the cursor and codex plugin repos. +# ABOUTME: Triggers when a new release is created (by the package-skill workflow) or manually. +# ABOUTME: Creates or updates a PR in each target repo rather than pushing directly to main. +# ABOUTME: Uses a GitHub App for cross-repo authentication. Required secrets: +# ABOUTME: SKILL_T_DEV_APP_ID — the GitHub App's ID +# ABOUTME: SKILL_T_DEV_KEY — the GitHub App's private key +# ABOUTME: The app must be installed on all three repos with Contents (write) and +# ABOUTME: Pull Requests (write) permissions. + +name: Sync Skill to Plugin Repos + +on: + release: + types: [published] + workflow_dispatch: + +permissions: + contents: read + +jobs: + sync: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - repo: temporalio/cursor-temporal-plugin + target_path: skills/temporal-developer + - repo: temporalio/codex-temporal-plugin + target_path: plugins/temporal-developer/skills/temporal-developer + - repo: temporalio/claude-temporal-plugin + target_path: skills/temporal-developer + + steps: + - name: Generate token from GitHub App + id: app-token + uses: actions/create-github-app-token@v2 + with: + app-id: ${{ secrets.SKILL_T_DEV_APP_ID }} + private-key: ${{ secrets.SKILL_T_DEV_KEY }} + owner: ${{ github.repository_owner }} + + - name: Checkout source + uses: actions/checkout@v4 + + - name: Checkout target repo + uses: actions/checkout@v4 + with: + repository: ${{ matrix.repo }} + token: ${{ steps.app-token.outputs.token }} + path: target-repo + + - name: Sync skill contents + working-directory: target-repo + run: | + BRANCH="sync/temporal-developer-skill" + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + # Create or reset the sync branch based on current main. + # -B ensures the branch always starts from main's tip, even if a + # stale remote branch exists from a previously merged PR. + git checkout -B "$BRANCH" origin/main + + # Remove old contents and copy current + rm -rf "${{ matrix.target_path }}/SKILL.md" \ + "${{ matrix.target_path }}/references" + cp ../SKILL.md "${{ matrix.target_path }}/" + cp -r ../references "${{ matrix.target_path }}/" + + # Check for changes against main + git add "${{ matrix.target_path }}" + if git diff --cached --quiet; then + echo "no_changes=true" >> "$GITHUB_ENV" + echo "No changes to sync" + else + echo "no_changes=false" >> "$GITHUB_ENV" + version="${{ github.event.release.tag_name || 'manual' }}" + git commit -m "sync temporal-developer skill ${version} from source repo" + git push --force origin "$BRANCH" + fi + + - name: Create or update PR + if: env.no_changes == 'false' + working-directory: target-repo + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + BRANCH="sync/temporal-developer-skill" + version="${{ github.event.release.tag_name || 'manual' }}" + + # Check if a PR already exists from this branch + existing_pr=$(gh pr list --head "$BRANCH" --state open --json number --jq '.[0].number') + + if [ -n "$existing_pr" ]; then + echo "PR #${existing_pr} already exists — updated by the force-push" + gh pr comment "$existing_pr" --body "Updated to ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }})." + else + body="Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). + + This PR was created automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})." + + gh pr create \ + --title "Sync temporal-developer skill ${version}" \ + --body "$body" + fi From c92f5ecae73527f10e0989381be1dd45bf394b59 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 10:25:14 -0400 Subject: [PATCH 23/42] Update to latest versions of actions, to address deprecation warnings (#82) --- .github/workflows/package-skill.yml | 6 +++--- .github/workflows/sync-skill-to-plugins.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/package-skill.yml b/.github/workflows/package-skill.yml index 69c48f5..637deb2 100644 --- a/.github/workflows/package-skill.yml +++ b/.github/workflows/package-skill.yml @@ -16,7 +16,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: fetch-depth: 0 @@ -44,14 +44,14 @@ jobs: -x '*.DS_Store' - name: Upload artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: temporal-developer-skill path: temporal-developer-skill.zip - name: Create release if: steps.tag_check.outputs.exists == 'false' - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@v3 with: tag_name: ${{ steps.version.outputs.tag }} name: ${{ steps.version.outputs.tag }} diff --git a/.github/workflows/sync-skill-to-plugins.yml b/.github/workflows/sync-skill-to-plugins.yml index d5ca8a7..1610430 100644 --- a/.github/workflows/sync-skill-to-plugins.yml +++ b/.github/workflows/sync-skill-to-plugins.yml @@ -34,17 +34,17 @@ jobs: steps: - name: Generate token from GitHub App id: app-token - uses: actions/create-github-app-token@v2 + uses: actions/create-github-app-token@v3 with: app-id: ${{ secrets.SKILL_T_DEV_APP_ID }} private-key: ${{ secrets.SKILL_T_DEV_KEY }} owner: ${{ github.repository_owner }} - name: Checkout source - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Checkout target repo - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: repository: ${{ matrix.repo }} token: ${{ steps.app-token.outputs.token }} From 418cbd7861b0fceaea517656675882735d1fea1f Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 10:30:42 -0400 Subject: [PATCH 24/42] Improved Syncing UX: Changelogs + Step Summaries (#83) * Add changelog to syncing PRs. * Add step summary to syncing job * Also *edit* PR bodies with changelogs --- .github/workflows/sync-skill-to-plugins.yml | 48 ++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/.github/workflows/sync-skill-to-plugins.yml b/.github/workflows/sync-skill-to-plugins.yml index 1610430..2fb8aac 100644 --- a/.github/workflows/sync-skill-to-plugins.yml +++ b/.github/workflows/sync-skill-to-plugins.yml @@ -42,6 +42,8 @@ jobs: - name: Checkout source uses: actions/checkout@v6 + with: + fetch-depth: 0 - name: Checkout target repo uses: actions/checkout@v6 @@ -80,6 +82,27 @@ jobs: git push --force origin "$BRANCH" fi + - name: Build changelog + id: changelog + run: | + if [ "${{ github.event_name }}" = "release" ]; then + # Use the release body (auto-generated notes from package-skill) + changelog=$(cat <<'RELEASE_BODY' + ${{ github.event.release.body }} + RELEASE_BODY + ) + else + # Manual trigger: generate from git log since the previous tag + prev_tag=$(git describe --tags --abbrev=0 HEAD^ 2>/dev/null || echo "") + if [ -n "$prev_tag" ]; then + changelog=$(git log --oneline "${prev_tag}..HEAD") + else + changelog=$(git log --oneline -20) + fi + fi + # Write to a file to avoid shell quoting issues + echo "$changelog" > /tmp/changelog.md + - name: Create or update PR if: env.no_changes == 'false' working-directory: target-repo @@ -88,19 +111,34 @@ jobs: run: | BRANCH="sync/temporal-developer-skill" version="${{ github.event.release.tag_name || 'manual' }}" + changelog=$(cat /tmp/changelog.md) # Check if a PR already exists from this branch existing_pr=$(gh pr list --head "$BRANCH" --state open --json number --jq '.[0].number') if [ -n "$existing_pr" ]; then echo "PR #${existing_pr} already exists — updated by the force-push" + gh pr edit "$existing_pr" \ + --title "Sync temporal-developer skill ${version}" \ + --body "Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). + + This PR was updated automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}). + + ## Changelog + ${changelog}" gh pr comment "$existing_pr" --body "Updated to ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }})." + pr_url=$(gh pr view "$existing_pr" --json url --jq '.url') + echo "### ${{ matrix.repo }}" >> "$GITHUB_STEP_SUMMARY" + echo "Updated [PR #${existing_pr}](${pr_url})" >> "$GITHUB_STEP_SUMMARY" else - body="Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). + pr_url=$(gh pr create \ + --title "Sync temporal-developer skill ${version}" \ + --body "Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). - This PR was created automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})." + This PR was created automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}). - gh pr create \ - --title "Sync temporal-developer skill ${version}" \ - --body "$body" + ## Changelog + ${changelog}") + echo "### ${{ matrix.repo }}" >> "$GITHUB_STEP_SUMMARY" + echo "Created ${pr_url}" >> "$GITHUB_STEP_SUMMARY" fi From 44eba4e91c986e53b2f322d462037756c77962b7 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 14:28:02 -0400 Subject: [PATCH 25/42] Add .NET SDK support to temporal-developer skill (#39) * Add .NET reference files for temporal-developer skill Created 11 .NET reference files covering: dotnet.md (overview/quick start), patterns.md, determinism.md, determinism-protection.md, error-handling.md, testing.md, versioning.md, observability.md, data-handling.md, gotchas.md, and advanced-features.md. Follows Python/TypeScript patterns with .NET-specific content for Task determinism, CancellationToken, dependency injection, etc. Co-Authored-By: Claude Opus 4.6 (1M context) * Fix .NET alignment issues from self-review - dotnet.md: Reduce Determinism Rules section to brief cross-reference (was duplicating determinism.md content) - patterns.md: Add ParentClosePolicy to Child Workflows example - gotchas.md: Add missing "Heartbeat Timeout Too Short" subsection - versioning.md: Add missing Key Concepts, Deployment Strategies, Query Filters, PINNED/AUTO_UPGRADE guidance, CLI examples - advanced-features.md: Add worker-level heading for exception types Co-Authored-By: Claude Opus 4.6 (1M context) * Fix .NET correctness issues from verification pass - patterns.md: Fix cancellation pattern to use official TemporalException.IsCanceledException(e) with detached CancellationTokenSource - advanced-features.md: Fix DI hosting example to use official AddHostedTemporalWorker(clientTargetHost:, clientNamespace:, taskQueue:) pattern Verified against official SDK README, API docs, and temporal-docs. Co-Authored-By: Claude Opus 4.6 (1M context) * Update supported language references to include .NET - SKILL.md: Add "Temporal .NET" and "Temporal C#" trigger phrases, update overview to mention .NET, add .NET entry in getting started - core/determinism.md: Add .NET entry in SDK Protection Mechanisms Co-Authored-By: Claude Opus 4.6 (1M context) * Edits to advanced features * edits to determinism protection, and move the .editorconfig section * missed one * edit determinism.md * edit error-handling.md * edit gotchas.md * edit patterns.md * edit versioning.md * edit observability.md * fix metrics * self-review round 1 * minor correctness fixed * Update references/dotnet/patterns.md Co-authored-by: Justin Anderson <44687433+jmaeagle99@users.noreply.github.com> * address comments, clarify reference to earlier code snippet * clarify that operations are forbidden IN WORKFLOWS * cleanup workflow cancellation handling example * add task token retrieval comment * update .net requirements * Fix propagation of workflow cancellation --------- Co-authored-by: Claude Opus 4.6 (1M context) Co-authored-by: Justin Anderson <44687433+jmaeagle99@users.noreply.github.com> --- SKILL.md | 7 +- references/core/determinism.md | 1 + references/dotnet/advanced-features.md | 203 ++++++++ references/dotnet/data-handling.md | 216 +++++++++ references/dotnet/determinism-protection.md | 49 ++ references/dotnet/determinism.md | 56 +++ references/dotnet/dotnet.md | 193 ++++++++ references/dotnet/error-handling.md | 157 +++++++ references/dotnet/gotchas.md | 261 +++++++++++ references/dotnet/observability.md | 107 +++++ references/dotnet/patterns.md | 493 ++++++++++++++++++++ references/dotnet/testing.md | 176 +++++++ references/dotnet/versioning.md | 301 ++++++++++++ references/go/determinism.md | 4 +- references/java/advanced-features.md | 1 + references/java/determinism-protection.md | 4 +- references/java/determinism.md | 4 +- references/java/error-handling.md | 5 + references/python/advanced-features.md | 1 + references/python/determinism-protection.md | 4 +- references/python/determinism.md | 4 +- references/python/error-handling.md | 5 +- references/typescript/determinism.md | 4 +- 23 files changed, 2244 insertions(+), 12 deletions(-) create mode 100644 references/dotnet/advanced-features.md create mode 100644 references/dotnet/data-handling.md create mode 100644 references/dotnet/determinism-protection.md create mode 100644 references/dotnet/determinism.md create mode 100644 references/dotnet/dotnet.md create mode 100644 references/dotnet/error-handling.md create mode 100644 references/dotnet/gotchas.md create mode 100644 references/dotnet/observability.md create mode 100644 references/dotnet/patterns.md create mode 100644 references/dotnet/testing.md create mode 100644 references/dotnet/versioning.md diff --git a/SKILL.md b/SKILL.md index 5d2a338..df322df 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,6 +1,6 @@ --- name: temporal-developer -description: Develop, debug, and manage Temporal applications across Python, TypeScript, Go, and Java. Use when the user is building workflows, activities, or workers with a Temporal SDK, debugging issues like non-determinism errors, stuck workflows, or activity retries, using Temporal CLI, Temporal Server, or Temporal Cloud, or working with durable execution concepts like signals, queries, heartbeats, versioning, continue-as-new, child workflows, or saga patterns. +description: Develop, debug, and manage Temporal applications across Python, TypeScript, Go, Java and .NET. Use when the user is building workflows, activities, or workers with a Temporal SDK, debugging issues like non-determinism errors, stuck workflows, or activity retries, using Temporal CLI, Temporal Server, or Temporal Cloud, or working with durable execution concepts like signals, queries, heartbeats, versioning, continue-as-new, child workflows, or saga patterns. version: 0.2.0 --- @@ -8,7 +8,7 @@ version: 0.2.0 ## Overview -Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python, TypeScript, Go, and Java. +Temporal is a durable execution platform that makes workflows survive failures automatically. This skill provides guidance for building Temporal applications in Python, TypeScript, Go, Java and .NET. ## Core Architecture @@ -79,8 +79,9 @@ Once you've downloaded the file, extract the downloaded archive and add the temp 1. First, read the getting started guide for the language you are working in: - Python -> read `references/python/python.md` - TypeScript -> read `references/typescript/typescript.md` - - Java -> read `references/java/java.md` - Go -> read `references/go/go.md` + - Java -> read `references/java/java.md` + - .NET (C#) -> read `references/dotnet/dotnet.md` 2. Second, read appropriate `core` and language-specific references for the task at hand. ## Primary References diff --git a/references/core/determinism.md b/references/core/determinism.md index 952cca4..f2439b4 100644 --- a/references/core/determinism.md +++ b/references/core/determinism.md @@ -88,6 +88,7 @@ Each Temporal SDK language provides a different level of protection against non- - TypeScript: The TypeScript SDK runs workflows in an isolated V8 sandbox, intercepting many common sources of non-determinism and replacing them automatically with deterministic variants. - Java: The Java SDK has no sandbox. Determinism is enforced by developer conventions — the SDK provides `Workflow.*` APIs as safe alternatives (e.g., `Workflow.sleep()` instead of `Thread.sleep()`), and non-determinism is only detected at replay time via `NonDeterministicException`. A static analysis tool (`temporal-workflowcheck`, beta) can catch violations at build time. Cooperative threading under a global lock eliminates the need for synchronization. - Go: The Go SDK has no runtime sandbox. Therefore, non-determinism bugs will never be immediately appararent, and are usually only observable during replay. The optional `workflowcheck` static analysis tool can be used to check for many sources of non-determinism at compile time. +- .NET: The .NET SDK has no sandbox. It uses a custom TaskScheduler and a runtime EventListener to detect invalid task scheduling. Developers must use Workflow.* safe alternatives (e.g., Workflow.DelayAsync instead of Task.Delay) and avoid non-deterministic .NET Task APIs. Regardless of which SDK you are using, it is your responsibility to ensure that workflow code does not contain sources of non-determinism. Use SDK-specific tools as well as replay tests for doing so. diff --git a/references/dotnet/advanced-features.md b/references/dotnet/advanced-features.md new file mode 100644 index 0000000..fd0f81e --- /dev/null +++ b/references/dotnet/advanced-features.md @@ -0,0 +1,203 @@ +# .NET SDK Advanced Features + +## Schedules + +Create recurring workflow executions. + +```csharp +using Temporalio.Client.Schedules; + +var scheduleId = "daily-report"; +await client.CreateScheduleAsync( + scheduleId, + new Schedule( + Action: ScheduleActionStartWorkflow.Create( + (DailyReportWorkflow wf) => wf.RunAsync(), + new(id: "daily-report", taskQueue: "reports")), + Spec: new ScheduleSpec + { + Intervals = new List + { + new(Every: TimeSpan.FromDays(1)), + }, + })); + +// Manage schedules +var handle = client.GetScheduleHandle(scheduleId); +await handle.PauseAsync("Maintenance window"); +await handle.UnpauseAsync(); +await handle.TriggerAsync(); // Run immediately +await handle.DeleteAsync(); +``` + +## Async Activity Completion + +For activities that complete asynchronously (e.g., human tasks, external callbacks). +If you configure a `HeartbeatTimeout` on this activity, the external completer is responsible for sending heartbeats via the async handle. +If you do NOT set a `HeartbeatTimeout`, no heartbeats are required. + +**Note:** If the external system that completes the asynchronous action can reliably be trusted to do the task and Signal back with the result, and it doesn't need to Heartbeat or receive Cancellation, then consider using **signals** instead. + +```csharp +using Temporalio.Activities; +using Temporalio.Client; + +[Activity] +public async Task RequestApprovalAsync(string requestId) +{ + var taskToken = ActivityExecutionContext.Current.Info.TaskToken; + + // Store task token for later completion (e.g., in database) + await StoreTaskTokenAsync(requestId, taskToken); + + // Mark this activity as waiting for external completion + throw new CompleteAsyncException(); +} + +// Later, complete the activity from another process +public async Task CompleteApprovalAsync(string requestId, bool approved) +{ + var client = await TemporalClient.ConnectAsync(new("localhost:7233")); + // Retrieve the task token from external storage (e.g., database) + var taskToken = await GetTaskTokenAsync(requestId); + + var handle = client.GetAsyncActivityHandle(taskToken); + + // Optional: if a HeartbeatTimeout was set, you can periodically: + // await handle.HeartbeatAsync(progressDetails); + + if (approved) + await handle.CompleteAsync("approved"); + else + // You can also fail or report cancellation via the handle + await handle.FailAsync(new ApplicationFailureException("Rejected")); +} +``` + +## Worker Tuning + +Configure worker performance settings. + +```csharp +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + { + // Workflow task concurrency + MaxConcurrentWorkflowTasks = 100, + // Activity task concurrency + MaxConcurrentActivities = 100, + // Graceful shutdown timeout + GracefulShutdownTimeout = TimeSpan.FromSeconds(30), + } + .AddWorkflow() + .AddAllActivities(new MyActivities())); +``` + +## Workflow Init Attribute + +Use `[WorkflowInit]` on a constructor to run initialization code when a workflow is first created. + +**Purpose:** Execute some setup code before signal/update happens or run is invoked. + +```csharp +[Workflow] +public class MyWorkflow +{ + private readonly string _initialValue; + private readonly List _items = new(); + + [WorkflowInit] + public MyWorkflow(string initialValue) + { + _initialValue = initialValue; + } + + [WorkflowRun] + public async Task RunAsync(string initialValue) + { + // _initialValue and _items are already initialized + return _initialValue; + } +} +``` + +Constructor and `[WorkflowRun]` method must have the same parameters with the same types. You cannot make blocking calls (activities, sleeps, etc.) from the constructor. + +## Workflow Failure Exception Types + +Control which exceptions cause workflow failures vs workflow task retries. + +**Default behavior:** Only `ApplicationFailureException` fails a workflow. All other exceptions retry the workflow task forever (treated as bugs to fix with a code deployment). + +**Tip for testing:** Set `WorkflowFailureExceptionTypes` to include `Exception` so any unhandled exception fails the workflow immediately rather than retrying the workflow task forever. This surfaces bugs faster. + +### Worker-Level Configuration + +```csharp +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + { + // These exception types will fail the workflow execution (not just the task) + WorkflowFailureExceptionTypes = new[] { typeof(ArgumentException), typeof(InvalidOperationException) }, + } + .AddWorkflow() + .AddAllActivities(new MyActivities())); +``` + +## Dependency Injection + +The .NET SDK supports dependency injection via the `Temporalio.Extensions.Hosting` package, which integrates with .NET's generic host. + +### Worker as Generic Host + +```csharp +using Temporalio.Extensions.Hosting; + +public class Program +{ + public static async Task Main(string[] args) + { + var host = Host.CreateDefaultBuilder(args) + .ConfigureServices(ctx => + ctx. + AddScoped(). + AddHostedTemporalWorker( + clientTargetHost: "localhost:7233", + clientNamespace: "default", + taskQueue: "my-task-queue"). + AddScopedActivities(). + AddWorkflow()) + .Build(); + await host.RunAsync(); + } +} +``` + +### Activity Dependency Injection + +As shown in the host setup above, activities can be registered with `AddScopedActivities()`, `AddSingletonActivities()`, or `AddTransientActivities()`. Activities registered this way are created via DI, allowing constructor injection: + +```csharp +public class MyActivities +{ + private readonly ILogger _logger; + private readonly IOrderRepository _repository; + + public MyActivities(ILogger logger, IOrderRepository repository) + { + _logger = logger; + _repository = repository; + } + + [Activity] + public async Task GetOrderAsync(string orderId) + { + _logger.LogInformation("Fetching order {OrderId}", orderId); + return await _repository.GetAsync(orderId); + } +} +``` + +**Note:** Dependency injection is NOT available in workflows — workflows must be self-contained for determinism. diff --git a/references/dotnet/data-handling.md b/references/dotnet/data-handling.md new file mode 100644 index 0000000..fc8d308 --- /dev/null +++ b/references/dotnet/data-handling.md @@ -0,0 +1,216 @@ +# .NET SDK Data Handling + +## Overview + +The .NET SDK uses data converters to serialize/deserialize workflow inputs, outputs, and activity parameters. + +## Default Data Converter + +The default converter handles: +- `null` +- `byte[]` (as binary) +- `Google.Protobuf.IMessage` instances +- Anything that `System.Text.Json` supports +- `IRawValue` as unconverted raw payloads + +## Custom Data Converter + +Customize serialization by extending `DefaultPayloadConverter`. For example, to use camelCase property naming: + +```csharp +using System.Text.Json; +using Temporalio.Client; +using Temporalio.Converters; + +public class CamelCasePayloadConverter : DefaultPayloadConverter +{ + public CamelCasePayloadConverter() + : base(new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase }) + { + } +} + +var client = await TemporalClient.ConnectAsync(new() +{ + TargetHost = "localhost:7233", + Namespace = "my-namespace", + DataConverter = DataConverter.Default with + { + PayloadConverter = new CamelCasePayloadConverter(), + }, +}); +``` + +## Protobuf Support + +The default data converter includes built-in support for Protocol Buffer messages via `Google.Protobuf.IMessage`. Protobuf messages are automatically serialized using proto3 JSON. + +```csharp +// Any Google.Protobuf.IMessage is automatically handled +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync(MyProtoRequest request) + { + // Protobuf messages are serialized/deserialized automatically + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessAsync(request), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} +``` + +## Payload Encryption + +Encrypt sensitive workflow data using a custom `IPayloadCodec`: + +```csharp +using Temporalio.Converters; +using Google.Protobuf; + +public class EncryptionCodec : IPayloadCodec +{ + public Task> EncodeAsync( + IReadOnlyCollection payloads) => + Task.FromResult>(payloads.Select(p => + new Payload + { + Metadata = { ["encoding"] = "binary/encrypted" }, + Data = ByteString.CopyFrom(Encrypt(p.ToByteArray())), + }).ToList()); + + public Task> DecodeAsync( + IReadOnlyCollection payloads) => + Task.FromResult>(payloads.Select(p => + { + if (p.Metadata.GetValueOrDefault("encoding") != "binary/encrypted") + return p; + return Payload.Parser.ParseFrom(Decrypt(p.Data.ToByteArray())); + }).ToList()); + + private byte[] Encrypt(byte[] data) => /* your encryption logic */; + private byte[] Decrypt(byte[] data) => /* your decryption logic */; +} + +// Apply encryption codec +var client = await TemporalClient.ConnectAsync(new("localhost:7233") +{ + DataConverter = DataConverter.Default with + { + PayloadCodec = new EncryptionCodec(), + }, +}); +``` + +## Search Attributes + +Custom searchable fields for workflow visibility. These can be set at workflow start: + +```csharp +using Temporalio.Common; + +var handle = await client.StartWorkflowAsync( + (OrderWorkflow wf) => wf.RunAsync(order), + new(id: $"order-{order.Id}", taskQueue: "orders") + { + TypedSearchAttributes = new SearchAttributeCollection.Builder() + .Set(SearchAttributeKey.CreateKeyword("OrderId"), order.Id) + .Set(SearchAttributeKey.CreateKeyword("OrderStatus"), "pending") + .Set(SearchAttributeKey.CreateFloat("OrderTotal"), order.Total) + .Build(), + }); +``` + +Or upserted during workflow execution: + +```csharp +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + // ... process order ... + + // Update search attribute + Workflow.UpsertTypedSearchAttributes( + SearchAttributeKey.CreateKeyword("OrderStatus").ValueSet("completed")); + return "done"; + } +} +``` + +### Querying Workflows by Search Attributes + +```csharp +await foreach (var wf in client.ListWorkflowsAsync( + "OrderStatus = \"processing\" OR OrderStatus = \"pending\"")) +{ + Console.WriteLine($"Workflow {wf.Id} is still processing"); +} +``` + +## Workflow Memo + +Store arbitrary metadata with workflows (not searchable). + +```csharp +await client.ExecuteWorkflowAsync( + (OrderWorkflow wf) => wf.RunAsync(order), + new(id: $"order-{order.Id}", taskQueue: "orders") + { + Memo = new Dictionary + { + ["customer_name"] = order.CustomerName, + ["notes"] = "Priority customer", + }, + }); +``` + +```csharp +// Read memo from workflow +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + var notes = Workflow.Memo["notes"]; + // ... + } +} +``` + +## Deterministic APIs for Values + +Use these APIs within workflows for deterministic random values and UUIDs: + +```csharp +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + // Deterministic GUID (same on replay) + var uniqueId = Workflow.NewGuid(); + + // Deterministic random (same on replay) + var value = Workflow.Random.Next(1, 100); + + // Deterministic current time + var now = Workflow.UtcNow; + + return uniqueId.ToString(); + } +} +``` + +## Best Practices + +1. Use records or classes with `System.Text.Json` support for input/output +2. Keep payloads small — see `references/core/gotchas.md` for limits +3. Encrypt sensitive data with `IPayloadCodec` +4. Use `Workflow.NewGuid()` and `Workflow.Random` for deterministic values +5. Use camelCase converter if interoperating with other SDKs diff --git a/references/dotnet/determinism-protection.md b/references/dotnet/determinism-protection.md new file mode 100644 index 0000000..f9c480d --- /dev/null +++ b/references/dotnet/determinism-protection.md @@ -0,0 +1,49 @@ +# .NET Determinism Protection + +## Overview + +The .NET SDK has no runtime sandbox. Determinism is enforced by **developer convention** and **runtime task detection**. Unlike the Python and TypeScript SDKs, the .NET SDK will not intercept or replace non-deterministic calls at compile time or import time. The SDK does provide a runtime `EventListener` that detects some invalid task scheduling, but catching all non-deterministic code requires following the rules below and testing, in particular replay tests (see `references/dotnet/testing.md`). + +## Runtime Task Detection + +By default, the .NET SDK enables an `EventListener` that monitors task events. When workflow code accidentally starts a task on the wrong scheduler (e.g., via `Task.Run`), an `InvalidWorkflowOperationException` is thrown. This causes the workflow task to fail, which will continuously retry until the code is fixed. + +```csharp +// This will be detected at runtime and fail the workflow task +[Workflow] +public class BadWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + // BAD: Task.Run uses TaskScheduler.Default + await Task.Run(() => DoSomething()); + } +} +``` + +## .NET Task Determinism Rules + +Many .NET `Task` APIs implicitly use `TaskScheduler.Default`, which breaks determinism. Here are the key rules: + +**Do NOT use:** +- `Task.Run` — uses default scheduler. Use `Workflow.RunTaskAsync`. +- `Task.ConfigureAwait(false)` — leaves current context. Use `ConfigureAwait(true)` or omit. +- `Task.Delay` / `Task.Wait` / timeout-based `CancellationTokenSource` — uses system timers. Use `Workflow.DelayAsync` / `Workflow.WaitConditionAsync`. +- `Task.WhenAny` — use `Workflow.WhenAnyAsync`. +- `Task.WhenAll` — use `Workflow.WhenAllAsync` (technically safe currently, but wrapper is recommended). +- `CancellationTokenSource.CancelAsync` — use `CancellationTokenSource.Cancel`. +- `System.Threading.Semaphore` / `SemaphoreSlim` / `Mutex` — use `Temporalio.Workflows.Semaphore` / `Mutex`. + +**Be wary of:** +- Third-party libraries that implicitly use `TaskScheduler.Default` +- `Dataflow` blocks and similar concurrency libraries with hidden default scheduler usage + +## Best Practices + +1. **Always use `Workflow.*` alternatives** for Task operations in workflows +2. **Don't disable the `EventListener`** — it's on by default and catches mistakes at runtime +3. **Separate workflow and activity code** into different files/projects for clarity +4. **Use `SortedDictionary`** or sort collections before iterating — `Dictionary` iteration order is not guaranteed +5. **Test with replay** to catch non-determinism early +6. **Review third-party library usage** in workflow code for hidden default scheduler usage diff --git a/references/dotnet/determinism.md b/references/dotnet/determinism.md new file mode 100644 index 0000000..c1dbf56 --- /dev/null +++ b/references/dotnet/determinism.md @@ -0,0 +1,56 @@ +# .NET SDK Determinism + +## Overview + +The .NET SDK has NO runtime sandbox (unlike Python/TypeScript). Workflows must be deterministic for replay, and determinism is enforced by developer convention and runtime task detection via an `EventListener` (see `references/dotnet/determinism-protection.md`). + +## Why Determinism Matters: History Replay + +Temporal provides durable execution through **History Replay**. When a Worker restores workflow state, it re-executes workflow code from the beginning. This requires the code to be **deterministic**. See `references/core/determinism.md` for a deep explanation. + +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. + +```csharp +// DO NOT do these in workflows: +await Task.Run(() => { }); // Uses default scheduler +await Task.Delay(TimeSpan.FromSeconds(1)); // System timer +var now = DateTime.UtcNow; // System clock +var r = new Random().Next(); // Non-deterministic +var id = Guid.NewGuid(); // Non-deterministic +File.ReadAllText("file.txt"); // I/O +await httpClient.GetAsync("..."); // Network I/O +``` + +Most non-determinism and side effects should be wrapped in Activities. + +## Safe Builtin Alternatives + +| Forbidden | Safe Alternative | +|-----------|------------------| +| `DateTime.Now` / `DateTime.UtcNow` | `Workflow.UtcNow` | +| `Random` | `Workflow.Random` | +| `Guid.NewGuid()` | `Workflow.NewGuid()` | +| `Task.Delay` | `Workflow.DelayAsync` | +| `Thread.Sleep` | `Workflow.DelayAsync` | +| `Task.Run` | `Workflow.RunTaskAsync` | +| `Task.WhenAll` | `Workflow.WhenAllAsync` | +| `Task.WhenAny` | `Workflow.WhenAnyAsync` | +| `System.Threading.Mutex` | `Temporalio.Workflows.Mutex` | +| `System.Threading.Semaphore` | `Temporalio.Workflows.Semaphore` | +| `CancellationTokenSource.CancelAsync` | `CancellationTokenSource.Cancel` | + +## Testing Replay Compatibility + +Use `WorkflowReplayer` to verify your code changes are compatible with existing histories. See the Workflow Replay Testing section of `references/dotnet/testing.md`. + +## Best Practices + +1. Always use `Workflow.*` APIs instead of standard .NET equivalents (see table above) +2. Never use `ConfigureAwait(false)` in workflows +3. Use `SortedDictionary` or sort before iterating collections +4. Move all I/O operations (network, filesystem, database) into activities +5. Use `Workflow.Logger` instead of `Console.WriteLine` for replay-safe logging +6. Keep workflow code focused on orchestration; delegate non-deterministic work to activities +7. Test with replay after making changes to workflow definitions diff --git a/references/dotnet/dotnet.md b/references/dotnet/dotnet.md new file mode 100644 index 0000000..29b40aa --- /dev/null +++ b/references/dotnet/dotnet.md @@ -0,0 +1,193 @@ +# Temporal .NET SDK Reference + +## Overview + +The Temporal .NET SDK provides a high-performance, type-safe approach to building durable workflows using C# and .NET. Workflows use attributes (`[Workflow]`, `[WorkflowRun]`) and lambda expressions for type-safe invocations. Supports .NET Framework 4.6.2+ and .NET Core 3.1+ (including .NET 5+). + +**CRITICAL**: The .NET SDK has **no sandbox**. Developers must be careful to avoid non-deterministic code in workflows. See the Determinism Rules section below and `references/dotnet/determinism.md`. + +## Understanding Replay + +Temporal workflows are durable through history replay. For details on how this works, see `references/core/determinism.md`. + +## Quick Start + +**Add Dependency:** Install the Temporal SDK NuGet package: +```bash +dotnet add package Temporalio +``` + +**Activities.cs** - Activity definitions (separate file for clarity): +```csharp +using Temporalio.Activities; + +public class MyActivities +{ + [Activity] + public string Greet(string name) + { + return $"Hello, {name}!"; + } +} +``` + +**GreetingWorkflow.workflow.cs** - Workflow definition: +```csharp +using Temporalio.Workflows; + +[Workflow] +public class GreetingWorkflow +{ + [WorkflowRun] + public async Task RunAsync(string name) + { + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.Greet(name), + new() { StartToCloseTimeout = TimeSpan.FromSeconds(30) }); + } +} +``` + +**Worker (Program.cs)** - Worker setup: +```csharp +using Temporalio.Client; +using Temporalio.Worker; + +var client = await TemporalClient.ConnectAsync(new("localhost:7233")); + +using var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + .AddWorkflow() + .AddAllActivities(new MyActivities())); + +await worker.ExecuteAsync(); +``` + +**Start the dev server:** Start `temporal server start-dev` in the background. + +**Start the worker:** Run `dotnet run` in the worker project. + +**Starter (Program.cs)** - Start a workflow execution: +```csharp +using Temporalio.Client; + +var client = await TemporalClient.ConnectAsync(new("localhost:7233")); + +var result = await client.ExecuteWorkflowAsync( + (GreetingWorkflow wf) => wf.RunAsync("my name"), + new(id: $"greeting-{Guid.NewGuid()}", taskQueue: "my-task-queue")); + +Console.WriteLine($"Result: {result}"); +``` + +**Run the workflow:** Run `dotnet run` in the starter project. Should output: `Result: Hello, my name!`. + +## Key Concepts + +### Workflow Definition +- Use `[Workflow]` attribute on class +- Put any state initialization logic in the constructor of your workflow class to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `[WorkflowInit]` attribute and parameters to your constructor. +- Use `[WorkflowRun]` on the async entry point method +- Must return `Task` or `Task` +- Use `[WorkflowSignal]`, `[WorkflowQuery]`, `[WorkflowUpdate]` for handlers + +### Activity Definition +- Use `[Activity]` attribute on methods +- Can be sync or async +- Instance methods support dependency injection +- Static methods are also supported + +### Worker Setup +- Connect client, create `TemporalWorker` with workflows and activities +- Use `AddWorkflow()` and `AddAllActivities(instance)` or `AddActivity(method)` + +### Determinism + +**Workflow code must be deterministic!** The .NET SDK has no sandbox. See the Determinism Rules section below and `references/core/determinism.md` and `references/dotnet/determinism.md`. + +## File Organization Best Practice + +**Keep Workflow definitions in separate files from Activity definitions.** While not as critical as Python (no sandbox reloading), separation improves clarity and testability. Use the `.workflow.cs` extension for workflow files so the `.editorconfig` overrides (see below) apply only to workflow code. + +``` +MyTemporalApp/ +├── Workflows/ +│ └── GreetingWorkflow.workflow.cs # Only Workflow classes +├── Activities/ +│ └── TranslateActivities.cs # Only Activity classes +├── Models/ +│ └── OrderInput.cs # Shared data models +├── Worker/ +│ └── Program.cs # Worker setup +└── Starter/ + └── Program.cs # Client code to start workflows +``` + +## Workflow .editorconfig + +Workflow code violates some standard .NET analyzer rules. The recommended approach is to use the `.workflow.cs` file extension for workflow files and scope the overrides to that extension: + +```ini +# Configuration specific for Temporal workflows +[*.workflow.cs] + +# We use getters for queries, they cannot be properties +dotnet_diagnostic.CA1024.severity = none + +# Don't force workflows to have static methods +dotnet_diagnostic.CA1822.severity = none + +# Do not need ConfigureAwait for workflows +dotnet_diagnostic.CA2007.severity = none + +# Do not need task scheduler for workflows +dotnet_diagnostic.CA2008.severity = none + +# Workflow randomness is intentionally deterministic +dotnet_diagnostic.CA5394.severity = none + +# Allow async methods to not have await in them +dotnet_diagnostic.CS1998.severity = none + +# Don't force workflows to call async methods +dotnet_diagnostic.VSTHRD103.severity = none + +# Don't avoid, but rather encourage things using TaskScheduler.Current in workflows +dotnet_diagnostic.VSTHRD105.severity = none +``` + +## Determinism Rules + +The .NET SDK has **no sandbox** like Python or TypeScript. Developers must avoid non-deterministic operations manually. Many standard .NET `Task` APIs use `TaskScheduler.Default` implicitly, which breaks determinism. + +See `references/dotnet/determinism.md` for the full list of forbidden operations, safe alternatives, and best practices. See `references/dotnet/determinism-protection.md` for details on the runtime detection mechanism. + +## Common Pitfalls + +1. **Using `Task.Run` in workflows** — Uses default scheduler, breaks determinism. Use `Workflow.RunTaskAsync`. +2. **Using `Task.Delay` in workflows** — Uses system timer. Use `Workflow.DelayAsync`. +3. **`ConfigureAwait(false)` in workflows** — Leaves the deterministic scheduler. Never use in workflows. +4. **Non-`ApplicationFailureException` in workflows** — Other exceptions retry the workflow task forever instead of failing the workflow. +5. **Dictionary iteration in workflows** — `Dictionary` has no guaranteed order. Use `SortedDictionary`. +6. **Forgetting to heartbeat** — Long-running activities need `ActivityExecutionContext.Current.Heartbeat()` calls. +7. **Using `CancellationTokenSource.CancelAsync`** — Use `CancellationTokenSource.Cancel` instead. +8. **Logging with `Console.WriteLine` in workflows** — Use `Workflow.Logger` for replay-safe logging. + +## Writing Tests + +See `references/dotnet/testing.md` for info on writing tests. + +## Additional Resources + +### Reference Files +- **`references/dotnet/patterns.md`** — Signals, queries, child workflows, saga pattern, etc. +- **`references/dotnet/determinism.md`** — Essentials of determinism in .NET +- **`references/dotnet/gotchas.md`** — .NET-specific mistakes and anti-patterns +- **`references/dotnet/error-handling.md`** — ApplicationFailureException, retry policies, non-retryable errors +- **`references/dotnet/observability.md`** — Logging, metrics, tracing +- **`references/dotnet/testing.md`** — WorkflowEnvironment, time-skipping, activity mocking +- **`references/dotnet/advanced-features.md`** — Schedules, worker tuning, dependency injection +- **`references/dotnet/data-handling.md`** — Data converters, payload encryption, etc. +- **`references/dotnet/versioning.md`** — Patching API, workflow type versioning, Worker Versioning +- **`references/dotnet/determinism-protection.md`** — Runtime task detection, .NET Task determinism rules diff --git a/references/dotnet/error-handling.md b/references/dotnet/error-handling.md new file mode 100644 index 0000000..f441620 --- /dev/null +++ b/references/dotnet/error-handling.md @@ -0,0 +1,157 @@ +# .NET SDK Error Handling + +## Overview + +The .NET SDK uses `ApplicationFailureException` for application-specific errors and provides comprehensive retry policy configuration. Generally, the following information about errors and retryability applies across activities, child workflows and Nexus operations. + +## Application Failures + +```csharp +using Temporalio.Activities; +using Temporalio.Exceptions; + +[Activity] +public async Task ValidateOrderAsync(Order order) +{ + if (!order.IsValid()) + { + throw new ApplicationFailureException( + "Invalid order", + errorType: "ValidationError"); + } +} +``` + +## Non-Retryable Errors + +```csharp +using Temporalio.Activities; +using Temporalio.Exceptions; + +[Activity] +public async Task ChargeCardAsync(ChargeCardInput input) +{ + if (!IsValidCard(input.CardNumber)) + { + throw new ApplicationFailureException( + "Permanent failure - invalid credit card", + errorType: "PaymentError", + nonRetryable: true); // Will not retry activity + } + return await ProcessPaymentAsync(input.CardNumber, input.Amount); +} +``` + +## Handling Activity Errors in Workflows + +```csharp +using Temporalio.Workflows; +using Temporalio.Exceptions; + +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + try + { + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.RiskyActivityAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } + catch (ActivityFailureException ex) when (!TemporalException.IsCanceledException(ex)) + { + Workflow.Logger.LogError(ex, "Activity failed"); + throw new ApplicationFailureException( + "Workflow failed due to activity error"); + } + } +} +``` + +## Retry Configuration + +```csharp +using Temporalio.Common; +using Temporalio.Workflows; + +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.MyActivityAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(10), + RetryPolicy = new() + { + MaximumInterval = TimeSpan.FromMinutes(1), + MaximumAttempts = 5, + NonRetryableErrorTypes = new[] { "ValidationError", "PaymentError" }, + }, + }); + } +} +``` + +Only set options such as MaximumInterval, MaximumAttempts etc. if you have a domain-specific reason to. +If not, prefer to leave them at their defaults. + +## Timeout Configuration + +```csharp +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + return await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.MyActivityAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(5), // Single attempt + ScheduleToCloseTimeout = TimeSpan.FromMinutes(30), // Including retries + HeartbeatTimeout = TimeSpan.FromMinutes(2), // Between heartbeats + }); + } +} +``` + +## Workflow Failure + +**Critical .NET behavior:** Only `ApplicationFailureException` will fail a workflow. All other exceptions (including standard .NET exceptions like `NullReferenceException`, `KeyNotFoundException`, etc.) will **retry the workflow task** indefinitely. This is by design — those are treated as bugs to be fixed with a code deployment, not reasons for the workflow to fail. + +```csharp +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + if (someCondition) + { + throw new ApplicationFailureException( + "Cannot process order", + errorType: "BusinessError"); + } + return "success"; + } +} +``` + +**Note:** Do not use `nonRetryable:` with `ApplicationFailureException` inside a workflow (as opposed to an activity). + +## Best Practices + +1. Use specific error types for different failure modes +2. Mark permanent failures as non-retryable in activities +3. Configure appropriate retry policies +4. Log errors before re-raising +5. Use `ActivityFailureException` to catch activity failures in workflows +6. Design code to be idempotent for safe retries (see more at `references/core/patterns.md`) +7. Only throw `ApplicationFailureException` from workflows to fail them — other exceptions will retry the workflow task diff --git a/references/dotnet/gotchas.md b/references/dotnet/gotchas.md new file mode 100644 index 0000000..05213c2 --- /dev/null +++ b/references/dotnet/gotchas.md @@ -0,0 +1,261 @@ +# .NET Gotchas + +.NET-specific mistakes and anti-patterns. See also [Common Gotchas](references/core/gotchas.md) for language-agnostic concepts. + +## .NET Task Determinism + +The biggest .NET gotcha. Many `Task` APIs implicitly use `TaskScheduler.Default`, which breaks determinism. The SDK detects some of these at runtime via an `EventListener`, but not all. + +### Task.Run + +```csharp +// BAD: Uses TaskScheduler.Default +await Task.Run(() => DoSomething()); + +// GOOD: Uses current (deterministic) scheduler +await Workflow.RunTaskAsync(() => DoSomething()); +``` + +### Task.Delay / Thread.Sleep + +```csharp +// BAD: Uses system timer +await Task.Delay(TimeSpan.FromMinutes(5)); + +// GOOD: Creates durable timer in event history +await Workflow.DelayAsync(TimeSpan.FromMinutes(5)); +``` + +### ConfigureAwait(false) + +```csharp +// BAD: Leaves the deterministic context +var result = await SomeCallAsync().ConfigureAwait(false); + +// GOOD: Stays on deterministic scheduler (or just omit ConfigureAwait) +var result = await SomeCallAsync().ConfigureAwait(true); +var result = await SomeCallAsync(); // Also fine +``` + +### Task.WhenAll / Task.WhenAny + +```csharp +// BAD: Potential non-determinism +await Task.WhenAll(task1, task2); +await Task.WhenAny(task1, task2); + +// GOOD: Deterministic wrappers +await Workflow.WhenAllAsync(task1, task2); +await Workflow.WhenAnyAsync(task1, task2); +``` + +### Threading Primitives + +```csharp +// BAD: System threading primitives +var mutex = new System.Threading.Mutex(); +var semaphore = new SemaphoreSlim(1); + +// GOOD: Temporal workflow-safe alternatives +var mutex = new Temporalio.Workflows.Mutex(); +var semaphore = new Temporalio.Workflows.Semaphore(1); +``` + +See `references/dotnet/determinism-protection.md` for the complete list. + +## Wrong Retry Classification + +**Example:** Transient network errors should be retried. Authentication errors should not be. +See `references/dotnet/error-handling.md` to understand how to classify errors. + +## Heartbeating + +### Forgetting to Heartbeat Long Activities + +```csharp +// BAD: No heartbeat, can't detect stuck activities +[Activity] +public async Task ProcessLargeFileAsync(string path) +{ + foreach (var chunk in ReadChunks(path)) + await ProcessAsync(chunk); // Takes hours, no heartbeat + +// GOOD: Regular heartbeats with progress +[Activity] +public async Task ProcessLargeFileAsync(string path) +{ + var chunks = ReadChunks(path); + for (var i = 0; i < chunks.Count; i++) + { + ActivityExecutionContext.Current.Heartbeat($"Processing chunk {i}"); + await ProcessAsync(chunks[i]); + } +} +``` + +### Heartbeat Timeout Too Short + +```csharp +// BAD: Heartbeat timeout shorter than processing time +await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessChunkAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(30), + HeartbeatTimeout = TimeSpan.FromSeconds(10), // Too short! + }); + +// GOOD: Heartbeat timeout allows for processing variance +await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessChunkAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(30), + HeartbeatTimeout = TimeSpan.FromMinutes(2), + }); +``` + +Set heartbeat timeout as high as acceptable for your use case — each heartbeat counts as an action. + +## Cancellation + +### Not Handling Workflow Cancellation + +```csharp +// BAD: Cleanup doesn't run on cancellation +[Workflow] +public class BadWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.AcquireResourceAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.DoWorkAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ReleaseResourceAsync(), // Never runs if cancelled! + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} + +// GOOD: Use try/finally for cleanup +[Workflow] +public class GoodWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.AcquireResourceAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + try + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.DoWorkAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } + finally + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ReleaseResourceAsync(), + new() + { + StartToCloseTimeout = TimeSpan.FromMinutes(5), + CancellationToken = CancellationToken.None, + }); + } + } +} +``` + +### Not Handling Activity Cancellation + +Activities must **opt in** to receive cancellation. This requires: +1. **Heartbeating** — Cancellation is delivered via heartbeat +2. **Checking the cancellation token** — Token is triggered when heartbeat detects cancellation + +```csharp +// BAD: Activity ignores cancellation +[Activity] +public async Task LongActivityAsync() +{ + await DoExpensiveWorkAsync(); // Runs to completion even if cancelled +} + +// GOOD: Heartbeat, check cancellation, and handle cleanup +[Activity] +public async Task LongActivityAsync() +{ + try + { + foreach (var item in items) + { + ActivityExecutionContext.Current.Heartbeat(); + ActivityExecutionContext.Current.CancellationToken.ThrowIfCancellationRequested(); + await ProcessAsync(item); + } + } + catch (OperationCanceledException) + { + await CleanupAsync(); + throw; + } +} +``` + +## Testing + +### Not Testing Failures + +It is important to make sure workflows work as expected under failure paths in addition to happy paths. Please see `references/dotnet/testing.md` for more info. + +### Not Testing Replay + +Replay tests help you test that you do not have hidden sources of non-determinism bugs in your workflow code. Please see `references/dotnet/testing.md` for more info. + +## Timers and Sleep + +### Using Task.Delay + +```csharp +// BAD: Task.Delay uses system timer, not deterministic during replay +[Workflow] +public class BadWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Task.Delay(TimeSpan.FromMinutes(1)); // SDK will detect and fail the task + } +} + +// GOOD: Use Workflow.DelayAsync for deterministic timers +[Workflow] +public class GoodWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.DelayAsync(TimeSpan.FromMinutes(1)); // Deterministic + } +} +``` + +**Why this matters:** `Task.Delay` uses the system clock, which differs between original execution and replay. `Workflow.DelayAsync` creates a durable timer in the event history, ensuring consistent behavior during replay. + +## Dictionary Iteration Order + +```csharp +// BAD: Dictionary iteration order is not guaranteed +var dict = new Dictionary { ["b"] = 2, ["a"] = 1 }; +foreach (var kvp in dict) // Order may differ between executions! + await ProcessAsync(kvp.Key, kvp.Value); + +// GOOD: Use SortedDictionary or sort before iterating +var dict = new SortedDictionary { ["b"] = 2, ["a"] = 1 }; +foreach (var kvp in dict) // Always iterates in key order + await ProcessAsync(kvp.Key, kvp.Value); +``` diff --git a/references/dotnet/observability.md b/references/dotnet/observability.md new file mode 100644 index 0000000..1150f63 --- /dev/null +++ b/references/dotnet/observability.md @@ -0,0 +1,107 @@ +# .NET SDK Observability + +## Overview + +The .NET SDK provides observability through logging, metrics, and tracing using standard .NET patterns. + +## Logging + +### Workflow Logging (Replay-Safe) + +Use `Workflow.Logger` for replay-safe logging that avoids duplicate messages: + +```csharp +[Workflow] +public class MyWorkflow +{ + [WorkflowRun] + public async Task RunAsync(string name) + { + Workflow.Logger.LogInformation("Workflow started for {Name}", name); + + var result = await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.MyActivityAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + + Workflow.Logger.LogInformation("Activity completed with {Result}", result); + return result; + } +} +``` + +The workflow logger automatically: +- Suppresses duplicate logs during replay +- Includes workflow context (workflow ID, run ID, etc.) + +### Activity Logging + +Use `ActivityExecutionContext.Current.Logger` for context-aware activity logging: + +```csharp +[Activity] +public async Task ProcessOrderAsync(string orderId) +{ + var logger = ActivityExecutionContext.Current.Logger; + logger.LogInformation("Processing order {OrderId}", orderId); + + // Perform work... + + logger.LogInformation("Order processed successfully"); + return "completed"; +} +``` + +### Customizing Logger Configuration + +```csharp +using Microsoft.Extensions.Logging; + +var client = await TemporalClient.ConnectAsync(new("localhost:7233") +{ + LoggerFactory = LoggerFactory.Create(builder => + builder + .AddSimpleConsole(options => options.TimestampFormat = "[HH:mm:ss] ") + .SetMinimumLevel(LogLevel.Information)), +}); +``` + +## Metrics + +### Enabling SDK Metrics + +Metrics are configured on `TemporalRuntime`. Create the runtime globally before any client/worker and set a Prometheus endpoint or custom metric meter. + +```csharp +using Temporalio.Client; +using Temporalio.Runtime; + +// Create runtime with Prometheus endpoint +var runtime = new TemporalRuntime(new() +{ + Telemetry = new() { Metrics = new() { Prometheus = new("0.0.0.0:9000") } }, +}); + +// Use this runtime for all clients +var client = await TemporalClient.ConnectAsync( + new("localhost:7233") { Runtime = runtime }); +``` + +Alternatively, use `Temporalio.Extensions.DiagnosticSource` to bridge metrics to a .NET `System.Diagnostics.Metrics.Meter` for integration with OpenTelemetry or other .NET metrics pipelines. + +### Key SDK Metrics + +- `temporal_request` — Client requests to server +- `temporal_workflow_task_execution_latency` — Workflow task processing time +- `temporal_activity_execution_latency` — Activity execution time +- `temporal_workflow_task_replay_latency` — Replay duration + +## Search Attributes (Visibility) + +See the Search Attributes section of `references/dotnet/data-handling.md` + +## Best Practices + +1. Use `Workflow.Logger` in workflows, `ActivityExecutionContext.Current.Logger` in activities +2. Don't use `Console.WriteLine` in workflows — it will produce duplicate output on replay +3. Configure metrics for production monitoring +4. Use Search Attributes for business-level visibility diff --git a/references/dotnet/patterns.md b/references/dotnet/patterns.md new file mode 100644 index 0000000..19d3317 --- /dev/null +++ b/references/dotnet/patterns.md @@ -0,0 +1,493 @@ +# .NET SDK Patterns + +## Signals + +```csharp +[Workflow] +public class OrderWorkflow +{ + private bool _approved; + private readonly List _items = new(); + + [WorkflowSignal] + public async Task ApproveAsync() + { + _approved = true; + } + + [WorkflowSignal] + public async Task AddItemAsync(string item) + { + _items.Add(item); + } + + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.WaitConditionAsync(() => _approved); + return $"Processed {_items.Count} items"; + } +} +``` + +## Dynamic Signal Handlers + +For handling signals with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined signal handlers. + +```csharp +[Workflow] +public class DynamicSignalWorkflow +{ + private readonly Dictionary> _signals = new(); + + [WorkflowSignal(Dynamic = true)] + public async Task HandleSignalAsync(string signalName, IRawValue[] args) + { + if (!_signals.ContainsKey(signalName)) + _signals[signalName] = new List(); + var value = Workflow.PayloadConverter.ToValue(args.Single()); + _signals[signalName].Add(value); + } + + [WorkflowRun] + public async Task>> RunAsync() + { + await Workflow.WaitConditionAsync(() => _signals.ContainsKey("done")); + return _signals; + } +} +``` + +## Queries + +**Important:** Queries must NOT modify workflow state or have side effects. + +```csharp +[Workflow] +public class StatusWorkflow +{ + private string _status = "pending"; + private int _progress; + + [WorkflowQuery] + public string GetStatus() => _status; + + [WorkflowQuery] + public int Progress => _progress; + + [WorkflowRun] + public async Task RunAsync() + { + _status = "running"; + for (var i = 0; i < 100; i++) + { + _progress = i; + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessItem(i), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(1) }); + } + _status = "completed"; + return "done"; + } +} +``` + +## Dynamic Query Handlers + +For handling queries with names not known at compile time. Use cases for this pattern are rare — most workflows should use statically defined query handlers. + +```csharp +[Workflow] +public class DynamicQueryWorkflow +{ + private readonly SortedDictionary _state = new() + { + ["status"] = "running", + ["progress"] = "0", + }; + + [WorkflowQuery(Dynamic = true)] + public string HandleQuery(string queryName, IRawValue[] args) + { + return _state.GetValueOrDefault(queryName, "unknown"); + } + + [WorkflowRun] + public async Task RunAsync() { /* ... */ } +} +``` + +## Updates + +```csharp +[Workflow] +public class OrderWorkflow +{ + private readonly List _items = new(); + + [WorkflowUpdate] + public async Task AddItemAsync(string item) + { + _items.Add(item); + return _items.Count; + } + + [WorkflowUpdateValidator(nameof(AddItemAsync))] + public void ValidateAddItem(string item) + { + if (string.IsNullOrEmpty(item)) + throw new ArgumentException("Item cannot be empty"); + if (_items.Count >= 100) + throw new InvalidOperationException("Order is full"); + } + + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.WaitConditionAsync(() => _items.Count > 0); + return $"Order with {_items.Count} items"; + } +} +``` + +**Important:** Validators must NOT mutate workflow state or do anything blocking (no activities, sleeps, or other commands). They are read-only, similar to query handlers. Throw an exception to reject the update; return void to accept. + +## Child Workflows + +```csharp +[Workflow] +public class ParentWorkflow +{ + [WorkflowRun] + public async Task> RunAsync(List orders) + { + var results = new List(); + foreach (var order in orders) + { + var result = await Workflow.ExecuteChildWorkflowAsync( + (ProcessOrderWorkflow wf) => wf.RunAsync(order), + new() + { + Id = $"order-{order.Id}", + // Control what happens to child when parent completes + // Terminate (default), Abandon, RequestCancel + ParentClosePolicy = ParentClosePolicy.Abandon, + }); + results.Add(result); + } + return results; + } +} +``` + +## Handles to External Workflows + +```csharp +[Workflow] +public class CoordinatorWorkflow +{ + [WorkflowRun] + public async Task RunAsync(string targetWorkflowId) + { + var handle = Workflow.GetExternalWorkflowHandle(targetWorkflowId); + + // Signal the external workflow + await handle.SignalAsync(wf => wf.DataReadyAsync(new DataPayload())); + + // Or cancel it + await handle.CancelAsync(); + } +} +``` + +## Parallel Execution + +```csharp +[Workflow] +public class ParallelWorkflow +{ + [WorkflowRun] + public async Task RunAsync(string[] items) + { + var tasks = items.Select(item => + Workflow.ExecuteActivityAsync( + (MyActivities a) => a.ProcessItem(item), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) })); + + return await Workflow.WhenAllAsync(tasks); + } +} +``` + +## Deterministic Task Alternatives + +.NET `Task` APIs often use `TaskScheduler.Default` implicitly. Use Temporal's deterministic alternatives: + +```csharp +// Instead of Task.WhenAll: +await Workflow.WhenAllAsync(task1, task2, task3); + +// Instead of Task.WhenAny: +await Workflow.WhenAnyAsync(task1, task2); + +// Instead of Task.Run: +await Workflow.RunTaskAsync(() => SomeWork()); + +// Instead of Task.Delay: +await Workflow.DelayAsync(TimeSpan.FromMinutes(5)); + +// Instead of System.Threading.Mutex: +var mutex = new Temporalio.Workflows.Mutex(); +await mutex.WaitOneAsync(); +try { /* critical section */ } +finally { mutex.ReleaseMutex(); } + +// Instead of System.Threading.Semaphore: +var semaphore = new Temporalio.Workflows.Semaphore(3); +await semaphore.WaitAsync(); +try { /* limited concurrency section */ } +finally { semaphore.Release(); } +``` + +## Continue-as-New + +```csharp +[Workflow] +public class LongRunningWorkflow +{ + [WorkflowRun] + public async Task RunAsync(WorkflowState state) + { + while (true) + { + state = await ProcessNextBatch(state); + + if (state.IsComplete) + return "done"; + + if (Workflow.ContinueAsNewSuggested) + throw Workflow.CreateContinueAsNewException( + (LongRunningWorkflow wf) => wf.RunAsync(state)); + } + } +} +``` + +## Saga Pattern (Compensations) + +**Important:** Compensation activities should be idempotent — they may be retried (as with ALL activities). + +```csharp +[Workflow] +public class OrderSagaWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + var compensations = new List>(); + + try + { + // IMPORTANT: Save compensation BEFORE calling the activity. + // If activity fails after completing but before returning, + // compensation must still be registered. + compensations.Add(() => Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ReleaseInventoryIfReservedAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) })); + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ReserveInventoryAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + + compensations.Add(() => Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.RefundPaymentIfChargedAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) })); + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ChargePaymentAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ShipOrderAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + + return "Order completed"; + } + catch (Exception ex) + { + Workflow.Logger.LogError(ex, "Order failed, running compensations"); + compensations.Reverse(); + foreach (var compensate in compensations) + { + try { await compensate(); } + catch (Exception compErr) + { + Workflow.Logger.LogError(compErr, "Compensation failed"); + } + } + throw; + } + } +} +``` + +## Cancellation Handling (CancellationToken) + +.NET uses standard `CancellationToken` for workflow cancellation. + +```csharp +[Workflow] +public class CancellableWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + try + { + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.LongRunningAsync(), + new() { StartToCloseTimeout = TimeSpan.FromHours(1) }); + return "completed"; + } + catch (Exception e) when (TemporalException.IsCanceledException(e)) + { + // The "when" clause above is because we only want to apply the logic to cancellation, but + // this kind of cleanup could be done on any/all exceptions too. + Workflow.Logger.LogError(e, "Cancellation occurred, performing cleanup"); + + // Call cleanup activity. If this throws, it will swallow the original exception which we + // are ok with here. This could be changed to just log a failure and let the original + // cancellation continue. + // The default token on Workflow.CancellationToken is now marked + // cancelled, so we pass a different one. We use CancellationToken.None here because the + // cleanup activity itself doesn't need to be cancellable; if it did (e.g. you want to + // cancel cleanup from a timeout or another signal), create a new detached + // CancellationTokenSource and pass its Token instead. + await Workflow.ExecuteActivityAsync( + (MyActivities a) => a.MyCancellationCleanupActivity(), + new() + { + ScheduleToCloseTimeout = TimeSpan.FromMinutes(5), + CancellationToken = CancellationToken.None, + }); + + // Rethrow the cancellation + throw; + } + } +} +``` + +## Wait Condition with Timeout + +```csharp +[Workflow] +public class ApprovalWorkflow +{ + private bool _approved; + + [WorkflowSignal] + public async Task ApproveAsync() => _approved = true; + + [WorkflowRun] + public async Task RunAsync() + { + // Wait for approval with 24-hour timeout + var gotApproval = await Workflow.WaitConditionAsync( + () => _approved, + TimeSpan.FromHours(24)); + + return gotApproval ? "approved" : "auto-rejected due to timeout"; + } +} +``` + +## Waiting for All Handlers to Finish + +Signal and update handlers should generally be non-async (avoid running activities from them). Otherwise, the workflow may complete before handlers finish their execution. However, making handlers non-async sometimes requires workarounds that add complexity. + +When async handlers are necessary, use `WaitConditionAsync(AllHandlersFinished)` at the end of your workflow (or before continue-as-new) to prevent completion until all pending handlers complete. + +```csharp +[Workflow] +public class HandlerAwareWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + // ... main workflow logic ... + + // Before exiting, wait for all handlers to finish + await Workflow.WaitConditionAsync(() => Workflow.AllHandlersFinished); + return "done"; + } +} +``` + +## Activity Heartbeat Details + +### WHY: +- **Support activity cancellation** — Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled +- **Resume progress after worker failure** — Heartbeat details persist across retries + +### WHEN: +- **Cancellable activities** — Any activity that should respond to cancellation +- **Long-running activities** — Track progress for resumability +- **Checkpointing** — Save progress periodically + +```csharp +[Activity] +public async Task ProcessLargeFileAsync(string filePath) +{ + var info = ActivityExecutionContext.Current.Info; + // Get heartbeat details from previous attempt (if any) + var startLine = info.HeartbeatDetails.Count > 0 + ? await info.HeartbeatDetailAtAsync(0) + : 0; + + var lines = await File.ReadAllLinesAsync(filePath); + for (var i = startLine; i < lines.Length; i++) + { + await ProcessLineAsync(lines[i]); + + // Heartbeat with progress + // If cancelled, CancellationToken will be triggered + ActivityExecutionContext.Current.Heartbeat(i + 1); + ActivityExecutionContext.Current.CancellationToken.ThrowIfCancellationRequested(); + } + + return "completed"; +} +``` + +## Timers + +```csharp +[Workflow] +public class TimerWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + await Workflow.DelayAsync(TimeSpan.FromHours(1)); + return "Timer fired"; + } +} +``` + +## Local Activities + +**Purpose**: Reduce latency for short, lightweight operations by skipping the task queue. ONLY use these when necessary for performance. Do NOT use these by default, as they are not durable and distributed. + +```csharp +[Workflow] +public class LocalActivityWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + var result = await Workflow.ExecuteLocalActivityAsync( + (MyActivities a) => a.QuickLookup("key"), + new() { StartToCloseTimeout = TimeSpan.FromSeconds(5) }); + return result; + } +} +``` diff --git a/references/dotnet/testing.md b/references/dotnet/testing.md new file mode 100644 index 0000000..8bea410 --- /dev/null +++ b/references/dotnet/testing.md @@ -0,0 +1,176 @@ +# .NET SDK Testing + +## Overview + +You test Temporal .NET Workflows using the `Temporalio.Testing` namespace plus a normal .NET test framework. The .NET SDK is compatible with any testing framework; most samples use xUnit. The SDK provides `WorkflowEnvironment` for testing workflows in a local environment and `ActivityEnvironment` for isolated activity testing. + +## Test Environment Setup + +The core pattern is: + +1. Start a `WorkflowEnvironment` (`WorkflowEnvironment.StartLocalAsync()`). +2. Create a `TemporalWorker` in that environment with your Workflow and Activities registered. +3. Use the environment's client to execute the Workflow, using a fresh GUID for the task queue name and workflow ID. +4. Assert on the result or status. + +```csharp +using Temporalio.Testing; +using Temporalio.Worker; + +[Fact] +public async Task TestWorkflow() +{ + await using var env = await WorkflowEnvironment.StartLocalAsync(); + + using var worker = new TemporalWorker( + env.Client, + new TemporalWorkerOptions($"task-queue-{Guid.NewGuid()}") + .AddWorkflow() + .AddAllActivities(new MyActivities())); + + await worker.ExecuteAsync(async () => + { + var result = await env.Client.ExecuteWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync("input"), + new(id: $"wf-{Guid.NewGuid()}", taskQueue: worker.Options.TaskQueue!)); + Assert.Equal("expected", result); + }); +} +``` + +Conveniently, the local `env` can be shared among tests, e.g. via a fixture class. + +If your workflows / tests involve long durations (such as using Temporal timers / sleeps), then you can use the time-skipping environment, via `WorkflowEnvironment.StartTimeSkippingAsync()`. Only use time-skipping if you must. It is not thread safe and cannot be shared among tests. + +## Activity Mocking + +The .NET SDK provides a straightforward way to mock Activities. Create a mock function with the `[Activity]` attribute and specify the name of the original Activity you want to mock: + +```csharp +[Fact] +public async Task TestWithMockActivity() +{ + await using var env = await WorkflowEnvironment.StartLocalAsync(); + + [Activity("MyActivity")] + static Task MockMyActivity(string input) => + Task.FromResult($"mocked: {input}"); + + using var worker = new TemporalWorker( + env.Client, + new TemporalWorkerOptions($"task-queue-{Guid.NewGuid()}") + .AddWorkflow() + .AddActivity(MockMyActivity)); + + await worker.ExecuteAsync(async () => + { + var result = await env.Client.ExecuteWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync("test"), + new(id: $"wf-{Guid.NewGuid()}", taskQueue: worker.Options.TaskQueue!)); + Assert.Equal("mocked: test", result); + }); +} +``` + +**Note:** If the original activity method name ends with `Async` and returns a `Task`, the default activity name has `Async` trimmed off. For example, `MyActivityAsync` has default name `MyActivity`. + +## Testing Signals and Queries + +```csharp +[Fact] +public async Task TestSignalsAndQueries() +{ + await using var env = await WorkflowEnvironment.StartLocalAsync(); + + using var worker = new TemporalWorker(/* ... */); + + await worker.ExecuteAsync(async () => + { + var handle = await env.Client.StartWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync(), + new(id: $"wf-{Guid.NewGuid()}", taskQueue: worker.Options.TaskQueue!)); + + // Send signal + await handle.SignalAsync(wf => wf.MySignalAsync("data")); + + // Query state + var status = await handle.QueryAsync(wf => wf.GetStatus()); + Assert.Equal("expected", status); + + // Wait for completion + var result = await handle.GetResultAsync(); + }); +} +``` + +## Testing Failure Cases + +```csharp +[Fact] +public async Task TestActivityFailureHandling() +{ + await using var env = await WorkflowEnvironment.StartLocalAsync(); + + [Activity("RiskyActivity")] + static Task MockFailingActivity() => + throw new ApplicationFailureException("Simulated failure", nonRetryable: true); + + using var worker = new TemporalWorker(/* ... with mock activity */); + + await worker.ExecuteAsync(async () => + { + var ex = await Assert.ThrowsAsync(() => + env.Client.ExecuteWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync(), + new(id: $"wf-{Guid.NewGuid()}", taskQueue: worker.Options.TaskQueue!))); + }); +} +``` + +## Replay Testing + +```csharp +using Temporalio.Worker; + +[Fact] +public async Task TestReplay() +{ + var historyJson = await File.ReadAllTextAsync("example-history.json"); + var replayer = new WorkflowReplayer( + new WorkflowReplayerOptions() + .AddWorkflow()); + + await replayer.ReplayWorkflowAsync( + WorkflowHistory.FromJson("my-workflow-id", historyJson)); +} +``` + +## Activity Testing + +```csharp +using Temporalio.Testing; + +[Fact] +public async Task TestActivity() +{ + var env = new ActivityEnvironment(); + var activities = new MyActivities(); + var result = await env.RunAsync(() => activities.MyActivity("arg1")); + Assert.Equal("expected", result); +} +``` + +The `ActivityEnvironment` provides: +- `Info` — Activity info, defaulted to basic values +- `CancellationTokenSource` — Token source for issuing cancellation +- `Heartbeater` — Callback invoked each heartbeat +- `Logger` — Activity logger + +## Best Practices + +1. Use the `WorkflowEnvironment.StartLocalAsync` environment for most testing +2. Use time-skipping environment for workflows with durable timers / durable sleeps +3. Mock external dependencies in activities +4. Test replay compatibility, especially when changing workflow code +5. Test signal/query handlers explicitly +6. Use unique workflow IDs and task queues per test to avoid conflicts — `Guid.NewGuid()` is easiest diff --git a/references/dotnet/versioning.md b/references/dotnet/versioning.md new file mode 100644 index 0000000..677a64c --- /dev/null +++ b/references/dotnet/versioning.md @@ -0,0 +1,301 @@ +# .NET SDK Versioning + +For conceptual overview and guidance on choosing an approach, see `references/core/versioning.md`. + +## Patching API + +### The Patched() Method + +The `Workflow.Patched()` method checks whether a Workflow should run new or old code: + +```csharp +[Workflow] +public class ShippingWorkflow +{ + [WorkflowRun] + public async Task RunAsync() + { + if (Workflow.Patched("send-email-instead-of-fax")) + { + // New code path + await Workflow.ExecuteActivityAsync( + (ShippingActivities a) => a.SendEmailAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } + else + { + // Old code path (for replay of existing workflows) + await Workflow.ExecuteActivityAsync( + (ShippingActivities a) => a.SendFaxAsync(), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } + } +} +``` + +**How it works:** +- For new executions: `Patched()` returns `true` and records a marker in the Workflow history +- For replay with the marker: `Patched()` returns `true` (history includes this patch) +- For replay without the marker: `Patched()` returns `false` (history predates this patch) + +### Three-Step Patching Process + +**Warning:** Failing to follow this process correctly will result in non-determinism errors for in-flight workflows. + +**Step 1: Patch in New Code** + +```csharp +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + if (Workflow.Patched("add-fraud-check")) + { + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.CheckFraudAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(2) }); + } + + return await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ProcessPaymentAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} +``` + +**Step 2: Deprecate the Patch** + +Once all pre-patch Workflow Executions have completed: + +```csharp +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + Workflow.DeprecatePatch("add-fraud-check"); + + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.CheckFraudAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(2) }); + + return await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ProcessPaymentAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} +``` + +**Step 3: Remove the Patch** + +After all workflows with the deprecated patch marker have completed, remove the `DeprecatePatch()` call entirely: + +```csharp +[Workflow] +public class OrderWorkflow +{ + [WorkflowRun] + public async Task RunAsync(Order order) + { + await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.CheckFraudAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(2) }); + + return await Workflow.ExecuteActivityAsync( + (OrderActivities a) => a.ProcessPaymentAsync(order), + new() { StartToCloseTimeout = TimeSpan.FromMinutes(5) }); + } +} +``` + +### Query Filters for Finding Workflows by Version + +Use List Filters to find workflows with specific patch versions: + +```bash +# Find running workflows with a specific patch +temporal workflow list --query \ + 'WorkflowType = "OrderWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion = "add-fraud-check"' + +# Find running workflows without any patch (pre-patch versions) +temporal workflow list --query \ + 'WorkflowType = "OrderWorkflow" AND ExecutionStatus = "Running" AND TemporalChangeVersion IS NULL' +``` + +## Workflow Type Versioning + +For incompatible changes, create a new Workflow Type instead of using patches: + +```csharp +[Workflow("PizzaWorkflow")] +public class PizzaWorkflow +{ + [WorkflowRun] + public async Task RunAsync(PizzaOrder order) + { + return await ProcessOrderV1Async(order); + } +} + +[Workflow("PizzaWorkflowV2")] +public class PizzaWorkflowV2 +{ + [WorkflowRun] + public async Task RunAsync(PizzaOrder order) + { + return await ProcessOrderV2Async(order); + } +} +``` + +Register both with the Worker: + +```csharp +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("pizza-task-queue") + .AddWorkflow() + .AddWorkflow() + .AddAllActivities(new PizzaActivities())); +``` + +Update client code to start new workflows with the new type: + +```csharp +// Old workflows continue on PizzaWorkflow +// New workflows use PizzaWorkflowV2 +var handle = await client.StartWorkflowAsync( + (PizzaWorkflowV2 wf) => wf.RunAsync(order), + new(id: $"pizza-{order.Id}", taskQueue: "pizza-task-queue")); +``` + +Check for open executions before removing the old type: + +```bash +temporal workflow list --query 'WorkflowType = "PizzaWorkflow" AND ExecutionStatus = "Running"' +``` + +## Worker Versioning + +Worker Versioning manages versions at the deployment level, allowing multiple Worker versions to run simultaneously. + +### Key Concepts + +**Worker Deployment**: A logical service grouping similar Workers together (e.g., "loan-processor"). All versions of your code live under this umbrella. + +**Worker Deployment Version**: A specific snapshot of your code identified by a deployment name and Build ID (e.g., "loan-processor:v1.0" or "loan-processor:abc123"). + +### Configuring Workers for Versioning + +```csharp +using Temporalio.Worker; + +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + { + DeploymentOptions = new WorkerDeploymentOptions( + DeploymentName: "my-service", + BuildId: Environment.GetEnvironmentVariable("BUILD_ID") ?? "dev"), + UseWorkerVersioning = true, + } + .AddWorkflow() + .AddAllActivities(new MyActivities())); +``` + +**Configuration parameters:** +- `UseWorkerVersioning`: Enables Worker Versioning +- `DeploymentOptions`: Identifies the Worker Deployment Version (deployment name + build ID) +- Build ID: Typically a git commit hash, version number, or timestamp + +### PINNED vs AUTO_UPGRADE Behaviors + +**PINNED Behavior** + +Workflows stay locked to their original Worker version: + +```csharp +[Workflow(VersioningBehavior = VersioningBehavior.Pinned)] +public class StableWorkflow { /* ... */ } +``` + +**When to use PINNED:** +- Short-running workflows (minutes to hours) +- Consistency is critical (e.g., financial transactions) +- You want to eliminate version compatibility complexity +- Building new applications and want simplest development experience + +**AUTO_UPGRADE Behavior** + +Workflows can move to newer versions: + +```csharp +[Workflow(VersioningBehavior = VersioningBehavior.AutoUpgrade)] +public class UpgradableWorkflow { /* ... */ } +``` + +**When to use AUTO_UPGRADE:** +- Long-running workflows (weeks or months) +- Workflows need to benefit from bug fixes during execution +- Migrating from traditional rolling deployments +- You are already using patching APIs for version transitions + +**Important:** AUTO_UPGRADE workflows still need patching to handle version transitions safely since they can move between Worker versions. + +### Worker Configuration with Default Behavior + +```csharp +var worker = new TemporalWorker( + client, + new TemporalWorkerOptions("my-task-queue") + { + DeploymentOptions = new WorkerDeploymentOptions( + DeploymentName: "order-service", + BuildId: Environment.GetEnvironmentVariable("BUILD_ID") ?? "dev") + { + DefaultVersioningBehavior = VersioningBehavior.Pinned, + }, + UseWorkerVersioning = true, + } + .AddWorkflow() + .AddAllActivities(new OrderActivities())); +``` + +### Deployment Strategies + +**Blue-Green Deployments** + +Maintain two environments and switch traffic between them: +1. Deploy new code to idle environment +2. Run tests and validation +3. Switch traffic to new environment +4. Keep old environment for instant rollback + +**Rainbow Deployments** + +Multiple versions run simultaneously: +- New workflows use latest version +- Existing workflows complete on their original version +- Add new versions alongside existing ones +- Gradually sunset old versions as workflows complete + +### Querying Workflows by Worker Version + +```bash +# Find workflows on a specific Worker version +temporal workflow list --query \ + 'TemporalWorkerDeploymentVersion = "my-service:v1.0.0" AND ExecutionStatus = "Running"' +``` + +## Best Practices + +1. **Check for open executions** before removing old code paths +2. **Use descriptive patch IDs** that explain the change (e.g., "add-fraud-check" not "patch-1") +3. **Deploy patches incrementally**: patch, deprecate, remove +4. **Use PINNED for short workflows** to simplify version management +5. **Use AUTO_UPGRADE with patching** for long-running workflows that need updates +6. **Generate Build IDs from code** (git hash) to ensure changes produce new versions +7. **Avoid rolling deployments** for high-availability services with long-running workflows diff --git a/references/go/determinism.md b/references/go/determinism.md index 0cff905..c8b52b9 100644 --- a/references/go/determinism.md +++ b/references/go/determinism.md @@ -8,9 +8,9 @@ The Go SDK has NO runtime sandbox (unlike Python/TypeScript). Workflows must be Temporal provides durable execution through **History Replay**. When a Worker restores workflow state, it re-executes workflow code from the beginning. This requires the code to be **deterministic**. See `references/core/determinism.md` for a deep explanation. -## Forbidden Operations +## Forbidden Operations in Workflows -Do not use any of the following in workflow code: +Do not use any of the following in workflow code (they are appropriate to use in activities): - **Native goroutines** (`go func()`) -- use `workflow.Go()` instead - **Native channels** (`chan`, send, receive, `range` over channel) -- use `workflow.Channel` instead diff --git a/references/java/advanced-features.md b/references/java/advanced-features.md index e897bb1..e736da2 100644 --- a/references/java/advanced-features.md +++ b/references/java/advanced-features.md @@ -77,6 +77,7 @@ public void completeApproval(String requestId, boolean approved) { ActivityCompletionClient completionClient = client.newActivityCompletionClient(); + // Retrieve the task token from external storage (e.g., database) byte[] taskToken = getTaskToken(requestId); if (approved) { diff --git a/references/java/determinism-protection.md b/references/java/determinism-protection.md index 78c4446..1894644 100644 --- a/references/java/determinism-protection.md +++ b/references/java/determinism-protection.md @@ -4,7 +4,9 @@ The Java SDK has **no sandbox** (only Python and TypeScript have sandboxing). Java relies on developer conventions and runtime replay detection to enforce determinism. A static analysis tool (`temporal-workflowcheck`) is available in beta. -## Forbidden Operations +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. ```java // BAD: Non-deterministic operations in workflow code diff --git a/references/java/determinism.md b/references/java/determinism.md index 1981d00..29f25d5 100644 --- a/references/java/determinism.md +++ b/references/java/determinism.md @@ -14,7 +14,9 @@ Java workflow code runs in a cooperative threading model where only one workflow `temporal-workflowcheck` (static analysis, beta) and `WorkflowReplayer` (replay testing) can help uncover some violations, but they are not exhaustive — careful code review and adherence to the rules below remain essential. -## Forbidden Operations +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. - `Thread.sleep()` — blocks the real thread, bypasses Temporal timers - `new Thread()` or thread pools — breaks the cooperative threading model diff --git a/references/java/error-handling.md b/references/java/error-handling.md index 97d4cea..753d69a 100644 --- a/references/java/error-handling.md +++ b/references/java/error-handling.md @@ -77,6 +77,7 @@ Activity failures are always wrapped in `ActivityFailure`. The original exceptio ```java import io.temporal.failure.ActivityFailure; import io.temporal.failure.ApplicationFailure; +import io.temporal.failure.CanceledFailure; import io.temporal.failure.TimeoutFailure; import io.temporal.workflow.Workflow; @@ -86,6 +87,10 @@ public class MyWorkflowImpl implements MyWorkflow { try { return activities.riskyOperation(); } catch (ActivityFailure af) { + // Let cancellation propagate so the workflow is canceled, not failed + if (af.getCause() instanceof CanceledFailure) { + throw af; + } if (af.getCause() instanceof ApplicationFailure) { ApplicationFailure appFailure = (ApplicationFailure) af.getCause(); String type = appFailure.getType(); diff --git a/references/python/advanced-features.md b/references/python/advanced-features.md index 3584a64..3d86e9f 100644 --- a/references/python/advanced-features.md +++ b/references/python/advanced-features.md @@ -62,6 +62,7 @@ async def request_approval(request_id: str) -> None: # Later, complete the activity from another process async def complete_approval(request_id: str, approved: bool): client = await Client.connect("localhost:7233", namespace="default") + # Retrieve the task token from external storage (e.g., database) task_token = await get_task_token(request_id) handle = client.get_async_activity_handle(task_token=task_token) diff --git a/references/python/determinism-protection.md b/references/python/determinism-protection.md index 3ff9543..2eba418 100644 --- a/references/python/determinism-protection.md +++ b/references/python/determinism-protection.md @@ -13,9 +13,9 @@ The sandbox: - Passes through standard library with restrictions - Reloads workflow files on each execution -## Forbidden Operations +## Forbidden Operations in Workflows -These operations will fail in the sandbox: +These operations are forbidden inside workflow code (appropriate in activities) and will fail in the sandbox: - **Direct I/O**: Network calls, file reads/writes - **Threading**: `threading` module operations diff --git a/references/python/determinism.md b/references/python/determinism.md index e1b53a7..2be8f75 100644 --- a/references/python/determinism.md +++ b/references/python/determinism.md @@ -8,7 +8,9 @@ The Python SDK runs workflows in a sandbox that provides automatic protection ag Temporal provides durable execution through **History Replay**. When a Worker needs to restore workflow state (after a crash, cache eviction, or to continue after a long timer), it re-executes the workflow code from the beginning, which requires the workflow code to be **deterministic**. -## Forbidden Operations +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. - Direct I/O (network, filesystem) - Threading operations diff --git a/references/python/error-handling.md b/references/python/error-handling.md index 19460cb..ed9e69d 100644 --- a/references/python/error-handling.md +++ b/references/python/error-handling.md @@ -47,7 +47,7 @@ async def charge_card(input: ChargeCardInput) -> str: ```python from datetime import timedelta from temporalio import workflow -from temporalio.exceptions import ActivityError, ApplicationError +from temporalio.exceptions import ActivityError, ApplicationError, is_cancelled_exception @workflow.defn class MyWorkflow: @@ -59,6 +59,9 @@ class MyWorkflow: start_to_close_timeout=timedelta(minutes=5), ) except ActivityError as e: + # Let cancellation propagate so the workflow is canceled, not failed + if is_cancelled_exception(e): + raise workflow.logger.error(f"Activity failed: {e}") # Handle or re-raise raise ApplicationError("Workflow failed due to activity error") diff --git a/references/typescript/determinism.md b/references/typescript/determinism.md index 47f8948..dfd3464 100644 --- a/references/typescript/determinism.md +++ b/references/typescript/determinism.md @@ -28,7 +28,9 @@ The Temporal workflow sandbox will use the same random seed when replaying a wor See `references/typescript/determinism-protection.md` for more information about the sandbox. -## Forbidden Operations +## Forbidden Operations in Workflows + +The following are forbidden inside workflow code but are appropriate to use in activities. ```typescript // DO NOT do these in workflows: From 0843752be47f13ae82c4ed65fe57873e3ecb0b1d Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 14:41:28 -0400 Subject: [PATCH 26/42] Run formatter over .NET (#87) --- references/core/determinism.md | 2 +- references/dotnet/data-handling.md | 1 + references/dotnet/determinism-protection.md | 2 ++ references/dotnet/dotnet.md | 9 +++++++++ references/dotnet/gotchas.md | 1 + references/dotnet/observability.md | 1 + references/dotnet/patterns.md | 2 ++ references/dotnet/testing.md | 1 + references/dotnet/versioning.md | 6 ++++++ 9 files changed, 24 insertions(+), 1 deletion(-) diff --git a/references/core/determinism.md b/references/core/determinism.md index f2439b4..004f879 100644 --- a/references/core/determinism.md +++ b/references/core/determinism.md @@ -88,7 +88,7 @@ Each Temporal SDK language provides a different level of protection against non- - TypeScript: The TypeScript SDK runs workflows in an isolated V8 sandbox, intercepting many common sources of non-determinism and replacing them automatically with deterministic variants. - Java: The Java SDK has no sandbox. Determinism is enforced by developer conventions — the SDK provides `Workflow.*` APIs as safe alternatives (e.g., `Workflow.sleep()` instead of `Thread.sleep()`), and non-determinism is only detected at replay time via `NonDeterministicException`. A static analysis tool (`temporal-workflowcheck`, beta) can catch violations at build time. Cooperative threading under a global lock eliminates the need for synchronization. - Go: The Go SDK has no runtime sandbox. Therefore, non-determinism bugs will never be immediately appararent, and are usually only observable during replay. The optional `workflowcheck` static analysis tool can be used to check for many sources of non-determinism at compile time. -- .NET: The .NET SDK has no sandbox. It uses a custom TaskScheduler and a runtime EventListener to detect invalid task scheduling. Developers must use Workflow.* safe alternatives (e.g., Workflow.DelayAsync instead of Task.Delay) and avoid non-deterministic .NET Task APIs. +- .NET: The .NET SDK has no sandbox. It uses a custom TaskScheduler and a runtime EventListener to detect invalid task scheduling. Developers must use `Workflow.*` safe alternatives (e.g., Workflow.DelayAsync instead of Task.Delay) and avoid non-deterministic .NET Task APIs. Regardless of which SDK you are using, it is your responsibility to ensure that workflow code does not contain sources of non-determinism. Use SDK-specific tools as well as replay tests for doing so. diff --git a/references/dotnet/data-handling.md b/references/dotnet/data-handling.md index fc8d308..8d0bb23 100644 --- a/references/dotnet/data-handling.md +++ b/references/dotnet/data-handling.md @@ -7,6 +7,7 @@ The .NET SDK uses data converters to serialize/deserialize workflow inputs, outp ## Default Data Converter The default converter handles: + - `null` - `byte[]` (as binary) - `Google.Protobuf.IMessage` instances diff --git a/references/dotnet/determinism-protection.md b/references/dotnet/determinism-protection.md index f9c480d..8c7f331 100644 --- a/references/dotnet/determinism-protection.md +++ b/references/dotnet/determinism-protection.md @@ -27,6 +27,7 @@ public class BadWorkflow Many .NET `Task` APIs implicitly use `TaskScheduler.Default`, which breaks determinism. Here are the key rules: **Do NOT use:** + - `Task.Run` — uses default scheduler. Use `Workflow.RunTaskAsync`. - `Task.ConfigureAwait(false)` — leaves current context. Use `ConfigureAwait(true)` or omit. - `Task.Delay` / `Task.Wait` / timeout-based `CancellationTokenSource` — uses system timers. Use `Workflow.DelayAsync` / `Workflow.WaitConditionAsync`. @@ -36,6 +37,7 @@ Many .NET `Task` APIs implicitly use `TaskScheduler.Default`, which breaks deter - `System.Threading.Semaphore` / `SemaphoreSlim` / `Mutex` — use `Temporalio.Workflows.Semaphore` / `Mutex`. **Be wary of:** + - Third-party libraries that implicitly use `TaskScheduler.Default` - `Dataflow` blocks and similar concurrency libraries with hidden default scheduler usage diff --git a/references/dotnet/dotnet.md b/references/dotnet/dotnet.md index 29b40aa..437fcbb 100644 --- a/references/dotnet/dotnet.md +++ b/references/dotnet/dotnet.md @@ -13,11 +13,13 @@ Temporal workflows are durable through history replay. For details on how this w ## Quick Start **Add Dependency:** Install the Temporal SDK NuGet package: + ```bash dotnet add package Temporalio ``` **Activities.cs** - Activity definitions (separate file for clarity): + ```csharp using Temporalio.Activities; @@ -32,6 +34,7 @@ public class MyActivities ``` **GreetingWorkflow.workflow.cs** - Workflow definition: + ```csharp using Temporalio.Workflows; @@ -49,6 +52,7 @@ public class GreetingWorkflow ``` **Worker (Program.cs)** - Worker setup: + ```csharp using Temporalio.Client; using Temporalio.Worker; @@ -69,6 +73,7 @@ await worker.ExecuteAsync(); **Start the worker:** Run `dotnet run` in the worker project. **Starter (Program.cs)** - Start a workflow execution: + ```csharp using Temporalio.Client; @@ -86,6 +91,7 @@ Console.WriteLine($"Result: {result}"); ## Key Concepts ### Workflow Definition + - Use `[Workflow]` attribute on class - Put any state initialization logic in the constructor of your workflow class to guarantee that it happens before signals/updates arrive. If your state initialization logic requires the workflow parameters, then add the `[WorkflowInit]` attribute and parameters to your constructor. - Use `[WorkflowRun]` on the async entry point method @@ -93,12 +99,14 @@ Console.WriteLine($"Result: {result}"); - Use `[WorkflowSignal]`, `[WorkflowQuery]`, `[WorkflowUpdate]` for handlers ### Activity Definition + - Use `[Activity]` attribute on methods - Can be sync or async - Instance methods support dependency injection - Static methods are also supported ### Worker Setup + - Connect client, create `TemporalWorker` with workflows and activities - Use `AddWorkflow()` and `AddAllActivities(instance)` or `AddActivity(method)` @@ -181,6 +189,7 @@ See `references/dotnet/testing.md` for info on writing tests. ## Additional Resources ### Reference Files + - **`references/dotnet/patterns.md`** — Signals, queries, child workflows, saga pattern, etc. - **`references/dotnet/determinism.md`** — Essentials of determinism in .NET - **`references/dotnet/gotchas.md`** — .NET-specific mistakes and anti-patterns diff --git a/references/dotnet/gotchas.md b/references/dotnet/gotchas.md index 05213c2..9b5806c 100644 --- a/references/dotnet/gotchas.md +++ b/references/dotnet/gotchas.md @@ -174,6 +174,7 @@ public class GoodWorkflow ### Not Handling Activity Cancellation Activities must **opt in** to receive cancellation. This requires: + 1. **Heartbeating** — Cancellation is delivered via heartbeat 2. **Checking the cancellation token** — Token is triggered when heartbeat detects cancellation diff --git a/references/dotnet/observability.md b/references/dotnet/observability.md index 1150f63..6919207 100644 --- a/references/dotnet/observability.md +++ b/references/dotnet/observability.md @@ -30,6 +30,7 @@ public class MyWorkflow ``` The workflow logger automatically: + - Suppresses duplicate logs during replay - Includes workflow context (workflow ID, run ID, etc.) diff --git a/references/dotnet/patterns.md b/references/dotnet/patterns.md index 19d3317..586fab0 100644 --- a/references/dotnet/patterns.md +++ b/references/dotnet/patterns.md @@ -425,10 +425,12 @@ public class HandlerAwareWorkflow ## Activity Heartbeat Details ### WHY: + - **Support activity cancellation** — Cancellations are delivered via heartbeat; activities that don't heartbeat won't know they've been cancelled - **Resume progress after worker failure** — Heartbeat details persist across retries ### WHEN: + - **Cancellable activities** — Any activity that should respond to cancellation - **Long-running activities** — Track progress for resumability - **Checkpointing** — Save progress periodically diff --git a/references/dotnet/testing.md b/references/dotnet/testing.md index 8bea410..d60805a 100644 --- a/references/dotnet/testing.md +++ b/references/dotnet/testing.md @@ -161,6 +161,7 @@ public async Task TestActivity() ``` The `ActivityEnvironment` provides: + - `Info` — Activity info, defaulted to basic values - `CancellationTokenSource` — Token source for issuing cancellation - `Heartbeater` — Callback invoked each heartbeat diff --git a/references/dotnet/versioning.md b/references/dotnet/versioning.md index 677a64c..6371926 100644 --- a/references/dotnet/versioning.md +++ b/references/dotnet/versioning.md @@ -34,6 +34,7 @@ public class ShippingWorkflow ``` **How it works:** + - For new executions: `Patched()` returns `true` and records a marker in the Workflow history - For replay with the marker: `Patched()` returns `true` (history includes this patch) - For replay without the marker: `Patched()` returns `false` (history predates this patch) @@ -207,6 +208,7 @@ var worker = new TemporalWorker( ``` **Configuration parameters:** + - `UseWorkerVersioning`: Enables Worker Versioning - `DeploymentOptions`: Identifies the Worker Deployment Version (deployment name + build ID) - Build ID: Typically a git commit hash, version number, or timestamp @@ -223,6 +225,7 @@ public class StableWorkflow { /* ... */ } ``` **When to use PINNED:** + - Short-running workflows (minutes to hours) - Consistency is critical (e.g., financial transactions) - You want to eliminate version compatibility complexity @@ -238,6 +241,7 @@ public class UpgradableWorkflow { /* ... */ } ``` **When to use AUTO_UPGRADE:** + - Long-running workflows (weeks or months) - Workflows need to benefit from bug fixes during execution - Migrating from traditional rolling deployments @@ -269,6 +273,7 @@ var worker = new TemporalWorker( **Blue-Green Deployments** Maintain two environments and switch traffic between them: + 1. Deploy new code to idle environment 2. Run tests and validation 3. Switch traffic to new environment @@ -277,6 +282,7 @@ Maintain two environments and switch traffic between them: **Rainbow Deployments** Multiple versions run simultaneously: + - New workflows use latest version - Existing workflows complete on their original version - Add new versions alongside existing ones From d68413af30f1b3627356cde094075471804dfd8c Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 15:03:44 -0400 Subject: [PATCH 27/42] Minor fixes to versioning.md (#64) * Edits to python versioning fixes * add missing workflow imports --- references/python/versioning.md | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/references/python/versioning.md b/references/python/versioning.md index 1daab78..c1ad39a 100644 --- a/references/python/versioning.md +++ b/references/python/versioning.md @@ -226,13 +226,13 @@ worker = Worker( Workflows stay locked to their original Worker version: ```python -from temporalio.workflow import VersioningBehavior +from temporalio import workflow +from temporalio.common import VersioningBehavior -@workflow.defn +@workflow.defn(versioning_behavior=VersioningBehavior.PINNED) class StableWorkflow: @workflow.run async def run(self) -> str: - # This workflow will always run on its assigned version return await workflow.execute_activity( process_order, start_to_close_timeout=timedelta(minutes=5), @@ -250,6 +250,20 @@ class StableWorkflow: Workflows can move to newer versions: +```python +from temporalio import workflow +from temporalio.common import VersioningBehavior + +@workflow.defn(versioning_behavior=VersioningBehavior.AUTO_UPGRADE) +class UpgradableWorkflow: + @workflow.run + async def run(self) -> str: + return await workflow.execute_activity( + process_order, + start_to_close_timeout=timedelta(minutes=5), + ) +``` + **When to use AUTO_UPGRADE:** - Long-running workflows (weeks or months) @@ -262,7 +276,6 @@ Workflows can move to newer versions: ### Worker Configuration with Default Behavior ```python -# For short-running workflows, prefer PINNED worker = Worker( client, task_queue="orders-task-queue", @@ -274,7 +287,7 @@ worker = Worker( build_id=os.environ["BUILD_ID"], ), use_worker_versioning=True, - # default_versioning_behavior=VersioningBehavior.PINNED, + default_versioning_behavior=VersioningBehavior.PINNED, ), ) ``` From 999af7884d9c2e425506d08730d07488caa1a68f Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 15:07:01 -0400 Subject: [PATCH 28/42] Update version of temporal-developer skill to 0.3.0 (#88) --- SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SKILL.md b/SKILL.md index df322df..e4950df 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,7 +1,7 @@ --- name: temporal-developer description: Develop, debug, and manage Temporal applications across Python, TypeScript, Go, Java and .NET. Use when the user is building workflows, activities, or workers with a Temporal SDK, debugging issues like non-determinism errors, stuck workflows, or activity retries, using Temporal CLI, Temporal Server, or Temporal Cloud, or working with durable execution concepts like signals, queries, heartbeats, versioning, continue-as-new, child workflows, or saga patterns. -version: 0.2.0 +version: 0.3.0 --- # Skill: temporal-developer From b4fe783d421965a6368642428d17c0f52fe5f533 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 17:22:11 -0400 Subject: [PATCH 29/42] Change to use the app tokens, so hopefully the sync workflow gets triggered correctly. (#89) --- .github/workflows/package-skill.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/package-skill.yml b/.github/workflows/package-skill.yml index 637deb2..64cb96e 100644 --- a/.github/workflows/package-skill.yml +++ b/.github/workflows/package-skill.yml @@ -1,5 +1,7 @@ # ABOUTME: GitHub Actions workflow that packages the skill for upload to Claude.ai. # ABOUTME: Creates a ZIP artifact on every push to main and a GitHub Release when the version in SKILL.md increases. +# ABOUTME: Releases are created using a GitHub App token so the release event can trigger downstream workflows +# ABOUTME: (events fired by the default GITHUB_TOKEN do not trigger other workflows). name: Package Skill @@ -12,9 +14,17 @@ jobs: package: runs-on: ubuntu-latest permissions: - contents: write + contents: read steps: + - name: Generate token from GitHub App + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.SKILL_T_DEV_APP_ID }} + private-key: ${{ secrets.SKILL_T_DEV_KEY }} + owner: ${{ github.repository_owner }} + - name: Checkout uses: actions/checkout@v6 with: @@ -53,6 +63,7 @@ jobs: if: steps.tag_check.outputs.exists == 'false' uses: softprops/action-gh-release@v3 with: + token: ${{ steps.app-token.outputs.token }} tag_name: ${{ steps.version.outputs.tag }} name: ${{ steps.version.outputs.tag }} files: temporal-developer-skill.zip From 801a48d0665e10ac6a33634786e582ceda44e71a Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 17:25:40 -0400 Subject: [PATCH 30/42] Bump version to 0.3.1 in SKILL.md (#90) --- SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SKILL.md b/SKILL.md index e4950df..1bea2e2 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,7 +1,7 @@ --- name: temporal-developer description: Develop, debug, and manage Temporal applications across Python, TypeScript, Go, Java and .NET. Use when the user is building workflows, activities, or workers with a Temporal SDK, debugging issues like non-determinism errors, stuck workflows, or activity retries, using Temporal CLI, Temporal Server, or Temporal Cloud, or working with durable execution concepts like signals, queries, heartbeats, versioning, continue-as-new, child workflows, or saga patterns. -version: 0.3.0 +version: 0.3.1 --- # Skill: temporal-developer From 127d400c67ba9a89593ba1e3ea08c0166595b47a Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Fri, 17 Apr 2026 21:45:08 -0400 Subject: [PATCH 31/42] merge into 1 workflow (#92) --- .github/workflows/package-skill.yml | 156 ++++++++++++++++++-- .github/workflows/sync-skill-to-plugins.yml | 144 ------------------ 2 files changed, 141 insertions(+), 159 deletions(-) delete mode 100644 .github/workflows/sync-skill-to-plugins.yml diff --git a/.github/workflows/package-skill.yml b/.github/workflows/package-skill.yml index 64cb96e..a9e0dc2 100644 --- a/.github/workflows/package-skill.yml +++ b/.github/workflows/package-skill.yml @@ -1,9 +1,12 @@ -# ABOUTME: GitHub Actions workflow that packages the skill for upload to Claude.ai. -# ABOUTME: Creates a ZIP artifact on every push to main and a GitHub Release when the version in SKILL.md increases. -# ABOUTME: Releases are created using a GitHub App token so the release event can trigger downstream workflows -# ABOUTME: (events fired by the default GITHUB_TOKEN do not trigger other workflows). +# ABOUTME: Packages the skill on every push to main (as a ZIP artifact) and, if the version in SKILL.md +# ABOUTME: has been bumped, creates a GitHub Release and syncs the skill contents to three plugin repos +# ABOUTME: (cursor-temporal-plugin, codex-temporal-plugin, claude-temporal-plugin) via PRs. +# ABOUTME: Required secrets (used only by the sync job for cross-repo PRs): +# ABOUTME: SKILL_T_DEV_APP_ID — the GitHub App's ID +# ABOUTME: SKILL_T_DEV_KEY — the GitHub App's private key +# ABOUTME: The app must be installed on the three plugin repos with Contents (write) and Pull Requests (write). -name: Package Skill +name: Package and Sync Skill on: push: @@ -14,17 +17,13 @@ jobs: package: runs-on: ubuntu-latest permissions: - contents: read + contents: write + outputs: + version: ${{ steps.version.outputs.version }} + tag: ${{ steps.version.outputs.tag }} + released: ${{ steps.tag_check.outputs.exists == 'false' }} steps: - - name: Generate token from GitHub App - id: app-token - uses: actions/create-github-app-token@v3 - with: - app-id: ${{ secrets.SKILL_T_DEV_APP_ID }} - private-key: ${{ secrets.SKILL_T_DEV_KEY }} - owner: ${{ github.repository_owner }} - - name: Checkout uses: actions/checkout@v6 with: @@ -63,8 +62,135 @@ jobs: if: steps.tag_check.outputs.exists == 'false' uses: softprops/action-gh-release@v3 with: - token: ${{ steps.app-token.outputs.token }} tag_name: ${{ steps.version.outputs.tag }} name: ${{ steps.version.outputs.tag }} files: temporal-developer-skill.zip generate_release_notes: true + + sync: + needs: package + if: needs.package.outputs.released == 'true' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + permissions: + contents: read + strategy: + fail-fast: false + matrix: + include: + - repo: temporalio/cursor-temporal-plugin + target_path: skills/temporal-developer + - repo: temporalio/codex-temporal-plugin + target_path: plugins/temporal-developer/skills/temporal-developer + - repo: temporalio/claude-temporal-plugin + target_path: skills/temporal-developer + + steps: + - name: Generate token from GitHub App + id: app-token + uses: actions/create-github-app-token@v3 + with: + app-id: ${{ secrets.SKILL_T_DEV_APP_ID }} + private-key: ${{ secrets.SKILL_T_DEV_KEY }} + owner: ${{ github.repository_owner }} + + - name: Checkout source + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Checkout target repo + uses: actions/checkout@v6 + with: + repository: ${{ matrix.repo }} + token: ${{ steps.app-token.outputs.token }} + path: target-repo + + - name: Sync skill contents + working-directory: target-repo + run: | + BRANCH="sync/temporal-developer-skill" + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + # Create or reset the sync branch based on current main. + # -B ensures the branch always starts from main's tip, even if a + # stale remote branch exists from a previously merged PR. + git checkout -B "$BRANCH" origin/main + + # Remove old contents and copy current + rm -rf "${{ matrix.target_path }}/SKILL.md" \ + "${{ matrix.target_path }}/references" + cp ../SKILL.md "${{ matrix.target_path }}/" + cp -r ../references "${{ matrix.target_path }}/" + + # Check for changes against main + git add "${{ matrix.target_path }}" + if git diff --cached --quiet; then + echo "no_changes=true" >> "$GITHUB_ENV" + echo "No changes to sync" + else + echo "no_changes=false" >> "$GITHUB_ENV" + version="${{ needs.package.outputs.tag }}" + git commit -m "sync temporal-developer skill ${version} from source repo" + git push --force origin "$BRANCH" + fi + + - name: Build changelog + if: env.no_changes == 'false' + env: + GH_TOKEN: ${{ github.token }} + run: | + tag="${{ needs.package.outputs.tag }}" + + # Prefer the release body (auto-generated notes). Fall back to git log + # if no release exists for this tag (e.g. manual re-sync of an older version). + if body=$(gh release view "$tag" --repo "${{ github.repository }}" --json body --jq '.body' 2>/dev/null) && [ -n "$body" ]; then + echo "$body" > /tmp/changelog.md + else + prev_tag=$(git describe --tags --abbrev=0 HEAD^ 2>/dev/null || echo "") + if [ -n "$prev_tag" ]; then + git log --oneline "${prev_tag}..HEAD" > /tmp/changelog.md + else + git log --oneline -20 > /tmp/changelog.md + fi + fi + + - name: Create or update PR + if: env.no_changes == 'false' + working-directory: target-repo + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + run: | + BRANCH="sync/temporal-developer-skill" + version="${{ needs.package.outputs.tag }}" + changelog=$(cat /tmp/changelog.md) + + # Check if a PR already exists from this branch + existing_pr=$(gh pr list --head "$BRANCH" --state open --json number --jq '.[0].number') + + if [ -n "$existing_pr" ]; then + echo "PR #${existing_pr} already exists — updated by the force-push" + gh pr edit "$existing_pr" \ + --title "Sync temporal-developer skill ${version}" \ + --body "Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). + + This PR was updated automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}). + + ## Changelog + ${changelog}" + gh pr comment "$existing_pr" --body "Updated to ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }})." + pr_url=$(gh pr view "$existing_pr" --json url --jq '.url') + echo "### ${{ matrix.repo }}" >> "$GITHUB_STEP_SUMMARY" + echo "Updated [PR #${existing_pr}](${pr_url})" >> "$GITHUB_STEP_SUMMARY" + else + pr_url=$(gh pr create \ + --title "Sync temporal-developer skill ${version}" \ + --body "Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). + + This PR was created automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}). + + ## Changelog + ${changelog}") + echo "### ${{ matrix.repo }}" >> "$GITHUB_STEP_SUMMARY" + echo "Created ${pr_url}" >> "$GITHUB_STEP_SUMMARY" + fi diff --git a/.github/workflows/sync-skill-to-plugins.yml b/.github/workflows/sync-skill-to-plugins.yml deleted file mode 100644 index 2fb8aac..0000000 --- a/.github/workflows/sync-skill-to-plugins.yml +++ /dev/null @@ -1,144 +0,0 @@ -# ABOUTME: GitHub Actions workflow that syncs skill contents to the cursor and codex plugin repos. -# ABOUTME: Triggers when a new release is created (by the package-skill workflow) or manually. -# ABOUTME: Creates or updates a PR in each target repo rather than pushing directly to main. -# ABOUTME: Uses a GitHub App for cross-repo authentication. Required secrets: -# ABOUTME: SKILL_T_DEV_APP_ID — the GitHub App's ID -# ABOUTME: SKILL_T_DEV_KEY — the GitHub App's private key -# ABOUTME: The app must be installed on all three repos with Contents (write) and -# ABOUTME: Pull Requests (write) permissions. - -name: Sync Skill to Plugin Repos - -on: - release: - types: [published] - workflow_dispatch: - -permissions: - contents: read - -jobs: - sync: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - include: - - repo: temporalio/cursor-temporal-plugin - target_path: skills/temporal-developer - - repo: temporalio/codex-temporal-plugin - target_path: plugins/temporal-developer/skills/temporal-developer - - repo: temporalio/claude-temporal-plugin - target_path: skills/temporal-developer - - steps: - - name: Generate token from GitHub App - id: app-token - uses: actions/create-github-app-token@v3 - with: - app-id: ${{ secrets.SKILL_T_DEV_APP_ID }} - private-key: ${{ secrets.SKILL_T_DEV_KEY }} - owner: ${{ github.repository_owner }} - - - name: Checkout source - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Checkout target repo - uses: actions/checkout@v6 - with: - repository: ${{ matrix.repo }} - token: ${{ steps.app-token.outputs.token }} - path: target-repo - - - name: Sync skill contents - working-directory: target-repo - run: | - BRANCH="sync/temporal-developer-skill" - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - - # Create or reset the sync branch based on current main. - # -B ensures the branch always starts from main's tip, even if a - # stale remote branch exists from a previously merged PR. - git checkout -B "$BRANCH" origin/main - - # Remove old contents and copy current - rm -rf "${{ matrix.target_path }}/SKILL.md" \ - "${{ matrix.target_path }}/references" - cp ../SKILL.md "${{ matrix.target_path }}/" - cp -r ../references "${{ matrix.target_path }}/" - - # Check for changes against main - git add "${{ matrix.target_path }}" - if git diff --cached --quiet; then - echo "no_changes=true" >> "$GITHUB_ENV" - echo "No changes to sync" - else - echo "no_changes=false" >> "$GITHUB_ENV" - version="${{ github.event.release.tag_name || 'manual' }}" - git commit -m "sync temporal-developer skill ${version} from source repo" - git push --force origin "$BRANCH" - fi - - - name: Build changelog - id: changelog - run: | - if [ "${{ github.event_name }}" = "release" ]; then - # Use the release body (auto-generated notes from package-skill) - changelog=$(cat <<'RELEASE_BODY' - ${{ github.event.release.body }} - RELEASE_BODY - ) - else - # Manual trigger: generate from git log since the previous tag - prev_tag=$(git describe --tags --abbrev=0 HEAD^ 2>/dev/null || echo "") - if [ -n "$prev_tag" ]; then - changelog=$(git log --oneline "${prev_tag}..HEAD") - else - changelog=$(git log --oneline -20) - fi - fi - # Write to a file to avoid shell quoting issues - echo "$changelog" > /tmp/changelog.md - - - name: Create or update PR - if: env.no_changes == 'false' - working-directory: target-repo - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - run: | - BRANCH="sync/temporal-developer-skill" - version="${{ github.event.release.tag_name || 'manual' }}" - changelog=$(cat /tmp/changelog.md) - - # Check if a PR already exists from this branch - existing_pr=$(gh pr list --head "$BRANCH" --state open --json number --jq '.[0].number') - - if [ -n "$existing_pr" ]; then - echo "PR #${existing_pr} already exists — updated by the force-push" - gh pr edit "$existing_pr" \ - --title "Sync temporal-developer skill ${version}" \ - --body "Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). - - This PR was updated automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}). - - ## Changelog - ${changelog}" - gh pr comment "$existing_pr" --body "Updated to ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }})." - pr_url=$(gh pr view "$existing_pr" --json url --jq '.url') - echo "### ${{ matrix.repo }}" >> "$GITHUB_STEP_SUMMARY" - echo "Updated [PR #${existing_pr}](${pr_url})" >> "$GITHUB_STEP_SUMMARY" - else - pr_url=$(gh pr create \ - --title "Sync temporal-developer skill ${version}" \ - --body "Automated sync of the temporal-developer skill ${version} from [skill-temporal-developer](https://github.com/${{ github.repository }}). - - This PR was created automatically by the [sync workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}). - - ## Changelog - ${changelog}") - echo "### ${{ matrix.repo }}" >> "$GITHUB_STEP_SUMMARY" - echo "Created ${pr_url}" >> "$GITHUB_STEP_SUMMARY" - fi From 73fc5f025942187b61b76b6b94f87bb9a59345dc Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Mon, 20 Apr 2026 09:28:44 -0400 Subject: [PATCH 32/42] Improve syncing PR changelogs: include changelog from previous versions that were unmerged (#93) --- .github/workflows/package-skill.yml | 40 +++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/.github/workflows/package-skill.yml b/.github/workflows/package-skill.yml index a9e0dc2..3bfe16f 100644 --- a/.github/workflows/package-skill.yml +++ b/.github/workflows/package-skill.yml @@ -140,19 +140,37 @@ jobs: env: GH_TOKEN: ${{ github.token }} run: | - tag="${{ needs.package.outputs.tag }}" + current_tag="${{ needs.package.outputs.tag }}" + + # Determine the base for the changelog: the version currently on the + # target repo's main branch. This represents what was last merged, so + # the changelog spans every release since then — correctly accumulating + # unmerged versions if a prior sync PR is still open. + # + # Read the old SKILL.md from git (it's been overwritten on disk by the + # sync step) via `git show origin/main:...`. + target_version=$(git -C target-repo show "origin/main:${{ matrix.target_path }}/SKILL.md" 2>/dev/null \ + | grep '^version:' | sed 's/version:[[:space:]]*//' || echo "") + + if [ -n "$target_version" ]; then + base_tag="v${target_version}" + else + base_tag="" + fi - # Prefer the release body (auto-generated notes). Fall back to git log - # if no release exists for this tag (e.g. manual re-sync of an older version). - if body=$(gh release view "$tag" --repo "${{ github.repository }}" --json body --jq '.body' 2>/dev/null) && [ -n "$body" ]; then - echo "$body" > /tmp/changelog.md + # Prefer GitHub's auto-generated notes for the range (nicely formatted + # with PR links and contributors). Fall back to git log if unavailable. + if [ -n "$base_tag" ] && notes=$(gh api \ + --method POST \ + "/repos/${{ github.repository }}/releases/generate-notes" \ + -f tag_name="${current_tag}" \ + -f previous_tag_name="${base_tag}" \ + --jq '.body' 2>/dev/null) && [ -n "$notes" ]; then + echo "$notes" > /tmp/changelog.md + elif [ -n "$base_tag" ]; then + git log --oneline "${base_tag}..HEAD" > /tmp/changelog.md else - prev_tag=$(git describe --tags --abbrev=0 HEAD^ 2>/dev/null || echo "") - if [ -n "$prev_tag" ]; then - git log --oneline "${prev_tag}..HEAD" > /tmp/changelog.md - else - git log --oneline -20 > /tmp/changelog.md - fi + git log --oneline -20 > /tmp/changelog.md fi - name: Create or update PR From 150c73371e4f13a40d3c0839f133639ae4abf5b8 Mon Sep 17 00:00:00 2001 From: Patrick Dewey <57921252+ptdewey@users.noreply.github.com> Date: Mon, 20 Apr 2026 09:41:19 -0400 Subject: [PATCH 33/42] docs: update go observability reference with up-to-date logging approach (#59) --- references/go/observability.md | 45 ++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/references/go/observability.md b/references/go/observability.md index 23ad62f..a7867b3 100644 --- a/references/go/observability.md +++ b/references/go/observability.md @@ -62,43 +62,68 @@ logger.Info("Processing order") // includes orderId and customerId ## Customizing the Logger -Set a custom logger via `client.Options{Logger: myLogger}`. Implement the `log.Logger` interface (Debug, Info, Warn, Error methods). +The SDK ships a single built-in **`slog` adapter** (`log.NewStructuredLogger`) and considers `slog` (go 1.21+) the universal bridge to other logging libraries. -### Using slog (Go 1.21+) +### The `log.Logger` Interface + +```go +// go.temporal.io/sdk/log +type Logger interface { + Debug(msg string, keyvals ...interface{}) + Info(msg string, keyvals ...interface{}) + Warn(msg string, keyvals ...interface{}) + Error(msg string, keyvals ...interface{}) +} +``` + +Optional companion interfaces: `WithLogger` (adds `.With()`) and `WithSkipCallers` (fixes caller frames). + +### Using slog (Recommended) ```go import ( "log/slog" "os" - tlog "go.temporal.io/sdk/log" + "go.temporal.io/sdk/log" ) slogHandler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}) -logger := tlog.NewStructuredLogger(slog.New(slogHandler)) +logger := log.NewStructuredLogger(slog.New(slogHandler)) c, err := client.Dial(client.Options{ Logger: logger, }) ``` -### Using Third-Party Loggers (Logrus, Zap, etc.) +### Using slog as a Bridge to Third-Party Loggers -Use the [logur](https://github.com/logur/logur) adapter package: +Any third-party logger that can back an `slog.Handler` works with `log.NewStructuredLogger` — this includes zap, zerolog, logrus, and most modern Go logging libraries. The pattern is: create an `slog.Handler` from your logger, then wrap it with `log.NewStructuredLogger`. + +**Example with Zap:** ```go import ( - "github.com/sirupsen/logrus" - logrusadapter "logur.dev/adapter/logrus" - "logur.dev/logur" + "log/slog" + + "go.uber.org/zap" + "go.uber.org/zap/exp/zapslog" + "go.temporal.io/sdk/log" ) -logger := logur.LoggerToKV(logrusadapter.New(logrus.New())) +zapLogger, _ := zap.NewProduction() +handler := zapslog.NewHandler(zapLogger.Core()) +logger := log.NewStructuredLogger(slog.New(handler)) + c, err := client.Dial(client.Options{ Logger: logger, }) ``` +### Direct Adapter (Alternative) + +If you cannot use the slog bridge, you can implement the `log.Logger` interface directly. The Temporal samples repo has a ~60-line [zap adapter](https://github.com/temporalio/samples-go/blob/main/zapadapter/zap_adapter.go) that implements `Logger`, `WithLogger`, and `WithSkipCallers` and can be copied into your project. + ## Metrics Use the Tally library (`go.temporal.io/sdk/contrib/tally`) with Prometheus: From c98e05a6fc31578476b34b84f5e5c7550d177520 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Mon, 20 Apr 2026 09:42:34 -0400 Subject: [PATCH 34/42] Update version of temporal-developer skill to 0.3.2 (#94) --- SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SKILL.md b/SKILL.md index 1bea2e2..9f12f6c 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,7 +1,7 @@ --- name: temporal-developer description: Develop, debug, and manage Temporal applications across Python, TypeScript, Go, Java and .NET. Use when the user is building workflows, activities, or workers with a Temporal SDK, debugging issues like non-determinism errors, stuck workflows, or activity retries, using Temporal CLI, Temporal Server, or Temporal Cloud, or working with durable execution concepts like signals, queries, heartbeats, versioning, continue-as-new, child workflows, or saga patterns. -version: 0.3.1 +version: 0.3.2 --- # Skill: temporal-developer From 62957d1e5683d29d245ffda8eb65f67b8288ccf7 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Mon, 20 Apr 2026 10:36:44 -0400 Subject: [PATCH 35/42] Fix permission for "reading" changelogs (#95) --- .github/workflows/package-skill.yml | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/.github/workflows/package-skill.yml b/.github/workflows/package-skill.yml index 3bfe16f..f2b89e3 100644 --- a/.github/workflows/package-skill.yml +++ b/.github/workflows/package-skill.yml @@ -72,7 +72,9 @@ jobs: if: needs.package.outputs.released == 'true' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest permissions: - contents: read + # contents: write is required by the POST /releases/generate-notes endpoint, + # even though it only returns text and doesn't actually write anything. + contents: write strategy: fail-fast: false matrix: @@ -160,16 +162,22 @@ jobs: # Prefer GitHub's auto-generated notes for the range (nicely formatted # with PR links and contributors). Fall back to git log if unavailable. - if [ -n "$base_tag" ] && notes=$(gh api \ - --method POST \ - "/repos/${{ github.repository }}/releases/generate-notes" \ - -f tag_name="${current_tag}" \ - -f previous_tag_name="${base_tag}" \ - --jq '.body' 2>/dev/null) && [ -n "$notes" ]; then - echo "$notes" > /tmp/changelog.md - elif [ -n "$base_tag" ]; then - git log --oneline "${base_tag}..HEAD" > /tmp/changelog.md + echo "Base tag: ${base_tag:-} / Current tag: ${current_tag}" + if [ -n "$base_tag" ]; then + if notes=$(gh api \ + --method POST \ + "/repos/${{ github.repository }}/releases/generate-notes" \ + -f tag_name="${current_tag}" \ + -f previous_tag_name="${base_tag}" \ + --jq '.body') && [ -n "$notes" ]; then + echo "Using auto-generated release notes" + echo "$notes" > /tmp/changelog.md + else + echo "generate-notes API call failed or empty; falling back to git log" + git log --oneline "${base_tag}..HEAD" > /tmp/changelog.md + fi else + echo "No base tag found; using last 20 commits" git log --oneline -20 > /tmp/changelog.md fi From cf347103633038c30c097c5d8e167256a8094b0a Mon Sep 17 00:00:00 2001 From: Amir Benvenisti <128422269+starfleeth@users.noreply.github.com> Date: Tue, 21 Apr 2026 10:28:24 -0700 Subject: [PATCH 36/42] Update README to reflect new plugin packaging (#96) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update README to reflect new plugin packaging Remove reference to the old agent-skills mono-plugin repo. Link to the per-harness plugin repos (Claude Code, Cursor, Codex) and present the standalone installation options after the plugin links. Co-Authored-By: Claude Opus 4.6 * Mark .NET SDK as supported .NET support has landed — move from 🚧 to ✅. Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- README.md | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 8f2a0a5..a0ae73b 100644 --- a/README.md +++ b/README.md @@ -8,24 +8,26 @@ A comprehensive skill for developers to use when building [Temporal](https://tem ## Installation -### As a Claude Code Plugin +### As a Plugin -This skill is housed within a [Claude Code plugin](https://github.com/temporalio/agent-skills), which provides a simple way to install and receive future updates to the skill. +This skill is packaged as a plugin for major coding agents, which provides a simple way to install and receive future updates: -1. Run `/plugin marketplace add temporalio/agent-skills` -2. Run `/plugin` to open the plugin manager -3. Select **Marketplaces** -4. Choose `temporal-marketplace` from the list -5. Select **Enable auto-update** or **Disable auto-update** -6. run `/plugin install temporal-developer@temporalio-agent-skills` -7. Restart Claude Code +- **Claude Code**: [temporalio/claude-temporal-plugin](https://github.com/temporalio/claude-temporal-plugin) +- **Cursor**: [temporalio/cursor-temporal-plugin](https://github.com/temporalio/cursor-temporal-plugin) +- **OpenAI Codex**: [temporalio/codex-temporal-plugin](https://github.com/temporalio/codex-temporal-plugin) -### Via `npx skills` - supports all major coding agents +See each repo's README for installation instructions. + +### Standalone Installation + +If you prefer to install the skill directly without the plugin wrapper: + +#### Via `npx skills` — supports all major coding agents 1. `npx skills add temporalio/skill-temporal-developer` 2. Follow prompts -### Via manually cloning the skill repo: +#### Via manually cloning the skill repo 1. `mkdir -p ~/.claude/skills && git clone https://github.com/temporalio/skill-temporal-developer ~/.claude/skills/temporal-developer` @@ -37,6 +39,6 @@ Appropriately adjust the installation directory based on your coding agent. - [x] TypeScript ✅ - [x] Go ✅ - [x] Java ✅ -- [ ] .NET 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/39)) +- [x] .NET ✅ - [ ] Ruby 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/41)) - [ ] PHP 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/40)) From 2a1d8b394ff321286900c9c39faff2e011d80937 Mon Sep 17 00:00:00 2001 From: Maksim Mesilov Date: Sat, 25 Apr 2026 16:29:33 +0600 Subject: [PATCH 37/42] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a0ae73b..e607dcf 100644 --- a/README.md +++ b/README.md @@ -41,4 +41,4 @@ Appropriately adjust the installation directory based on your coding agent. - [x] Java ✅ - [x] .NET ✅ - [ ] Ruby 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/41)) -- [ ] PHP 🚧 ([PR](https://github.com/temporalio/skill-temporal-developer/pull/40)) +- [x] PHP ✅ From 26955d9cce215a7662e77622810842471a6851cb Mon Sep 17 00:00:00 2001 From: Amir Benvenisti <128422269+starfleeth@users.noreply.github.com> Date: Wed, 29 Apr 2026 09:23:58 -0700 Subject: [PATCH 38/42] Add Task Queue Priority and Fairness reference (#98) Add a core reference file covering Priority (keys 1-5, inheritance, when to use) and Fairness (keys, weights, virtual queues, round-robin dispatch, rate limiting, weight overrides, enabling, limitations) with SDK examples for Go, Java, Python, TypeScript, and .NET. Content is grounded in the official Temporal docs. Update SKILL.md to proactively surface Fairness when developers are building multi-tenant applications, before they run into tenant starvation problems. Co-authored-by: Claude Opus 4.6 --- SKILL.md | 8 + references/core/priority-fairness.md | 340 +++++++++++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 references/core/priority-fairness.md diff --git a/SKILL.md b/SKILL.md index 9f12f6c..fe6650c 100644 --- a/SKILL.md +++ b/SKILL.md @@ -101,6 +101,14 @@ Once you've downloaded the file, extract the downloaded archive and add the temp - **`references/core/ai-patterns.md`** - AI/LLM pattern concepts - Language-specific info at `references/{your_language}/ai-patterns.md`, if available. Currently Python only. +## Task Queue Priority and Fairness + +If the developer is building a **multi-tenant application**, proactively recommend Task Queue Fairness. Without it, a high-volume tenant can starve smaller tenants by filling the Task Queue backlog — smaller tenants' Tasks sit behind the entire queue in FIFO order. Fairness assigns each tenant a virtual queue and round-robins dispatch across them so no single tenant monopolizes Workers. + +Priority and Fairness also apply to tiered workloads (batch vs. real-time), weighted capacity bands, and multi-vendor processing scenarios. + +- **`references/core/priority-fairness.md`** - Priority keys, fairness keys and weights, rate limiting, SDK examples, and limitations + ## Additional Topics - **`references/{your_language}/observability.md`** - See for language-specific implementation guidance on observability in Temporal diff --git a/references/core/priority-fairness.md b/references/core/priority-fairness.md new file mode 100644 index 0000000..cb6930e --- /dev/null +++ b/references/core/priority-fairness.md @@ -0,0 +1,340 @@ +# Task Queue Priority and Fairness + +## Overview + +Priority and Fairness control how Tasks are distributed within a Task Queue. Priority determines execution order. Fairness prevents one group of Tasks from starving others. They can be used independently or together. + +Both features are in Public Preview. Priority is free. Fairness is a paid feature in Temporal Cloud. + +## Priority + +Priority lets you control execution order within a single Task Queue by assigning a priority key (integer 1-5, lower = higher priority). Each priority level acts as a sub-queue. All priority-1 Tasks dispatch before priority-2, and so on. Tasks at the same priority level dispatch in FIFO order. + +Default priority is 3. Activities inherit their parent workflow's priority unless explicitly overridden. + +### When to use Priority + +Use Priority to differentiate execution order between types of work sharing a single Task Queue and Worker pool. For example, process payment-related Tasks before less time-sensitive inventory management Tasks, or ensure real-time Tasks run ahead of batch Tasks. You can also use it to run urgent Tasks immediately by assigning them priority 1. + +### CLI + +``` +temporal workflow start \ + --type ChargeCustomer \ + --task-queue my-task-queue \ + --workflow-id my-workflow-id \ + --input '{"customerId":"12345"}' \ + --priority-key 1 +``` + +### Go + +```go +workflowOptions := client.StartWorkflowOptions{ + ID: "my-workflow-id", + TaskQueue: "my-task-queue", + Priority: temporal.Priority{PriorityKey: 1}, +} +we, err := c.ExecuteWorkflow(context.Background(), workflowOptions, MyWorkflow) +``` + +### Java + +```java +WorkflowOptions options = WorkflowOptions.newBuilder() + .setTaskQueue("my-task-queue") + .setPriority(Priority.newBuilder().setPriorityKey(1).build()) + .build(); +``` + +### Python + +```python +await client.start_workflow( + MyWorkflow.run, + args="hello", + id="my-workflow-id", + task_queue="my-task-queue", + priority=Priority(priority_key=1), +) +``` + +### TypeScript + +```ts +const handle = await startWorkflow(workflows.myWorkflow, { + args: [false, 1], + priority: { priorityKey: 1 }, +}); +``` + +### .NET + +```csharp +var handle = await Client.StartWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync("hello"), + new StartWorkflowOptions(id: "my-workflow-id", taskQueue: "my-task-queue") + { + Priority = new Priority(1), + } +); +``` + +## Fairness + +Fairness prevents one group of Tasks from monopolizing Worker capacity. Each fairness key creates a "virtual queue" within the Task Queue. The server uses round-robin dispatch across virtual queues so no single key can block others, even with a much larger backlog. + +### When to use Fairness + +Fairness solves the multi-tenant starvation problem. Without it, Tasks dispatch FIFO: if tenant-big enqueues 100k Tasks, tenant-small's 10 Tasks sit behind the entire backlog. With Fairness, each tenant gets its own virtual queue and Tasks are interleaved. + +Common scenarios: + +- **Multi-tenant applications** where large tenants should not block small ones. +- **Tiered capacity bands** where you want weighted distribution (e.g., 80% premium, 20% free) without limiting overall throughput when one band is empty. +- **Batch jobs** where some jobs run far more frequently than others. +- **Multi-vendor processing** where a few vendors generate the majority of work. + +If all your Tasks can be dispatched immediately (no backlog), you don't need Fairness. + +Fairness applies at Task dispatch time and considers each Task as having equal cost until dispatch. It does not account for Tasks currently being processed by Workers. So if you look at Tasks being processed by Workers, you might not see "fairness" across tenants — for example, if tenant-big already has Tasks being processed when tenant-small's Tasks are dispatched, it may still appear that tenant-big is using the most resources. + +### Fairness keys and weights + +A fairness key is a string, typically a tenant ID or workload category. Each unique key creates a virtual queue. + +A fairness weight (float, default 1.0) controls how often a key's Tasks are dispatched relative to others. A key with weight 2.0 dispatches twice as often as keys with weight 1.0. + +Example with three tiers: + +| Fairness Key | Weight | Share of Dispatches | +|----------------|--------|---------------------| +| premium-tier | 5.0 | 50% | +| basic-tier | 3.0 | 30% | +| free-tier | 2.0 | 20% | + +Tasks without a fairness key are grouped under an implicit empty-string key with weight 1.0. Adoption is incremental: unkeyed Tasks participate in round-robin alongside keyed Tasks. + +### Using Fairness with Priority + +When combined, Priority determines which sub-queue Tasks go into (priority 1 before 2, etc.), and Fairness applies within each priority level. + +### SDK examples + +#### CLI + +``` +temporal workflow start \ + --type ChargeCustomer \ + --task-queue my-task-queue \ + --workflow-id my-workflow-id \ + --input '{"customerId":"12345"}' \ + --priority-key 1 \ + --fairness-key tenant-123 \ + --fairness-weight 2.0 +``` + +#### Go + +```go +workflowOptions := client.StartWorkflowOptions{ + ID: "my-workflow-id", + TaskQueue: "my-task-queue", + Priority: temporal.Priority{ + PriorityKey: 1, + FairnessKey: "tenant-123", + FairnessWeight: 2.0, + }, +} +we, err := c.ExecuteWorkflow(context.Background(), workflowOptions, MyWorkflow) +``` + +Activities: + +```go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + Priority: temporal.Priority{ + PriorityKey: 1, + FairnessKey: "tenant-123", + FairnessWeight: 2.0, + }, +} +ctx := workflow.WithActivityOptions(ctx, ao) +err := workflow.ExecuteActivity(ctx, MyActivity).Get(ctx, nil) +``` + +#### Java + +```java +WorkflowOptions options = WorkflowOptions.newBuilder() + .setTaskQueue("my-task-queue") + .setPriority(Priority.newBuilder() + .setPriorityKey(1) + .setFairnessKey("tenant-123") + .setFairnessWeight(2.0) + .build()) + .build(); +``` + +#### Python + +```python +await client.start_workflow( + MyWorkflow.run, + args="hello", + id="my-workflow-id", + task_queue="my-task-queue", + priority=Priority(priority_key=1, fairness_key="tenant-123", fairness_weight=2.0), +) +``` + +Activities: + +```python +await workflow.execute_activity( + say_hello, + "hi", + priority=Priority(priority_key=1, fairness_key="tenant-123", fairness_weight=2.0), + start_to_close_timeout=timedelta(seconds=5), +) +``` + +#### TypeScript + +```ts +const handle = await startWorkflow(workflows.myWorkflow, { + args: [false, 1], + priority: { priorityKey: 1, fairnessKey: 'tenant-123', fairnessWeight: 2.0 }, +}); +``` + +#### .NET + +```csharp +var handle = await Client.StartWorkflowAsync( + (MyWorkflow wf) => wf.RunAsync("hello"), + new StartWorkflowOptions(id: "my-workflow-id", taskQueue: "my-task-queue") + { + Priority = new Priority( + priorityKey: 1, + fairnessKey: "tenant-123", + fairnessWeight: 2.0 + ) + } +); +``` + +#### Child Workflows + +Child workflows can set their own priority and fairness, overriding the parent. + +Go: + +```go +cwo := workflow.ChildWorkflowOptions{ + WorkflowID: "child-workflow-id", + TaskQueue: "child-task-queue", + Priority: temporal.Priority{ + PriorityKey: 1, + FairnessKey: "tenant-123", + FairnessWeight: 2.0, + }, +} +ctx := workflow.WithChildOptions(ctx, cwo) +err := workflow.ExecuteChildWorkflow(ctx, MyChildWorkflow).Get(ctx, nil) +``` + +Java: + +```java +ChildWorkflowOptions childOptions = ChildWorkflowOptions.newBuilder() + .setTaskQueue("child-task-queue") + .setWorkflowId("child-workflow-id") + .setPriority(Priority.newBuilder() + .setPriorityKey(1) + .setFairnessKey("tenant-123") + .setFairnessWeight(2.0) + .build()) + .build(); +MyChildWorkflow child = Workflow.newChildWorkflowStub(MyChildWorkflow.class, childOptions); +child.run(); +``` + +Python: + +```python +await workflow.execute_child_workflow( + MyChildWorkflow.run, + args="hello child", + priority=Priority(priority_key=1, fairness_key="tenant-123", fairness_weight=2.0), +) +``` + +TypeScript: + +```ts +const handle = await startChildWorkflow(workflows.myChildWorkflow, { + args: [false, 1], + priority: { priorityKey: 1, fairnessKey: 'tenant-123', fairnessWeight: 2.0 }, +}); +``` + +.NET: + +```csharp +await Workflow.ExecuteChildWorkflowAsync( + (MyChildWorkflow wf) => wf.RunAsync("hello child"), + new() { + Priority = new( + priorityKey: 1, + fairnessKey: "tenant-123", + fairnessWeight: 2.0 + ) + } +); +``` + +### Rate limiting + +Two rate-limiting controls work alongside Fairness: + +- **`queue-rps-limit`** — overall dispatch rate for the entire Task Queue. +- **`fairness-key-rps-limit-default`** — per-key rate limit, scaled by weight. If the default is 10 rps and a key has weight 2.5, that key's effective limit is 25 rps. + +``` +temporal task-queue config set \ + --task-queue my-task-queue \ + --task-queue-type activity \ + --namespace my-namespace \ + --queue-rps-limit 500 \ + --queue-rps-limit-reason "overall limit" \ + --fairness-key-rps-limit-default 33.3 \ + --fairness-key-rps-limit-reason "per-key limit" +``` + +If both limits are set, the more restrictive one applies. + +### Fairness weight overrides + +You can override the weights of up to 1000 keys through the config API. When an override is set for a key, the SDK-supplied weight is ignored. Overrides are per Task Queue and type (workflow vs. activity), so set them for both if needed. + +### Enabling Fairness + +When you start using fairness keys, it switches your active Task Queues to fairness mode. Existing queued Tasks are processed before any new fairness-mode ones. + +**Temporal Cloud**: automatically enabled when you start using fairness keys. + +**Self-hosted**: set these dynamic config flags to `true`: + +- `matching.useNewMatcher` +- `matching.enableFairness` +- `matching.enableMigration` (to drain existing backlogs after enabling) + +### Limitations + +- Accuracy can degrade with a very large number of distinct fairness keys. +- Task Queue partitioning can interfere with fairness distribution. Contact Temporal Support to set a Task Queue to a single partition if needed. +- Weights apply at schedule time, not dispatch time. Changing a weight does not reorder already-backlogged Tasks. +- Fairness is not guaranteed across different Worker versions when using Worker Versioning. +- After server restarts, less-active keys may briefly dispatch new Tasks ahead of their existing backlog until ordering normalizes. From 8d05f55a5d511bdfb018fde411f053456aac434d Mon Sep 17 00:00:00 2001 From: Devin Smaldore Date: Thu, 30 Apr 2026 13:21:48 -0400 Subject: [PATCH 39/42] Add Spring Boot integration reference for Java SDK (#74) Covers auto-discovery mechanics, annotation layering (@WorkflowImpl vs @ActivityImpl + @Component), WorkflowClient injection, worker lifecycle, testing strategies, and Spring-specific gotchas. Updates java.md and testing.md with pointers to the new reference. Co-authored-by: Donald Pinckney --- references/java/java.md | 3 + references/java/spring-boot.md | 287 +++++++++++++++++++++++++++++++++ references/java/testing.md | 71 ++++++++ 3 files changed, 361 insertions(+) create mode 100644 references/java/spring-boot.md diff --git a/references/java/java.md b/references/java/java.md index b260424..a0ba272 100644 --- a/references/java/java.md +++ b/references/java/java.md @@ -193,6 +193,8 @@ public class Starter { - `Worker` -- polls a single Task Queue, register workflows and activities on it - Call `factory.start()` to begin polling +For Spring Boot apps, `temporal-spring-boot-starter` handles all of the above automatically via auto-configuration. See `references/java/spring-boot.md`. + ## File Organization Best Practice **Keep Workflow and Activity definitions in separate files.** Separating them is good practice for clarity and maintainability. @@ -252,6 +254,7 @@ See `references/java/testing.md` for info on writing tests. ### Reference Files +- **`references/java/spring-boot.md`** - Spring Boot integration: auto-discovery, dependency injection, worker lifecycle, testing - **`references/java/patterns.md`** - Signals, queries, child workflows, saga pattern, etc. - **`references/java/determinism.md`** - Determinism rules and safe alternatives for Java - **`references/java/gotchas.md`** - Java-specific mistakes and anti-patterns diff --git a/references/java/spring-boot.md b/references/java/spring-boot.md new file mode 100644 index 0000000..ceaaaec --- /dev/null +++ b/references/java/spring-boot.md @@ -0,0 +1,287 @@ +# Temporal Spring Boot Integration + +## Overview + +`temporal-spring-boot-starter` auto-configures workers, registers workflow/activity implementations, and exposes `WorkflowClient` as a Spring bean. This eliminates the manual `WorkflowServiceStubs` → `WorkflowClient` → `WorkerFactory` setup required without Spring. + +## Dependency Setup + +Maven: +```xml + + io.temporal + temporal-spring-boot-starter + [1.0,) + +``` + +Gradle: +```groovy +implementation 'io.temporal:temporal-spring-boot-starter:1.+' +``` + +The starter transitively includes `temporal-sdk` and the autoconfigure module. You can declare both `temporal-sdk` and `temporal-spring-boot-starter` explicitly, but the starter alone is sufficient. + +## Minimal Configuration + +`application.properties`: +```properties +spring.temporal.connection.target=local +spring.temporal.start-workers=true +spring.temporal.workersAutoDiscovery.packages=greetingapp +``` + +`application.yml` equivalent: +```yaml +spring: + temporal: + connection: + target: local # shorthand for localhost:7233 + start-workers: true + workersAutoDiscovery: + packages: + - greetingapp + workers: + - task-queue: greeting-queue + name: greeting-worker +``` + +For self-hosted Temporal, replace `local` with the server address: +```properties +spring.temporal.connection.target=temporal.internal:7233 +``` + +## Interface Design + Spring Annotation Layering + +The key concept: Temporal SDK annotations go on **interfaces**, Spring Boot autoconfigure annotations go on **implementation classes**. This is identical to non-Spring usage at the interface level. + +### Workflow Interface (unchanged from non-Spring) +```java +package greetingapp; + +import io.temporal.workflow.WorkflowInterface; +import io.temporal.workflow.WorkflowMethod; + +@WorkflowInterface +public interface GreetingWorkflow { + @WorkflowMethod + String greet(String name); +} +``` + +### Workflow Implementation +```java +package greetingapp; + +import io.temporal.activity.ActivityOptions; +import io.temporal.spring.boot.WorkflowImpl; +import io.temporal.workflow.Workflow; + +import java.time.Duration; + +// @WorkflowImpl replaces manual worker.registerWorkflowImplementationTypes() +// No @Component — workflows are NOT Spring beans; Temporal creates a new instance per execution +@WorkflowImpl(taskQueues = "greeting-queue") +public class GreetingWorkflowImpl implements GreetingWorkflow { + + // Activity stubs created via Workflow.newActivityStub() as usual + private final GreetActivities activities = Workflow.newActivityStub( + GreetActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .setTaskQueue("greeting-queue") + .build() + ); + + @Override + public String greet(String name) { + return activities.greet(name); + } +} +``` + +### Activity Interface (unchanged from non-Spring) +```java +package greetingapp; + +import io.temporal.activity.ActivityInterface; +import io.temporal.activity.ActivityMethod; + +@ActivityInterface +public interface GreetActivities { + @ActivityMethod + String greet(String name); +} +``` + +### Activity Implementation +```java +package greetingapp; + +import io.temporal.spring.boot.ActivityImpl; +import org.springframework.stereotype.Component; + +// @Component makes this a Spring bean — dependencies can be injected normally +// @ActivityImpl replaces manual worker.registerActivitiesImplementations() +@Component +@ActivityImpl(taskQueues = "greeting-queue") +public class GreetActivitiesImpl implements GreetActivities { + + private final GreetingService greetingService; + + // Constructor injection works because this is a Spring bean + public GreetActivitiesImpl(GreetingService greetingService) { + this.greetingService = greetingService; + } + + @Override + public String greet(String name) { + return greetingService.composeGreeting(name); + } +} +``` + +## Auto-Discovery + +Auto-discovery is how the autoconfigure finds and registers implementations without explicit configuration. It requires **both** of the following: + +1. `@WorkflowImpl(taskQueues = "...")` or `@ActivityImpl(taskQueues = "...")` on the implementation class +2. `spring.temporal.workersAutoDiscovery.packages` pointing to a package that contains those classes + +Missing either one results in silent non-registration — no error, nothing polls the task queue. + +The `taskQueues` attribute routes implementations to the right worker when multiple task queues exist. A worker configured with task queue `"greeting-queue"` only picks up implementations annotated with `taskQueues = "greeting-queue"`. + +**Important:** `@ActivityImpl(taskQueues = "greeting-queue")` only registers the activity bean with that worker. It does not route individual activity task executions. Inside the workflow, `ActivityOptions.setTaskQueue("greeting-queue")` must also be set on the activity stub to route activity tasks to the correct queue. + +### Comparison: Auto-Discovery vs Explicit YAML Registration + +Auto-discovery via annotations: +```properties +spring.temporal.workersAutoDiscovery.packages=greetingapp +``` +```java +@Component +@ActivityImpl(taskQueues = "greeting-queue") +public class GreetActivitiesImpl implements GreetActivities { ... } +``` + +Explicit YAML registration (alternative): +```yaml +spring: + temporal: + workers: + - task-queue: greeting-queue + name: greeting-worker + activity-beans: + - greetActivitiesImpl + workflow-classes: + - greetingapp.GreetingWorkflowImpl +``` + +Use auto-discovery when implementations are colocated in a single package tree (most apps). Use explicit YAML when you need fine-grained control, want to exclude specific classes, or are registering beans defined elsewhere. + +## WorkflowClient Injection + +`WorkflowClient` is automatically registered as a Spring bean by the autoconfigure. Inject it into any `@Service` or `@RestController`: + +```java +package greetingapp; + +import io.temporal.client.WorkflowClient; +import io.temporal.client.WorkflowOptions; +import org.springframework.stereotype.Service; + +import java.util.UUID; + +@Service +public class GreetingStarter { + + private final WorkflowClient client; + + public GreetingStarter(WorkflowClient client) { + this.client = client; + } + + public String startGreeting(String name) { + var stub = client.newWorkflowStub( + GreetingWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId(UUID.randomUUID().toString()) + .setTaskQueue("greeting-queue") // must match the worker's task queue + .build() + ); + // Synchronous — blocks until workflow completes + return stub.greet(name); + } + + public void startGreetingAsync(String name) { + var stub = client.newWorkflowStub( + GreetingWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId(UUID.randomUUID().toString()) + .setTaskQueue("greeting-queue") + .build() + ); + // Fire-and-forget — returns immediately + WorkflowClient.start(stub::greet, name); + } +} +``` + +## Worker Lifecycle + +Workers start on `ApplicationReadyEvent` — after the full Spring context is initialized (DB migrations run, all beans wired). This means activity beans are fully ready before any workflow tasks are processed. + +To run a client-only app (one that submits workflows but does not execute them): +```properties +spring.temporal.start-workers=false +``` + +## Testing Strategies + +See `references/java/testing.md` for full details on both approaches. + +**Spring integration tests** — uses an embedded Temporal test server wired into the Spring context: +```properties +# src/test/resources/application-test.properties +spring.temporal.test-server.enabled=true +``` +```java +@SpringBootTest +@ActiveProfiles("test") +class GreetingIntegrationTest { + @Autowired WorkflowClient client; // points at the embedded test server + + @Test + void testWorkflowThroughSpringContext() { ... } +} +``` + +**Unit tests without Spring** — use `TestWorkflowEnvironment` or `TestWorkflowExtension` directly. No Spring context, faster startup, full time-skipping support: +```java +@RegisterExtension +static final TestWorkflowExtension testWorkflow = TestWorkflowExtension.newBuilder() + .setWorkflowTypes(GreetingWorkflowImpl.class) + .setDoNotStart(true) + .build(); +``` + +Do not mix approaches in the same test class — choose one or the other. + +## Spring-Specific Gotchas + +**Workflow impls must not have `@Component`** +Temporal creates a new workflow instance per execution via `beanFactory.createBean()` (not `getBean()`). Adding `@Component` means Spring also registers it as a singleton bean, which can cause confusing lifecycle behavior. Leave `@WorkflowImpl` classes as plain classes with no Spring annotations. + +**Activity beans are Spring singletons** +Temporal may invoke activity methods concurrently across many workflow executions. Keep activity implementations stateless — no mutable instance fields. Use injected services (which are themselves stateless or thread-safe) for all state. + +**`@WorkflowImpl` / `@ActivityImpl` without `workersAutoDiscovery.packages` → silently ignored** +This is the most common setup mistake. If auto-discovery packages are not configured, the annotations are never scanned and nothing registers with the worker. Verify with the Temporal UI that the worker is registering the expected workflow/activity types. + +**`ActivityOptions.setTaskQueue(...)` is required on activity stubs** +`@ActivityImpl(taskQueues = "greeting-queue")` registers the activity bean with the worker — it does not set the default task queue for activity execution. Inside workflow code, always set `.setTaskQueue(...)` in `ActivityOptions` to explicitly route activity tasks to the correct worker. + +**Multiple `DataConverter` beans** +If you define more than one `DataConverter` bean (e.g., a custom JSON converter and a default), the autoconfigure fails with an ambiguity error. Name one of them `mainDataConverter` to designate it as the primary. diff --git a/references/java/testing.md b/references/java/testing.md index 80ed9b2..b46db29 100644 --- a/references/java/testing.md +++ b/references/java/testing.md @@ -182,3 +182,74 @@ For activities that use `Activity.getExecutionContext()` or heartbeating, use `T 4. Test replay compatibility when changing workflow code (see `references/java/determinism.md`) 5. Test signal/query handlers explicitly 6. Use unique task queues per test to avoid conflicts (handled automatically by `TestWorkflowExtension`) + +## Spring Boot Testing + +Two strategies — choose one per test class, do not mix them. + +### Embedded test server in Spring context + +For full integration tests that exercise the Spring context (DB, beans, config): + +```properties +# src/test/resources/application-test.properties +spring.temporal.test-server.enabled=true +``` + +```java +@SpringBootTest +@ActiveProfiles("test") +class TeeTimeMonitorIntegrationTest { + + @Autowired + WorkflowClient client; // auto-configured to point at the embedded test server + + @Test + void testWorkflow() { + var stub = client.newWorkflowStub( + TeeTimeMonitorWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId("test-" + UUID.randomUUID()) + .setTaskQueue("golfnow") + .build() + ); + var result = stub.monitorTeeTimes(new TTMonitorRequest(...)); + assertNotNull(result); + } +} +``` + +The embedded server does not support time-skipping. Use this when you need Spring beans (real DB, email service, etc.) wired alongside Temporal. + +### Unit tests without Spring context + +For faster, isolated tests with time-skipping support, use `TestWorkflowExtension` or `TestWorkflowEnvironment` directly. No Spring context starts, so activity dependencies must be provided manually (real instances or Mockito mocks): + +```java +public class TeeTimeMonitorWorkflowTest { + + @RegisterExtension + static final TestWorkflowExtension testWorkflow = TestWorkflowExtension.newBuilder() + .setWorkflowTypes(TeeTimeMonitorWorkflowImpl.class) + .setDoNotStart(true) + .build(); + + @Test + void testWorkflow(TestWorkflowEnvironment env, Worker worker, WorkflowClient client) { + GolfNowActivities activities = mock(GolfNowActivities.class, withSettings().withoutAnnotations()); + when(activities.searchTeeTimes(any())).thenReturn(List.of()); + + worker.registerActivitiesImplementations(activities); + env.start(); + + var stub = client.newWorkflowStub( + TeeTimeMonitorWorkflow.class, + WorkflowOptions.newBuilder().setTaskQueue(worker.getTaskQueue()).build() + ); + stub.monitorTeeTimes(new TTMonitorRequest(...)); + verify(activities).searchTeeTimes(any()); + } +} +``` + +See the sections above for more detail on mocking, signals/queries, and replay testing. From 0e7a3125f861eca091f0c6465c3c204e693fc3e5 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 30 Apr 2026 16:06:59 -0400 Subject: [PATCH 40/42] Small tweak to worker setup wording (#100) --- references/dotnet/dotnet.md | 2 +- references/go/go.md | 2 +- references/java/java.md | 2 +- references/python/python.md | 2 +- references/typescript/typescript.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/references/dotnet/dotnet.md b/references/dotnet/dotnet.md index 437fcbb..a7f1c54 100644 --- a/references/dotnet/dotnet.md +++ b/references/dotnet/dotnet.md @@ -51,7 +51,7 @@ public class GreetingWorkflow } ``` -**Worker (Program.cs)** - Worker setup: +**Worker (Program.cs)** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): ```csharp using Temporalio.Client; diff --git a/references/go/go.md b/references/go/go.md index 546e1b1..6c42bed 100644 --- a/references/go/go.md +++ b/references/go/go.md @@ -55,7 +55,7 @@ func (a *Activities) Greet(ctx context.Context, name string) (string, error) { } ``` -**worker/main.go** - Worker setup: +**worker/main.go** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): ```go package main diff --git a/references/java/java.md b/references/java/java.md index a0ba272..2adfc6d 100644 --- a/references/java/java.md +++ b/references/java/java.md @@ -96,7 +96,7 @@ public class GreetingWorkflowImpl implements GreetingWorkflow { } ``` -**GreetingWorker.java** - Worker setup: +**GreetingWorker.java** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): ```java package greetingapp; diff --git a/references/python/python.md b/references/python/python.md index bc0a0f3..d3c0e9c 100644 --- a/references/python/python.md +++ b/references/python/python.md @@ -36,7 +36,7 @@ class GreetingWorkflow: ) ``` -**worker.py** - Worker setup (imports activity and workflow, runs indefinitely and processes tasks): +**worker.py** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): ```python import asyncio diff --git a/references/typescript/typescript.md b/references/typescript/typescript.md index 9e125cb..96fc089 100644 --- a/references/typescript/typescript.md +++ b/references/typescript/typescript.md @@ -43,7 +43,7 @@ export async function greetingWorkflow(name: string): Promise { } ``` -**worker.ts** - Worker setup (imports activities and workflows, runs indefinitely): +**worker.ts** - Worker setup (registers activity and workflow, runs indefinitely and processes tasks): ```typescript import { Worker } from '@temporalio/worker'; From 6495e5927f0cb93fc0c16c09ad93628df98a3095 Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 30 Apr 2026 16:44:30 -0400 Subject: [PATCH 41/42] Cleanup and correct language of workflow initializers (#101) --- references/dotnet/advanced-features.md | 6 +++--- references/java/advanced-features.md | 24 ++++++++++++++++++++++++ references/python/advanced-features.md | 8 +++++--- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/references/dotnet/advanced-features.md b/references/dotnet/advanced-features.md index fd0f81e..dd844d0 100644 --- a/references/dotnet/advanced-features.md +++ b/references/dotnet/advanced-features.md @@ -96,9 +96,9 @@ var worker = new TemporalWorker( ## Workflow Init Attribute -Use `[WorkflowInit]` on a constructor to run initialization code when a workflow is first created. +You should always put state initialization logic in the constructor of your workflow class, so that it happens before signals/updates arrive. -**Purpose:** Execute some setup code before signal/update happens or run is invoked. +Normally, your constructor must have no arguments. However, if you add the `[WorkflowInit]` attribute, then your constructor instead receives the same workflow arguments that `[WorkflowRun]` receives: ```csharp [Workflow] @@ -122,7 +122,7 @@ public class MyWorkflow } ``` -Constructor and `[WorkflowRun]` method must have the same parameters with the same types. You cannot make blocking calls (activities, sleeps, etc.) from the constructor. +Constructor (with `[WorkflowInit]`) and `[WorkflowRun]` method must have the same parameters with the same types. You cannot make blocking calls (activities, sleeps, etc.) from the constructor. ## Workflow Failure Exception Types diff --git a/references/java/advanced-features.md b/references/java/advanced-features.md index e736da2..9db730c 100644 --- a/references/java/advanced-features.md +++ b/references/java/advanced-features.md @@ -116,6 +116,30 @@ worker.registerActivitiesImplementations(new MyActivitiesImpl()); factory.start(); ``` +## Workflow Init Annotation + +You should always put state initialization logic in the constructor of your workflow class, so that it happens before signals/updates arrive. + +Normally, your constructor must have no arguments. However, if you add the `@WorkflowInit` annotation, then your constructor instead receives the same workflow arguments that `run` receives: + +```java +public class MyWorkflowImpl implements MyWorkflow { + private final int foo; + + @WorkflowInit + public MyWorkflowImpl(MyInput input) { + foo = 1234; + } + + @Override + public ClusterManagerResult run(ClusterManagerInput input) { + // this.foo is already initialized + } +} +``` + +Constructor (with `@WorkflowInit`) and `run` method must have the same parameters with the same types. You cannot make blocking calls (activities, sleeps, etc.) from the constructor. + ## Workflow Failure Exception Types Control which exceptions cause workflow failures vs workflow task failures. diff --git a/references/python/advanced-features.md b/references/python/advanced-features.md index 3d86e9f..c5ec1b3 100644 --- a/references/python/advanced-features.md +++ b/references/python/advanced-features.md @@ -116,9 +116,9 @@ worker = Worker( ## Workflow Init Decorator -Use `@workflow.init` to run initialization code when a workflow is first created. +You should always put state initialization logic in the `__init__` of your workflow class, so that it happens before signals/updates arrive. -**Purpose:** Execute some setup code before signal/update happens or run is invoked. +Normally, your `__init__` must have no arguments. However, if you add the `@workflow.init` decorator, then your `__init__` instead receives the same workflow arguments that `@workflow.run` receives: ```python @workflow.defn @@ -130,11 +130,13 @@ class MyWorkflow: self._items: list[str] = [] @workflow.run - async def run(self) -> str: + async def run(self, initial_value: str) -> str: # self._value and self._items are already initialized return self._value ``` +`__init__` (with `@workflow.init`) and `@workflow.run` must have the same parameters with the same types. You cannot make blocking calls (activities, sleeps, etc.) from the `__init__`. + ## Workflow Failure Exception Types Control which exceptions cause workflow task failures vs workflow failures. From bd513b1f11840d99eb793b2af29bf0c83ee0ae7e Mon Sep 17 00:00:00 2001 From: Donald Pinckney Date: Thu, 30 Apr 2026 16:52:10 -0400 Subject: [PATCH 42/42] Separate out CLI installation instructions to file (#102) --- SKILL.md | 26 +------------------------- references/core/install_cli.md | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 25 deletions(-) create mode 100644 references/core/install_cli.md diff --git a/SKILL.md b/SKILL.md index fe6650c..ff770cd 100644 --- a/SKILL.md +++ b/SKILL.md @@ -48,31 +48,7 @@ See `references/core/determinism.md` for detailed explanation. ### Ensure Temporal CLI is installed -Check if `temporal` CLI is installed. If not, follow these instructions: - -#### macOS - -``` -brew install temporal -``` - -#### Linux - -Check your machine's architecture and download the appropriate archive: - -- [Linux amd64](https://temporal.download/cli/archive/latest?platform=linux&arch=amd64) -- [Linux arm64](https://temporal.download/cli/archive/latest?platform=linux&arch=arm64) - -Once you've downloaded the file, extract the downloaded archive and add the temporal binary to your PATH by copying it to a directory like /usr/local/bin - -#### Windows - -Check your machine's architecture and download the appropriate archive: - -- [Windows amd64](https://temporal.download/cli/archive/latest?platform=windows&arch=amd64) -- [Windows arm64](https://temporal.download/cli/archive/latest?platform=windows&arch=arm64) - -Once you've downloaded the file, extract the downloaded archive and add the temporal.exe binary to your PATH. +Check if `temporal` CLI is installed. If not, follow the instructions at `references/core/install_cli.md` to install it for your platform. ### Read All Relevant References diff --git a/references/core/install_cli.md b/references/core/install_cli.md new file mode 100644 index 0000000..4421172 --- /dev/null +++ b/references/core/install_cli.md @@ -0,0 +1,25 @@ +# How to install Temporal CLI + +## macOS + +``` +brew install temporal +``` + +## Linux + +Check your machine's architecture and download the appropriate archive: + +- [Linux amd64](https://temporal.download/cli/archive/latest?platform=linux&arch=amd64) +- [Linux arm64](https://temporal.download/cli/archive/latest?platform=linux&arch=arm64) + +Once you've downloaded the file, extract the downloaded archive and add the temporal binary to your PATH by copying it to a directory like /usr/local/bin + +## Windows + +Check your machine's architecture and download the appropriate archive: + +- [Windows amd64](https://temporal.download/cli/archive/latest?platform=windows&arch=amd64) +- [Windows arm64](https://temporal.download/cli/archive/latest?platform=windows&arch=arm64) + +Once you've downloaded the file, extract the downloaded archive and add the temporal.exe binary to your PATH. \ No newline at end of file