diff --git a/docs/design-patterns/activity-dependency-injection.mdx b/docs/design-patterns/activity-dependency-injection.mdx new file mode 100644 index 0000000000..448c5f7283 --- /dev/null +++ b/docs/design-patterns/activity-dependency-injection.mdx @@ -0,0 +1,500 @@ +--- +id: activity-dependency-injection +title: "Activity Dependency Injection" +sidebar_label: "Activity Dependency Injection" +description: "Injects external dependencies into Activities at Worker startup, keeping Workflow code deterministic and Activities testable." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Activity Dependency Injection pattern separates the creation of external dependencies (database connections, API clients, configuration) from Activity business logic by injecting them at Worker startup. +This approach keeps Workflow code deterministic, makes Activities testable in isolation, and ensures expensive resources are initialized once per Worker process rather than once per Activity execution. + +## Problem + +Activities often need access to external resources such as database connection pools, HTTP clients, third-party API credentials, or shared caches. +Without a structured approach, you face several challenges: + +- **Reinitializing resources per execution.** Creating a new database connection or API client on every Activity invocation wastes resources and increases latency. +- **Hardcoded dependencies.** Embedding connection logic directly inside Activity functions couples business logic to infrastructure, making it difficult to swap implementations across environments. +- **Difficult testing.** When Activities construct their own dependencies internally, you cannot substitute test doubles without modifying production code. +- **Non-determinism risk.** Passing dependencies directly into Workflow code breaks Temporal's determinism guarantees, because dependency state can change between replays. + +## Solution + +You define Activities as methods on a struct or class that holds dependencies as fields. At Worker startup, you instantiate the struct or class with real implementations and register it with the Worker. The Workflow references Activity methods without knowing about the underlying dependencies. + +```mermaid +flowchart LR + subgraph Worker Startup + D[Dependencies
DB, API Client, Config] --> S[Activity Struct / Class] + S --> R[Register with Worker] + end + + subgraph Workflow Execution + W[Workflow] -->|"execute activity"| A[Activity Method] + A -->|"uses"| S + end + + subgraph Testing + M[Mock / Stub] --> T[Test Environment] + T -->|"execute activity"| A2[Activity Method] + end +``` + +The following describes each path in the diagram: + +1. At Worker startup, you create dependency instances (database pools, API clients) and inject them into an Activity struct or class, which you then register with the Worker. +2. During Workflow execution, the Workflow calls Activity methods by reference. The Temporal runtime routes the call to the registered instance on the Worker, where the method accesses the injected dependencies. +3. During testing, you substitute mock or stub implementations into the same Activity struct or class, allowing you to verify behavior without external services. + +## Implementation + +### Define Activities with dependencies + +Define Activities as methods on a struct or class that accepts dependencies through its constructor or fields. Each method acts as a separate Activity Type. + + + + +```go +// activities.go +package payment + +import ( + "context" + + "go.temporal.io/sdk/activity" +) + +type Activities struct { + DBClient DBClient + EmailClient EmailClient +} + +func (a *Activities) ChargeCustomer(ctx context.Context, orderID string, amount int) (string, error) { + logger := activity.GetLogger(ctx) + logger.Info("Charging customer", "orderID", orderID, "amount", amount) + + receiptID, err := a.DBClient.ProcessPayment(orderID, amount) + if err != nil { + return "", err + } + + return receiptID, nil +} + +func (a *Activities) SendReceipt(ctx context.Context, email string, receiptID string) error { + return a.EmailClient.Send(email, "Payment Receipt", receiptID) +} +``` + + + + +```python +# activities.py +from dataclasses import dataclass +from temporalio import activity + + +@dataclass +class PaymentActivities: + db_client: DBClient + email_client: EmailClient + + @activity.defn + async def charge_customer(self, order_id: str, amount: int) -> str: + activity.logger.info( + "Charging customer", extra={"order_id": order_id, "amount": amount} + ) + receipt_id = await self.db_client.process_payment(order_id, amount) + return receipt_id + + @activity.defn + async def send_receipt(self, email: str, receipt_id: str) -> None: + await self.email_client.send(email, "Payment Receipt", receipt_id) +``` + + + + +```java +// PaymentActivities.java +@ActivityInterface +public interface PaymentActivities { + String chargeCustomer(String orderId, int amount); + void sendReceipt(String email, String receiptId); +} + +// PaymentActivitiesImpl.java +public class PaymentActivitiesImpl implements PaymentActivities { + private final DBClient dbClient; + private final EmailClient emailClient; + + public PaymentActivitiesImpl(DBClient dbClient, EmailClient emailClient) { + this.dbClient = dbClient; + this.emailClient = emailClient; + } + + @Override + public String chargeCustomer(String orderId, int amount) { + return dbClient.processPayment(orderId, amount); + } + + @Override + public void sendReceipt(String email, String receiptId) { + emailClient.send(email, "Payment Receipt", receiptId); + } +} +``` + + + + +```typescript +// activities.ts +export interface DB { + processPayment(orderId: string, amount: number): Promise; +} + +export interface EmailClient { + send(to: string, subject: string, body: string): Promise; +} + +export const createActivities = (db: DB, emailClient: EmailClient) => ({ + async chargeCustomer(orderId: string, amount: number): Promise { + const receiptId = await db.processPayment(orderId, amount); + return receiptId; + }, + + async sendReceipt(email: string, receiptId: string): Promise { + await emailClient.send(email, 'Payment Receipt', receiptId); + }, +}); +``` + + + + +Each SDK uses a different mechanism to group Activities with their dependencies: + +- **Go**: Methods on a struct. The struct fields hold dependencies. +- **Python**: A `@dataclass` with `@activity.defn` methods. Fields hold dependencies. +- **Java**: An `@ActivityInterface` with a separate implementation class. Dependencies are passed through the constructor. +- **TypeScript**: A factory function that closes over dependencies and returns an object of Activity functions. + +### Reference Activities from the Workflow + +The Workflow references Activity methods without any knowledge of the injected dependencies. Each SDK provides a type-safe way to call Activities. + + + + +```go +// workflow.go +package payment + +import ( + "time" + + "go.temporal.io/sdk/workflow" +) + +func PaymentWorkflow(ctx workflow.Context, orderID string, amount int, email string) error { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + // Use a nil struct pointer to reference Activity methods. + // This provides compile-time type safety without instantiating the struct. + var a *Activities + var receiptID string + err := workflow.ExecuteActivity(ctx, a.ChargeCustomer, orderID, amount).Get(ctx, &receiptID) + if err != nil { + return err + } + + return workflow.ExecuteActivity(ctx, a.SendReceipt, email, receiptID).Get(ctx, nil) +} +``` + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import PaymentActivities + + +@workflow.defn +class PaymentWorkflow: + @workflow.run + async def run(self, order_id: str, amount: int, email: str) -> None: + receipt_id = await workflow.execute_activity_method( + PaymentActivities.charge_customer, + args=[order_id, amount], + start_to_close_timeout=timedelta(seconds=30), + ) + + await workflow.execute_activity_method( + PaymentActivities.send_receipt, + args=[email, receipt_id], + start_to_close_timeout=timedelta(seconds=30), + ) +``` + + + + +```java +// PaymentWorkflowImpl.java +public class PaymentWorkflowImpl implements PaymentWorkflow { + private final PaymentActivities activities = Workflow.newActivityStub( + PaymentActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .build() + ); + + @Override + public void processPayment(String orderId, int amount, String email) { + String receiptId = activities.chargeCustomer(orderId, amount); + activities.sendReceipt(email, receiptId); + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities } from '@temporalio/workflow'; +import type { createActivities } from './activities'; + +// Use ReturnType to extract the Activity types from the factory function +const { chargeCustomer, sendReceipt } = proxyActivities< + ReturnType +>({ + startToCloseTimeout: '30s', +}); + +export async function paymentWorkflow( + orderId: string, + amount: number, + email: string +): Promise { + const receiptId = await chargeCustomer(orderId, amount); + await sendReceipt(email, receiptId); +} +``` + + + + +Key points for each SDK: + +- **Go**: A nil pointer of the Activity struct type (`var a *Activities`) provides compile-time method references without instantiating the struct. The Temporal runtime resolves the actual registered instance at execution time. +- **Python**: `workflow.execute_activity_method` references the class method directly and resolves to the registered instance on the Worker. +- **Java**: `Workflow.newActivityStub` creates a typed proxy from the Activity interface. The Temporal runtime routes calls to the registered implementation. +- **TypeScript**: `proxyActivities>` infers the Activity types from the factory function's return type. Activities are always referenced by name at runtime. + +### Register Activities with the Worker + +At Worker startup, you instantiate the Activity struct or class with real dependency implementations and register it. + + + + +```go +// worker/main.go +package main + +import ( + "log" + + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/worker" + + "example/payment" +) + +func main() { + c, err := client.Dial(client.Options{}) + if err != nil { + log.Fatalln("Unable to create client", err) + } + defer c.Close() + + w := worker.New(c, "payment", worker.Options{}) + + w.RegisterWorkflow(payment.PaymentWorkflow) + + // Inject real dependencies at Worker startup + w.RegisterActivity(&payment.Activities{ + DBClient: payment.NewPostgresClient("postgres://localhost:5432/payments"), + EmailClient: payment.NewSMTPClient("smtp://mail.example.com"), + }) + + err = w.Run(worker.InterruptCh()) + if err != nil { + log.Fatalln("Unable to start worker", err) + } +} +``` + + + + +```python +# worker.py +import asyncio +from temporalio.client import Client +from temporalio.worker import Worker + +from activities import PaymentActivities +from workflows import PaymentWorkflow + + +async def main(): + client = await Client.connect("localhost:7233") + + # Inject real dependencies at Worker startup + payment_activities = PaymentActivities( + db_client=PostgresClient("postgres://localhost:5432/payments"), + email_client=SMTPClient("smtp://mail.example.com"), + ) + + worker = Worker( + client, + task_queue="payment", + workflows=[PaymentWorkflow], + activities=[ + payment_activities.charge_customer, + payment_activities.send_receipt, + ], + ) + await worker.run() + + +if __name__ == "__main__": + asyncio.run(main()) +``` + + + + +```java +// PaymentWorker.java +public class PaymentWorker { + public static void main(String[] args) { + WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); + WorkflowClient client = WorkflowClient.newInstance(service); + WorkerFactory factory = WorkerFactory.newInstance(client); + + Worker worker = factory.newWorker("payment"); + worker.registerWorkflowImplementationTypes(PaymentWorkflowImpl.class); + + // Inject real dependencies at Worker startup + worker.registerActivitiesImplementations( + new PaymentActivitiesImpl( + new PostgresClient("postgres://localhost:5432/payments"), + new SMTPClient("smtp://mail.example.com") + ) + ); + + factory.start(); + } +} +``` + + + + +```typescript +// worker.ts +import { Worker } from '@temporalio/worker'; +import { createActivities } from './activities'; + +async function run() { + // Initialize dependencies at Worker startup + const db = new PostgresClient('postgres://localhost:5432/payments'); + const emailClient = new SMTPClient('smtp://mail.example.com'); + + const worker = await Worker.create({ + taskQueue: 'payment', + workflowsPath: require.resolve('./workflows'), + // Inject dependencies through the factory function + activities: createActivities(db, emailClient), + }); + + await worker.run(); +} + +run().catch((err) => { + console.error(err); + process.exit(1); +}); +``` + + + + +Dependencies are initialized once when the Worker process starts. All Activity executions on that Worker share the same instances, which is appropriate for thread-safe resources like connection pools and HTTP clients. + +## When to use + +This pattern is a good fit when your Activities access external services such as databases, message queues, or third-party APIs. It is appropriate when you want to initialize expensive resources once per Worker process, when you need to test Activity logic without connecting to real services, or when you operate in multiple environments (development, staging, production) that require different dependency configurations. + +This pattern is not necessary for Activities that are pure functions with no external dependencies, or for Activities that only use Temporal-provided context like heartbeating and logging. + +## Benefits and trade-offs + +Injecting dependencies at the Worker level provides several advantages. Resources like connection pools are initialized once and shared across all Activity executions, reducing overhead. Substituting mock implementations in tests requires no changes to Activity or Workflow code. Switching between environments involves changing only the Worker configuration. + +The trade-off is that all Activity executions on a given Worker share the same dependency instances. If an Activity requires per-execution isolation (for example, a database transaction scoped to a single Activity), you need to manage that within the Activity method itself. Dependencies must also be thread-safe, since multiple Activity executions may run concurrently on the same Worker. + +## Best practices + +- **Keep dependencies thread-safe.** Multiple Activity executions run concurrently on the same Worker. Use connection pools rather than single connections, and avoid mutable shared state. +- **Define dependencies as interfaces.** In Go, Python, and Java, using interfaces (or protocols in Python) for dependencies makes it possible to swap implementations for testing or different environments. +- **Do not inject dependencies into Workflows.** Workflow code must remain deterministic. If a Workflow needs configuration, retrieve it through a Local Activity so the value gets recorded in the Event History. +- **Initialize dependencies before Worker startup.** Create and validate all connections before calling `worker.Run()` or its equivalent. This ensures that the Worker does not start accepting tasks until all dependencies are ready. +- **Group related Activities on a single struct or class.** Activities that share the same dependencies belong together. If two groups of Activities have different dependencies, use separate structs or classes for each group. + +## Common pitfalls + +- **Constructing dependencies inside Activity methods.** Creating a new database connection or API client per execution leads to resource exhaustion and increased latency. +- **Injecting dependencies into Workflows.** This breaks determinism because dependency state can change between the original execution and a replay. The Temporal Java SDK documentation explicitly warns against this. +- **Using non-thread-safe dependencies.** A single mutable object shared across concurrent Activity executions causes race conditions. Use connection pools and ensure all injected objects are safe for concurrent use. +- **Registering class methods as static in Python.** If you register `BotService.send_message` (the unbound method) instead of `bot_service.send_message` (a method on an instance), the `self` parameter is not bound, causing a missing argument error at runtime. +- **Forgetting to bind methods in TypeScript.** When using a class instead of a factory function, class methods must be defined as arrow functions or explicitly bound in the constructor. Otherwise, `this` is `undefined` when Temporal invokes the Activity. + +## Related patterns + +- **[Entity Workflow](/design-patterns/entity-workflow)**: Long-lived Workflows that manage stateful entities, often using Activities with injected dependencies. +- **[Worker-Specific Task Queues](/design-patterns/worker-specific-taskqueue)**: Routing Activities to specific Workers, which can have different injected dependencies. + +## Sample code + +### Go +- [Greetings](https://github.com/temporalio/samples-go/tree/main/greetings) — Activities as struct methods with injected dependencies. +- [Large Payload Fixture](https://github.com/temporalio/samples-go/tree/main/temporal-fixtures/largepayload) — The reference sample using struct-based Activity dependency injection. + +### Python +- [Hello Activity Method](https://github.com/temporalio/samples-python/blob/main/hello/hello_activity_method.py) — Activities defined as class methods with dependency injection. + +### Java +- [Hello World](https://github.com/temporalio/hello-world-project-template-java) — Activity interface and implementation with Worker registration. + +### TypeScript +- [Activities Dependency Injection](https://github.com/temporalio/samples-typescript/tree/main/activities-dependency-injection) — Factory function pattern for sharing dependencies between Activities. diff --git a/docs/design-patterns/approval.mdx b/docs/design-patterns/approval.mdx new file mode 100644 index 0000000000..e510ade162 --- /dev/null +++ b/docs/design-patterns/approval.mdx @@ -0,0 +1,1028 @@ +--- +id: approval +title: "Approval Pattern" +sidebar_label: "Approval" +description: "Human-in-the-loop Workflows that block until external approval decisions are made. Uses Signals to capture approval data with metadata." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Approval pattern implements human-in-the-loop Workflows where execution blocks until an external decision is made. +It uses Workflow Signals with custom input data to unblock Workflows, enabling approval processes, manual reviews, and decision gates in automated business processes. + + + +## Problem + +In many business processes, you need Workflows that wait for human approval before proceeding. +These Workflows must capture approval decisions along with metadata such as the approver's identity, a reason, and a timestamp. +They must also support multiple outcomes — approval, rejection, or escalation — and handle timeout scenarios when no decision arrives. + +Without a structured approval pattern, you are forced to poll external systems for approval status, implement complex state machines by hand, and manage race conditions between timeouts and incoming approvals. +You also risk losing approval context and metadata, and you must build custom audit logging to meet compliance requirements. + +## Solution + +The Approval pattern uses a blocking wait with timeout to pause execution until a Signal is received. +The Signal carries custom data — the approval decision, approver details, and comments — that the Workflow captures and uses to determine next steps. + +```mermaid +sequenceDiagram + participant Requester + participant Workflow + participant Approver + + Requester->>+Workflow: Start approval request + activate Workflow + Workflow->>Workflow: Wait with timeout + Note over Workflow: Waiting for approval... + + alt Approval received + Approver->>Workflow: Signal: submitApproval(data) + Workflow->>Workflow: Process approval data + Workflow-->>Requester: Approved + else Timeout + Note over Workflow: Timeout expires + Workflow-->>Requester: Timeout/Rejected + end + deactivate Workflow +``` + +The following describes each step in the diagram: + +1. The requester starts the Workflow with an approval request. +2. The Workflow blocks execution with a timeout — using `Workflow.await()` in Java, `condition()` in TypeScript, `workflow.wait_condition()` in Python, or `workflow.AwaitWithTimeout()` in Go. +3. If an approver sends a Signal before the timeout expires, the Workflow receives the approval data, processes the decision, and returns the result to the requester. +4. If the timeout expires before any Signal arrives, the Workflow unblocks and follows the timeout path, which typically results in rejection or escalation. + +The approval data object carries the decision context through the Workflow. +Define a type to hold the approver's identity, the decision, any comments, and a timestamp: + + + + +```python +# models.py +from dataclasses import dataclass + +@dataclass +class ApprovalData: + approver: str + decision: str # "APPROVED", "REJECTED", "ESCALATED" + comments: str + timestamp: int +``` + + + + +```go +// types.go +type ApprovalData struct { + Approver string + Decision string // "APPROVED", "REJECTED", "ESCALATED" + Comments string + Timestamp int64 +} +``` + + + + +```java +// ApprovalData.java +public class ApprovalData { + private String approver; + private String decision; // "APPROVED", "REJECTED", "ESCALATED" + private String comments; + private long timestamp; + + // Constructor, getters, setters +} +``` + + + + +```typescript +// types.ts +export interface ApprovalData { + approver: string; + decision: 'APPROVED' | 'REJECTED' | 'ESCALATED'; + comments: string; + timestamp: number; +} +``` + + + + +This type gives you a structured way to pass rich context through the Signal rather than a plain boolean. + +Next, define the Workflow contract. +The Workflow accepts a request ID and a timeout duration. +A Signal receives the approval data from an external system. +A Query exposes the current status without modifying Workflow state: + + + + +```python +# workflows.py +from temporalio import workflow +from models import ApprovalData +``` + + + + +```go +// workflow.go +// In Go, signals are received via named channels and +// queries are registered with workflow.SetQueryHandler. +// There is no separate interface definition. +``` + + + + +```java +// ApprovalWorkflow.java +@WorkflowInterface +public interface ApprovalWorkflow { + @WorkflowMethod + String execute(String requestId, Duration timeout); + + @SignalMethod + void submitApproval(ApprovalData approvalData); + + @QueryMethod + String getStatus(); +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import { ApprovalData } from './types'; + +export const submitApprovalSignal = wf.defineSignal<[ApprovalData]>('submitApproval'); +export const getStatusQuery = wf.defineQuery('getStatus'); +``` + + + + +These definitions form the contract for any approval Workflow implementation. + +The implementation ties everything together. +The Workflow blocks until either the approval data arrives via Signal or the timeout expires: + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow +from models import ApprovalData + +@workflow.defn +class ApprovalWorkflow: + def __init__(self) -> None: + self.approval_data: ApprovalData | None = None + self.status = "PENDING" + + @workflow.run + async def run(self, request_id: str, timeout_seconds: int) -> str: + try: + await workflow.wait_condition( + lambda: self.approval_data is not None, + timeout=timedelta(seconds=timeout_seconds), + ) + self.status = self.approval_data.decision + return f"Request {request_id} {self.status} by {self.approval_data.approver}" + except asyncio.TimeoutError: + self.status = "TIMEOUT" + return f"Request {request_id} timed out" + + @workflow.signal + def submit_approval(self, data: ApprovalData) -> None: + self.approval_data = data + + @workflow.query + def get_status(self) -> str: + return self.status +``` + + + + +```go +// workflow.go +func ApprovalWorkflow(ctx workflow.Context, requestId string, timeout time.Duration) (string, error) { + var approvalData *ApprovalData + status := "PENDING" + + err := workflow.SetQueryHandler(ctx, "getStatus", func() (string, error) { + return status, nil + }) + if err != nil { + return "", err + } + + // Listen for the approval signal in a goroutine + workflow.Go(ctx, func(ctx workflow.Context) { + signalChan := workflow.GetSignalChannel(ctx, "submitApproval") + signalChan.Receive(ctx, &approvalData) + }) + + approved, err := workflow.AwaitWithTimeout(ctx, timeout, func() bool { + return approvalData != nil + }) + if err != nil { + return "", err + } + + if approved { + status = approvalData.Decision + return fmt.Sprintf("Request %s %s by %s", requestId, status, approvalData.Approver), nil + } + status = "TIMEOUT" + return fmt.Sprintf("Request %s timed out", requestId), nil +} +``` + + + + +```java +// ApprovalWorkflowImpl.java +public class ApprovalWorkflowImpl implements ApprovalWorkflow { + private ApprovalData approvalData; + private String status = "PENDING"; + + @Override + public String execute(String requestId, Duration timeout) { + boolean approved = Workflow.await(timeout, () -> approvalData != null); + + if (approved) { + status = approvalData.getDecision(); + return "Request " + requestId + " " + status + " by " + approvalData.getApprover(); + } else { + status = "TIMEOUT"; + return "Request " + requestId + " timed out"; + } + } + + @Override + public void submitApproval(ApprovalData data) { + this.approvalData = data; + } + + @Override + public String getStatus() { + return status; + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import { ApprovalData } from './types'; + +export const submitApprovalSignal = wf.defineSignal<[ApprovalData]>('submitApproval'); +export const getStatusQuery = wf.defineQuery('getStatus'); + +export async function approvalWorkflow( + requestId: string, + timeout: string | number, // ms or Duration string +): Promise { + let approvalData: ApprovalData | undefined; + let status = 'PENDING'; + + wf.setHandler(submitApprovalSignal, (data: ApprovalData) => { + approvalData = data; + }); + + wf.setHandler(getStatusQuery, () => status); + + const approved = await wf.condition(() => approvalData !== undefined, timeout); + + if (approved) { + status = approvalData!.decision; + return `Request ${requestId} ${status} by ${approvalData!.approver}`; + } else { + status = 'TIMEOUT'; + return `Request ${requestId} timed out`; + } +} +``` + + + + +Each SDK uses a different mechanism to block with a timeout, but the core pattern is the same. +In Java, `Workflow.await()` takes a timeout and a condition lambda, returning `false` on timeout. +In TypeScript, `condition()` takes a predicate and a timeout, returning `false` on timeout. +In Python, `workflow.wait_condition()` takes a lambda and a timeout, raising `asyncio.TimeoutError` on timeout. +In Go, `workflow.AwaitWithTimeout()` takes a timeout and a condition function, returning `ok=false` on timeout. + +The condition is evaluated on every state transition, so it must not call blocking operations, mutate Workflow state, or use time-based checks. +When the Signal handler sets the approval data, the condition evaluates to `true` and the Workflow unblocks. + +## Implementation + +### Basic approval with timeout + +The following implementation shows the minimal version of the pattern. +The Workflow waits for a boolean approval flag to be set via Signal, and falls back to auto-rejection on timeout: + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow + +@workflow.defn +class SimpleApprovalWorkflow: + def __init__(self) -> None: + self.approved = False + self.approver: str | None = None + + @workflow.run + async def run(self, request_id: str, timeout_seconds: int) -> str: + try: + await workflow.wait_condition( + lambda: self.approved, + timeout=timedelta(seconds=timeout_seconds), + ) + return f"Approved by {self.approver}" + except asyncio.TimeoutError: + return "Approval timeout - auto-rejected" + + @workflow.signal + def submit_approval(self, approver_name: str) -> None: + self.approved = True + self.approver = approver_name +``` + + + + +```go +// workflow.go +func SimpleApprovalWorkflow(ctx workflow.Context, requestId string, timeout time.Duration) (string, error) { + approved := false + var approver string + + workflow.Go(ctx, func(ctx workflow.Context) { + signalChan := workflow.GetSignalChannel(ctx, "submitApproval") + signalChan.Receive(ctx, &approver) + approved = true + }) + + ok, err := workflow.AwaitWithTimeout(ctx, timeout, func() bool { + return approved + }) + if err != nil { + return "", err + } + + if ok { + return fmt.Sprintf("Approved by %s", approver), nil + } + return "Approval timeout - auto-rejected", nil +} +``` + + + + +```java +// SimpleApprovalWorkflowImpl.java +public class SimpleApprovalWorkflowImpl implements ApprovalWorkflow { + private boolean approved = false; + private String approver; + + @Override + public String execute(String requestId, Duration timeout) { + Workflow.await(timeout, () -> approved); + + if (approved) { + return "Approved by " + approver; + } else { + return "Approval timeout - auto-rejected"; + } + } + + @Override + public void submitApproval(String approverName) { + this.approved = true; + this.approver = approverName; + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; + +export const submitApprovalSignal = wf.defineSignal<[string]>('submitApproval'); + +export async function simpleApprovalWorkflow( + requestId: string, + timeout: string | number, +): Promise { + let approved = false; + let approver: string | undefined; + + wf.setHandler(submitApprovalSignal, (approverName: string) => { + approved = true; + approver = approverName; + }); + + await wf.condition(() => approved, timeout); + + if (approved) { + return `Approved by ${approver}`; + } else { + return 'Approval timeout - auto-rejected'; + } +} +``` + + + + +The Signal handler sets both the approval flag and the approver's name. +When the wait unblocks, the Workflow checks the flag and returns the appropriate result. + +### Multi-level approval chain + +Some business processes require approvals from multiple levels of authority in sequence. +The following implementation iterates through a list of required approval levels, waiting for a Signal at each level before proceeding to the next: + + + + +```python +# models.py +from dataclasses import dataclass + +@dataclass +class MultiLevelApprovalData: + level: str # "L1", "L2", "L3" + approver: str + decision: str + comments: str +``` + + + + +```go +// types.go +type MultiLevelApprovalData struct { + Level string // "L1", "L2", "L3" + Approver string + Decision string + Comments string +} +``` + + + + +```java +// MultiLevelApprovalData.java +public class MultiLevelApprovalData { + private String level; // "L1", "L2", "L3" + private String approver; + private String decision; + private String comments; +} +``` + + + + +```typescript +// types.ts +export interface MultiLevelApprovalData { + level: 'L1' | 'L2' | 'L3'; + approver: string; + decision: string; + comments: string; +} +``` + + + + +This data type extends the basic approval data with a `level` field that identifies which approval tier the decision belongs to. + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow +from models import MultiLevelApprovalData + +@workflow.defn +class MultiLevelApprovalWorkflow: + def __init__(self) -> None: + self.approvals: list[MultiLevelApprovalData] = [] + + @workflow.run + async def run(self, request_id: str, timeout_per_level_seconds: int) -> str: + required_levels = ["L1", "L2", "L3"] + timeout = timedelta(seconds=timeout_per_level_seconds) + + for level in required_levels: + try: + await workflow.wait_condition( + lambda lv=level: any(a.level == lv for a in self.approvals), + timeout=timeout, + ) + except asyncio.TimeoutError: + return f"Timeout at {level}" + + approval = next(a for a in self.approvals if a.level == level) + if approval.decision == "REJECTED": + return f"Rejected at {level} by {approval.approver}" + + return "Fully approved through all levels" + + @workflow.signal + def submit_approval(self, data: MultiLevelApprovalData) -> None: + self.approvals.append(data) +``` + + + + +```go +// workflow.go +func MultiLevelApprovalWorkflow(ctx workflow.Context, requestId string, timeoutPerLevel time.Duration) (string, error) { + var approvals []MultiLevelApprovalData + requiredLevels := []string{"L1", "L2", "L3"} + + workflow.Go(ctx, func(ctx workflow.Context) { + signalChan := workflow.GetSignalChannel(ctx, "submitApproval") + for { + var data MultiLevelApprovalData + signalChan.Receive(ctx, &data) + approvals = append(approvals, data) + } + }) + + for _, level := range requiredLevels { + lv := level + ok, err := workflow.AwaitWithTimeout(ctx, timeoutPerLevel, func() bool { + for _, a := range approvals { + if a.Level == lv { + return true + } + } + return false + }) + if err != nil { + return "", err + } + if !ok { + return fmt.Sprintf("Timeout at %s", lv), nil + } + + var approval MultiLevelApprovalData + for _, a := range approvals { + if a.Level == lv { + approval = a + break + } + } + if approval.Decision == "REJECTED" { + return fmt.Sprintf("Rejected at %s by %s", lv, approval.Approver), nil + } + } + + return "Fully approved through all levels", nil +} +``` + + + + +```java +// MultiLevelApprovalWorkflowImpl.java +public class MultiLevelApprovalWorkflowImpl implements ApprovalWorkflow { + private List approvals = new ArrayList<>(); + private String[] requiredLevels = {"L1", "L2", "L3"}; + + @Override + public String execute(String requestId, Duration timeoutPerLevel) { + for (String level : requiredLevels) { + boolean received = Workflow.await( + timeoutPerLevel, + () -> hasApprovalForLevel(level)); + + if (!received) { + return "Timeout at " + level; + } + + MultiLevelApprovalData approval = getApprovalForLevel(level); + if (approval.getDecision().equals("REJECTED")) { + return "Rejected at " + level + " by " + approval.getApprover(); + } + } + + return "Fully approved through all levels"; + } + + @Override + public void submitApproval(MultiLevelApprovalData data) { + approvals.add(data); + } + + private boolean hasApprovalForLevel(String level) { + return approvals.stream().anyMatch(a -> a.getLevel().equals(level)); + } + + private MultiLevelApprovalData getApprovalForLevel(String level) { + return approvals.stream() + .filter(a -> a.getLevel().equals(level)) + .findFirst() + .orElse(null); + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import { MultiLevelApprovalData } from './types'; + +export const submitApprovalSignal = wf.defineSignal<[MultiLevelApprovalData]>('submitApproval'); + +export async function multiLevelApprovalWorkflow( + requestId: string, + timeoutPerLevelMs: number, +): Promise { + const approvals: MultiLevelApprovalData[] = []; + const requiredLevels = ['L1', 'L2', 'L3'] as const; + + wf.setHandler(submitApprovalSignal, (data: MultiLevelApprovalData) => { + approvals.push(data); + }); + + for (const level of requiredLevels) { + const received = await wf.condition( + () => approvals.some((a) => a.level === level), + timeoutPerLevelMs, + ); + + if (!received) { + return `Timeout at ${level}`; + } + + const approval = approvals.find((a) => a.level === level)!; + if (approval.decision === 'REJECTED') { + return `Rejected at ${level} by ${approval.approver}`; + } + } + + return 'Fully approved through all levels'; +} +``` + + + + +The Workflow loops through each required level and waits with a per-level timeout. +The helper logic checks whether a Signal has arrived for the current level. +If a timeout occurs at any level, the Workflow exits with a timeout result. +If any level returns a rejection, the Workflow exits immediately without proceeding to subsequent levels. + +### Approval with escalation + +When an initial approval times out, you may want to escalate the request to a manager rather than rejecting it outright. +The following implementation adds an escalation step with an extended timeout: + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow +from models import ApprovalData + +with workflow.unsafe.imports_passed_through(): + from activities import send_escalation_email + +@workflow.defn +class EscalatingApprovalWorkflow: + def __init__(self) -> None: + self.approval_data: ApprovalData | None = None + self.escalated = False + + @workflow.run + async def run(self, request_id: str, initial_timeout_seconds: int) -> str: + try: + await workflow.wait_condition( + lambda: self.approval_data is not None, + timeout=timedelta(seconds=initial_timeout_seconds), + ) + except asyncio.TimeoutError: + self.escalated = True + await workflow.execute_activity( + send_escalation_email, + start_to_close_timeout=timedelta(seconds=10), + ) + + try: + await workflow.wait_condition( + lambda: self.approval_data is not None, + timeout=timedelta(hours=24), + ) + except asyncio.TimeoutError: + return "Escalation timeout - auto-rejected" + + decision = self.approval_data.decision + approver = self.approval_data.approver + escalation_note = " (escalated)" if self.escalated else "" + + return f"{decision} by {approver}{escalation_note}" + + @workflow.signal + def submit_approval(self, data: ApprovalData) -> None: + self.approval_data = data +``` + + + + +```go +// workflow.go +func EscalatingApprovalWorkflow(ctx workflow.Context, requestId string, initialTimeout time.Duration) (string, error) { + var approvalData *ApprovalData + escalated := false + + workflow.Go(ctx, func(ctx workflow.Context) { + signalChan := workflow.GetSignalChannel(ctx, "submitApproval") + signalChan.Receive(ctx, &approvalData) + }) + + ok, err := workflow.AwaitWithTimeout(ctx, initialTimeout, func() bool { + return approvalData != nil + }) + if err != nil { + return "", err + } + + if !ok { + escalated = true + + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + } + actCtx := workflow.WithActivityOptions(ctx, ao) + err = workflow.ExecuteActivity(actCtx, SendEscalationEmail).Get(ctx, nil) + if err != nil { + return "", err + } + + ok, err = workflow.AwaitWithTimeout(ctx, 24*time.Hour, func() bool { + return approvalData != nil + }) + if err != nil { + return "", err + } + if !ok { + return "Escalation timeout - auto-rejected", nil + } + } + + escalationNote := "" + if escalated { + escalationNote = " (escalated)" + } + + return fmt.Sprintf("%s by %s%s", approvalData.Decision, approvalData.Approver, escalationNote), nil +} +``` + + + + +```java +// EscalatingApprovalWorkflowImpl.java +public class EscalatingApprovalWorkflowImpl implements ApprovalWorkflow { + private ApprovalData approvalData; + private boolean escalated = false; + + @Override + public String execute(String requestId, Duration initialTimeout) { + boolean received = Workflow.await(initialTimeout, () -> approvalData != null); + + if (!received) { + escalated = true; + sendEscalationNotification(); + + received = Workflow.await( + Duration.ofHours(24), + () -> approvalData != null); + + if (!received) { + return "Escalation timeout - auto-rejected"; + } + } + + String decision = approvalData.getDecision(); + String approver = approvalData.getApprover(); + String escalationNote = escalated ? " (escalated)" : ""; + + return decision + " by " + approver + escalationNote; + } + + @Override + public void submitApproval(ApprovalData data) { + this.approvalData = data; + } + + private void sendEscalationNotification() { + ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .build(); + NotificationActivities activities = + Workflow.newActivityStub(NotificationActivities.class, options); + activities.sendEscalationEmail(); + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import type * as activities from './activities'; +import { ApprovalData } from './types'; + +const { sendEscalationEmail } = wf.proxyActivities({ + startToCloseTimeout: '10 seconds', +}); + +export const submitApprovalSignal = wf.defineSignal<[ApprovalData]>('submitApproval'); + +export async function escalatingApprovalWorkflow( + requestId: string, + initialTimeoutMs: number, +): Promise { + let approvalData: ApprovalData | undefined; + let escalated = false; + + wf.setHandler(submitApprovalSignal, (data: ApprovalData) => { + approvalData = data; + }); + + let received = await wf.condition( + () => approvalData !== undefined, + initialTimeoutMs, + ); + + if (!received) { + escalated = true; + await sendEscalationEmail(); + + received = await wf.condition( + () => approvalData !== undefined, + '24 hours', + ); + + if (!received) { + return 'Escalation timeout - auto-rejected'; + } + } + + const { decision, approver } = approvalData!; + const escalationNote = escalated ? ' (escalated)' : ''; + + return `${decision} by ${approver}${escalationNote}`; +} +``` + + + + +The Workflow first waits for the initial timeout. +If no Signal arrives, it sets the `escalated` flag, executes a notification Activity to alert the manager, and then waits again with a 24-hour extended timeout. +The notification Activity uses a short start-to-close timeout, since sending an email should complete quickly. +The final result includes an escalation note so the caller knows the request was escalated before approval. + +## When to use + +The Approval pattern is a good fit for purchase order approvals, expense report reviews, code deployment gates, contract signing Workflows, manual quality checks, compliance reviews, budget authorization, and access request approvals. + +It is not a good fit for fully automated processes that require no human input, real-time decisions that need synchronous API responses, or processes that require sub-second response times. +If you only need a boolean yes/no without any context, a plain boolean Signal may be sufficient. + +## Benefits and trade-offs + +The Approval pattern captures rich context — approver identity, reasons, and timestamps — alongside each decision. +All approval data is recorded in the Workflow history as Signal events, giving you a built-in audit trail. +Timeout handling is automatic: you define the maximum wait time and the Workflow handles the fallback. +The pattern supports multi-level, conditional, and escalating approval chains, and you can check approval status at any time through Query methods without modifying Workflow state. +Because all decisions are recorded in the event history, the Workflow is deterministic and replay-safe. + +The trade-offs to consider are that the pattern requires an external system to send approval Signals, which means you need a separate approval interface. +The Workflow blocks until the approval arrives or the timeout expires, so you must define a maximum wait time. +Large approval data objects increase the size of the Workflow history. + +## Comparison with alternatives + +| Approach | Rich data | Built-in wait | Caller gets result | Complexity | Use case | +| :--- | :--- | :--- | :--- | :--- | :--- | +| Signal with data | Yes | Yes | No | Low | Approval Workflows | +| Update | Yes | No | Yes | Low | Synchronous validation with immediate confirmation | +| Boolean Signal | No | Yes | No | Low | Yes/no decisions | +| Polling Activity | Yes | Yes | Yes | High | External approval systems | + +Signals are fire-and-forget: the caller receives an acknowledgement from the server but cannot wait for the Workflow to process the Signal or receive a result. +Updates are synchronous: the caller blocks until the handler completes and can receive a return value or error. +If the approver's interface needs immediate confirmation that the approval was accepted and valid, consider using an Update with a validator instead of a Signal. + +## Best practices + +- **Use custom data objects.** Capture rich approval context — approver identity, comments, timestamps — rather than a plain boolean. +- **Set reasonable timeouts.** Balance responsiveness with the time approvers realistically need to respond. +- **Add Query methods.** Expose the current approval status so external systems can check progress without sending a Signal. +- **Validate Signal data.** Verify approver permissions and data completeness before accepting an approval. +- **Log approval events.** Record each decision for audit trails and compliance. +- **Handle timeouts gracefully.** Define clear timeout behavior such as rejection, escalation, or notification. +- **Support cancellation.** Allow Workflows to be cancelled if the request is withdrawn. +- **Ensure idempotency.** Handle duplicate approval Signals safely so that re-delivery does not corrupt state. Signals [may be duplicated in rare cases](https://docs.temporal.io/workflows#signal), so use idempotency keys when necessary. +- **Include timestamps.** Record when each approval was submitted to support time-based auditing. +- **Expose approval history.** Provide a Query method that returns all approval attempts, not only the final decision. + +## Common pitfalls + +- **No timeout.** Without a timeout, the Workflow waits indefinitely for an approval that may never arrive. +- **Missing validation.** Accepting approvals from unauthorized users compromises the integrity of the process. +- **Lost context.** Failing to capture the approver's identity or reason makes audit trails incomplete. +- **Assuming non-deterministic races.** Temporal processes events in a deterministic, single-threaded order, so a Signal and a timer cannot truly "race." However, if the Signal arrives after the timer fires in the event history, the wait will have already returned with a timeout result. Design your timeout path to account for late-arriving Signals. +- **No audit trail.** Skipping approval logging makes it difficult to meet compliance requirements. +- **Tight timeouts.** Setting the timeout too short causes legitimate approvals to be rejected. +- **Boolean-only Signals.** Using a plain boolean instead of a rich data object limits your ability to capture decision context. +- **No status Query.** Without a Query method, external systems have no way to check approval progress. +- **No duplicate handling.** Receiving multiple approval Signals without deduplication can overwrite earlier decisions. +- **No escalation path.** Without a fallback when the initial approval times out, requests stall or are silently rejected. + +## Related patterns + +- [Signal-Based Event Handling](/design-patterns/signal-with-start): Receiving external events through Signals. +- [Updatable Timer](/design-patterns/updatable-timer): Extending approval deadlines dynamically. +- [Saga Pattern](/design-patterns/saga-pattern): Executing compensating actions on rejection. + +## Sample code + +**Java** +- [Hello Signal](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloSignal.java) — Basic Signal handling in a Workflow. +- [Safe Message Passing](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/safemessagepassing) — Concurrent Signal handling with validation. + +**TypeScript** +- [Signals and Queries](https://github.com/temporalio/samples-typescript/tree/main/signals-queries) — Signal and Query usage in a Workflow. +- [Message Passing](https://github.com/temporalio/samples-typescript/tree/main/message-passing-intro) — Introduction to message passing with Signals, Queries, and Updates. + +**Python** +- [Hello Signal](https://github.com/temporalio/samples-python/tree/main/hello/hello_signal.py) — Basic Signal handling in a Workflow. +- [Message Passing](https://github.com/temporalio/samples-python/tree/main/message_passing/introduction) — Introduction to message passing with Signals, Queries, and Updates. + +**Go** +- [Await Signals](https://github.com/temporalio/samples-go/tree/main/await-signals) — Waiting for Signals with timeout using `AwaitWithTimeout`. +- [Message Passing](https://github.com/temporalio/samples-go/tree/message-passing/message-passing-intro) — Introduction to message passing with Signals, Queries, and Updates. diff --git a/docs/design-patterns/batch-iterator.mdx b/docs/design-patterns/batch-iterator.mdx new file mode 100644 index 0000000000..61f932bb93 --- /dev/null +++ b/docs/design-patterns/batch-iterator.mdx @@ -0,0 +1,252 @@ +--- +id: batch-iterator +title: "Batch Iterator" +sidebar_label: "Batch Iterator" +description: "Pages through unbounded datasets using Continue-As-New to prevent history overflow while maintaining exactly-once processing guarantees." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +**Process one page at a time** and call Continue-as-New with the next offset after each page so the Workflow's event history never grows without bound. With this method you can process infinite pages. Use this when your record set is arbitrarily large, you need a durable checkpoint after every page, and sequential page-by-page throughput is acceptable. +::: + +## Overview + +The Batch Iterator pattern processes a large record set one page at a time. Each Workflow run processes a single page and then calls Continue-as-New with the next offset, producing a chain of short-lived runs that together cover the entire record set without accumulating unbounded event history. + +## Problem + +A single Workflow run is limited to 50,000 history events (aim for 2,000) and 2,000 in-flight Activities. Processing millions of records in one run is not possible within these bounds. + +You need a way to process an arbitrarily large record set reliably, with the ability to resume from a checkpoint if the Workflow is interrupted, and without overwhelming downstream systems with a burst of concurrent requests. + +## Solution + +Each Workflow run fetches one page of records using a persistent `offset` parameter, processes each record sequentially, and then calls `continueAsNew` with the incremented offset. The next run picks up exactly where the previous one left off. + +Because each run processes only a bounded number of records, history stays well within limits. The offset acts as a durable checkpoint: if the Workflow is interrupted mid-page, the next run replays only from the start of the current page. + +```mermaid +flowchart TD + DB[("Data Source\n(paginated)")] + WF1["Workflow Run 1\n(offset=0)"] + WF2["Workflow Run 2\n(offset=PAGE_SIZE)"] + WF3["Workflow Run N\n(offset=N×PAGE_SIZE)"] + Done(["Complete"]) + + DB -->|"fetch page 1"| WF1 + WF1 -->|"processRecord ×PAGE_SIZE"| Acts1["Activities"] + WF1 -->|"continueAsNew\n(offset=PAGE_SIZE)"| WF2 + + DB -->|"fetch page 2"| WF2 + WF2 -->|"processRecord ×PAGE_SIZE"| Acts2["Activities"] + WF2 -->|"continueAsNew\n(offset=N×PAGE_SIZE)"| WF3 + + DB -->|"fetch page N"| WF3 + WF3 -->|"processRecord ×PAGE_SIZE"| Acts3["Activities"] + WF3 -->|"last page → return"| Done +``` + +The following describes each step in the diagram: + +1. The Workflow starts with `offset=0` and calls `fetchPage(offset, pageSize)` to retrieve the first page of records. +2. It processes each record in the page by executing the `processRecord` Activity. +3. After the page is fully processed, it calls `continueAsNew` with `offset + pageSize`, passing the updated offset to the next run. +4. The next run begins with a clean history and repeats the same steps for the next page. +5. When `fetchPage` returns fewer records than `pageSize`, the Workflow knows it has reached the last page and returns normally. + +## Implementation + + +The following examples show how each SDK implements the Batch Iterator pattern. + + + + +```typescript +// workflows.ts +import { continueAsNew, log, proxyActivities } from "@temporalio/workflow"; +import type * as activities from "./activities"; +import { PAGE_SIZE } from "./shared"; + +const { fetchPage, processRecord } = proxyActivities({ + startToCloseTimeout: "10 seconds", +}); + +export async function batchIteratorWorkflow( + offset: number = 0, + totalProcessed: number = 0 +): Promise { + const page = await fetchPage(offset, PAGE_SIZE); + + for (const record of page) { + await processRecord(record); + totalProcessed++; + } + + log.info(`Processed page at offset ${offset} (${page.length} records, running total: ${totalProcessed})`); + + if (page.length === PAGE_SIZE) { + await continueAsNew(offset + PAGE_SIZE, totalProcessed); + } + + return totalProcessed; +} +``` + + + + +```python +# workflows.py +from temporalio import workflow +from temporalio.workflow import continue_as_new +from datetime import timedelta +from activities import fetch_page, process_record +from shared import PAGE_SIZE + + +@workflow.defn +class BatchIteratorWorkflow: + @workflow.run + async def run(self, offset: int = 0, total_processed: int = 0) -> int: + page = await workflow.execute_activity( + fetch_page, + args=[offset, PAGE_SIZE], + start_to_close_timeout=timedelta(seconds=10), + ) + + for record in page: + await workflow.execute_activity( + process_record, + record, + start_to_close_timeout=timedelta(seconds=10), + ) + total_processed += 1 + + workflow.logger.info( + f"Processed page at offset {offset} ({len(page)} records, running total: {total_processed})" + ) + + if len(page) == PAGE_SIZE: + continue_as_new(offset + PAGE_SIZE, total_processed) + + return total_processed +``` + + + + +```go +// workflows.go +package main + +import ( + "go.temporal.io/sdk/workflow" +) + +func BatchIteratorWorkflow(ctx workflow.Context, offset int, totalProcessed int) (int, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var page []Record + if err := workflow.ExecuteActivity(ctx, FetchPage, offset, PageSize).Get(ctx, &page); err != nil { + return totalProcessed, err + } + + for _, record := range page { + if err := workflow.ExecuteActivity(ctx, ProcessRecord, record).Get(ctx, nil); err != nil { + return totalProcessed, err + } + totalProcessed++ + } + + workflow.GetLogger(ctx).Info("Processed page", + "offset", offset, + "pageSize", len(page), + "totalProcessed", totalProcessed) + + if len(page) == PageSize { + return totalProcessed, workflow.NewContinueAsNewError(ctx, BatchIteratorWorkflow, offset+PageSize, totalProcessed) + } + + return totalProcessed, nil +} +``` + + + + +```java +// BatchIteratorWorkflow.java +import io.temporal.activity.ActivityOptions; +import io.temporal.workflow.*; +import java.time.Duration; +import java.util.List; + +@WorkflowInterface +public interface BatchIteratorWorkflow { + @WorkflowMethod + int run(int offset, int totalProcessed); +} + +// BatchIteratorWorkflowImpl.java +public class BatchIteratorWorkflowImpl implements BatchIteratorWorkflow { + private final Activities activities = Workflow.newActivityStub( + Activities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .build() + ); + + @Override + public int run(int offset, int totalProcessed) { + List page = activities.fetchPage(offset, Shared.PAGE_SIZE); + + for (Record record : page) { + activities.processRecord(record); + totalProcessed++; + } + + Workflow.getLogger(BatchIteratorWorkflowImpl.class).info( + "Processed page at offset " + offset + " (" + page.size() + " records, total: " + totalProcessed + ")" + ); + + if (page.size() == Shared.PAGE_SIZE) { + throw Workflow.newContinueAsNewStub(BatchIteratorWorkflow.class) + .run(offset + Shared.PAGE_SIZE, totalProcessed); + } + + return totalProcessed; + } +} +``` + + + + +## Best Practices + +- **Choose a page size that keeps history under 2,000 events.** Each page produces roughly `3 × pageSize` history events (`ActivityTaskScheduled` + `ActivityTaskStarted` + `ActivityTaskCompleted`). A page size of 500–800 records is a safe target. +- **Include `totalProcessed` (or a similar counter) in the `continueAsNew` args.** This lets you observe overall progress via the Workflow input visible in the UI without querying internal state. +- **Fetch inside an Activity, not the Workflow.** The `fetchPage` call must be an Activity — not inline Workflow code — so it can interact with external systems and be retried independently. +- **Make `processRecord` idempotent.** Activities have at-least-once execution semantics. If a worker crashes after an Activity completes externally but before the completion is recorded in history, Temporal will retry it. Your downstream system must tolerate receiving the same record more than once. +- **Avoid accumulating large local state between pages.** `continueAsNew` does not carry over in-memory state; only the arguments you pass are available in the next run. + +## Common Pitfalls + +- **Forgetting `continueAsNew` on the last page.** If you call `continueAsNew` unconditionally, the Workflow loops forever even when the data source is exhausted. Check whether the returned page is shorter than `pageSize` before continuing. +- **Passing unnecessary state into `continueAsNew`.** All arguments are serialized and stored in history. Pass only the minimal state needed (offset, counters) — not accumulated result lists or large collections that grow with each page. +- **Sequential processing bottlenecks.** The default implementation processes one record at a time per page. You can fan out Activities concurrently within a page using the SDK's async primitives for higher per-page throughput — note this increases per-page event count accordingly. If record-set-wide throughput matters more than rate limiting, consider [Sliding Window](/design-patterns/sliding-window) or [MapReduce Tree](/design-patterns/mapreduce-tree). + +## Related Resources + +- [Continue-as-New pattern](/design-patterns/continue-as-new) — core concepts for history management via `continueAsNew` +- [Sliding Window](/design-patterns/sliding-window) — bounded concurrency that progresses at the rate of the fastest processor +- [MapReduce Tree](/design-patterns/mapreduce-tree) — fully parallel processing for maximum speed +- [Temporal limits reference](https://docs.temporal.io/cloud/limits) +- [Batch samples (Java)](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/batch/iterator) diff --git a/docs/design-patterns/batch-processing-patterns.mdx b/docs/design-patterns/batch-processing-patterns.mdx new file mode 100644 index 0000000000..a43ae24040 --- /dev/null +++ b/docs/design-patterns/batch-processing-patterns.mdx @@ -0,0 +1,134 @@ +--- +id: batch-processing-patterns +title: Batch Processing Patterns +sidebar_label: Overview +description: Patterns for processing large volumes of records reliably, at scale, and without overwhelming downstream systems. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for processing large volumes of records reliably, at scale, and without overwhelming downstream systems. + +Choose based on your throughput requirements, record set size, and whether you need rate limiting or maximum parallelism. + +## When to use which pattern + +| Pattern | Record set size | Parallelism model | Workflow-based rate control | +|---|---|---|---| +| [Basic Workflow](#basic-workflow-single-tier-fan-out) | Small (up to a few hundred records) | Sequential or parallel activities in one Workflow | No | +| [Fan-Out with Child Workflows](/design-patterns/fanout-child-workflows) | Up to ~4M records | Fixed concurrency (one child per chunk) | No | +| [Batch Iterator](/design-patterns/batch-iterator) | Unlimited | Limited (activities per page) | Yes — fixed page rate | +| [Sliding Window](/design-patterns/sliding-window) | Unlimited | Bounded window of concurrent children | Yes — configurable window | +| [MapReduce Tree](/design-patterns/mapreduce-tree) | Unlimited | Fully parallel recursive tree | No — maximum speed | + + + +--- + +## Schedules + +Schedules allow Workflows to be executed on a recurring basis — think of them as a more powerful cron. + +- Supports `start` / `pause` / `stop` / `update` / `backfill` of scheduled Workflow executions +- Configurable **Overlap Policies** control what happens when the previous run is still running +- Full execution history visibility in the Temporal UI +- Schedules can be created via the UI, CLI, or SDK + +```bash +temporal schedule create \ + --schedule-id 'your-schedule-id' \ + --workflow-id 'your-workflow-id' \ + --task-queue 'your-task-queue' \ + --workflow-type 'YourWorkflowType' +``` + +**References:** +- [Temporal Schedules](https://docs.temporal.io/workflows#schedule) +- [CLI schedule commands](https://docs.temporal.io/cli/schedule) + +--- + +## Basic Workflow (single-tier fan-out) + +The simplest form of batch processing: the Workflow fetches or receives record IDs and executes one Activity per record. + +- Activities can be executed sequentially or concurrently (using the SDK's async primitives) +- **Limit: 2,000 in-flight Activities per Workflow run** (aim for 500) +- If total event count is likely to exceed 2,000 (hard limit: 50,000), use the [Batch Iterator](/design-patterns/batch-iterator) instead + +**Pros:** Simple +**Cons:** Hard cap on concurrent Activities; all-or-nothing failure model; can overwhelm downstream systems + +```mermaid +flowchart TD + Records["📋 Record IDs
(fetched or passed in)"] + WF["Workflow"] + A1["Activity"] + A2["Activity"] + AN["Activity ..."] + + Records --> WF + WF --> A1 + WF --> A2 + WF --> AN +``` + +--- + +## Batch Signalling + +The Temporal CLI lets you signal, reset, cancel, or terminate multiple Workflows with a single command using a visibility query. + +- 1 running batch job per namespace +- 50 Workflows per second per batch + +```bash +# Signal all running Workflows of a given type +temporal workflow signal \ + --name MySignal \ + --input '{"Input": "As-JSON"}' \ + --query 'ExecutionStatus = "Running" AND WorkflowType="YourWorkflow"' \ + --reason "Testing" + +# Terminate all running Workflows of a given type +temporal workflow terminate \ + --query 'ExecutionStatus = "Running" AND WorkflowType="SomeWorkflowType"' \ + --reason "Terminate Test Workflows" +``` + +**Reference:** [CLI batch commands](https://docs.temporal.io/cli/batch) + +--- + +## Key Limits + +Full reference: [Temporal Cloud limits](https://docs.temporal.io/cloud/limits) + +| Limit | Value | +|---|---| +| Unfinished actions per Workflow | 2,000 max (aim for 500). Includes Activities, Signals, Child Workflows, cancellation requests | +| Events per Workflow history | 50,000 events max (aim for 2,000) **or** 50 MB total history size | +| Signals per Workflow | 10,000 | +| Updates per Workflow | 10 in-flight, 2,000 total | +| Batch Signalling | 1 batch job per namespace; 50 Workflows/sec per batch | diff --git a/docs/design-patterns/child-workflows.mdx b/docs/design-patterns/child-workflows.mdx new file mode 100644 index 0000000000..2d2bdf2a53 --- /dev/null +++ b/docs/design-patterns/child-workflows.mdx @@ -0,0 +1,725 @@ +--- +id: child-workflows +title: "Child Workflows Pattern" +sidebar_label: "Child Workflows" +description: "Decomposes complex Workflows into smaller, reusable units. Each child has an independent Workflow ID, history, and lifecycle." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +Child Workflows enable decomposition of complex business logic into smaller, reusable Workflow units. +Each child executes as an independent Workflow with its own Workflow ID, event history (50K event limit), and lifecycle. +Unlike Activities which execute code, Child Workflows orchestrate processes and provide Workflow-level semantics: independent tracking, querying, timeouts, and the ability to outlive the parent. + +Key capabilities: + +- **Independent identity**: Each child has a unique Workflow ID visible in the UI for tracking and querying. +- **Separate history**: Each child maintains its own event history, preventing parent history bloat. +- **Flexible invocation**: Synchronous (blocking) or asynchronous (non-blocking) execution. +- **Lifecycle control**: Parent close policies (TERMINATE, ABANDON, REQUEST_CANCEL) determine child behavior when the parent completes. +- **Task Queue routing**: Children can execute on different Task Queues with specialized Workers. +- **Reusability**: The same Child Workflow logic can be invoked by multiple different parent Workflows. + +## Problem + +In distributed systems, you often need Workflows that break down complex processes into modular, reusable components, execute sub-processes that may outlive the parent Workflow, coordinate multiple independent Workflows with different lifecycles, isolate failure domains while maintaining orchestration control, and reuse Workflow logic across different parent Workflows. + +Without Child Workflows, you must implement all logic in a single monolithic Workflow, manually coordinate separate Workflows via Signals and Queries, duplicate Workflow logic across multiple implementations, and manage complex state machines for sub-process coordination. + +## Solution + +You invoke Child Workflows from within parent Workflows using the SDK's Child Workflow API. +You can call them synchronously (blocking until completion) or asynchronously (fire-and-forget). +The `ParentClosePolicy` determines what happens to children when the parent completes. + +```mermaid +sequenceDiagram + participant Parent + participant Child1 + participant Child2 + + Parent->>+Child1: Start (sync) + activate Parent + Child1->>Child1: Execute + Child1-->>-Parent: Result + + Parent->>+Child2: Start (async) + Note over Parent,Child2: Parent continues immediately + Parent->>Parent: Do other work + Child2->>Child2: Execute independently + + alt Parent completes first + Parent->>Parent: Complete + deactivate Parent + Note over Child2: Policy: ABANDON
Child continues + Child2->>Child2: Keep running + Child2-->>-Child2: Complete + end +``` + +The following describes each step in the diagram: + +1. The parent starts Child 1 synchronously and blocks until it completes. +2. The parent starts Child 2 asynchronously and continues doing other work immediately. +3. If the parent completes before Child 2, the ABANDON policy allows Child 2 to continue running independently. + +### Synchronous Child Workflow + +The following example creates a Child Workflow and calls it synchronously. +The parent blocks until the child completes and returns a result: + + + + +```python +# workflows.py +from temporalio import workflow + +from child_workflows import ChildWorkflow + +@workflow.defn +class ParentWorkflow: + @workflow.run + async def run(self, input: str) -> str: + # Synchronous call - awaits until child completes + result = await workflow.execute_child_workflow( + ChildWorkflow.run, + input, + id=f"child-{workflow.uuid4()}", + ) + + return f"Parent received: {result}" +``` + + + + +```go +// parent_workflow.go +func ParentWorkflow(ctx workflow.Context, input string) (string, error) { + cwo := workflow.ChildWorkflowOptions{} + ctx = workflow.WithChildOptions(ctx, cwo) + + // Synchronous call - blocks until child completes + var result string + err := workflow.ExecuteChildWorkflow(ctx, ChildWorkflow, input).Get(ctx, &result) + if err != nil { + return "", err + } + + return "Parent received: " + result, nil +} +``` + + + + +```java +// ParentWorkflowImpl.java +@WorkflowInterface +public interface ParentWorkflow { + @WorkflowMethod + String execute(String input); +} + +public class ParentWorkflowImpl implements ParentWorkflow { + @Override + public String execute(String input) { + ChildWorkflow child = Workflow.newChildWorkflowStub(ChildWorkflow.class); + + // Synchronous call - blocks until child completes + String result = child.processData(input); + + return "Parent received: " + result; + } +} +``` + + + + +```typescript +// workflows.ts +import { executeChild } from '@temporalio/workflow'; +import { childWorkflow } from './child-workflows'; + +export async function parentWorkflow(input: string): Promise { + // Synchronous call - awaits until child completes + const result = await executeChild(childWorkflow, { + args: [input], + }); + + return `Parent received: ${result}`; +} +``` + + + + +In Java, `Workflow.newChildWorkflowStub()` creates a typed stub and calling a method on it blocks the parent. +In TypeScript, `executeChild()` starts the child and awaits its completion. +In Python, `workflow.execute_child_workflow()` starts the child and awaits its completion. +In Go, `workflow.ExecuteChildWorkflow()` returns a `ChildWorkflowFuture`, and calling `.Get()` blocks until the child completes. + +### Asynchronous Child Workflow + +The following example starts a Child Workflow asynchronously with an ABANDON policy. +The parent receives the child's execution info without waiting for completion: + + + + +```python +# workflows.py +from temporalio import workflow +from temporalio.workflow import ParentClosePolicy + +from child_workflows import ChildWorkflow + +@workflow.defn +class ParentWorkflow: + @workflow.run + async def run(self, input: str) -> str: + # Async call - returns handle once child starts + handle = await workflow.start_child_workflow( + ChildWorkflow.run, + input, + id=f"child-{workflow.uuid4()}", + parent_close_policy=ParentClosePolicy.ABANDON, + ) + + # Parent continues without waiting for child completion + return handle.id +``` + + + + +```go +// parent_workflow.go +import ( + enumspb "go.temporal.io/api/enums/v1" + "go.temporal.io/sdk/workflow" +) + +func ParentWorkflow(ctx workflow.Context, input string) (string, error) { + cwo := workflow.ChildWorkflowOptions{ + ParentClosePolicy: enumspb.PARENT_CLOSE_POLICY_ABANDON, + } + ctx = workflow.WithChildOptions(ctx, cwo) + + childFuture := workflow.ExecuteChildWorkflow(ctx, ChildWorkflow, input) + + // Wait for child to start, not complete + var childWE workflow.Execution + if err := childFuture.GetChildWorkflowExecution().Get(ctx, &childWE); err != nil { + return "", err + } + + // Parent continues without waiting for child completion + return childWE.ID, nil +} +``` + + + + +```java +// ParentWorkflowImpl.java +public class ParentWorkflowImpl implements ParentWorkflow { + @Override + public WorkflowExecution execute(String input) { + ChildWorkflowOptions options = ChildWorkflowOptions.newBuilder() + .setWorkflowId("child-" + Workflow.randomUUID()) + .setParentClosePolicy(ParentClosePolicy.PARENT_CLOSE_POLICY_ABANDON) + .build(); + + ChildWorkflow child = Workflow.newChildWorkflowStub(ChildWorkflow.class, options); + + // Async call - returns immediately + Async.function(child::processData, input); + + // Get child execution info without waiting for completion + Promise childExecution = Workflow.getWorkflowExecution(child); + return childExecution.get(); // Blocks only until child starts + } +} +``` + + + + +```typescript +// workflows.ts +import { startChild, ParentClosePolicy } from '@temporalio/workflow'; +import { childWorkflow } from './child-workflows'; + +export async function parentWorkflow(input: string): Promise { + const childHandle = await startChild(childWorkflow, { + args: [input], + parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_ABANDON, + }); + + // Parent continues without waiting for child completion + // childHandle.workflowId and childHandle.firstExecutionRunId are available + return childHandle.workflowId; +} +``` + + + + +In Java, `Async.function()` starts the child asynchronously. +`Workflow.getWorkflowExecution(child)` returns a Promise that resolves when the child starts (not when it completes). +In TypeScript, `startChild()` returns a handle once the child has started. +In Python, `workflow.start_child_workflow()` returns a handle once the child has started, without waiting for completion. +In Go, `childFuture.GetChildWorkflowExecution().Get()` blocks until the child has started. +The ABANDON policy ensures the child continues running even if the parent completes first. + +## Parent close policy + +The `ParentClosePolicy` determines Child Workflow behavior when the parent closes: + +| Policy | Behavior | Use case | +| :--- | :--- | :--- | +| `TERMINATE` | Child is terminated when parent closes | Tightly coupled processes | +| `ABANDON` | Child continues independently | Fire-and-forget, long-running tasks | +| `REQUEST_CANCEL` | Child receives cancellation request | Graceful cleanup | + +## Implementation + +### Parallel Child Workflows + +The following example starts multiple Child Workflows in parallel and waits for all of them to complete: + + + + +```python +# workflows.py +import asyncio +from temporalio import workflow + +from child_workflows import ChildWorkflow + +@workflow.defn +class ParallelParentWorkflow: + @workflow.run + async def run(self, items: list[str]) -> str: + # Start all children concurrently using asyncio.gather + results = await asyncio.gather( + *[ + workflow.execute_child_workflow( + ChildWorkflow.run, + item, + id=f"child-{workflow.uuid4()}", + ) + for item in items + ] + ) + + return ", ".join(results) +``` + + + + +```go +// parallel_parent_workflow.go +func ParallelParentWorkflow(ctx workflow.Context, items []string) (string, error) { + cwo := workflow.ChildWorkflowOptions{} + ctx = workflow.WithChildOptions(ctx, cwo) + + // Start all children - ExecuteChildWorkflow returns immediately + var futures []workflow.ChildWorkflowFuture + for _, item := range items { + futures = append(futures, workflow.ExecuteChildWorkflow(ctx, ChildWorkflow, item)) + } + + // Wait for all children to complete + var results []string + for _, future := range futures { + var result string + if err := future.Get(ctx, &result); err != nil { + return "", err + } + results = append(results, result) + } + + return strings.Join(results, ", "), nil +} +``` + + + + +```java +// ParallelParentWorkflowImpl.java +public class ParallelParentWorkflowImpl implements ParentWorkflow { + @Override + public String execute(List items) { + List> promises = new ArrayList<>(); + + for (String item : items) { + ChildWorkflow child = Workflow.newChildWorkflowStub(ChildWorkflow.class); + promises.add(Async.function(child::process, item)); + } + + // Wait for all children to complete + Promise.allOf(promises).get(); + + return promises.stream() + .map(Promise::get) + .collect(Collectors.joining(", ")); + } +} +``` + + + + +```typescript +// workflows.ts +import { executeChild } from '@temporalio/workflow'; +import { childWorkflow } from './child-workflows'; + +export async function parallelParentWorkflow(items: string[]): Promise { + // Start all children concurrently using Promise.all + const results = await Promise.all( + items.map((item) => + executeChild(childWorkflow, { + args: [item], + }) + ) + ); + + return results.join(', '); +} +``` + + + + +In Java, each child starts asynchronously via `Async.function()`, and `Promise.allOf(promises).get()` blocks until every child completes. +In TypeScript, `Promise.all()` starts all children concurrently and awaits all results. +In Python, `asyncio.gather()` starts all children concurrently and awaits all results. +In Go, `workflow.ExecuteChildWorkflow()` returns a Future immediately without blocking, so starting all children in a loop launches them in parallel. Calling `.Get()` on each Future afterward collects the results. + +### Fire-and-forget + +The following example starts a Child Workflow with the ABANDON policy and returns immediately without waiting: + + + + +```python +# workflows.py +from temporalio import workflow +from temporalio.workflow import ParentClosePolicy + +from child_workflows import LongRunningChildWorkflow + +@workflow.defn +class FireAndForgetParentWorkflow: + @workflow.run + async def run(self, data: str) -> None: + # Start child with ABANDON policy - child survives parent completion + await workflow.start_child_workflow( + LongRunningChildWorkflow.run, + data, + id=f"child-{workflow.uuid4()}", + parent_close_policy=ParentClosePolicy.ABANDON, + ) + + # start_child_workflow resolves once the child has started + # Parent completes, child continues independently +``` + + + + +```go +// fire_and_forget_workflow.go +import ( + enumspb "go.temporal.io/api/enums/v1" + "go.temporal.io/sdk/workflow" +) + +func FireAndForgetParentWorkflow(ctx workflow.Context, data string) error { + cwo := workflow.ChildWorkflowOptions{ + ParentClosePolicy: enumspb.PARENT_CLOSE_POLICY_ABANDON, + } + ctx = workflow.WithChildOptions(ctx, cwo) + + childFuture := workflow.ExecuteChildWorkflow(ctx, LongRunningChildWorkflow, data) + + // Wait for child to start before parent completes + if err := childFuture.GetChildWorkflowExecution().Get(ctx, nil); err != nil { + return err + } + + // Parent completes, child continues independently + return nil +} +``` + + + + +```java +// FireAndForgetParentWorkflowImpl.java +public class FireAndForgetParentWorkflowImpl implements ParentWorkflow { + @Override + public void execute(String data) { + ChildWorkflowOptions options = ChildWorkflowOptions.newBuilder() + .setParentClosePolicy(ParentClosePolicy.PARENT_CLOSE_POLICY_ABANDON) + .build(); + + ChildWorkflow child = Workflow.newChildWorkflowStub(ChildWorkflow.class, options); + + // Start child and don't wait for completion + Async.function(child::longRunningProcess, data); + + // Wait for child to start before parent completes + Workflow.getWorkflowExecution(child).get(); + + // Parent completes, child continues independently + } +} +``` + + + + +```typescript +// workflows.ts +import { startChild, ParentClosePolicy } from '@temporalio/workflow'; +import { longRunningChildWorkflow } from './child-workflows'; + +export async function fireAndForgetParentWorkflow(data: string): Promise { + // Start child with ABANDON policy - child survives parent completion + await startChild(longRunningChildWorkflow, { + args: [data], + parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_ABANDON, + }); + + // startChild resolves once the child has started + // Parent completes, child continues independently +} +``` + + + + +You must wait for the child to start before the parent completes. +Without this, the parent could complete before the child is scheduled, and the child would never execute. +The ABANDON policy ensures the child continues running after the parent completes. + +### Conditional child execution + +The following example conditionally starts different Child Workflows based on business logic: + + + + +```python +# workflows.py +from temporalio import workflow + +from child_workflows import ApprovalWorkflow, FulfillmentWorkflow + +@workflow.defn +class ConditionalParentWorkflow: + @workflow.run + async def run(self, order: Order) -> str: + if order.requires_approval: + approved = await workflow.execute_child_workflow( + ApprovalWorkflow.run, + order, + id=f"approval-{order.id}", + ) + + if not approved: + return "Order rejected" + + return await workflow.execute_child_workflow( + FulfillmentWorkflow.run, + order, + id=f"fulfillment-{order.id}", + ) +``` + + + + +```go +// conditional_parent_workflow.go +func ConditionalParentWorkflow(ctx workflow.Context, order Order) (string, error) { + cwo := workflow.ChildWorkflowOptions{} + ctx = workflow.WithChildOptions(ctx, cwo) + + if order.RequiresApproval { + var approved bool + err := workflow.ExecuteChildWorkflow(ctx, ApprovalWorkflow, order).Get(ctx, &approved) + if err != nil { + return "", err + } + + if !approved { + return "Order rejected", nil + } + } + + var result string + err := workflow.ExecuteChildWorkflow(ctx, FulfillmentWorkflow, order).Get(ctx, &result) + if err != nil { + return "", err + } + + return result, nil +} +``` + + + + +```java +// ConditionalParentWorkflowImpl.java +public class ConditionalParentWorkflowImpl implements ParentWorkflow { + @Override + public String execute(Order order) { + if (order.requiresApproval()) { + ApprovalWorkflow approval = Workflow.newChildWorkflowStub(ApprovalWorkflow.class); + boolean approved = approval.requestApproval(order); + + if (!approved) { + return "Order rejected"; + } + } + + FulfillmentWorkflow fulfillment = Workflow.newChildWorkflowStub(FulfillmentWorkflow.class); + return fulfillment.fulfill(order); + } +} +``` + + + + +```typescript +// workflows.ts +import { executeChild } from '@temporalio/workflow'; +import { approvalWorkflow, fulfillmentWorkflow } from './child-workflows'; + +export async function conditionalParentWorkflow(order: Order): Promise { + if (order.requiresApproval) { + const approved = await executeChild(approvalWorkflow, { + args: [order], + }); + + if (!approved) { + return 'Order rejected'; + } + } + + return await executeChild(fulfillmentWorkflow, { + args: [order], + }); +} +``` + + + + +The parent checks whether the order requires approval and only starts the approval Child Workflow when needed. + +## When to use + +Child Workflows and Activities serve different purposes. + +Use Child Workflows when: + +- You need a separate Workflow ID for tracking and querying. +- The operation may outlive the parent Workflow. +- You need to reuse Workflow logic across multiple parents. +- You want to execute Workflows on different Task Queues. +- You need independent history and event limits. +- You want to apply different timeouts or retry policies at the Workflow level. + +Use Activities when: + +- You are executing external operations (API calls, database queries). +- The operation is short-lived. +- You do not need independent Workflow tracking. +- The operation is tightly coupled to the parent Workflow lifecycle. +- Lower overhead is important. + +The key distinction is that Activities are for executing code (especially external operations), while Child Workflows are for orchestrating processes that benefit from independent Workflow semantics. + +## Benefits and trade-offs + +Child Workflows provide modularity by breaking complex logic into reusable units. +Each child is a first-class Workflow with its own ID for tracking, its own 50K event history limit, and its own execution timeout configuration. +Children can outlive parents with the ABANDON policy, and you can start multiple children concurrently. +Child failures do not automatically fail the parent, and the same Child Workflow can be reused by multiple parents. + +The trade-offs to consider are that each child is a separate Workflow execution with its own history (overhead). +There are more moving parts than a single Workflow. +Child execution details are not in the parent history (but are queryable independently). +Async children require explicit synchronization if needed. +More Workflow executions mean higher resource usage. +Starting a Child Workflow has more overhead than starting an Activity. + +## Comparison with alternatives + +| Approach | Modularity | Independent history | Can outlive parent | Overhead | Separate Workflow ID | +| :--- | :--- | :--- | :--- | :--- | :--- | +| Child Workflow | High | Yes | Yes (ABANDON) | Medium | Yes | +| Activity | Medium | No | No | Low | No | +| Separate Workflow + Signals | High | Yes | Yes | High | Yes | +| Async Lambda | Low | No | No | Very Low | No | + +## Best practices + +- **Use unique Workflow IDs.** Generate unique IDs for Child Workflows to avoid conflicts. +- **Choose the appropriate policy.** Use TERMINATE for tightly coupled children, ABANDON for independent children. +- **Handle child failures.** Catch and handle Child Workflow exceptions appropriately. +- **Limit parallelism.** Do not spawn unlimited children; use batch patterns for large datasets. +- **Consider Activities first.** Use Activities for operations that do not need independent Workflow tracking. +- **Set timeouts.** Configure appropriate Workflow execution timeouts for children. +- **Use typed stubs.** Prefer typed stubs over untyped for compile-time safety. +- **Monitor child executions.** Track Child Workflow IDs for observability and debugging. + +## Common pitfalls + +- **Treating Child Workflows like Activities.** Child Workflows are for orchestration, not for executing external code. If you only need to call an API or run a function, use an Activity instead. +- **Spawning unbounded children in a loop.** Starting thousands of Child Workflows without batching can overwhelm the Temporal Service and bloat the parent's event history. Use fixed-size batches or a sliding window. +- **Ignoring the Parent Close Policy.** The default policy is TERMINATE, which kills children when the parent closes. If children must outlive the parent, set the policy to ABANDON explicitly. +- **Using synchronous calls when async is needed.** Calling a Child Workflow synchronously blocks the parent until the child completes. For long-running children, use the async API (`Async.function()` in Java, `startChild()` in TypeScript, `start_child_workflow()` in Python, or collect Futures without calling `.Get()` in Go) to avoid stalling the parent. +- **Omitting Workflow IDs.** Without explicit Workflow IDs, you lose the ability to deduplicate or look up Child Workflows by a meaningful identifier. Generate deterministic IDs based on business keys. +- **Not handling child failures.** Child Workflow failures propagate as `ChildWorkflowFailure` exceptions. If you do not catch and handle them, the parent Workflow fails as well. + +## Related patterns + +- **[Parallel Execution](/design-patterns/parallel-execution)**: Running multiple children concurrently. +- **[Continue-As-New](/design-patterns/continue-as-new)**: Child Workflows can use Continue-As-New independently. +- **[Saga Pattern](/design-patterns/saga-pattern)**: Children as compensatable transactions. + +## Sample code + +**Java:** +- [HelloChild](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloChild.java) — Basic synchronous Child Workflow. +- [Async Child Workflow](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/asyncchild) — Asynchronous child with ABANDON policy. +- [Async Untyped Child](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/asyncuntypedchild) — Untyped async Child Workflow. + +**TypeScript:** +- [Child Workflows](https://github.com/temporalio/samples-typescript/tree/main/child-workflows) — Parent and child Workflow using `executeChild` and `startChild`. + +**Python:** +- [Child Workflows](https://github.com/temporalio/samples-python/tree/main/hello/hello_child_workflow.py) — Basic Child Workflow using `execute_child_workflow`. + +**Go:** +- [Child Workflow](https://github.com/temporalio/samples-go/tree/main/child-workflow) — Synchronous and async Child Workflow patterns. diff --git a/docs/design-patterns/continue-as-new.mdx b/docs/design-patterns/continue-as-new.mdx new file mode 100644 index 0000000000..963cac0560 --- /dev/null +++ b/docs/design-patterns/continue-as-new.mdx @@ -0,0 +1,438 @@ +--- +id: continue-as-new +title: "Continue-As-New Pattern" +sidebar_label: "Continue-As-New" +description: "Prevents unbounded history growth by completing the current execution and starting a new one with fresh history." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Continue-As-New pattern allows long-running Workflows to reset their event history by completing the current execution and immediately starting a new one with fresh state. +This prevents Workflows from hitting Temporal's event history limits while maintaining logical continuity, making it essential for periodic tasks, infinite loops, and Workflows that process unbounded data streams. +By archiving old event history and starting fresh, Continue-As-New also reduces active storage costs — only the current execution's history remains in active storage while previous runs are moved to cheaper archived storage. + +## Problem + +In long-running Workflows, you often need to execute periodic tasks indefinitely, process unbounded streams of data without accumulating history, implement infinite loops that run for months or years, avoid hitting the 50,000 event history limit, and maintain Workflow state across logical restarts. + +Without Continue-As-New, you must manually stop and restart Workflows (losing continuity), risk hitting history limits and Workflow failures, implement external orchestration to manage Workflow lifecycle, and accept degraded performance as history grows large. + +## Solution + +Continue-As-New completes the current Workflow execution and atomically starts a new one with the same Workflow ID. +The new execution begins with a fresh event history while preserving logical continuity. +You pass state as arguments to the new execution. + +```mermaid +flowchart LR + Start([Start]) --> Exec1[Execution 1
Process batch] + Exec1 --> Check1{History
large?} + Check1 -->|Yes| CAN1[Continue-As-New] + Check1 -->|No| More1{More
data?} + More1 -->|Yes| Exec1 + More1 -->|No| End1([Complete]) + + CAN1 -.->|Fresh history
Same Workflow ID| Exec2[Execution 2
Process batch] + Exec2 --> Check2{History
large?} + Check2 -->|Yes| CAN2[Continue-As-New] + Check2 -->|No| More2{More
data?} + More2 -->|Yes| Exec2 + More2 -->|No| End2([Complete]) + + CAN2 -.-> Etc[...] + classDef highlight stroke-width:1px + class CAN1,CAN2 highlight +``` + +The following describes each step in the diagram: + +1. The Workflow starts Execution 1 and processes a batch of data. +2. After each batch, the Workflow checks whether the history is getting large. +3. If the history is large, the Workflow calls Continue-As-New, which starts Execution 2 with a fresh history and the same Workflow ID. +4. If the history is not large and more data remains, the Workflow loops and processes the next batch. +5. If no more data remains, the Workflow completes normally. + +The following implementation shows a data processor that passes a cursor and a running total across executions. +When the batch is full (indicating more data to process), the Workflow calls Continue-As-New with the updated state: + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import fetchBatch, process + +BATCH_SIZE = 100 + +@workflow.defn +class DataProcessorWorkflow: + @workflow.run + async def run(self, cursor: str, total_processed: int = 0) -> None: + batch = await workflow.execute_activity( + fetchBatch, args=[cursor, BATCH_SIZE], + start_to_close_timeout=timedelta(seconds=60), + ) + + for record in batch: + await workflow.execute_activity( + process, record, + start_to_close_timeout=timedelta(seconds=60), + ) + total_processed += 1 + cursor = record.id + + if len(batch) == BATCH_SIZE: + # More data to process - continue as new with updated state + workflow.continue_as_new(cursor, total_processed) + # Otherwise complete normally +``` + + + + +```go +// data_processor_workflow.go +const BatchSize = 100 + +func DataProcessorWorkflow(ctx workflow.Context, cursor string, totalProcessed int) error { + ao := workflow.ActivityOptions{StartToCloseTimeout: time.Minute} + ctx = workflow.WithActivityOptions(ctx, ao) + + var batch []Record + err := workflow.ExecuteActivity(ctx, FetchBatch, cursor, BatchSize).Get(ctx, &batch) + if err != nil { + return err + } + + for _, record := range batch { + err = workflow.ExecuteActivity(ctx, Process, record).Get(ctx, nil) + if err != nil { + return err + } + totalProcessed++ + cursor = record.ID + } + + if len(batch) == BatchSize { + // More data to process - continue as new with updated state + return workflow.NewContinueAsNewError(ctx, DataProcessorWorkflow, cursor, totalProcessed) + } + // Otherwise complete normally + return nil +} +``` + + + + +```java +// DataProcessorWorkflowImpl.java +@WorkflowInterface +public interface DataProcessorWorkflow { + @WorkflowMethod + void processData(String cursor, int totalProcessed); +} + +public class DataProcessorWorkflowImpl implements DataProcessorWorkflow { + private static final int BATCH_SIZE = 100; + + @Override + public void processData(String cursor, int totalProcessed) { + List batch = activities.fetchBatch(cursor, BATCH_SIZE); + + for (Record record : batch) { + activities.process(record); + totalProcessed++; + cursor = record.getId(); + } + + if (batch.size() == BATCH_SIZE) { + // More data to process - continue as new with updated state + DataProcessorWorkflow continueAsNew = + Workflow.newContinueAsNewStub(DataProcessorWorkflow.class); + continueAsNew.processData(cursor, totalProcessed); + } + // Otherwise complete normally + } +} +``` + + + + +```typescript +// workflows.ts +import { continueAsNew, proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { fetchBatch, process } = proxyActivities({ + startToCloseTimeout: '1 minute', +}); + +const BATCH_SIZE = 100; + +export async function dataProcessorWorkflow( + cursor: string, + totalProcessed: number = 0 +): Promise { + const batch = await fetchBatch(cursor, BATCH_SIZE); + + for (const record of batch) { + await process(record); + totalProcessed++; + cursor = record.id; + } + + if (batch.length === BATCH_SIZE) { + // More data to process - continue as new with updated state + await continueAsNew(cursor, totalProcessed); + } + // Otherwise complete normally +} +``` + + + + +The Workflow fetches a batch of records, processes each one, and updates the cursor. +If the batch is full, the Workflow triggers Continue-As-New with the updated cursor and total. +In Java, `Workflow.newContinueAsNewStub()` creates a typed stub. +In TypeScript, `continueAsNew()` throws a special error that the runtime intercepts. +In Python, `workflow.continue_as_new()` immediately stops the current execution and starts a new one. +In Go, `workflow.NewContinueAsNewError()` returns a special error that signals the runtime to continue as new. +If the batch is smaller than `BATCH_SIZE`, no more data remains and the Workflow completes. + +## Implementation + +### Using the continue-as-new suggestion + +Instead of tracking iteration counts manually, you can use the SDK's built-in suggestion to let Temporal tell you when the history is getting large: + + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow + +@workflow.defn +class DataProcessorWorkflow: + @workflow.run + async def run(self, cursor: str, total_processed: int = 0) -> None: + batch = await workflow.execute_activity( + fetchBatch, args=[cursor, BATCH_SIZE], + start_to_close_timeout=timedelta(seconds=60), + ) + + for record in batch: + await workflow.execute_activity( + process, record, + start_to_close_timeout=timedelta(seconds=60), + ) + total_processed += 1 + cursor = record.id + + # Check if history is getting large + if workflow.info().is_continue_as_new_suggested(): + workflow.continue_as_new(cursor, total_processed) + + # Continue processing or complete +``` + + + + +```go +// data_processor_workflow.go +func DataProcessorWorkflow(ctx workflow.Context, cursor string, totalProcessed int) error { + ao := workflow.ActivityOptions{StartToCloseTimeout: time.Minute} + ctx = workflow.WithActivityOptions(ctx, ao) + + var batch []Record + err := workflow.ExecuteActivity(ctx, FetchBatch, cursor, BatchSize).Get(ctx, &batch) + if err != nil { + return err + } + + for _, record := range batch { + err = workflow.ExecuteActivity(ctx, Process, record).Get(ctx, nil) + if err != nil { + return err + } + totalProcessed++ + cursor = record.ID + + // Check if history is getting large + if workflow.GetInfo(ctx).GetContinueAsNewSuggested() { + return workflow.NewContinueAsNewError(ctx, DataProcessorWorkflow, cursor, totalProcessed) + } + } + + // Continue processing or complete + return nil +} +``` + + + + +```java +// DataProcessorWorkflowImpl.java +public class DataProcessorWorkflowImpl implements DataProcessorWorkflow { + + @Override + public void processData(String cursor, int totalProcessed) { + List batch = activities.fetchBatch(cursor, BATCH_SIZE); + + for (Record record : batch) { + activities.process(record); + totalProcessed++; + cursor = record.getId(); + + // Check if history is getting large + if (Workflow.getInfo().shouldContinueAsNew()) { + DataProcessorWorkflow continueAsNew = + Workflow.newContinueAsNewStub(DataProcessorWorkflow.class); + continueAsNew.processData(cursor, totalProcessed); + return; + } + } + + // Continue processing or complete + } +} +``` + + + + +```typescript +// workflows.ts +import { continueAsNew, workflowInfo, proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { fetchBatch, process } = proxyActivities({ + startToCloseTimeout: '1 minute', +}); + +export async function dataProcessorWorkflow( + cursor: string, + totalProcessed: number = 0 +): Promise { + const batch = await fetchBatch(cursor, BATCH_SIZE); + + for (const record of batch) { + await process(record); + totalProcessed++; + cursor = record.id; + + // Check if history is getting large + if (workflowInfo().continueAsNewSuggested) { + await continueAsNew(cursor, totalProcessed); + } + } + + // Continue processing or complete +} +``` + + + + +Each SDK provides a method to check if the history is approaching the limit: +- Java: `Workflow.getInfo().shouldContinueAsNew()` +- TypeScript: `workflowInfo().continueAsNewSuggested` +- Python: `workflow.info().is_continue_as_new_suggested()` +- Go: `workflow.GetInfo(ctx).GetContinueAsNewSuggested()` + +This approach is more reliable than a fixed iteration count because it accounts for the actual number of events generated per iteration. + +## When to use + +The Continue-As-New pattern is a good fit for periodic Workflows running indefinitely (cron-like behavior), processing unbounded data streams, long-running Workflows with repetitive patterns, Workflows that accumulate state over many iterations, and preventing event history from growing too large. + +It is not a good fit for short-lived Workflows (under 1000 events), Workflows that naturally complete, one-time batch processing, or Workflows that require full history for audit purposes. + +## Benefits and trade-offs + +Continue-As-New allows you to run Workflows indefinitely without history limits. +Fresh history keeps Workflow execution fast. +It reduces active storage costs by archiving old event history — more aggressive iteration limits mean more frequent archiving, keeping active storage minimal. +The transition is atomic with no gap between old and new execution. +You pass state as arguments to the new execution, and the Workflow ID remains the same, maintaining logical continuity for Queries and Signals. + +The trade-offs to consider are that previous execution history is archived separately. +You must explicitly pass state as arguments (manual state management). +Queries only see the current execution's state. +Debugging requires tracing across multiple execution runs. +You cannot undo Continue-As-New once triggered. + +## Comparison with alternatives + +| Approach | History reset | State continuity | Use case | +| :--- | :--- | :--- | :--- | +| Continue-As-New | Yes | Manual | Long-running periodic | +| Child Workflows | Per child | Automatic | Parallel processing | +| Cron Schedule | Yes | None | Fixed schedule tasks | +| Manual Restart | Yes | None | One-time Workflows | + +## Best practices + +- **Use the continue-as-new suggestion.** Check the SDK's built-in suggestion (`shouldContinueAsNew()` in Java, `continueAsNewSuggested` in TypeScript, `is_continue_as_new_suggested()` in Python, `GetContinueAsNewSuggested()` in Go) to automatically detect when history is large. +- **Set aggressive iteration limits.** Continue as new every 100–1000 iterations to prevent history buildup and reduce storage costs. Balance frequency with the overhead of creating new executions. +- **Pass minimal state.** Only pass necessary state to keep arguments small. +- **Add exit Signals.** Allow graceful termination via Signals. +- **Log transitions.** Log when continuing as new for observability. +- **Version carefully.** Ensure new code can handle state from old executions. +- **Monitor history size.** Track event count and continue before hitting limits. +- **Use typed APIs.** In Java, prefer `newContinueAsNewStub()` over untyped `continueAsNew()`. In TypeScript, use the generic `continueAsNew()` for type safety. +- **Consider cron.** For fixed Schedules, use Temporal Schedules instead. +- **Test state transfer.** Verify state correctly passes between executions. + +## Common pitfalls + +- **Passing too much state.** Continue-As-New arguments are serialized into the first event of the new execution. Large payloads slow down startup and increase storage costs. Pass only the minimal state needed. +- **Forgetting to drain Signals before continuing.** Any Signals received but not yet processed are lost when Continue-As-New starts a fresh execution. Drain your Signal channel and carry pending Signals forward as arguments. +- **Using a fixed iteration count instead of the built-in suggestion.** Different Workflow paths generate different numbers of events per iteration. A fixed count may continue too early or too late. Use the SDK's built-in continue-as-new suggestion for accurate detection. +- **Not versioning state arguments.** When you change the Workflow method signature or state shape, in-flight executions may continue as new into code that cannot deserialize the old arguments. Use versioning or backward-compatible argument types. +- **Calling Continue-As-New from a Signal handler.** Triggering Continue-As-New inside a Signal handler can cause Signal loss because the handler may preempt other pending Signals. Always set a flag in the Signal handler and call Continue-As-New from the main Workflow thread, where all Signal handlers are guaranteed to have run first. +- **Not accounting for Child Workflows.** Continue-As-New closes the current Workflow Execution, which triggers the Parent Close Policy on all Child Workflows. By default, children are terminated. If children must survive, set `ParentClosePolicy` to `ABANDON` and pass their Workflow IDs to the new execution so you can interact with them via external handles. +- **Caching Run IDs for external interaction.** Continue-As-New creates a new Run ID. If external callers cache the old Run ID for Signals or Queries, they will get a "workflow execution already completed" error. Always use Workflow ID without a Run ID (or an empty Run ID) so the request routes to the currently running execution. +- **Catching the Continue-As-New exception.** In TypeScript and Python, Continue-As-New is implemented by throwing a special exception. Wrapping it in a try-catch or try-except can suppress the transition and cause unexpected behavior. Let the exception propagate unhandled. In Go, return the `ContinueAsNewError` from the Workflow function without wrapping it. + +## Related patterns + +- **[Entity Workflow](/design-patterns/entity-workflow)**: Long-lived Workflows that model business entities, relying on Continue-As-New to prevent unbounded history. +- **[Child Workflows](/design-patterns/child-workflows)**: Decomposing work into sub-Workflows. Consider Parent Close Policy when combining with Continue-As-New. +- **[Signal with Start](/design-patterns/signal-with-start)**: Idempotent Workflow start with an initial Signal — use Workflow ID without Run ID to interact with continued executions. + +## Sample code + +**Java:** +- [Cron Workflow](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloCron.java) — Periodic Workflow using Continue-As-New. +- [Heartbeating Activity Batch](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/batch/heartbeatingactivity) — Batch processing with Continue-As-New for large datasets. + +**TypeScript:** +- [Continue-As-New](https://github.com/temporalio/samples-typescript/tree/main/continue-as-new) — Basic Continue-As-New with `continueAsNew()`. +- [Safe Message Handlers](https://github.com/temporalio/samples-typescript/tree/main/message-passing/safe-message-handlers) — Entity Workflow with Continue-As-New and `continueAsNewSuggested`. + +**Python:** +- [Safe Message Handlers](https://github.com/temporalio/samples-python/tree/main/message_passing/safe_message_handlers) — Entity Workflow with Continue-As-New and `is_continue_as_new_suggested()`. + +**Go:** +- [Safe Message Handlers](https://github.com/temporalio/samples-go/tree/main/safe_message_handler) — Entity Workflow with Continue-As-New and `GetContinueAsNewSuggested()`. + +## References + +- [Temporal Docs: Continue-As-New](https://docs.temporal.io/workflow-execution/continue-as-new) — Official documentation on the Continue-As-New mechanism. +- [Temporal Blog: How to Keep a Workflow Running Indefinitely Long](https://temporal.io/blog/very-long-running-workflows) — Detailed guidance on managing Workflows that run forever. +- [Temporal Blog: Workflows as Actors](https://temporal.io/blog/workflows-as-actors-is-it-really-possible) — Using Continue-As-New with the Entity Workflow / Actor pattern. diff --git a/docs/design-patterns/delayed-callback.mdx b/docs/design-patterns/delayed-callback.mdx new file mode 100644 index 0000000000..6800ae0373 --- /dev/null +++ b/docs/design-patterns/delayed-callback.mdx @@ -0,0 +1,757 @@ +--- +id: delayed-callback +title: "Delayed Callback (Webhooks)" +sidebar_label: "Delayed Callback" +description: "Durable webhook and callback handling: fire delayed outbound callbacks with durable timers and complete work asynchronously via task tokens." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview +Delayed Callback patterns in Temporal are exactly what they sound like: manage delayed completion notification between two systems. They leverage durable timers to make this extremely simple and defined only in code. + +### Webhooks +[Webhooks](https://www.redhat.com/en/topics/automation/what-is-a-webhook) are easy to build and configure with the Delayed Callback patterns. There are two recommended patterns for integrating Temporal Workflows to communicate through HTTP callbacks: +- waiting and receiving inbound webhooks +- firing delayed outbound callbacks + +Included is a pattern for completing activities asynchronously via a callback token and some guidance on when to use it. + +Temporal lets you build durable, observable webhook-based integrations without ad-hoc queues, cron jobs, or fragile state machines. + + +## Problem + +Waiting for another system without Durable Execution is hard. You must implement your own: +- durable timers (e.g. with a cron per timer) +- retry queues +- state stores +- reconciliation jobs + +All just for a simple elayed callback. + +Webhook-based integrations have several failure modes that are difficult to handle without durable infrastructure: + +- An inbound message fires before the target system is ready or at the proper state, causing the message to be ignored or lost. +- An outbound HTTP callback fails midway through a multi-step cross-system process, and there is no record of what was sent, retried, or skipped. +- An external job (payment processor, ML pipeline, etc.) completes and calls back, but the in-process state that was waiting for the callback has been lost to a poor state management or application restart. +- A delayed callback is scheduled via a cron job or message queue, but the scheduling system and the application process have no shared recovery mechanism. + + +## Solution + +Temporal makes solving these problems simple with the use of [durable timers](https://docs.temporal.io/workflow-execution/timers-delays) in a workflow. [Signals or Updates](https://docs.temporal.io/encyclopedia/workflow-message-passing/) are used to send events to a Workflow. All of these are just Temporal code - no extra infrastructure to deploy or manage. + +- **Pattern 1 — Inbound Callback:** Route the incoming HTTP request to a Temporal Signal-with-Start. The Workflow is the durable recipient; if it is not running yet, Temporal creates it and delivers the Signal atomically. +- **Pattern 2 — Delayed Outbound Callbacks:** Use a durable `workflow.sleep()` to set the proper delay before executing the outbound HTTP activity. The sleep timer survives worker and server restarts; the activity retries automatically on failure. +- **Pattern 3 — Async Activity Completion:** The activity records a task token before returning, and your callback endpoint uses that token to complete the activity from the outside. The Workflow resumes with the result as if the activity had returned normally. + +### Pattern 1 — Inbound Webhooks + +```mermaid +sequenceDiagram + participant E as External Service + participant A as Your API Handler + participant T as Temporal Cluster + participant W as Workflow + + E->>A: POST /webhook (payload) + A->>T: signal_with_start(workflow_id, "payment_received", payload) + Note over T: Atomic: start if not running, then signal + T->>W: Start workflow (if not running) + deliver signal + W->>W: Wake up, process payload + W->>W: Execute follow-on activities +``` + +The following describes each step in the diagram: + +1. An external service sends an HTTP POST to your API handler — this is the inbound webhook. +2. Your handler calls `signal_with_start` on the Temporal client with the Workflow ID and payload. The handler can return an HTTP 200 immediately after this call; Temporal takes responsibility for delivery. +3. Temporal atomically starts the Workflow if it is not already running, then delivers the Signal — no race condition between "start" and "signal." +4. The Workflow wakes up exactly where it was blocked waiting (or begins execution if newly created) and processes the payload. + +### Pattern 2 — Delayed Outbound Callbacks + +```mermaid +sequenceDiagram + participant C as Client + participant W as Workflow + participant T as Temporal Cluster + participant E as External Service + + C->>T: Start DelayedCallbackWorkflow(url, data, delay) + T->>W: Schedule first task + W->>T: workflow.sleep(delay) + Note over T: Durable timer (survives restarts) + T->>W: Timer fires after delay + W->>W: execute_activity(send_callback, url, data) + W->>E: POST callback_url (data) + E-->>W: HTTP 200 + W-->>C: Workflow complete +``` + +The following describes each step in the diagram: + +1. The client starts the Workflow with a target URL, payload, and delay duration. +2. The Workflow calls `workflow.sleep()`. This stores a durable timer in the Temporal cluster — not in process memory. +3. If any worker restarts during the delay, the timer continues. When it fires, Temporal schedules the next Workflow Task on a healthy worker. +4. The Workflow executes an activity that performs the outbound HTTP POST. If the POST fails, Temporal retries it with the configured retry policy. + +### Pattern 3 — Async Activity Completion + +```mermaid +sequenceDiagram + participant W as Workflow + participant A as Activity + participant E as External Service + participant CB as Your Callback API + + W->>A: execute_activity(submit_job) + A->>E: Submit job, record task_token + A-->>W: Return (workflow paused waiting) + Note over W: Waiting for external completion... + E->>CB: POST /callback (result) + CB->>W: complete_async_activity(task_token, result) + W->>W: Resume with result +``` + +The following describes each step in the diagram: + +1. The Workflow executes an activity that submits a job to an external system. +2. The activity records its task token (an opaque handle Temporal provides) alongside the submitted job ID — for example, in a database row. +3. The activity returns without waiting; the Workflow is now paused waiting for the activity to complete externally. +4. When the external system finishes, it calls your callback endpoint with the result. +5. Your callback handler retrieves the task token from the database and calls `complete_async_activity` on the Temporal client. The Workflow resumes immediately with the result. + +## Implementation + + + +### Pattern 1 — Inbound Webhooks via Signal-with-Start + +The following examples show an `OrderWorkflow` that waits for a `payment_received` Signal. The starter uses Signal-with-Start to atomically create the Workflow and deliver the payment signal in one call — exactly what your HTTP handler would do on a real POST. + + + + +```python +# workflows.py +import asyncio +from dataclasses import dataclass +from datetime import timedelta +from typing import Optional + +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import process_payment + from shared import OrderInput, PaymentPayload + + +@workflow.defn +class OrderWorkflow: + def __init__(self) -> None: + self._payment: Optional[PaymentPayload] = None + + @workflow.run + async def run(self, order: OrderInput) -> str: + workflow.logger.info(f"Order {order.order_id}: waiting for payment webhook") + + # Block until the inbound webhook signal arrives (or timeout after 24 hours) + await workflow.wait_condition( + lambda: self._payment is not None, + timeout=timedelta(hours=24), + ) + + if self._payment is None: + return f"Order {order.order_id}: timed out waiting for payment" + + result = await workflow.execute_activity( + process_payment, + self._payment, + start_to_close_timeout=timedelta(seconds=30), + ) + return result + + @workflow.signal + async def payment_received(self, payload: PaymentPayload) -> None: + workflow.logger.info(f"Payment signal received: {payload.payment_id}") + self._payment = payload +``` + + + + +```java +// OrderWorkflow.java +import io.temporal.activity.ActivityOptions; +import io.temporal.workflow.SignalMethod; +import io.temporal.workflow.Workflow; +import io.temporal.workflow.WorkflowInterface; +import io.temporal.workflow.WorkflowMethod; + +import java.time.Duration; + +@WorkflowInterface +public interface OrderWorkflow { + @WorkflowMethod + String run(Shared.OrderInput order); + + @SignalMethod + void paymentReceived(Shared.PaymentPayload payload); + + final class Impl implements OrderWorkflow { + private final Activities activities = Workflow.newActivityStub( + Activities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .build()); + + private Shared.PaymentPayload payment = null; + + @Override + public String run(Shared.OrderInput order) { + System.out.println("Order " + order.orderId() + ": waiting for payment webhook"); + + // Block until the inbound webhook signal arrives (or timeout after 24 hours) + boolean received = Workflow.await(Duration.ofHours(24), () -> payment != null); + if (!received) { + return "Order " + order.orderId() + ": timed out waiting for payment"; + } + + return activities.processPayment(payment); + } + + @Override + public void paymentReceived(Shared.PaymentPayload payload) { + System.out.println("Payment signal received: " + payload.paymentId()); + this.payment = payload; + } + } +} +``` + + + + +```go +// workflows.go +package main + +import ( + "time" + + "go.temporal.io/sdk/workflow" +) + +func OrderWorkflow(ctx workflow.Context, order OrderInput) (string, error) { + workflow.GetLogger(ctx).Info("Order waiting for payment webhook", "order_id", order.OrderID) + + var payment *PaymentPayload + + // Block until the inbound webhook signal arrives (or timeout after 24 hours) + selector := workflow.NewSelector(ctx) + timerFired := false + + timerCtx, cancelTimer := workflow.WithCancel(ctx) + timer := workflow.NewTimer(timerCtx, 24*time.Hour) + + signalCh := workflow.GetSignalChannel(ctx, SignalName) + selector.AddReceive(signalCh, func(ch workflow.ReceiveChannel, more bool) { + ch.Receive(ctx, &payment) + cancelTimer() + }) + selector.AddFuture(timer, func(f workflow.Future) { + if err := f.Get(ctx, nil); err == nil { + timerFired = true + } + }) + + selector.Select(ctx) + + if timerFired || payment == nil { + return "Order " + order.OrderID + ": timed out waiting for payment", nil + } + + ao := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + }) + + var result string + err := workflow.ExecuteActivity(ao, ProcessPayment, payment).Get(ao, &result) + return result, err +} +``` + + + + +The Starter uses Signal-with-Start to atomically create the Workflow (if needed) and deliver the simulated webhook payload: + + + + +```python +# starter.py +import asyncio +import time + +from temporalio.client import Client + +from shared import TASK_QUEUE, OrderInput, PaymentPayload +from workflows import OrderWorkflow + + +async def main() -> None: + client = await Client.connect("localhost:7233") + + order_id = f"order-{int(time.time() * 1000)}" + order = OrderInput(order_id=order_id, amount=99.99) + payment = PaymentPayload(payment_id=f"pay-{int(time.time() * 1000)}", amount=99.99) + + print(f"Sending webhook for order {order_id}") + + # Signal-with-Start: atomically starts the workflow (if not running) and + # delivers the payment signal — this is exactly what your HTTP handler would do. + handle = await client.start_workflow( + OrderWorkflow.run, + order, + id=f"order-{order_id}", + task_queue=TASK_QUEUE, + start_signal="payment_received", + start_signal_args=[payment], + ) + print(f"Webhook signal sent: {payment.payment_id}") + + result = await handle.result() + print(f"Order completed: {result}") + + +if __name__ == "__main__": + asyncio.run(main()) +``` + + + + +```java +// Starter.java +import io.temporal.client.BatchRequest; +import io.temporal.client.WorkflowClient; +import io.temporal.client.WorkflowOptions; +import io.temporal.client.WorkflowStub; +import io.temporal.serviceclient.WorkflowServiceStubs; + +public class Starter { + public static void main(String[] args) throws Exception { + WorkflowServiceStubs service = WorkflowServiceStubs.newLocalServiceStubs(); + WorkflowClient client = WorkflowClient.newInstance(service); + + String orderId = "order-" + System.currentTimeMillis(); + Shared.PaymentPayload payment = new Shared.PaymentPayload( + "pay-" + System.currentTimeMillis(), 99.99); + + OrderWorkflow workflow = client.newWorkflowStub( + OrderWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(Shared.TASK_QUEUE) + .setWorkflowId("order-" + orderId) + .build()); + + System.out.println("Sending webhook for order " + orderId); + + // Signal-with-Start: atomically starts the workflow (if not running) and + // delivers the payment signal — this is exactly what your HTTP handler would do. + BatchRequest request = client.newSignalWithStartRequest(); + request.add(workflow::run, new Shared.OrderInput(orderId, 99.99)); + request.add(workflow::paymentReceived, payment); + client.signalWithStart(request); + System.out.println("Webhook signal sent: " + payment.paymentId()); + + // Wait for the workflow to complete + String result = WorkflowStub.fromTyped(workflow).getResult(String.class); + System.out.println("Order completed: " + result); + + System.exit(0); + } +} +``` + + + + +```go +// starter.go +package main + +import ( + "context" + "fmt" + "log" + "time" + + "go.temporal.io/sdk/client" +) + +func main() { + c, err := client.Dial(client.Options{HostPort: "localhost:7233"}) + if err != nil { + log.Fatalln("Unable to create client:", err) + } + defer c.Close() + + ctx := context.Background() + orderID := fmt.Sprintf("order-%d", time.Now().UnixMilli()) + workflowID := "order-" + orderID + + order := OrderInput{OrderID: orderID, Amount: 99.99} + payment := PaymentPayload{ + PaymentID: fmt.Sprintf("pay-%d", time.Now().UnixMilli()), + Amount: 99.99, + } + + fmt.Printf("Sending webhook for order %s\n", orderID) + + // Signal-with-Start: atomically starts the workflow (if not running) and + // delivers the payment signal — this is exactly what your HTTP handler would do. + we, err := c.SignalWithStartWorkflow( + ctx, + workflowID, + SignalName, + payment, + client.StartWorkflowOptions{ + ID: workflowID, + TaskQueue: TaskQueue, + }, + OrderWorkflow, + order, + ) + if err != nil { + log.Fatalln("SignalWithStart failed:", err) + } + fmt.Printf("Webhook signal sent: %s\n", payment.PaymentID) + + var result string + if err := we.Get(ctx, &result); err != nil { + log.Fatalln("Workflow result failed:", err) + } + fmt.Printf("Order completed: %s\n", result) +} +``` + + + + +### Pattern 2 — Delayed Outbound Callbacks + +Use a durable `workflow.sleep()` before the outbound activity. The timer is stored in the Temporal cluster, not in process memory — it survives any number of worker restarts. + + + + +```python +# delayed_callback_workflow.py +from datetime import timedelta + +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import send_webhook_callback + from shared import CallbackInput + + +@workflow.defn +class DelayedCallbackWorkflow: + @workflow.run + async def run(self, input: CallbackInput) -> str: + workflow.logger.info( + f"Sleeping {input.delay_seconds}s before calling {input.callback_url}" + ) + + # Durable sleep — survives worker restarts, server restarts, everything + await workflow.sleep(timedelta(seconds=input.delay_seconds)) + + # Fire the outbound callback; Temporal retries on HTTP failure + result = await workflow.execute_activity( + send_webhook_callback, + input, + start_to_close_timeout=timedelta(minutes=5), + ) + workflow.logger.info(f"Callback delivered to {input.callback_url}") + return result +``` + + + + +```java +// DelayedCallbackWorkflow.java +import io.temporal.activity.ActivityOptions; +import io.temporal.workflow.Workflow; +import io.temporal.workflow.WorkflowInterface; +import io.temporal.workflow.WorkflowMethod; + +import java.time.Duration; + +@WorkflowInterface +public interface DelayedCallbackWorkflow { + @WorkflowMethod + void run(Shared.CallbackInput input); + + final class Impl implements DelayedCallbackWorkflow { + private final Activities activities = Workflow.newActivityStub( + Activities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(5)) + .build()); + + @Override + public void run(Shared.CallbackInput input) { + System.out.println("Sleeping " + input.delaySeconds() + "s before calling " + input.callbackUrl()); + + // Durable sleep — survives worker restarts, server restarts, everything + Workflow.sleep(Duration.ofSeconds(input.delaySeconds())); + + // Fire the outbound callback; Temporal retries on HTTP failure + activities.sendWebhookCallback(input); + System.out.println("Callback delivered to " + input.callbackUrl()); + } + } +} +``` + + + + +```go +// delayed_callback.go (add to workflows.go) +package main + +import ( + "time" + + "go.temporal.io/sdk/workflow" +) + +func DelayedCallbackWorkflow(ctx workflow.Context, input CallbackInput) error { + workflow.GetLogger(ctx).Info("Sleeping before callback", + "delay", input.DelaySeconds, "url", input.CallbackURL) + + // Durable sleep — survives worker restarts, server restarts, everything + if err := workflow.Sleep(ctx, time.Duration(input.DelaySeconds)*time.Second); err != nil { + return err + } + + // Fire the outbound callback; Temporal retries on HTTP failure + ao := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Minute, + }) + return workflow.ExecuteActivity(ao, SendWebhookCallback, input).Get(ao, nil) +} +``` + + + + +### Pattern 3 — Async Activity Completion + +The activity records a task token before returning. Your callback endpoint later uses that token to complete the activity and unblock the Workflow. + + + + +```python +# async_completion_activities.py +import asyncio +from temporalio import activity +from temporalio.client import Client + +from shared import JobInput, JobResult + + +@activity.defn +async def submit_job(input: JobInput) -> str: + """Submit job to external system and return immediately. + The activity completes asynchronously when the callback arrives.""" + + # Get the task token — this is the claim ticket + task_token = activity.info().task_token + + # Submit the job to the external system, persisting the task token + # so your callback handler can retrieve it later + job_id = await external_service.submit( + payload=input.payload, + callback_url=f"https://your-api.example.com/callback", + task_token_hex=task_token.hex(), # store alongside job_id + ) + activity.logger.info(f"Job {job_id} submitted; waiting for async callback") + + # Raise ApplicationError to tell Temporal not to mark the activity complete yet + raise activity.CompleteAsyncError() + + +# In your webhook callback handler (e.g., FastAPI route): +async def handle_callback(job_id: str, result: str, task_token_hex: str) -> None: + token = bytes.fromhex(task_token_hex) + client = await Client.connect("localhost:7233") + await client.complete_async_activity_with_id(job_id, result) + # Workflow resumes with `result` immediately +``` + + + + +```java +// AsyncCompletionActivity.java — submit side +import io.temporal.activity.Activity; +import io.temporal.activity.ActivityExecutionContext; +import io.temporal.activity.ActivityInterface; +import io.temporal.activity.ActivityMethod; +import io.temporal.client.ActivityCompletionClient; + +@ActivityInterface +public interface AsyncJobActivity { + @ActivityMethod + String submitJob(Shared.JobInput input); +} + +public class AsyncJobActivityImpl implements AsyncJobActivity { + private final ActivityCompletionClient completionClient; + + public AsyncJobActivityImpl(ActivityCompletionClient completionClient) { + this.completionClient = completionClient; + } + + @Override + public String submitJob(Shared.JobInput input) { + ActivityExecutionContext context = Activity.getExecutionContext(); + + // Get the task token — this is the claim ticket + byte[] taskToken = context.getTaskToken(); + + // Submit to external system, persisting the task token so the callback can retrieve it + String jobId = ExternalService.submit(input.payload(), taskToken); + System.out.println("Job " + jobId + " submitted; waiting for async callback"); + + // Tell Temporal not to mark the activity complete yet + context.doNotCompleteOnReturn(); + return null; // ignored + } +} + +// In your callback handler: +// completionClient.complete(taskToken, result); +``` + + + + +```go +// async_completion.go +package main + +import ( + "context" + "encoding/hex" + "fmt" + + "go.temporal.io/sdk/activity" + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/temporal" +) + +// SubmitJob submits a job and returns immediately; the activity completes +// when the external callback arrives and calls CompleteAsyncActivity. +func SubmitJob(ctx context.Context, input JobInput) (string, error) { + info := activity.GetInfo(ctx) + + // Persist the task token so the callback handler can retrieve it by job ID + taskToken := info.TaskToken + jobID := fmt.Sprintf("job-%d", info.StartedTime.UnixMilli()) + + if err := persistTaskToken(jobID, hex.EncodeToString(taskToken)); err != nil { + return "", err + } + + fmt.Printf("Job %s submitted; waiting for async callback\n", jobID) + + // Return ErrResultPending to tell Temporal not to mark the activity complete yet + return "", temporal.NewApplicationError("async", "AsyncCompletion") +} + +// CompleteJob is called by your webhook callback handler to unblock the workflow. +func CompleteJob(ctx context.Context, c client.Client, jobID string, result string) error { + tokenHex, err := loadTaskToken(jobID) + if err != nil { + return err + } + taskToken, _ := hex.DecodeString(tokenHex) + return c.CompleteActivity(ctx, taskToken, result, nil) +} +``` + + + + +## When to use + +| Scenario | Pattern | +| :--- | :--- | +| External service POSTs a webhook (Workflow may or may not be running) | Signal-with-Start (Pattern 1) | +| Fire an outbound HTTP callback after a delay (seconds to years) | `workflow.sleep()` + activity (Pattern 2) | +| Submit a job to an external system; wait for its completion webhook | Async activity completion (Pattern 3) | +| Poll an external system that does not support webhooks | [Polling External Services](/design-patterns/polling) pattern | + +**Do not use** Pattern 2 for delays shorter than one second as you should not rely on (sub-second accuracy for timers)[https://docs.temporal.io/workflow-execution/timers-delays]. + +## Benefits and trade-offs + +**Benefits** + +- Retries and backoff on outbound HTTP calls come for free via Temporal's retry policy — no custom retry queues needed. +- Workflow state survives worker restarts, deploys, and infrastructure failures; durable timers continue without a running process. +- Every in-flight delayed callback is visible in the Temporal UI with its scheduled time, payload, and retry count. +- Signal-with-Start eliminates the race condition between "does the Workflow exist?" and "deliver the event." +- Async activity completion decouples job submission from job completion without polling. + +**Trade-offs** + +- Your inbound webhook handler requires a Temporal client; you need the client library in the service receiving webhooks. +- Task tokens for async completion must be persisted outside Temporal (e.g., in a database); if that store is unavailable the callback cannot complete. +- Workflow IDs must be deterministic and stable across webhook deliveries (order ID, user ID, etc.) so that Signal-with-Start routes to the correct instance. + +## Comparison with alternatives + +| Approach | Durability | Retries | Observability | Complexity | +| :--- | :--- | :--- | :--- | :--- | +| Temporal Signals + Workflows | Durable — survives restarts | Built-in, configurable | Full Temporal UI | Low — primitives compose naturally | +| Message queue (SQS, Kafka) | Durable (queue level) | Limited, manual DLQ | Requires external tooling | Medium — must handle ordering, DLQ | +| Redis `SET` + cron job | In-memory/volatile | Manual | None | High — cron + polling + error handling | +| Direct HTTP retry loops | Process lifetime only | Manual with `time.sleep` | None | High — fragile without process supervisor | + +## Best practices + +- Use stable, business-meaningful Workflow IDs (for example, `order-{order_id}`) so that Signal-with-Start and queries always route to the right Workflow. +- Return HTTP 200 from your inbound webhook handler as soon as you have called `signal_with_start`; do not wait for the Workflow to process the payload. +- Set a realistic `start_to_close_timeout` on outbound callback activities — long enough for the destination to respond, short enough to surface failures quickly. +- For async activity completion, persist the task token in a transactional write alongside the job submission so you never lose the token. +- Add a timeout to the `workflow.wait_condition` / `Workflow.await` call in inbound webhook Workflows so they do not wait indefinitely if the webhook is never delivered. +- For Pattern 3, use `heartbeat` if the external job takes longer than the activity heartbeat timeout to report back — heartbeating keeps the activity lease alive. + +## Common pitfalls + +- **Sending a plain Signal to a Workflow that does not exist** causes an error. Use Signal-with-Start when the Workflow may not be running. +- **Using `time.sleep()` (non-durable)** in Pattern 2 instead of `workflow.sleep()`. A process sleep disappears on restart; only Temporal's timer is durable. +- **Non-deterministic Workflow IDs** — generating IDs from timestamps or random values means Signal-with-Start creates a new Workflow on every webhook delivery instead of routing to the existing one. +- **Losing the task token** in Pattern 3. If the service storing task tokens is unavailable when the callback arrives, the activity can never complete. Store the token durably (database, not in-process cache). +- **Forgetting `doNotCompleteOnReturn()` / `CompleteAsyncError`** in Pattern 3. Without this, Temporal marks the activity as completed immediately when the function returns, before the external callback arrives. + +## Related patterns + +- [Signal with Start](/design-patterns/signal-with-start) — deeper coverage of the Signal-with-Start API for entity Workflows +- [Approval](/design-patterns/approval) — blocking wait for an external human decision using Signals +- [Polling External Services](/design-patterns/polling) — alternative to callbacks when the external system does not support webhooks +- [Delayed Start](/design-patterns/delayed-start) — defer Workflow execution to a future time without `workflow.sleep()` +- [Long-Running Activity](/design-patterns/long-running-activity) — heartbeating pattern for activities that run for extended periods +- **In the future** - Org-to-Org Nexus, stay tuned. diff --git a/docs/design-patterns/delayed-retry.mdx b/docs/design-patterns/delayed-retry.mdx new file mode 100644 index 0000000000..5b03c66414 --- /dev/null +++ b/docs/design-patterns/delayed-retry.mdx @@ -0,0 +1,268 @@ +--- +id: delayed-retry +title: "Delayed Retry" +sidebar_label: "Delayed Retry" +description: "Override the next retry interval for a specific failure using nextRetryDelay on ApplicationFailure." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Throw an `ApplicationFailure` with `nextRetryDelay` set inside the Activity to **delay the next retry for a fixed time.** Use this when an error carries its own timing information — such as an HTTP 429 `Retry-After` header or a known maintenance window — so Temporal waits exactly as long as needed instead of following the generic backoff schedule. +::: + +## Overview + +The Delayed Retry pattern overrides the next retry interval for a specific failure by throwing an `ApplicationFailure` with a `nextRetryDelay` field set from inside the Activity. +Use it when a particular error carries information about how long to wait before retrying — such as a rate-limit response with a `Retry-After` header, or a known maintenance window with a fixed end time. + +## Problem + +A `RetryPolicy` applies a single backoff schedule to all failures from an Activity. +This works well for generic transient errors, but some errors carry specific information about how long the caller must wait: + +- An HTTP 429 response includes a `Retry-After: 60` header telling you exactly when the quota resets. +- A downstream system returns an error message saying "maintenance until 02:00 UTC" — a precise, known delay. +- A database error includes a lock timeout duration that indicates when the resource will be available. + +With a global `RetryPolicy`, you have two options, neither of which is what you need: set a short interval and retry too early (wasting quota and adding load), or set a long interval and wait longer than necessary. +What you need is to set the next retry delay *specific to this failure*, based on the information the error itself provides. + +## Solution + +Throw an `ApplicationFailure` with the `nextRetryDelay` field set from inside the Activity. +Temporal replaces the RetryPolicy-calculated interval for that single retry with the value you specify. +Subsequent retries (if the next attempt also fails) return to the normal RetryPolicy schedule unless you set `nextRetryDelay` again. + +```mermaid +sequenceDiagram + participant Workflow + participant Temporal as Temporal Service + participant API as Rate-Limited API + + Workflow->>Temporal: Schedule activity (RetryPolicy: initialInterval=1s) + Temporal->>+API: Attempt 1 + API-->>-Temporal: HTTP 429 — Retry-After: 60s + Note over Temporal: Activity throws ApplicationFailure(nextRetryDelay=60s) + Note over Temporal: Override: wait 60s (ignoring RetryPolicy interval) + Temporal->>+API: Attempt 2 + API-->>-Temporal: Success + Temporal-->>Workflow: Result +``` + +The following describes each step: + +1. The Activity calls the API. It receives an HTTP 429 with a `Retry-After: 60` header. +2. The Activity extracts the retry delay from the response and throws `ApplicationFailure` with `nextRetryDelay=60s`. +3. Temporal ignores the RetryPolicy's calculated interval for this retry and waits exactly 60 seconds instead. +4. The next attempt succeeds and Temporal delivers the result to the Workflow. + +## Implementation + + +### Overriding the retry delay from the response + +Extract the wait duration from the error or response and pass it to `ApplicationFailure`. +The RetryPolicy's `MaximumAttempts` and `ScheduleToCloseTimeout` still apply — only the interval for the next retry is overridden. + + + + +```java +// RateLimitedActivityImpl.java +import io.temporal.activity.Activity; +import io.temporal.failure.ApplicationFailure; +import java.time.Duration; + +public class RateLimitedActivityImpl implements RateLimitedActivity { + @Override + public String callApi(String endpoint) { + ApiResponse response = httpClient.get(endpoint); + + if (response.getStatusCode() == 429) { + int retryAfterSeconds = response.getHeaderInt("Retry-After", 0); + if (retryAfterSeconds > 0) { + throw ApplicationFailure.newFailureWithCauseAndDelay( + "Rate limited — retrying after " + retryAfterSeconds + "s", + "RateLimitError", + null, + Duration.ofSeconds(retryAfterSeconds) + ); + } + throw ApplicationFailure.newFailure("Rate limited — retrying per RetryPolicy", "RateLimitError"); + } + + return response.getBody(); + } +} +``` + + + + +```typescript +// activities.ts +import { ApplicationFailure } from '@temporalio/activity'; + +export async function callApi(endpoint: string): Promise { + const response = await fetch(endpoint); + + if (response.status === 429) { + const retryAfterHeader = response.headers.get('Retry-After'); + const retryAfterSeconds = retryAfterHeader != null ? parseInt(retryAfterHeader, 10) : undefined; + throw ApplicationFailure.create({ + message: retryAfterSeconds != null + ? `Rate limited — retrying after ${retryAfterSeconds}s` + : 'Rate limited — retrying per RetryPolicy', + type: 'RateLimitError', + // Only override the interval when the header is present; fall back to RetryPolicy otherwise + nextRetryDelay: retryAfterSeconds != null ? `${retryAfterSeconds}s` : undefined, + }); + } + + return response.text(); +} +``` + + + + +### Attempt-proportional delay + +You can also set the delay dynamically based on the attempt number — for example, to implement a custom backoff that differs from exponential, or to add a known base delay on top of the standard backoff. + + + + +```java +// BackoffActivityImpl.java +import io.temporal.activity.Activity; +import io.temporal.failure.ApplicationFailure; +import java.time.Duration; + +public class BackoffActivityImpl implements BackoffActivity { + @Override + public String process(String input) { + int attempt = Activity.getExecutionContext().getInfo().getAttempt(); + + try { + return downstreamService.call(input); + } catch (ServiceUnavailableException e) { + // Custom delay: 3 seconds × attempt number (3s, 6s, 9s, …) + throw ApplicationFailure.newFailureWithCauseAndDelay( + "Service unavailable on attempt " + attempt, + "ServiceUnavailable", + e, + Duration.ofSeconds(3L * attempt) + ); + } + } +} +``` + + + + +```typescript +// activities.ts +import { ApplicationFailure, activityInfo } from '@temporalio/activity'; + +export async function process(input: string): Promise { + const { attempt } = activityInfo(); + + try { + return await downstreamService.call(input); + } catch (e) { + // Custom delay: 3 seconds × attempt number (3s, 6s, 9s, …) + throw ApplicationFailure.create({ + message: `Service unavailable on attempt ${attempt}`, + type: 'ServiceUnavailable', + cause: e as Error, + nextRetryDelay: `${3 * attempt}s`, + }); + } +} +``` + + + + +### Workflow configuration + +The Workflow sets a normal `RetryPolicy`. +The `nextRetryDelay` set in the Activity overrides the interval only for the retry following that specific failure — subsequent attempts fall back to the RetryPolicy schedule if `nextRetryDelay` is not set again. + + + + +```java +// ApiWorkflowImpl.java +public class ApiWorkflowImpl implements ApiWorkflow { + private final RateLimitedActivity activities = Workflow.newActivityStub( + RateLimitedActivity.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .setRetryOptions(RetryOptions.newBuilder() + .setInitialInterval(Duration.ofSeconds(1)) + .setBackoffCoefficient(2.0) + .setMaximumAttempts(10) + .build()) + .build() + ); + + @Override + public String run(String endpoint) { + return activities.callApi(endpoint); + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { callApi } = wf.proxyActivities({ + startToCloseTimeout: '10s', + retry: { + initialInterval: '1s', + backoffCoefficient: 2, + maximumAttempts: 10, + }, +}); + +export async function apiWorkflow(endpoint: string): Promise { + return await callApi(endpoint); +} +``` + + + + +## Best practices + +- **Use the error's own delay information when available.** HTTP 429 `Retry-After`, database lock timeouts, and API-provided backoff hints are more accurate than any value you could configure statically. +- **Fall back to the RetryPolicy for unknown errors.** Only set `nextRetryDelay` for error types where you have reliable delay information. Let the RetryPolicy handle all other failures normally. +- **Still set a meaningful RetryPolicy.** `nextRetryDelay` overrides the interval for a single retry; the RetryPolicy still governs maximum attempts and the intervals for attempts where `nextRetryDelay` is not set. Also ensure `scheduleToCloseTimeout` is long enough to accommodate the maximum possible `nextRetryDelay` value — a tight budget can cause the Activity to expire before the delayed retry executes. +- **Surface the delay in the failure message.** Include the delay value and its source in the `ApplicationFailure` message (for example, `"Rate limited — retrying after 60s (Retry-After header)"`) so it appears directly in the Workflow history - Activity failure details. This makes it clear why the Activity waited an unusual amount of time without requiring separate log correlation. + +## Common pitfalls + +- **Assuming `nextRetryDelay` persists across all retries.** It only applies to the immediate next retry. If the following attempt also fails without setting `nextRetryDelay`, the RetryPolicy interval resumes. +- **Setting `nextRetryDelay` longer than `ScheduleToCloseTimeout`.** If the override delay exceeds the remaining `ScheduleToCloseTimeout` budget, the retry will never execute — Temporal will expire the Activity before the delay elapses. + + +## Related patterns + +- [Fixed Count of Retries](/design-patterns/fixed-count-retries): Cap total attempts to prevent unbounded retry cost. +- [Fixed Wall-Time Retries](/design-patterns/fixed-wall-time-retries): Enforce a total elapsed time budget across all attempts. +- [Fast/Slow Retries](/design-patterns/fast-slow-retries): Shift from a fast retry cadence to a slow one after initial attempts are exhausted. +- [Error Handling & Retry Patterns](/design-patterns/error-handling-patterns): Overview and decision tree for all retry patterns. + +## References + +- [Per-error next Retry delay](https://docs.temporal.io/encyclopedia/retry-policies#per-error-next-retry-delay) diff --git a/docs/design-patterns/delayed-start.mdx b/docs/design-patterns/delayed-start.mdx new file mode 100644 index 0000000000..6840ecabba --- /dev/null +++ b/docs/design-patterns/delayed-start.mdx @@ -0,0 +1,455 @@ +--- +id: delayed-start +title: "Delayed Start Pattern" +sidebar_label: "Delayed Start" +description: "Creates Workflows immediately but defers execution until a specified delay expires. Fits one-time scheduled operations and grace periods." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Delayed Start pattern enables Workflows to be created immediately but begin execution after a specified delay. +The Workflow execution is registered in Temporal right away, but the first Workflow Task is scheduled to run only after the delay period expires, making it suitable for scheduled operations, grace periods, and deferred processing. + +## Problem + +In business processes, you often need Workflows that start execution at a future time, are created immediately for tracking but execute later, avoid external scheduling systems or cron jobs for one-time delays, and maintain Workflow identity and queryability before execution begins. + +Without delayed start, you must use external schedulers to trigger Workflow creation later, start Workflows immediately and sleep as the first operation (which wastes resources), implement complex queueing systems for deferred execution, or use Temporal Schedules for one-time delays (which is more than you need). + +## Solution + +The Delayed Start uses a start delay option in WorkflowOptions to defer the first Workflow Task. +The Workflow execution is created immediately with a `firstWorkflowTaskBackoff` set to the delay duration, but no Workflow code runs until the delay expires. + +```mermaid +sequenceDiagram + participant Client + participant Temporal + participant Workflow + + Client->>Temporal: Start with setStartDelay(30s) + Temporal->>Temporal: Create execution + Note over Temporal: Execution visible
but not running + Temporal-->>Client: Workflow ID + + Note over Temporal: Delay period (30s)... + + opt During delay + Client->>Temporal: Signal-With-Start + Note over Temporal: Bypasses remaining delay + end + + Note over Temporal: Delay expires (if not bypassed) + Temporal->>+Workflow: Schedule first task + Workflow->>Workflow: Execute + Workflow-->>-Temporal: Complete +``` + +The following describes each step in the diagram: + +1. The client starts the Workflow with a 30-second delay. Temporal creates the execution immediately. +2. The execution is visible and queryable, but no Workflow code runs during the delay. +3. If the client sends a Signal-With-Start or Update-With-Start during the delay, the remaining delay is bypassed and a Workflow Task is dispatched immediately. Regular Signals do not interrupt the delay. +4. After the delay expires, Temporal schedules the first Workflow Task and the Workflow begins execution. + +The following example creates a Workflow with a 30-second start delay: + + + + +```python +# client.py +from datetime import timedelta + +handle = await client.start_workflow( + DelayedStartWorkflow.run, + id=WORKFLOW_ID, + task_queue=TASK_QUEUE, + start_delay=timedelta(seconds=30), +) +# Created now, executes in 30 seconds +``` + + + + +```go +// starter/main.go +workflowOptions := client.StartWorkflowOptions{ + ID: WorkflowID, + TaskQueue: TaskQueue, + StartDelay: 30 * time.Second, +} + +we, err := c.ExecuteWorkflow(context.Background(), workflowOptions, DelayedStartWorkflow) +// Created now, executes in 30 seconds +``` + + + + +```java +// Client.java +DelayedStartWorkflow workflow = client.newWorkflowStub( + DelayedStartWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId(WORKFLOW_ID) + .setTaskQueue(TASK_QUEUE) + .setStartDelay(Duration.ofSeconds(30)) + .build()); + +workflow.start(); // Created now, executes in 30 seconds +``` + + + + +```typescript +// client.ts +const handle = await client.workflow.start(delayedStartWorkflow, { + workflowId: WORKFLOW_ID, + taskQueue: TASK_QUEUE, + startDelay: '30 seconds', +}); +// Created now, executes in 30 seconds +``` + + + + +The start delay option sets the `firstWorkflowTaskBackoff` on the execution. +The Workflow is created and visible in the UI immediately, but the Worker does not receive a Task until the delay expires. + +## Implementation + +### Basic delayed notification + +The following implementation sends a notification after a one-hour delay. +The Workflow code runs only after the delay expires: + + + + +```python +# workflows.py +from temporalio import workflow + +@workflow.defn +class NotificationWorkflow: + @workflow.run + async def run(self, message: str) -> None: + workflow.logger.info(f"Sending notification: {message}") + +# client.py +from datetime import timedelta + +handle = await client.start_workflow( + NotificationWorkflow.run, + "Your trial expires soon", + task_queue=TASK_QUEUE, + start_delay=timedelta(hours=1), +) +``` + + + + +```go +// workflow.go +func NotificationWorkflow(ctx workflow.Context, message string) error { + logger := workflow.GetLogger(ctx) + logger.Info("Sending notification: " + message) + return nil +} + +// starter/main.go +workflowOptions := client.StartWorkflowOptions{ + TaskQueue: TaskQueue, + StartDelay: 1 * time.Hour, +} + +we, err := c.ExecuteWorkflow( + context.Background(), workflowOptions, NotificationWorkflow, "Your trial expires soon", +) +``` + + + + +```java +// NotificationWorkflowImpl.java +@WorkflowInterface +public interface NotificationWorkflow { + @WorkflowMethod + void sendNotification(String message); +} + +public class NotificationWorkflowImpl implements NotificationWorkflow { + @Override + public void sendNotification(String message) { + Workflow.getLogger(NotificationWorkflowImpl.class) + .info("Sending notification: " + message); + } +} + +// Client.java +NotificationWorkflow workflow = client.newWorkflowStub( + NotificationWorkflow.class, + WorkflowOptions.newBuilder() + .setTaskQueue(TASK_QUEUE) + .setStartDelay(Duration.ofHours(1)) + .build()); + +workflow.sendNotification("Your trial expires soon"); +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; + +export async function notificationWorkflow(message: string): Promise { + wf.log.info(`Sending notification: ${message}`); +} + +// client.ts +const handle = await client.workflow.start(notificationWorkflow, { + args: ['Your trial expires soon'], + taskQueue: TASK_QUEUE, + startDelay: '1 hour', +}); +``` + + + + +The Workflow is created immediately, but the notification logic does not execute until one hour later. + +### Cancellable delayed execution + +The following implementation adds Signal handlers for cancellation and a Query for status. +You can cancel the Workflow before it runs or check its status during the delay: + + + + +```python +# workflows.py +from temporalio import workflow + +@workflow.defn +class DelayedOrderWorkflow: + def __init__(self) -> None: + self._cancelled = False + self._status = "SCHEDULED" + + @workflow.run + async def run(self, order_id: str) -> None: + if self._cancelled: + self._status = "CANCELLED" + return + + self._status = "PROCESSING" + # Process order logic + self._status = "COMPLETED" + + @workflow.signal + async def cancel(self) -> None: + self._cancelled = True + + @workflow.query + def get_status(self) -> str: + return self._status +``` + + + + +```go +// workflow.go +func DelayedOrderWorkflow(ctx workflow.Context, orderID string) error { + logger := workflow.GetLogger(ctx) + cancelled := false + status := "SCHEDULED" + + // Register Signal handler for cancellation + cancelCh := workflow.GetSignalChannel(ctx, "cancel") + // Drain any pending signals without blocking + for { + var signal interface{} + ok := cancelCh.ReceiveAsync(&signal) + if !ok { + break + } + cancelled = true + } + + // Register Query handler for status + err := workflow.SetQueryHandler(ctx, "getStatus", func() (string, error) { + return status, nil + }) + if err != nil { + return err + } + + if cancelled { + logger.Info("Order cancelled before processing", "orderId", orderID) + return nil + } + + status = "PROCESSING" + // Process order logic + status = "COMPLETED" + return nil +} +``` + + + + +```java +// DelayedOrderWorkflowImpl.java +@WorkflowInterface +public interface DelayedOrderWorkflow { + @WorkflowMethod + void processOrder(String orderId); + + @SignalMethod + void cancel(); + + @QueryMethod + String getStatus(); +} + +public class DelayedOrderWorkflowImpl implements DelayedOrderWorkflow { + private boolean cancelled = false; + private String status = "SCHEDULED"; + + @Override + public void processOrder(String orderId) { + if (cancelled) { + status = "CANCELLED"; + return; + } + + status = "PROCESSING"; + // Process order logic + status = "COMPLETED"; + } + + @Override + public void cancel() { + cancelled = true; + } + + @Override + public String getStatus() { + return status; + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; + +const cancelSignal = wf.defineSignal('cancel'); +const getStatusQuery = wf.defineQuery('getStatus'); + +export async function delayedOrderWorkflow(orderId: string): Promise { + let cancelled = false; + let status = 'SCHEDULED'; + + wf.setHandler(cancelSignal, () => { + cancelled = true; + }); + + wf.setHandler(getStatusQuery, () => status); + + if (cancelled) { + status = 'CANCELLED'; + return; + } + + status = 'PROCESSING'; + // Process order logic + status = 'COMPLETED'; +} +``` + + + + +The `cancel` Signal handler sets a flag that the Workflow checks when it starts executing. +Note that Signal handlers and Query handlers only run after the delay expires and the first Workflow Task is dispatched. +To cancel before execution, use `Signal-With-Start` to bypass the delay, or cancel the Workflow Execution directly. + +## When to use + +The Delayed Start pattern is a good fit for scheduled one-time operations (send a reminder in 24 hours), grace periods before processing (cancel a subscription in 7 days), delayed notifications and alerts, deferred batch processing, and trial expiration Workflows. + +It is not a good fit for recurring Schedules (use Temporal Schedules), immediate execution with internal delays (use Workflow sleep — `Workflow.sleep()` in Java, `wf.sleep()` in TypeScript, `workflow.sleep()` in Python, `workflow.Sleep()` in Go), complex scheduling logic (use Schedules with cron), or sub-second delays (minimal benefit). + +## Benefits and trade-offs + +The Workflow is queryable before execution starts (immediate visibility). +No Worker resources are consumed during the delay. +You can cancel the Workflow Execution before it runs. +A Signal-With-Start or Update-With-Start bypasses the remaining delay. +Regular Signals sent during the delay do not interrupt it. +The API is a single configuration option with no external schedulers needed. +The delay is managed by Temporal, ensuring deterministic behavior. + +The trade-offs to consider are that you cannot dynamically adjust the delay after creation (use the Updatable Timer pattern for that). +The pattern is for one-time delays only — for recurring Schedules, use Temporal Schedules. +Very short delays (milliseconds) provide minimal benefit — the minimum timer duration is 1 second. +The delay is time-based only, not condition-based. +Regular Signals sent during the delay are not delivered until the first Workflow Task fires, so Query and Signal handlers are not available until execution begins. + +## Comparison with alternatives + +| Approach | Immediate visibility | Resource usage | Cancellable | Use case | +| :--- | :--- | :--- | :--- | :--- | +| Delayed Start | Yes | None during delay | Yes | One-time future execution | +| Workflow sleep | Yes | Worker resources | Yes | Internal delays | +| Temporal Schedules | Yes | None | Yes | Recurring Schedules | +| External Scheduler | No | External system | Depends | Complex scheduling | + +## Best practices + +- **Use for one-time delays.** For recurring Schedules, use Temporal Schedules instead. +- **Set Workflow ID.** Always set an explicit Workflow ID for tracking and cancellation. +- **Add Query methods.** Expose status via Queries to check state during the delay. +- **Enable cancellation.** Add Signal handlers to cancel before execution. +- **Validate delay duration.** Ensure the delay is reasonable (not too short or too long). +- **Monitor backoff.** Check `firstWorkflowTaskBackoff` in history for verification. +- **Consider time zones.** Use absolute timestamps if the delay depends on a specific time. +- **Document behavior.** Clearly indicate that the Workflow does not execute immediately. + +## Common pitfalls + +- **Using Signals during the delay.** Regular Signals do not interrupt the Start Delay. Only Signal-With-Start or Update-With-Start bypass the delay. Signals sent to a delayed Workflow are buffered but the Workflow code has not started, so there is no handler to process them until the delay expires. +- **Querying before the Workflow starts.** Queries have no state to return during the delay because no Workflow code has executed yet. Clients may receive errors or empty results. +- **Setting delays shorter than 1 second.** The minimum timer resolution is 1 second. Sub-second delays are not supported. +- **Forgetting that the Workflow ID is reserved.** A delayed Workflow reserves its Workflow ID immediately. Starting another Workflow with the same ID will fail depending on the ID reuse policy. + +## Related patterns + +- **Temporal Schedules**: For recurring Workflow execution. +- **[Updatable Timer](/design-patterns/updatable-timer)**: For dynamically adjustable delays within Workflows. +- **[Signal with Start](/design-patterns/signal-with-start)**: Interacting with Workflows before execution. + +## Sample code + +- [Java Sample](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloDelayedStart.java) — Delayed start with `setStartDelay()`. +- [TypeScript Sample](https://github.com/temporalio/samples-typescript/tree/main/start-delay) — Delayed start with `startDelay` option. +- [Python Sample](https://github.com/temporalio/samples-python/tree/main/start_delay) — Delayed start with `start_delay` parameter. +- [Go Sample](https://github.com/temporalio/samples-go/tree/main/start-delay) — Delayed start with `StartDelay` option. diff --git a/docs/design-patterns/distributed-transaction-patterns.mdx b/docs/design-patterns/distributed-transaction-patterns.mdx new file mode 100644 index 0000000000..fbf7622389 --- /dev/null +++ b/docs/design-patterns/distributed-transaction-patterns.mdx @@ -0,0 +1,25 @@ +--- +id: distributed-transaction-patterns +title: Distributed transaction patterns +sidebar_label: Overview +description: Patterns for managing distributed transactions with compensating actions and coordination across multiple services. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for managing distributed transactions with compensating actions and coordination across multiple services. + + diff --git a/docs/design-patterns/downstream-rate-limiting.mdx b/docs/design-patterns/downstream-rate-limiting.mdx new file mode 100644 index 0000000000..30e27ad2b8 --- /dev/null +++ b/docs/design-patterns/downstream-rate-limiting.mdx @@ -0,0 +1,295 @@ +--- +id: downstream-rate-limiting +title: "Downstream Rate Limiting" +sidebar_label: "Downstream Rate Limiting" +description: "Caps Activity execution rate against a downstream service by routing throttled Activities to a dedicated Task Queue." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Used to **rate limit outbound requests to a downstream service**. Use this to limit the rate of requests, such as to a third-party API, or payment processor, or other external system, that concurrent Workflows would otherwise exceed. +::: + +## Overview + +The Downstream Rate Limiting pattern, also known as Task Queue rate limiting, caps how many Activities execute per second against a downstream service. +You place throttled Activities on a dedicated Task Queue backed by Workers configured with `MaxTaskQueueActivitiesPerSecond`. +The Temporal matching service enforces this limit before dispatching tasks, so the downstream service receives a controlled request rate regardless of how many Worker instances or Workflow executions are running concurrently. + + +## Problem + +Many downstream systems — LLM providers, payment processors, third-party REST APIs — enforce requests-per-second limits. Some systems cannot handle more than a defined level of requests per second. +When many Temporal Workflows schedule Activities concurrently, the resulting burst can saturate those limits, causing request failures, cascading retries, and increased latency for all callers. + +Without centralized throttling, each Activity implementation must manage backpressure independently, which scatters policy across the codebase and provides no enforcement at the Temporal scheduling layer. + +## Solution + +You assign rate-limited Activities to a dedicated Task Queue and Worker set and configure the Workers on that queue with a throughput cap. +Because the limit applies to the Task Queue, it is enforced before any Worker executes an Activity, and it holds across all Worker replicas without coordination. + +The Workflow routes the throttled Activity to the dedicated queue by specifying an explicit `task_queue` override in the Activity options. + +```mermaid +flowchart LR + subgraph Workflows + WA[Workflow A] + WB[Workflow B] + WC[Workflow C] + end + + subgraph Temporal Server + TQ["rate-limited-tq\n─────────────\ntask 1\ntask 2\ntask 3\ntask 4\n…"] + end + + subgraph Workers + WK1["Worker 1\nMaxTaskQueueActivitiesPerSecond\n= 5 RPS"] + WK2["Worker 2\nMaxTaskQueueActivitiesPerSecond\n= 5 RPS"] + end + + DS["Downstream API\n(rate limit: 5 RPS)"] + + WA -->|"schedule callApi\ntask_queue=rate-limited-tq"| TQ + WB -->|"schedule callApi\ntask_queue=rate-limited-tq"| TQ + WC -->|"schedule callApi\ntask_queue=rate-limited-tq"| TQ + TQ -->|"dispatch ≤2.5/sec"| WK1 + TQ -->|"dispatch ≤2.5/sec"| WK2 + WK1 -->|"≤5 req/sec combined"| DS + WK2 --> DS +``` + +The following describes each step in the diagram: + +1. Any number of Workflows schedule `callApi` Activities to the dedicated `rate-limited-tq` Task Queue via an explicit `task_queue` override in their Activity options. +2. The Temporal server holds tasks in `rate-limited-tq`. The queue depth grows if submission rate exceeds dispatch capacity. +3. Two Workers poll the queue. Each is configured with `MaxTaskQueueActivitiesPerSecond = 5`, so together they dispatch at most 5 Activity tasks per second — matching the downstream API's rate limit. +4. The downstream API receives a steady, controlled request rate regardless of how many Workflows are running concurrently. + +## Implementation + +### Worker configured with a throughput cap + + + + +```python +# worker.py +# This is a dedicated worker for rate-limited activities. +# You will also need a separate worker registered on your workflow task queue. +from temporalio.worker import Worker +from activities import call_api + +async def run_worker(client): + worker = Worker( + client, + task_queue="rate-limited-tq", + activities=[call_api], + max_task_queue_activities_per_second=5.0, + ) + await worker.run() +``` + + + + +```go +// main.go +// This is a dedicated worker for rate-limited activities. +// You will also need a separate worker registered on your workflow task queue. +w := worker.New(c, "rate-limited-tq", worker.Options{ + TaskQueueActivitiesPerSecond: 5.0, +}) +w.RegisterActivity(CallApi) +if err := w.Run(worker.InterruptCh()); err != nil { + log.Fatalf("worker error: %v", err) +} +``` + + + + +```java +// WorkerSetup.java +// This is a dedicated worker for rate-limited activities. +// You will also need a separate worker registered on your workflow task queue. +WorkerOptions rateLimitedOptions = WorkerOptions.newBuilder() + .setMaxTaskQueueActivitiesPerSecond(5.0) + .build(); + +Worker rateLimitedWorker = factory.newWorker("rate-limited-tq", rateLimitedOptions); +rateLimitedWorker.registerActivitiesImplementations(new RateLimitedActivitiesImpl()); +factory.start(); +``` + + + + +### Activity definition + + + + +```python +# activities.py +from temporalio import activity + +@activity.defn +async def call_api(input: str) -> str: + return await downstream_api.call(input) +``` + + + + +```go +// activities.go +func CallApi(ctx context.Context, input string) (string, error) { + return downstreamApi.Call(input) +} +``` + + + + +```java +// RateLimitedActivities.java +@ActivityInterface +public interface RateLimitedActivities { + @ActivityMethod + String callApi(String input); +} + +public class RateLimitedActivitiesImpl implements RateLimitedActivities { + @Override + public String callApi(String input) { + return downstreamApi.call(input); + } +} +``` + + + + +### Workflow routing to the rate-limited queue + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow +from activities import call_api + +@workflow.defn +class MyWorkflow: + @workflow.run + async def run(self, input: str) -> str: + return await workflow.execute_activity( + call_api, + input, + task_queue="rate-limited-tq", + start_to_close_timeout=timedelta(seconds=30), + ) +``` + + + + +```go +// workflow.go +func MyWorkflow(ctx workflow.Context, input string) (string, error) { + ao := workflow.ActivityOptions{ + TaskQueue: "rate-limited-tq", + StartToCloseTimeout: 30 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, CallApi, input).Get(ctx, &result) + return result, err +} +``` + + + + +```java +// MyWorkflowImpl.java +public class MyWorkflowImpl implements MyWorkflow { + private final RateLimitedActivities rateLimitedActivities = + Workflow.newActivityStub(RateLimitedActivities.class, + ActivityOptions.newBuilder() + .setTaskQueue("rate-limited-tq") + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .build() + ); + + @Override + public String run(String input) { + return rateLimitedActivities.callApi(input); + } +} +``` + + + + +## When to use + +This pattern is a good fit when your Workflow calls a downstream service with explicit requests-per-second limits, when you need throughput enforcement that holds across many concurrent Workflow instances without per-Activity logic, and when only a subset of Activity types require throttling and others should run without restriction. + +It is not a good fit when you need concurrency limits rather than throughput limits (see [Priority Task Queues](/design-patterns/priority-task-queues)), when the downstream system has no rate limit and throughput is bounded only by Workflow logic, or when all Activities require the same limit and a single shared queue suffices. + +## Benefits and trade-offs + +Centralizing rate limiting at the Task Queue ensures enforcement even when any number of Workflow instances run in parallel. +Because the Temporal server controls dispatch, the limit holds regardless of how many Worker replicas are running — provided you account for Worker count when setting the per-worker cap. + +Dedicated Task Queues require operating additional Workers. +If the throughput cap is set too low relative to demand, the queue depth grows and scheduling latency increases. +You must size the Worker pool so that slot availability does not become the bottleneck before the rate limit is reached. + +## Comparison with alternatives + +| Approach | Enforcement point | Works across Workers | Runtime adjustable | Complexity | +| :--- | :--- | :--- | :--- | :--- | +| `MaxTaskQueueActivitiesPerSecond` | Temporal matching service (server-side) | Yes | No (requires redeploy) | Low | +| `MaxWorkerActivitiesPerSecond` | Worker SDK poller (worker-side) | No — per-worker only | No (requires redeploy) | Low | +| Concurrency slots (`MaxConcurrentActivityExecutionSize`, `MaxConcurrentWorkflowTaskExecutionSize`, `MaxConcurrentLocalActivityExecutionSize`) | Worker executor | No — per-worker only | No (requires redeploy) | Low | +| Sleep-based throttle in Workflow | Workflow scheduler | No | Via signal | Low | +| Client-side token bucket in Activity | Activity execution | Per-worker only | No | Medium | +| API gateway rate limiting | Network layer | Yes | Yes | High | + +Three distinct layers of worker-side control exist alongside the server-side queue limit. `MaxWorkerActivitiesPerSecond` instructs the SDK to self-throttle its polling — the Worker will not request a new Activity task if doing so would push it over this rate. Because the limit is per-process, multiple Workers on the same queue each apply it independently, so the effective queue throughput is the per-worker cap multiplied by Worker count. By contrast, `MaxTaskQueueActivitiesPerSecond` is a server-side instruction: the Temporal matching service slows dispatch for the entire queue regardless of how many Workers are polling, making it the correct tool for protecting a shared downstream service. + +The concurrency slots (`MaxConcurrentActivityExecutionSize`, `MaxConcurrentWorkflowTaskExecutionSize`, `MaxConcurrentLocalActivityExecutionSize`) are not throughput limits but define the number of execution slots available on a Worker. A Worker will not accept more tasks than it has open slots, so a low slot count acts as an indirect throughput ceiling. + +## Best practices + +- **Use a separate Task Queue for each rate limit.** `MaxTaskQueueActivitiesPerSecond` applies to every Activity on the queue. Mixing rate-limited and unrestricted Activities on the same queue will throttle the unrestricted ones too. +- **Run at least two Worker processes per queue for availability.** A single Worker process is a single point of failure. With two Workers, keep the per-worker cap at half the target RPS so the combined rate stays at the intended limit. +- **Monitor queue depth and schedule latency.** Track the `temporal_activity_schedule_to_start_latency` metric on the rate-limited queue; sustained growth signals that demand consistently exceeds the configured cap. You can also query the Task Queue's `ApproximateBacklogCount` via the `DescribeTaskQueue` API — a steadily growing backlog count is a direct indicator that the configured RPS cap is too low for the current submission rate. + +## Common pitfalls + +- **Forgetting to override the task queue in Activity options.** If the Workflow does not explicitly specify `task_queue` in the Activity options, the Activity runs on the Workflow's default queue and bypasses the rate-limited Worker entirely. +- **Setting conflicting MaxTaskQueueActivitiesPerSecond limits in workers.** This setting is set in Workers and sent to the Task Queue when a Worker polls. If you have multiple Workers with conflicting settings, the Workers will overwrite each other as they poll. +- **Confusing throughput limits with concurrency limits.** `MaxTaskQueueActivitiesPerSecond` controls starts per second; `MaxConcurrentActivityExecutionSize` controls simultaneous executions. Long-running Activities that hold slots for minutes may exhaust concurrency before the RPS cap applies. +- **Setting the cap far below actual demand.** A cap much lower than actual submission rate causes the queue to grow unboundedly. Monitor queue depth and raise the cap or add more Workers when throughput requirements grow. + +## Related patterns + +- **[Priority Task Queues](/design-patterns/priority-task-queues)**: Route work to separate queues by urgency, with different concurrency budgets per tier. +- **[Fairness](/design-patterns/fairness)**: Give each tenant an equal throughput share when multiple tenants share capacity. +- **[Worker-Specific Task Queues](/design-patterns/worker-specific-taskqueue)**: Route Activities to a specific Worker host for resource or data affinity. + +## References + +- **Python** — [`max_task_queue_activities_per_second`](https://python.temporal.io/temporalio.worker.WorkerConfig.html#max_task_queue_activities_per_second) on [`Worker`](https://python.temporal.io/temporalio.worker.Worker.html) +- **Go** — [`TaskQueueActivitiesPerSecond`](https://pkg.go.dev/go.temporal.io/sdk/internal#WorkerOptions) in [`worker.Options`](https://pkg.go.dev/go.temporal.io/sdk/worker#Options) +- **Java** — [`setMaxTaskQueueActivitiesPerSecond`](https://www.javadoc.io/doc/io.temporal/temporal-sdk/latest/io/temporal/worker/WorkerOptions.Builder.html) on `WorkerOptions.Builder` +- **Temporal Community** — [Rate limit configuration and best practices](https://community.temporal.io/t/rate-limit-configuration-and-best-practices/5498) diff --git a/docs/design-patterns/early-return.mdx b/docs/design-patterns/early-return.mdx new file mode 100644 index 0000000000..d7c8c07dac --- /dev/null +++ b/docs/design-patterns/early-return.mdx @@ -0,0 +1,437 @@ +--- +id: early-return +title: "Early Return (Update with Start)" +sidebar_label: "Early Return" +description: "Synchronous initialization with asynchronous completion. Returns results immediately while processing continues in the background." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Early Return pattern returns initialization results to the caller immediately while continuing asynchronous processing in the background. + +## Problem + +Clients need immediate feedback on whether an operation can proceed, but the full operation takes significant time to complete. +Blocking the client for the entire operation duration creates a poor user experience and ties up resources. + +## Solution + +You use Update-with-Start to split operations into two phases: a fast synchronous initialization phase that validates and returns results immediately, and a slower asynchronous completion phase that runs in the background. +The Workflow uses local Activities for quick initialization, Signals completion via Update handlers, then either completes or cancels the operation based on initialization success. + +```mermaid +sequenceDiagram + participant Client + participant Workflow + participant Activity + + Client->>+Workflow: Update-with-Start + activate Workflow + Workflow->>+Activity: Phase 1: Init (fast) + Activity-->>-Workflow: Result + Workflow-->>Client: Init Result (early return) + deactivate Workflow + + Note over Workflow: Workflow continues executing + Workflow->>+Activity: Phase 2: Complete (slow) + Activity-->>-Workflow: Done + deactivate Workflow +``` + +The following describes each step in the diagram: + +1. The client sends an Update-with-Start request to the Workflow. +2. The Workflow executes a fast initialization Activity (Phase 1) and returns the result to the client immediately. +3. The client receives the initialization result while the Workflow continues executing. +4. The Workflow executes the slower completion Activity (Phase 2) in the background. + +## Implementation + +The following examples show how each SDK implements this pattern. +The Workflow registers an Update handler that blocks until initialization completes, then returns the result to the caller. +The client receives the initialization result in a single round trip while the Workflow continues processing. + + + + + +```python +# workflow.py +from dataclasses import dataclass +from datetime import timedelta + +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import init_transaction, complete_transaction, cancel_transaction + + +@dataclass +class TransactionRequest: + amount: float + currency: str + + +@dataclass +class Transaction: + id: str + status: str + + +@workflow.defn +class TransactionWorkflow: + def __init__(self) -> None: + self.tx: Transaction | None = None + self.init_done = False + self.init_err: Exception | None = None + + @workflow.run + async def run(self, tx_request: TransactionRequest) -> Transaction | None: + # Phase 1: Fast synchronous initialization (local activity) + try: + self.tx = await workflow.execute_local_activity( + init_transaction, + tx_request, + schedule_to_close_timeout=timedelta(seconds=5), + ) + except Exception as e: + self.init_err = e + finally: + self.init_done = True # Signal update handler + + # Phase 2: Slow asynchronous completion + if self.init_err is not None: + await workflow.execute_activity( + cancel_transaction, + self.tx, + start_to_close_timeout=timedelta(seconds=30), + ) + return None + + await workflow.execute_activity( + complete_transaction, + self.tx, + start_to_close_timeout=timedelta(seconds=30), + ) + return self.tx + + @workflow.update + async def return_init_result(self) -> Transaction: + await workflow.wait_condition(lambda: self.init_done) + if self.init_err is not None: + raise self.init_err + return self.tx + + +# client.py +from temporalio.client import ( + Client, + WithStartWorkflowOperation, + WorkflowUpdateStage, +) + +client = await Client.connect("localhost:7233") + +start_op = WithStartWorkflowOperation( + TransactionWorkflow.run, + tx_request, + id="transaction-123", + task_queue="transactions", + id_conflict_policy=common_pb2.WORKFLOW_ID_CONFLICT_POLICY_FAIL, +) + +update_handle = await client.start_update_with_start_workflow( + TransactionWorkflow.return_init_result, + wait_for_stage=WorkflowUpdateStage.COMPLETED, + start_workflow_operation=start_op, +) + +# Get initialization result immediately +tx = await update_handle.result() + +# Use transaction ID immediately while workflow continues +print(f"Transaction initialized: {tx.id}") +``` + + + + +```go +// workflow.go +func Workflow(ctx workflow.Context, txRequest TransactionRequest) (*Transaction, error) { + var tx *Transaction + var initDone bool + var initErr error + + // Register update handler that waits for initialization + workflow.SetUpdateHandler(ctx, UpdateName, + func(ctx workflow.Context) (*Transaction, error) { + workflow.Await(ctx, func() bool { return initDone }) + return tx, initErr + }, + ) + + // Phase 1: Fast synchronous initialization (local activity) + localOpts := workflow.WithLocalActivityOptions(ctx, workflow.LocalActivityOptions{ + ScheduleToCloseTimeout: 5 * time.Second, + }) + initErr = workflow.ExecuteLocalActivity(localOpts, txRequest.Init).Get(ctx, &tx) + initDone = true // Signal update handler + + // Phase 2: Slow asynchronous completion + activityCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + }) + + if initErr != nil { + // Cancel on initialization failure + return nil, workflow.ExecuteActivity(activityCtx, CancelTransaction, tx).Get(ctx, nil) + } + + // Complete on initialization success + return tx, workflow.ExecuteActivity(activityCtx, CompleteTransaction, tx).Get(ctx, nil) +} + +// client.go +startOp := client.NewWithStartWorkflowOperation( + client.StartWorkflowOptions{ + ID: "transaction-123", + TaskQueue: "transactions", + WorkflowIDConflictPolicy: enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, + }, + Workflow, + txRequest, +) + +updateHandle, err := client.UpdateWithStartWorkflow(ctx, + client.UpdateWithStartWorkflowOptions{ + StartWorkflowOperation: startOp, + UpdateOptions: client.UpdateWorkflowOptions{ + UpdateName: UpdateName, + WaitForStage: client.WorkflowUpdateStageCompleted, + }, + }, +) + +// Get initialization result immediately +var tx Transaction +err = updateHandle.Get(ctx, &tx) +if err != nil { + return err +} + +// Use transaction ID immediately while workflow continues +fmt.Printf("Transaction initialized: %s\n", tx.ID) +``` + + + + +```java +// TransactionWorkflowImpl.java +public class TransactionWorkflowImpl implements TransactionWorkflow { + private boolean initDone = false; + private Transaction tx; + private Exception initError = null; + + @Override + public TxResult processTransaction(TransactionRequest txRequest) { + this.tx = activities.mintTransactionId(txRequest); + + // Phase 1: Fast synchronous initialization + try { + this.tx = activities.initTransaction(this.tx); + } catch (Exception e) { + initError = e; + } finally { + initDone = true; // Signal update handler + } + + // Phase 2: Slow asynchronous completion + if (initError != null) { + activities.cancelTransaction(this.tx); + return new TxResult("", "Transaction cancelled."); + } else { + activities.completeTransaction(this.tx); + return new TxResult(this.tx.getId(), "Transaction completed successfully."); + } + } + + @Override + public TxResult returnInitResult() { + Workflow.await(() -> initDone); // Wait for initialization + if (initError != null) { + throw Workflow.wrap(initError); + } + return new TxResult(tx.getId(), "Initialization successful"); + } +} + +// Client.java +TransactionWorkflow workflow = client.newWorkflowStub( + TransactionWorkflow.class, + WorkflowOptions.newBuilder() + .setWorkflowId("transaction-123") + .setTaskQueue("transactions") + .setWorkflowIdConflictPolicy( + WorkflowIdConflictPolicy.WORKFLOW_ID_CONFLICT_POLICY_FAIL) + .build()); + +WorkflowUpdateHandle updateHandle = + WorkflowClient.startUpdateWithStart( + workflow::returnInitResult, + UpdateOptions.newBuilder().build(), + new WithStartWorkflowOperation<>(workflow::processTransaction, txRequest)); + +// Get initialization result immediately +TxResult result = updateHandle.getResultAsync().get(); + +// Use transaction ID immediately while workflow continues +System.out.println("Transaction initialized: " + result.getId()); +``` + + + + +```typescript +// workflow.ts +import { defineUpdate, setHandler, condition } from '@temporalio/workflow'; +import * as activities from './activities'; + +const { initTransaction, completeTransaction, cancelTransaction } = + proxyLocalActivities({ + scheduleToCloseTimeout: '5s', + }); + +export const returnInitResultUpdate = defineUpdate('returnInitResult'); + +export async function transactionWorkflow(txRequest: TransactionRequest): Promise { + let tx: Transaction | undefined; + let initDone = false; + let initError: Error | undefined; + + // Register update handler that waits for initialization + setHandler(returnInitResultUpdate, async () => { + await condition(() => initDone); + if (initError) { + throw initError; + } + return tx!; + }); + + // Phase 1: Fast synchronous initialization (local activity) + try { + tx = await initTransaction(txRequest); + } catch (err) { + initError = err as Error; + } finally { + initDone = true; // Signal update handler + } + + // Phase 2: Slow asynchronous completion + if (initError) { + await cancelTransaction(tx!); + throw initError; + } + + await completeTransaction(tx); + return tx; +} + +// client.ts +const startWorkflowOperation = new WithStartWorkflowOperation( + transactionWorkflow, + { + workflowId: 'transaction-123', + args: [txRequest], + taskQueue: 'transactions', + workflowIdConflictPolicy: 'FAIL', + }, +); + +const tx = await client.workflow.executeUpdateWithStart( + returnInitResultUpdate, + { startWorkflowOperation }, +); + +const wfHandle = await startWorkflowOperation.workflowHandle(); + +// Use transaction ID immediately while workflow continues +console.log(`Transaction initialized: ${tx.id}`); + +// Optionally wait for the workflow to complete +const finalResult = await wfHandle.result(); +``` + + + + +The key points across all SDKs are: + +- **Update-with-Start** is a single API call that starts the Workflow and returns the initialization result. +- The Workflow uses an Update handler with a condition/await to block until initialization completes. +- The client receives the result immediately while the Workflow continues executing in the background. +- A `WorkflowIdConflictPolicy` must be specified. For early return, use `FAIL` to assert a new Workflow is created. +- Update-with-Start is **not atomic**. If the Update cannot be delivered (for example, no Worker is available), the Workflow Execution will still start. The SDKs will retry the Update request, but there is no guarantee the Update will succeed. + +## When to use + +The Early Return pattern is a good fit when clients need immediate feedback but operations take time to complete, validation or initialization can be done quickly (under 5 seconds), the operation can be safely cancelled if initialization fails, and the initialization result determines whether to proceed or abort. +Common use cases include e-commerce payment processing (immediate authorization while settlement runs in the background), user onboarding and KYC verification (quick user ID return while background checks continue), resource provisioning (fast validation results while infrastructure is set up), document processing (immediate receipt confirmation while OCR and content analysis continue), and order processing (fast inventory check while fulfillment runs in the background). + +It is not a good fit for fully automated processes that require no intermediate feedback, operations that cannot be split into fast and slow phases, or fire-and-forget operations where no immediate response is needed (use Signals). + +## Benefits and trade-offs + +The Early Return pattern provides immediate client feedback via Update-with-Start in a single round trip. +Clients do not wait for full operation completion. +Local Activities avoid extra server roundtrips during initialization, and there is a clear separation between validation and execution phases. +Automatic cancellation handling runs on initialization failure. + +The trade-offs to consider are that you must tune timeouts carefully for local Activities. +There is a concurrent Update limit (10 per Workflow) that can bottleneck high-throughput scenarios requiring multiple simultaneous Updates. +Clients must handle asynchronous completion separately, and initialization must complete within a single Workflow Task. +The pattern is limited to operations that you can split into fast and slow phases. + +## Comparison with alternatives + +| Approach | Immediate response | Consistency | Complexity | Use case | +| :--- | :--- | :--- | :--- | :--- | +| Early Return (Update-with-Start) | Yes (typed) | Strong | Medium | Synchronous init + async completion | +| Signal + Query polling | Yes (eventual) | Eventual | High | Fire-and-forget with status checks | +| Child Workflow split | Yes (Workflow ID) | Strong | High | Separate init and completion Workflows | +| Blocking until completion | Yes (final) | Strong | Low | Short operations only | + +## Best practices + +- **Set WorkflowIdConflictPolicy to FAIL.** For early return, use `FAIL` to assert a new Workflow is created per request. Use `USE_EXISTING` only for lazy initialization patterns. +- **Use Workflow.await in the Update handler.** Keep the Update handler lightweight — block on a condition flag (`workflow.Await` in Go, `Workflow.await` in Java, `condition` in TypeScript, `workflow.wait_condition` in Python) and let the main Workflow method do the real work. +- **Use local Activities for initialization.** Local Activities avoid extra server roundtrips, keeping the synchronous phase fast (under 5 seconds). +- **Handle Update-with-Start non-atomicity.** Update-with-Start is not atomic. The Workflow may start even if the Update fails. Ensure Workers are running and handle the case where the Update is not delivered. +- **Set a timeout on the Update result.** Use a timeout when waiting for the Update result to avoid blocking the client indefinitely if the Worker is unavailable. +- **Be aware of the concurrent Update limit.** The default `maxInFlightUpdates` is 10 per Workflow. If you expect high concurrency, design accordingly or use separate Workflows. +- **Provide a unique Update ID.** Use a unique Update ID for idempotency so retried requests attach to the same Update rather than creating duplicates. +- **Avoid Workflow timeouts.** Do not set Workflow Execution timeouts when using early return, as the background phase may take longer than expected. + +## Common pitfalls + +- **Assuming Update-with-Start is atomic.** Unlike Signal-with-Start, Update-with-Start is not atomic. The Workflow may start even if the Update fails (for example, if no Worker is available). Handle this by checking Workflow state after the call. +- **Missing WorkflowIdConflictPolicy.** Update-with-Start requires a `WorkflowIdConflictPolicy`. Omitting it causes an error. Use `FAIL` for early return (one Workflow per request) or `USE_EXISTING` for lazy initialization. +- **Blocking too long in the Update handler.** The Update handler should return quickly. Perform long-running work in the main Workflow method and use `Workflow.await` in the Update handler to wait for a result. +- **Swallowing the ContinueAsNew exception.** In TypeScript, `continueAsNew` throws a special exception. Catching it in a try-catch without re-throwing (or returning in a `finally` block) silently prevents Continue-As-New. + +## Related patterns + +- **[Saga Pattern](/design-patterns/saga-pattern)**: You can combine this with the Saga pattern to add compensation for failed completions. +- **[Signal with Start](/design-patterns/signal-with-start)**: For fire-and-forget operations that do not need an immediate response. +- **[Request-Response via Updates](/design-patterns/request-response-via-updates)**: For synchronous state modifications on already-running Workflows. + +## Sample code + +- [Python Sample](https://github.com/temporalio/samples-python/tree/main/early_return) — Early return with Update-with-Start. +- [Go Sample](https://github.com/temporalio/samples-go/tree/main/early-return) — Early return with Update-with-Start. +- [TypeScript Sample](https://github.com/temporalio/samples-typescript/tree/main/early-return) — Early return with local Activities. +- [Java Sample](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/earlyreturn) — Early return with Update handler. diff --git a/docs/design-patterns/entity-lifecycle-patterns.mdx b/docs/design-patterns/entity-lifecycle-patterns.mdx new file mode 100644 index 0000000000..c1bb4206ca --- /dev/null +++ b/docs/design-patterns/entity-lifecycle-patterns.mdx @@ -0,0 +1,31 @@ +--- +id: entity-lifecycle-patterns +title: Entity & lifecycle patterns +sidebar_label: Overview +description: Patterns for modeling long-lived stateful entities and managing Workflow history growth over time. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for modeling long-lived stateful entities and managing Workflow history growth over time. + + diff --git a/docs/design-patterns/entity-workflow.mdx b/docs/design-patterns/entity-workflow.mdx new file mode 100644 index 0000000000..29127d2f0f --- /dev/null +++ b/docs/design-patterns/entity-workflow.mdx @@ -0,0 +1,508 @@ +--- +id: entity-workflow +title: "Entity Workflow Pattern" +sidebar_label: "Entity Workflow" +description: "Models long-lived business entities as individual Workflows that persist for the entity's entire lifetime, handling all state transitions through Signals and Updates." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Entity Workflow pattern models long-lived business entities (users, accounts, devices, orders) as individual Workflows that persist for the entity's entire lifetime — potentially months or years. +Each entity gets its own Workflow instance identified by the entity ID, handling all state transitions and operations through Signals and Updates. + +## Problem + +Many business domains have entities that exist for extended periods, undergo multiple state transitions over their lifetime, need to maintain consistent state across operations, require audit trails of all changes, and must handle concurrent operations safely. + +Traditional approaches struggle with these requirements: + +- **Database-centric**: Complex locking, race conditions, scattered business logic. +- **Event Sourcing**: Requires rebuilding state from events, complex infrastructure. +- **Stateless Services**: No built-in consistency, must coordinate state externally. +- **Short-lived Workflows**: Do not model the full entity lifecycle. + +## Solution + +You create one Workflow per entity, using the entity ID as the Workflow ID. +The Workflow runs for the entity's entire lifetime, maintaining state in Workflow variables and handling operations via Signals and Updates. +You use Continue-As-New periodically to prevent unbounded history growth. + +```mermaid +sequenceDiagram + participant Client + participant UserWorkflow + participant NotificationWorkflow + participant Activities + + Client->>UserWorkflow: Start(userId="user-123") + activate UserWorkflow + Note over UserWorkflow: State: ACTIVE + + Client->>UserWorkflow: Update: updateProfile(data) + UserWorkflow->>Activities: validateProfile(data) + Activities-->>UserWorkflow: valid + UserWorkflow->>Activities: updateDatabase(userId, data) + Activities-->>UserWorkflow: success + Note over UserWorkflow: Profile updated + UserWorkflow-->>Client: Success + + Client->>UserWorkflow: Signal: suspend() + Note over UserWorkflow: State: SUSPENDED + UserWorkflow->>NotificationWorkflow: Start child workflow + activate NotificationWorkflow + NotificationWorkflow->>Activities: sendEmail(userId, "suspended") + deactivate NotificationWorkflow + + Client->>UserWorkflow: Update: reactivate() + Note over UserWorkflow: State: ACTIVE + UserWorkflow-->>Client: Success + + Note over UserWorkflow: After 1000 operations... + UserWorkflow->>UserWorkflow: Continue-As-New + Note over UserWorkflow: Fresh history, same state + + Client->>UserWorkflow: Signal: delete() + Note over UserWorkflow: State: DELETED + UserWorkflow-->>UserWorkflow: Complete + deactivate UserWorkflow +``` + +The following describes each step in the diagram: + +1. The client starts the Workflow with a user ID. The Workflow initializes in the ACTIVE state. +2. The client sends an Update to modify the profile. The Workflow validates the data via an Activity, persists the change, and returns success. +3. The client sends a Signal to suspend the account. The Workflow transitions to SUSPENDED and starts a Child Workflow to send a notification email. +4. The client sends an Update to reactivate the account. The Workflow transitions back to ACTIVE. +5. After 1000 operations, the Workflow calls Continue-As-New to reset its history while preserving state. +6. The client sends a Signal to delete the account. The Workflow transitions to DELETED and completes. + +## Implementation + +The following examples show how each SDK implements the Entity Workflow pattern. +Each implementation defines Update handlers for synchronous operations, Signal handlers for asynchronous events, and Query handlers for state inspection. + + + + +```python +# workflows.py +from dataclasses import dataclass +from datetime import datetime, timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import validate_profile + +@dataclass +class UserState: + status: str = "ACTIVE" + profile: ProfileData | None = None + pending_email: str | None = None + created_at: datetime | None = None + updated_at: datetime | None = None + +@workflow.defn +class UserAccountWorkflow: + def __init__(self) -> None: + self.state = UserState(created_at=datetime.utcnow()) + self.deleted = False + self.operation_count = 0 + + @workflow.run + async def run(self, user_id: str) -> None: + # Block until deleted or Continue-As-New is suggested + await workflow.wait_condition( + lambda: self.deleted or workflow.info().is_continue_as_new_suggested() + ) + + if not self.deleted and workflow.info().is_continue_as_new_suggested(): + await workflow.wait_condition(workflow.all_handlers_finished) + workflow.continue_as_new(user_id) + + self.state.status = "DELETED" + + @workflow.update + async def update_profile(self, data: ProfileData) -> None: + if self.deleted: + raise ValueError("User account is deleted") + + await workflow.execute_activity( + validate_profile, data, + start_to_close_timeout=timedelta(seconds=30), + ) + + self.state.profile = data + self.state.updated_at = datetime.utcnow() + self.operation_count += 1 + + @workflow.update + async def suspend(self) -> None: + if not self.deleted and self.state.status != "SUSPENDED": + self.state.status = "SUSPENDED" + self.state.updated_at = datetime.utcnow() + self.operation_count += 1 + + @workflow.update + async def reactivate(self) -> None: + if not self.deleted and self.state.status == "SUSPENDED": + self.state.status = "ACTIVE" + self.state.updated_at = datetime.utcnow() + self.operation_count += 1 + + @workflow.signal + def delete(self) -> None: + self.deleted = True + + @workflow.query + def get_state(self) -> UserState: + return self.state +``` + + + + +```go +// workflow.go +type UserAccountWorkflow struct{} + +type UserState struct { + Status string + Profile ProfileData + PendingEmail string + CreatedAt time.Time + UpdatedAt time.Time +} + +func (w *UserAccountWorkflow) Run(ctx workflow.Context, userId string) error { + state := UserState{ + Status: "ACTIVE", + CreatedAt: workflow.Now(ctx), + } + deleted := false + operationCount := 0 + + err := workflow.SetUpdateHandler(ctx, "updateProfile", func(ctx workflow.Context, data ProfileData) error { + if deleted { + return errors.New("user account is deleted") + } + + if err := workflow.ExecuteActivity(ctx, ValidateProfile, data).Get(ctx, nil); err != nil { + return err + } + + state.Profile = data + state.UpdatedAt = workflow.Now(ctx) + operationCount++ + + return nil + }) + if err != nil { + return err + } + + err = workflow.SetUpdateHandler(ctx, "suspend", func(ctx workflow.Context) error { + if !deleted && state.Status != "SUSPENDED" { + state.Status = "SUSPENDED" + state.UpdatedAt = workflow.Now(ctx) + operationCount++ + } + return nil + }) + if err != nil { + return err + } + + // Block until deleted or Continue-As-New is suggested + for { + selector := workflow.NewSelector(ctx) + selector.AddReceive(workflow.GetSignalChannel(ctx, "delete"), func(c workflow.ReceiveChannel, more bool) { + c.Receive(ctx, nil) + deleted = true + }) + selector.Select(ctx) + + if deleted { + state.Status = "DELETED" + return nil + } + if workflow.GetInfo(ctx).GetContinueAsNewSuggested() { + return workflow.NewContinueAsNewError(ctx, w.Run, userId) + } + } +} +``` + + + + +```java +// UserAccountWorkflow.java +@WorkflowInterface +public interface UserAccountWorkflow { + @WorkflowMethod + void run(String userId); + + @UpdateMethod + void updateProfile(ProfileData data); + + @UpdateMethod + void changeEmail(String newEmail); + + @SignalMethod + void suspend(); + + @SignalMethod + void reactivate(); + + @SignalMethod + void delete(); + + @QueryMethod + UserState getState(); +} + +public class UserAccountWorkflowImpl implements UserAccountWorkflow { + private String userId; + private UserState state = new UserState(); + private boolean deleted = false; + private int operationCount = 0; + private static final int CONTINUE_AS_NEW_THRESHOLD = 1000; + + @Override + public void run(String userId) { + this.userId = userId; + if (state.getStatus() == null) { + state.setStatus("ACTIVE"); + state.setCreatedAt(Workflow.currentTimeMillis()); + } + + // Run until deleted or Continue-As-New is needed + Workflow.await(() -> deleted || Workflow.getInfo().isContinueAsNewSuggested()); + + if (!deleted && Workflow.getInfo().isContinueAsNewSuggested()) { + Workflow.continueAsNew(userId, state); + } + + state.setStatus("DELETED"); + state.setDeletedAt(Workflow.currentTimeMillis()); + } + + @Override + public void updateProfile(ProfileData data) { + validateNotDeleted(); + Activities.validateProfile(data); + state.setProfile(data); + state.setUpdatedAt(Workflow.currentTimeMillis()); + incrementOperationCount(); + } + + @Override + public void changeEmail(String newEmail) { + validateNotDeleted(); + Activities.sendVerificationEmail(userId, newEmail); + state.setPendingEmail(newEmail); + state.setUpdatedAt(Workflow.currentTimeMillis()); + incrementOperationCount(); + } + + @Override + public void suspend() { + if (!deleted && !"SUSPENDED".equals(state.getStatus())) { + state.setStatus("SUSPENDED"); + state.setUpdatedAt(Workflow.currentTimeMillis()); + incrementOperationCount(); + } + } + + @Override + public void reactivate() { + if (!deleted && "SUSPENDED".equals(state.getStatus())) { + state.setStatus("ACTIVE"); + state.setUpdatedAt(Workflow.currentTimeMillis()); + incrementOperationCount(); + } + } + + @Override + public void delete() { + deleted = true; + } + + @Override + public UserState getState() { + return state; + } + + private void validateNotDeleted() { + if (deleted) { + throw new IllegalStateException("User account is deleted"); + } + } + + private void incrementOperationCount() { + operationCount++; + } +} +``` + + + + +```typescript +// workflow.ts +import { condition, allHandlersFinished, defineUpdate, defineSignal, defineQuery, setHandler, continueAsNew, workflowInfo, proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { validateProfile } = proxyActivities({ + startToCloseTimeout: '30s', +}); + +interface UserState { + status: string; + profile?: ProfileData; + pendingEmail?: string; + createdAt: number; + updatedAt: number; +} + +export const updateProfileUpdate = defineUpdate('updateProfile'); +export const suspendSignal = defineSignal('suspend'); +export const deleteSignal = defineSignal('delete'); +export const getStateQuery = defineQuery('getState'); + +export async function userAccountWorkflow(userId: string): Promise { + const state: UserState = { + status: 'ACTIVE', + createdAt: Date.now(), + updatedAt: Date.now(), + }; + + let deleted = false; + let operationCount = 0; + + setHandler(updateProfileUpdate, async (data: ProfileData) => { + if (deleted) { + throw new Error('User account is deleted'); + } + + await validateProfile(data); + + state.profile = data; + state.updatedAt = Date.now(); + operationCount++; + + + }); + + setHandler(suspendSignal, () => { + if (!deleted && state.status !== 'SUSPENDED') { + state.status = 'SUSPENDED'; + state.updatedAt = Date.now(); + operationCount++; + } + }); + + setHandler(deleteSignal, () => { + deleted = true; + }); + + setHandler(getStateQuery, () => state); + + // Block until deleted or Continue-As-New is suggested + await condition(() => deleted || workflowInfo().continueAsNewSuggested); + + if (!deleted && workflowInfo().continueAsNewSuggested) { + await condition(allHandlersFinished); + await continueAsNew(userId); + } + + state.status = 'DELETED'; +} +``` + + + + +The Workflow blocks on `Workflow.await(() -> deleted)` (Java), `condition(() => deleted)` (TypeScript), `workflow.wait_condition(lambda: self.deleted)` (Python), or `selector.Select(ctx)` (Go) until the delete Signal arrives. +All state transitions happen through Signal and Update handlers, ensuring that every operation on the entity goes through a single Workflow with no race conditions. +Continue-As-New is triggered from the main Workflow method (not from handlers) when `isContinueAsNewSuggested()` returns true. +All SDK docs explicitly warn: do not call Continue-As-New from Update or Signal handlers. +Instead, handlers set state, and the main Workflow method checks whether to Continue-As-New. + +## When to use + +The Entity Workflow pattern is a good fit for user accounts and profiles, IoT devices and sensors, customer relationships (CRM), shopping carts and orders, financial accounts, subscription management, device provisioning and lifecycle, and multi-tenant resources. + +It is not a good fit for short-lived processes (use regular Workflows), stateless operations (use Activities), high-frequency updates (more than 100 per second per entity), or entities with only CRUD operations (use a database). + +## Benefits and trade-offs + +All operations on an entity go through a single Workflow, eliminating race conditions. +The Workflow history provides a complete audit trail of all state changes. +All entity logic lives in one place, and state survives process crashes and restarts. +Temporal provides exactly-once execution and automatic retries. +You can inspect current state through Queries without side effects. + +The trade-offs to consider are that you must use Continue-As-New to prevent unbounded history growth. +A single Workflow handles all operations for one entity, which limits throughput. +State is kept in Workflow memory, so you should use Activities for large data. +One Workflow per entity means you should consider costs at scale. +The first operation after an idle period may have latency. + +## Comparison with alternatives + +| Approach | Consistency | Audit trail | Complexity | Scalability | +| :--- | :--- | :--- | :--- | :--- | +| Entity Workflow | Strong | Complete | Low | High (per entity) | +| Database + Locks | Eventual | Manual | High | Very High | +| Event Sourcing | Strong | Complete | High | High | +| Stateless Service | Weak | Manual | Medium | Very High | + +## Best practices + +- **Use entity ID as Workflow ID.** This ensures uniqueness and idempotent starts. +- **Implement Continue-As-New.** Use `isContinueAsNewSuggested()` to check when to continue. Always call Continue-As-New from the main Workflow method, never from handlers. Wait for all handlers to finish before continuing. +- **Validate in Updates.** Use Updates for operations that require validation and a return value. +- **Use Signals for events.** Use Signals for asynchronous notifications that do not need responses. +- **Keep state minimal.** Store large data externally and reference it in the Workflow. +- **Add Queries.** Expose state for monitoring and debugging. +- **Handle deletion.** Implement an explicit deletion or decommission Signal. +- **Version carefully.** Use Worker versioning for Workflow code changes. +- **Set timeouts.** Use Workflow execution timeout as a safety net. +- **Monitor history size.** Alert when approaching the Continue-As-New threshold. + +## Common pitfalls + +- **Calling Continue-As-New from Signal or Update handlers.** Continue-As-New must be called from the main Workflow method, never from inside a handler. Calling it from a handler causes non-determinism errors. +- **Not waiting for handlers to finish before Continue-As-New.** Use `allHandlersFinished` (TypeScript), `Workflow.isEveryHandlerFinished()` (Java), or `workflow.all_handlers_finished()` (Python) to ensure in-flight handlers complete before transitioning. +- **Losing Update ID deduplication across Continue-As-New.** Update IDs are scoped to a single Workflow Execution. After Continue-As-New, the same Update ID can be accepted again. Carry processed IDs in the Continue-As-New input if deduplication is needed. +- **Exceeding the 2 MB payload limit on Continue-As-New input.** State passed to Continue-As-New is subject to the same 2 MB blob size limit as Workflow inputs. Use external storage for large state. +- **Using a hardcoded counter instead of `isContinueAsNewSuggested`.** The SDK provides `isContinueAsNewSuggested()` which accounts for actual history size. Hardcoded thresholds may be too aggressive or too lenient. + +## Related patterns + +- **[Continue-As-New](/design-patterns/continue-as-new)**: Essential for preventing unbounded history. +- **[Request-Response via Updates](/design-patterns/request-response-via-updates)**: Synchronous operations with validation. +- **[Signal with Start](/design-patterns/signal-with-start)**: Idempotent Workflow start with an initial Signal. + +## Sample code + +**Python:** +- [Safe Message Handlers](https://github.com/temporalio/samples-python/tree/main/message_passing/safe_message_handlers) — Entity Workflow with Updates, Signals, and Continue-As-New. + +**Go:** +- [Safe Message Handlers](https://github.com/temporalio/samples-go/tree/main/safe_message_handler) — Entity Workflow with Updates, Signals, and Continue-As-New. + +**Java:** +- [Safe Message Handlers](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/safemessagehandler) — Entity Workflow with Updates, Signals, and Continue-As-New. + +**TypeScript:** +- [Safe Message Handlers](https://github.com/temporalio/samples-typescript/tree/main/message-passing/safe-message-handlers) — Entity Workflow with Updates, Signals, and Continue-As-New. + +## References + +- [Temporal Blog: Very Long-Running Workflows](https://temporal.io/blog/very-long-running-workflows) — Guidance on managing Workflows that run for extended periods. +- [Temporal Docs: Continue-As-New](https://docs.temporal.io/workflows#continue-as-new) — Official documentation on the Continue-As-New mechanism. diff --git a/docs/design-patterns/error-handling-patterns.mdx b/docs/design-patterns/error-handling-patterns.mdx new file mode 100644 index 0000000000..5c85ca6545 --- /dev/null +++ b/docs/design-patterns/error-handling-patterns.mdx @@ -0,0 +1,115 @@ +--- +id: error-handling-patterns +title: Error Handling & Retry Patterns +sidebar_label: Overview +description: Patterns for controlling how Temporal retries Activities, surfaces persistent failures, and recovers from errors that require human intervention. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for controlling how Temporal retries Activities, surfaces persistent failures, and recovers from errors that require human intervention. + + + +## Choosing the right pattern + +The following decision tree helps you select the appropriate retry strategy for your use case. + +```mermaid +flowchart TD + Start([Activity failing]) --> Q1{Each attempt
costs money or
consumes quota?} + Q1 -->|Yes| FixedCount[Fixed Count of Retries
Cap MaximumAttempts] + Q1 -->|No| Q2{Will this error
ever succeed
automatically?} + Q2 -->|No| Q2a{Can a human
correct and retry?} + Q2a -->|Yes| Resumable[Resumable Activity
Park and await signal] + Q2a -->|No| NonRetryable[Non-Retryable Errors
Fail fast] + Q2 -->|Yes| Q3{Downstream has
a predictable
unavailability window?} + Q3 -->|Yes| Delayed[Delayed Retry
Fixed interval backoff] + Q3 -->|No| Q4{Must resolve
within a
time budget?} + Q4 -->|Yes| WallTime[Fixed Wall-Time Retries
ScheduleToCloseTimeout] + Q4 -->|No| Q5{Want aggressive
initial retries then
patient recovery?} + Q5 -->|Yes| FastSlow[Fast/Slow Retries
Two-phase retry policy] + Q5 -->|No| Metrics[Retry Alerting via Metrics
Emit metrics at attempt threshold] +``` + +The following describes each decision point: + +1. If each attempt consumes a paid API call, a rate-limited token, or another scarce resource, use **Fixed Count of Retries** to cap total consumption. +2. If the error is structural — a missing record, invalid input, or authorization failure — and cannot be corrected automatically, ask whether a human can fix it: if so, use **Resumable Activity** to park the Workflow and await a correction signal; otherwise use **Non-Retryable Errors** to fail fast. +3. If the downstream system has a scheduled maintenance window and you know approximately how long it will be unavailable, use **Delayed Retry** with a fixed interval. +4. If the process must resolve (one way or another) within a business SLA window such as 24 hours, use **Fixed Wall-Time Retries** with `ScheduleToCloseTimeout`. +5. If you want to recover from transient errors quickly but also wait indefinitely for the downstream system to come back, use **Fast/Slow Retries**. +6. For any long-running retry scenario, add **Retry Alerting via Metrics** to surface persistent failures before they breach an SLA. + +## How Temporal retries work + +Temporal's default `RetryPolicy` retries Activities indefinitely with exponential backoff. +Unless you configure a policy, a failing Activity will keep retrying until the `ScheduleToCloseTimeout` or the Workflow itself completes. + +The key `RetryPolicy` fields are: + +| Field | Default | Effect | +| :--- | :--- | :--- | +| `MaximumAttempts` | 0 (unlimited) | Caps total attempts including the first | +| `InitialInterval` | 1 second | Delay before the first retry | +| `BackoffCoefficient` | 2.0 | Multiplier applied after each retry | +| `MaximumInterval` | 100× InitialInterval | Upper bound on the backoff delay | +| `NonRetryableErrorTypes` | `[]` | Error types that skip retries entirely | + +`ScheduleToCloseTimeout` is set on the Activity call options, not in `RetryPolicy`. +It caps the total wall-clock time from when the Activity is first scheduled to when it must complete — across all retry attempts. + +## Related patterns + +- [Long Running Activity](/design-patterns/long-running-activity): Heartbeating and resumable progress for Activities that run for minutes to hours. +- [Polling External Services](/design-patterns/polling): Periodic status checks when the downstream system is asynchronous. +- [Approval](/design-patterns/approval): Human-in-the-loop gate before a Workflow proceeds. + +## References + +- [Temporal Retry Policies](https://docs.temporal.io/encyclopedia/retry-policies) +- [Understanding Workflow Retries and Failures](https://community.temporal.io/t/understanding-workflow-retries-and-failures/122) +- [Failure Handling in Practice](https://temporal.io/blog/failure-handling-in-practice) diff --git a/docs/design-patterns/event-accumulator.mdx b/docs/design-patterns/event-accumulator.mdx new file mode 100644 index 0000000000..ca4109550b --- /dev/null +++ b/docs/design-patterns/event-accumulator.mdx @@ -0,0 +1,409 @@ +--- +id: event-accumulator +title: "Event Accumulator Pattern" +sidebar_label: "Event Accumulator" +description: "Buffers a stream of incoming Signals and processes them together in batches based on size or time thresholds." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TL;DR] +Use the Event Accumulator pattern to **durably collect and process events from multiple senders over an unlimited time.** The workflow accumulates signals, deduplicates by a stable item key, and processes the batch after a sliding inactivity timeout — no external coordination, no lost events on retry. +::: + +## Overview + +The Accumulator pattern groups a stream of incoming events by a key and processes them together as a batch. +A *group key* is a stable, domain-specific identifier — for example, an order ID, customer ID, or session token — that logically binds related events belonging to the same accumulation window. +A single workflow instance per group key receives signals as events arrive, deduplicates them, and waits with a sliding inactivity timer. +When no new events arrive within the timeout window, the workflow calls a batch processing activity and completes. + +## Problem + +In distributed systems, events for the same logical entity arrive asynchronously from multiple producers at unpredictable rates. +Processing each event individually wastes downstream resources and makes throughput harder to control. +Ensuring exactly one active collector per group key — even under concurrent producers — requires coordination logic that is difficult to build reliably without distributed state. + +Without the Accumulator pattern, you must: + +- Handle race conditions when multiple producers try to start the same collection workflow simultaneously. +- Implement deduplication externally, because at-least-once delivery is common in event streams. +- Manage a reset timer that extends the collection window each time a new event arrives, without a reliable durable timer primitive. +- Persist collection state externally across restarts and failures. +- Handle gracefully the case where a long accumulation period grows the workflow history beyond safe limits. + +## Solution + +You assign each group a deterministic workflow ID derived from the group key (for example, `accumulator-order-123`). +Producers call Signal-With-Start so the workflow is created on first use and receives additional signals on subsequent calls — without any client-side coordination. +Inside the workflow, `Workflow.await()` with a timeout implements a sliding inactivity window: each arriving signal resets the countdown. +When the countdown expires — or when an explicit flush signal is sent — the workflow passes all accumulated, deduplicated events to a batch processing activity and completes. +If the accumulation period is long enough to grow the workflow history near its limit, the workflow uses Continue-As-New to carry its state forward into a fresh run. + +```mermaid +sequenceDiagram + participant P as Producers + participant T as Temporal + participant W as Accumulator Workflow + participant A as Batch Activity + + P->>T: SignalWithStart(order-A, item-1) + T->>W: Start workflow + deliver add-item(item-1) + activate W + Note over W: Accumulate item-1 · reset timer + + P->>T: SignalWithStart(order-A, item-2) + T->>W: Deliver add-item(item-2) + Note over W: Accumulate item-2 · reset timer + + P->>T: SignalWithStart(order-A, item-1) ← duplicate + T->>W: Deliver add-item(item-1) + Note over W: Deduplicate: item-1 already seen + + Note over W: Timer expires — no new signals + + W->>A: processItems(order-A, [item-1, item-2]) + A-->>W: "Order order-A: 2 items fulfilled" + deactivate W +``` + +The following describes each step in the diagram: + +1. A producer calls Signal-With-Start with `order-A` as the bucket key. Temporal starts `accumulator-order-A` and delivers the first `add-item` signal. The workflow adds `item-1` to its list and starts the inactivity timer. +2. A second producer calls Signal-With-Start for the same order. Temporal finds the workflow already running and delivers the signal — no new instance is created. The workflow adds `item-2` and resets the timer. +3. The first producer retries `item-1` due to at-least-once delivery. The workflow checks its deduplication set, finds `item-1` already recorded, and discards the duplicate. Alternatively, you can replace the existing record with the new payload — useful when a producer may resend an updated version of the same event under the same key. +4. No new signals arrive within the inactivity window. The `Workflow.await()` condition times out. +5. The workflow calls `processItems` with all accumulated items. The activity returns a result, and the workflow completes. + +## Implementation + + +The following examples highlight the key mechanics of the accumulator loop. +Full implementations including the worker, starter, and shared types are available in the runner above. + + + + +```typescript +// workflows.ts +export async function accumulatorWorkflow( + bucketKey: string, + accumulated: OrderItem[] = [], + seenKeys: string[] = [], +): Promise { + const seenSet = new Set(seenKeys); + const items: OrderItem[] = [...accumulated]; + const unprocessed: OrderItem[] = []; + let flushRequested = false; + + setHandler(addItemSignal, (item: OrderItem) => { + unprocessed.push(item); + }); + setHandler(flushSignal, () => { + flushRequested = true; + }); + + do { + // Sliding window: wait for a signal or let the inactivity timer fire + const timedOut = !(await condition( + () => unprocessed.length > 0 || flushRequested, + "10 seconds", + )); + + // Drain and deduplicate incoming signals + while (unprocessed.length > 0) { + const item = unprocessed.shift()!; + if (item.orderId === bucketKey && !seenSet.has(item.itemId)) { + seenSet.add(item.itemId); + items.push(item); + } + } + + if (timedOut || flushRequested) { + const result = await processItems(bucketKey, items); + if (unprocessed.length === 0) return result; + // More signals arrived after timeout/flush — loop to process them + } + } while (unprocessed.length > 0 || !workflowInfo().continueAsNewSuggested); + + // History growing large — continue as new, carrying accumulated state forward + await continueAsNew(bucketKey, items, [...seenSet]); + return ""; // unreachable +} +``` + + + + +```python +# workflows.py +@workflow.defn +class AccumulatorWorkflow: + def __init__(self) -> None: + self._unprocessed: deque[OrderItem] = deque() + self._flush_requested = False + + @workflow.signal(name="add-item") + async def add_item(self, item: OrderItem) -> None: + self._unprocessed.append(item) + + @workflow.signal(name="flush") + async def flush(self) -> None: + self._flush_requested = True + + @workflow.run + async def run( + self, + bucket_key: str, + accumulated: list[OrderItem] | None = None, + seen_keys: list[str] | None = None, + ) -> str: + items = list(accumulated or []) + seen_set = set(seen_keys or []) + + while True: + # Sliding window: wait for a signal or let the inactivity timer fire + timed_out = not await workflow.wait_condition( + lambda: bool(self._unprocessed) or self._flush_requested, + timeout=timedelta(seconds=10), + ) + + # Drain and deduplicate the signal queue + while self._unprocessed: + item = self._unprocessed.popleft() + if item.order_id == bucket_key and item.item_id not in seen_set: + seen_set.add(item.item_id) + items.append(item) + + if timed_out or self._flush_requested: + result = await workflow.execute_activity( + process_items, + args=[bucket_key, items], + start_to_close_timeout=timedelta(seconds=10), + ) + if not self._unprocessed: + return result + # More signals arrived after timeout/flush — loop to process them + + if not self._unprocessed and workflow.info().is_continue_as_new_suggested(): + workflow.continue_as_new(args=[bucket_key, items, sorted(seen_set)]) +``` + + + + +```go +// workflows.go +func AccumulatorWorkflow(ctx workflow.Context, bucketKey string, items []OrderItem, seenKeys []string) (string, error) { + seenSet := make(map[string]bool) + for _, k := range seenKeys { + seenSet[k] = true + } + accumulated := append([]OrderItem{}, items...) + + addItemCh := workflow.GetSignalChannel(ctx, "add-item") + flushCh := workflow.GetSignalChannel(ctx, "flush") + flushRequested := false + + for { + // Drain any signals buffered before this iteration + for { + var item OrderItem + if !addItemCh.ReceiveAsync(&item) { + break + } + if item.OrderID == bucketKey && !seenSet[item.ItemID] { + seenSet[item.ItemID] = true + accumulated = append(accumulated, item) + } + } + var voidFlush interface{} + if flushCh.ReceiveAsync(&voidFlush) { + flushRequested = true + } + if flushRequested { + break + } + if workflow.GetInfo(ctx).GetContinueAsNewSuggested() { + keys := make([]string, 0, len(seenSet)) + for k := range seenSet { + keys = append(keys, k) + } + sort.Strings(keys) // deterministic order for replay + return "", workflow.NewContinueAsNewError(ctx, AccumulatorWorkflow, bucketKey, accumulated, keys) + } + + // Sliding window: wait for a signal or let the inactivity timer fire + timedOut := false + timerCtx, cancelTimer := workflow.WithCancel(ctx) + timer := workflow.NewTimer(timerCtx, maxAwaitTime) + selector := workflow.NewSelector(ctx) + selector.AddFuture(timer, func(f workflow.Future) { timedOut = true }) + selector.AddReceive(addItemCh, func(c workflow.ReceiveChannel, _ bool) { + var item OrderItem + c.Receive(ctx, &item) + if item.OrderID == bucketKey && !seenSet[item.ItemID] { + seenSet[item.ItemID] = true + accumulated = append(accumulated, item) + } + }) + selector.AddReceive(flushCh, func(c workflow.ReceiveChannel, _ bool) { + var void interface{} + c.Receive(ctx, &void) + flushRequested = true + }) + selector.Select(ctx) + cancelTimer() // no-op if timer already fired; cancels timer if a signal arrived + + if timedOut || flushRequested { + break + } + } + + ao := workflow.ActivityOptions{StartToCloseTimeout: 10 * time.Second} + actCtx := workflow.WithActivityOptions(ctx, ao) + var result string + if err := workflow.ExecuteActivity(actCtx, ProcessItems, bucketKey, accumulated).Get(ctx, &result); err != nil { + return "", err + } + workflow.GetLogger(ctx).Info("Processed order batch", "bucketKey", bucketKey, "count", len(accumulated)) + return result, nil +} +``` + + + + +```java +// AccumulatorWorkflow.java +@WorkflowInterface +public interface AccumulatorWorkflow { + @WorkflowMethod + String accumulate(String bucketKey, List items, List seenKeys); + + @SignalMethod + void addItem(Shared.OrderItem item); + + @SignalMethod + void flush(); + + class Impl implements AccumulatorWorkflow { + private final Activities activities = Workflow.newActivityStub( + Activities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Shared.MAX_AWAIT_TIME.plusSeconds(10)) + .build()); + + private final ArrayDeque unprocessed = new ArrayDeque<>(); + private boolean flushRequested = false; + + @Override + public String accumulate(String bucketKey, List itemsInput, List seenKeysInput) { + List items = new ArrayList<>(itemsInput); + Set seenSet = new HashSet<>(seenKeysInput); + + do { + // Sliding window: wait for a signal or let the inactivity timer fire + boolean timedOut = !Workflow.await( + Shared.MAX_AWAIT_TIME, + () -> !unprocessed.isEmpty() || flushRequested); + + // Drain and deduplicate the signal queue + while (!unprocessed.isEmpty()) { + Shared.OrderItem item = unprocessed.removeFirst(); + if (item.orderId.equals(bucketKey) && seenSet.add(item.itemId)) { + items.add(item); + } + } + + if (timedOut || flushRequested) { + String result = activities.processItems(bucketKey, items); + if (unprocessed.isEmpty()) return result; + // More signals arrived after timeout/flush — loop to process them + } + } while (!unprocessed.isEmpty() || !Workflow.getInfo().isContinueAsNewSuggested()); + + // History growing large — continue as new, carrying accumulated state forward + AccumulatorWorkflow continueAsNew = Workflow.newContinueAsNewStub(AccumulatorWorkflow.class); + List seenKeysList = new ArrayList<>(seenSet); + java.util.Collections.sort(seenKeysList); // deterministic order for replay + continueAsNew.accumulate(bucketKey, items, seenKeysList); + return ""; // unreachable + } + + @Override + public void addItem(Shared.OrderItem item) { + unprocessed.add(item); + } + + @Override + public void flush() { + flushRequested = true; + } + } +} +``` + + + + +The key differences between SDKs: + +- **TypeScript**: Uses `setHandler` to register signal handlers and `condition()` with a string duration for the sliding window. `continueAsNew()` carries accumulated state to the next run. +- **Python**: Uses `@workflow.signal(name=...)` decorators and `workflow.wait_condition()` with a `timedelta` timeout. `workflow.continue_as_new()` accepts the new run arguments. +- **Go**: Uses `workflow.GetSignalChannel` to obtain named signal channels, then builds a `Selector` per iteration with a `NewTimer` future and two channel receivers. Cancels the old timer whenever a signal arrives before it fires. +- **Java**: Uses `@SignalMethod` annotations and `Workflow.await(Duration, Supplier)`. `Workflow.newContinueAsNewStub` carries state to the next run. + +Producers in all SDKs call Signal-With-Start to atomically start the workflow if not running and deliver the first signal without any client-side coordination. + +## When to use + +The Accumulator pattern is well suited when events related to the same entity or group arrive from multiple producers — including a single consumer (for example, a Kafka consumer) polling a topic that carries events for multiple groups simultaneously — and can be processed together as batches, when downstream systems prefer batched calls rather than one call per event, and when at-least-once event delivery makes deduplication necessary at the collection layer. + +It is not a good fit for use cases that require processing every event individually in _strict order_, for cases where the batch size is known in advance and all events arrive within a short deterministic window (a standard workflow is sufficient), or when events for different keys must be correlated at processing time (consider fan-in with e.g. Child Workflows or multiple levels of Accumulator). + +## Benefits and trade-offs + +The Accumulator pattern reduces downstream load by consolidating many individual events into a single batch and activity call. +Signal-With-Start eliminates client-side coordination logic for starting or locating the collector workflow. +Temporal's durable execution guarantees that accumulated state survives Worker restarts, and Continue-As-New prevents history growth from becoming a long-term problem. + +The trade-offs to consider are that events are not processed until the inactivity timeout fires or a flush signal is sent, introducing intentional latency. +The timeout value is a domain-specific trade-off between latency and batch size. +If producers stop sending signals but never send a flush, the workflow holds open resources until the timeout fires. + +## Best practices + +- **Use a deterministic workflow ID per bucket key.** Encode the group key and, optionally, an accumulation period (for example, `accumulator-order-123-2026-05-14`) to control when a new window starts. +- **Always use Signal-With-Start from producers.** Calling start and signal separately is not atomic: a signal sent between the two calls can be lost if the workflow has not yet started. +- **Include a deduplication key in every signal payload.** At-least-once delivery is common in event streams; without a dedup key, retried events add duplicate entries to the batch. +- **Size the inactivity timeout to your domain's quiet period.** The timeout should reflect how long you are confident no more events will arrive for this batch. Tune it based on observed producer behavior, not an arbitrary constant. +- **Pass accumulated state as workflow arguments into each Continue-As-New run.** Workflow state does not survive a Continue-As-New transition automatically; always pass items and the seen-keys list as arguments to the new run. If you omit the seen-keys list, any signal delivered to both the old and new run during the Continue-As-New handoff will be processed again in the new run, producing duplicate entries in the batch. +- **Keep signal handlers fast and side-effect free.** Signal handlers must not call activities or yield to the scheduler. Buffer incoming signals and process them in the main workflow loop. +- **Add a flush signal for testing and operational runbooks.** An explicit flush signal lets you trigger early batch processing without waiting for the timeout, which is useful for end-to-end tests and manual intervention. +- **Keep producer signal rate below 5/sec per workflow instance.** Each signal briefly locks the workflow execution. A sustained rate above roughly 5 signals/second causes workflow task backlog, limits throughput, and can eventually prevent Continue-As-New from completing. If your producer rate is higher, partition by a finer-grained key so each accumulator workflow receives a manageable share of the total signal volume. +- **Account for the 10,000-signal-per-run limit on Temporal Cloud.** A single workflow run in Temporal Cloud can receive at most 10,000 signals. If your accumulation window is long and producers are active, ensure your Continue-As-New trigger fires well before the per-run signal count reaches this limit. + +## Common pitfalls + +- **Non-deterministic or random workflow IDs.** If the workflow ID is not derived deterministically from the group key, multiple accumulator instances are created for the same group, splitting the batch. +- **Calling start followed by a separate signal.** These are not atomic. A signal sent between the two calls will be lost if the workflow has not yet started. Use Signal-With-Start instead. +- **Omitting a deduplication key.** Retried or re-delivered events add duplicate entries to the batch. Every signal payload must carry a stable, unique key. +- **Timeout set too short.** The workflow processes a partial batch while more events are still in flight, forcing producers to re-send unprocessed events. Profile your producer arrival rate before choosing a timeout. +- **Forgetting to pass accumulated state into Continue-As-New.** The new run starts with empty state and re-processes events from scratch, producing duplicate batches. +- **Calling activities inside signal handlers.** Signal handlers run synchronously in the workflow thread and must not block or call activities. Buffer the item and let the main loop handle activity calls. +- **Assuming workflow completion means all events were captured.** Producers that send signals after the workflow completes will start a new accumulator instance. Decide whether this is intentional (a new accumulation window) or an error. +- **Signal rate too high to allow Continue-As-New to complete.** Continue-As-New requires a brief window (~100 ms) with no unhandled signals. If producers send signals continuously without pause, the workflow can never enter that window, history grows without bound, and Temporal will eventually terminate the workflow. Partition by a finer-grained key, throttle producers, or batch multiple events into a single signal payload to keep the per-instance signal rate low enough for CAN to succeed. +- **Not draining the signal queue before calling Continue-As-New.** Any signal that arrives between the CAN decision and the actual CAN execution can be lost if the signal buffer is not empty when CAN fires. All SDK implementations in this pattern guard against this by re-checking the unprocessed queue before continuing; do not remove that guard or call CAN unconditionally on the history-size trigger. + +## Related patterns + +- **[Signal with Start](/design-patterns/signal-with-start)** — the atomic start-and-signal primitive this pattern uses to create or locate the accumulator workflow. +- **[Continue-As-New](/design-patterns/continue-as-new)** — used to reset workflow history when the accumulation period is long. +- **[Updatable Timer](/design-patterns/updatable-timer)** — an alternative approach for a resettable timer that does not require signals. +- **[Entity Workflow](/design-patterns/entity-workflow)** — a broader pattern for long-lived, keyed workflow instances. + +## Sample code + +- [Java Sample](https://github.com/temporalio/samples-java/blob/main/core/src/main/java/io/temporal/samples/hello/HelloAccumulator.java) — the canonical accumulator example from the Temporal Java samples repository. diff --git a/docs/design-patterns/external-interaction-patterns.mdx b/docs/design-patterns/external-interaction-patterns.mdx new file mode 100644 index 0000000000..15c3abca43 --- /dev/null +++ b/docs/design-patterns/external-interaction-patterns.mdx @@ -0,0 +1,43 @@ +--- +id: external-interaction-patterns +title: External interaction patterns +sidebar_label: Overview +description: Patterns for waiting on or interacting with systems and actors outside the Workflow, including external APIs, human decisions, and scheduled delays. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for waiting on or interacting with systems and actors outside the Workflow, including external APIs, human decisions, and scheduled delays. + + diff --git a/docs/design-patterns/fairness.mdx b/docs/design-patterns/fairness.mdx new file mode 100644 index 0000000000..b9224dfe7b --- /dev/null +++ b/docs/design-patterns/fairness.mdx @@ -0,0 +1,300 @@ +--- +id: fairness +title: "Fairness" +sidebar_label: "Fairness" +description: "Distributes Worker capacity evenly across tenants or users so that a burst from one caller does not starve the others." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Assign a `FairnessKey` and weight to Workflows and Activities so each tenant or group receives the **correct proportional share of Worker capacity** on a shared Task Queue. Use this when a high-volume caller would otherwise starve other tenants without requiring separate queues per tenant. +::: + +## Overview + +The Fairness pattern distributes Worker capacity proportionally across tenants or user groups within a single Task Queue so that a burst from one caller cannot starve others. Each group is assigned a fairness key and an optional weight; the Temporal matching service dispatches tasks in weighted round-robin order across all keys. + +## Problem + +When multiple tenants (e.g. customers) share a single Task Queue, a high-volume tenant can fill the queue and occupy all Worker slots. Other tenants receive no service until the dominant tenant's backlog drains. This starvation violates throughput guarantees and makes latency for lower-volume tenants unpredictable under burst conditions. + +The classic workaround—assigning one Task Queue per tenant—scales poorly: each new tenant requires a new Worker deployment, idle capacity on low-traffic tenants cannot be used by busy ones, and queue management complexity grows with tenant count. + +## Solution + +Temporal's native Fairness feature lets you assign a `FairnessKey` (a string identifier such as a tenant name or tier) and an optional `FairnessWeight` (a positive float, default 1.0) to Workflows, Activities, and Child Workflows. The Temporal matching service creates a virtual queue for each key and dispatches tasks in proportion to their weights. A single shared Worker pool serves all keys; no extra queues or routing logic is required. + +For example, assigning weights of 5.0, 3.0, and 2.0 to `premium`, `basic`, and `free` tiers causes 50% of dispatched tasks to come from `premium`, 30% from `basic`, and 20% from `free`—regardless of backlog depth. Within a single fairness key, tasks are dispatched in FIFO order. + +```mermaid +flowchart TD + WA["Workflow\nfairness_key=tenant-big\n(weight 1.0)"] --> TQ["my-task-queue"] + WB["Workflow\nfairness_key=tenant-mid\n(weight 1.0)"] --> TQ + WC["Workflow\nfairness_key=tenant-small\n(weight 1.0)"] --> TQ + TQ --> VQ1["Virtual Queue\ntenant-big"] + TQ --> VQ2["Virtual Queue\ntenant-mid"] + TQ --> VQ3["Virtual Queue\ntenant-small"] + VQ1 -->|round-robin| W["Shared Workers"] + VQ2 -->|round-robin| W + VQ3 -->|round-robin| W + W --> DS["Downstream\nService"] +``` + +The following describes each step in the diagram: + +1. Workflows start with a `FairnessKey` matching the tenant or group identity. +2. The Temporal matching service routes each task to the corresponding virtual queue inside the single Task Queue. +3. Workers poll the Task Queue and receive tasks in weighted round-robin order across all fairness keys. +4. Tenant-big's large backlog does not prevent tenant-mid or tenant-small from receiving service. + +## Implementation + +### Enable Fairness + +**Temporal Cloud:** Navigate to the Namespace's Overview page in the UI and activate the Fairness toggle. Fairness is a paid feature in Temporal Cloud. + +**Self-hosted Temporal:** Set `matching.enableFairness` to `true` in the [dynamic config](https://docs.temporal.io/temporal-service/configuration#dynamic-configuration) for the relevant Task Queues or Namespaces. + +### Set fairness key and weight at Workflow start + + + + +```python +from temporalio.common import Priority + +handle = await client.start_workflow( + ProcessOrder.run, + id="process-order-wf", + task_queue="my-task-queue", + priority=Priority( + fairness_key="tenant-a", + fairness_weight=2.0, + ), +) +``` + + + + +```go +we, err := c.ExecuteWorkflow( + context.Background(), + client.StartWorkflowOptions{ + ID: "process-order-wf", + TaskQueue: "my-task-queue", + Priority: temporal.Priority{ + FairnessKey: "tenant-a", + FairnessWeight: 2.0, + }, + }, + ProcessOrder, +) +``` + + + + +```java +WorkflowOptions options = WorkflowOptions.newBuilder() + .setWorkflowId("process-order-wf") + .setTaskQueue("my-task-queue") + .setPriority(Priority.newBuilder() + .setFairnessKey("tenant-a") + .setFairnessWeight(2.0f) + .build()) + .build(); +ProcessOrder workflow = client.newWorkflowStub(ProcessOrder.class, options); +WorkflowClient.start(workflow::run); +``` + + + + +### Set fairness key and weight on Activities + +Activities inherit the parent Workflow's fairness key and weight. Override them in `ActivityOptions` when an Activity should belong to a different fairness group than its Workflow. Each field (`priority_key`, `fairness_key`, `fairness_weight`) is resolved independently in this order: Task Queue weight overrides (highest precedence), value set explicitly in the options, value inherited from the calling Workflow, then the default. Workflows started with Continue-As-New inherit the current execution's priority values unless you pass explicit values. See [Inheritance](https://docs.temporal.io/develop/task-queue-priority-fairness#inheritance) in the Temporal docs for the full resolution diagram. + + + + +```python +from temporalio.common import Priority + +# inside the workflow +result = await workflow.execute_activity( + process_for_tenant, + tenant_request, + start_to_close_timeout=timedelta(minutes=1), + priority=Priority( + fairness_key="tenant-a", + fairness_weight=2.0, + ), +) +``` + + + + +```go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + Priority: temporal.Priority{ + FairnessKey: "tenant-a", + FairnessWeight: 2.0, + }, +} +ctx = workflow.WithActivityOptions(ctx, ao) +err := workflow.ExecuteActivity(ctx, ProcessForTenant, req).Get(ctx, nil) +``` + + + + +```java +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(1)) + .setPriority(Priority.newBuilder() + .setFairnessKey("tenant-a") + .setFairnessWeight(2.0f) + .build()) + .build(); +TenantActivity activity = Workflow.newActivityStub(TenantActivity.class, options); +activity.processForTenant(request); +``` + + + + +### Set queue-level and per-key rate limits via CLI + +You can rate-limit the entire Task Queue and set a default per-fairness-key limit. The per-key limit is scaled by the fairness weight for that key, so a key with weight 2.5 and a default per-key limit of 10 gets an effective limit of 25 tasks/second. + +```sh +temporal task-queue config set \ + --task-queue my-task-queue \ + --task-queue-type activity \ + --namespace my-namespace \ + --queue-rps-limit 500 \ + --queue-rps-limit-reason "overall limit" \ + --fairness-key-rps-limit-default 33.3 \ + --fairness-key-rps-limit-reason "per-key limit" +``` + +### Override fairness weights via CLI + +When it is more convenient to manage weights through configuration than to embed them in client code, you can override weights for up to 1000 keys per Task Queue. Overrides take precedence over the weight attached to a task's options and can be updated without a code deploy. + +```sh +temporal task-queue config set \ + --task-queue my-task-queue \ + --task-queue-type workflow \ + --namespace my-namespace \ + --fairness-key-weight premium=5.0 \ + --fairness-key-weight basic=3.0 \ + --fairness-key-weight free=2.0 +``` + +### Using Priority and Fairness together + +Priority and Fairness can be combined. Priority determines which sub-queue (1–5) a task enters; Fairness determines the dispatch order within each priority level. Set both `PriorityKey` and `FairnessKey` on the same options object. + + + + +```python +from temporalio.common import Priority + +handle = await client.start_workflow( + ChargeCustomer.run, + id="charge-customer-wf", + task_queue="my-task-queue", + priority=Priority( + priority_key=1, + fairness_key="tenant-a", + fairness_weight=2.0, + ), +) +``` + + + + +```go +we, err := c.ExecuteWorkflow( + context.Background(), + client.StartWorkflowOptions{ + ID: "charge-customer-wf", + TaskQueue: "my-task-queue", + Priority: temporal.Priority{ + PriorityKey: 1, + FairnessKey: "tenant-a", + FairnessWeight: 2.0, + }, + }, + ChargeCustomer, +) +``` + + + + +```java +WorkflowOptions options = WorkflowOptions.newBuilder() + .setWorkflowId("charge-customer-wf") + .setTaskQueue("my-task-queue") + .setPriority(Priority.newBuilder() + .setPriorityKey(1) + .setFairnessKey("tenant-a") + .setFairnessWeight(2.0f) + .build()) + .build(); +``` + + + + +## When to use + +This pattern is a good fit for multi-tenant applications where large tenants should not block small tenants, for workloads that need proportional capacity allocation across groups without hard rate limits, and when the set of tenants or groups is dynamic (new keys can be introduced without deploying new Workers). For a broader look at multi-tenancy strategies in Temporal, see [Multi-Tenant Patterns](https://docs.temporal.io/production-deployment/multi-tenant-patterns). + +It is not a good fit when absolute throughput isolation is required (dedicated queues per tenant remain or [task queue priorities](/design-patterns/priority-task-queues) are the appropriate choice). + +## Benefits and trade-offs + +A single Worker pool serves all tenants; idle capacity from a low-traffic tenant automatically benefits high-traffic tenants rather than going to waste. New tenants require no Worker deployment—add a fairness key and Temporal starts dispatching their tasks immediately. Weights can be updated via CLI without redeploying application code. + +Fairness requires explicit enablement on Temporal Cloud and self-hosted deployments. Accuracy can degrade with a very large number of fairness keys. Fairness weight applies at schedule time, not dispatch time: changing a weight does not retroactively reorder tasks already in the backlog. + +## Comparison with alternatives + +| Approach | Tenant isolation | Dynamic tenants | Shares idle capacity | Complexity | +| :--- | :--- | :--- | :--- | :--- | +| Temporal FairnessKey (native) | Soft | Yes | Yes | Low | +| Dedicated queue per tenant | Hard | No | No | Medium | +| Single shared queue (no control) | None | Yes | Yes | Lowest | +| External queue with per-tenant consumer groups | Hard | Yes | No | High | + +## Best practices + +- **Use stable, consistent naming for fairness keys.** Use account IDs or tenant slugs rather than display names. Key names cannot be changed retroactively on tasks already in the backlog. +- **Combine Priority and Fairness for multi-class, multi-tenant workloads.** Priority separates urgent from batch work; Fairness prevents any single tenant from dominating within each priority level. +- **Monitor queue depth by fairness key.** Sustained backlog growth for a particular key means its weight fraction of Worker capacity cannot drain its submission rate. + +## Common pitfalls + +- **Expecting Fairness to reorder the existing backlog.** Fairness weight is evaluated at schedule time. Enabling Fairness on a Namespace with an existing backlog drains that backlog in its original order first; the fairness-aware dispatch mode takes effect only for newly submitted tasks. +- **Using Fairness as a hard rate limiter.** Fairness controls proportional dispatch but does not cap the absolute throughput of any one key. For hard throughput caps, combine Fairness with per-fairness-key RPS limits via the CLI. +- **Unkeyed tasks bypassing Fairness.** Tasks without a `FairnessKey` are grouped under an implicit empty-string key and participate in round-robin dispatch alongside named keys with a weight of 1.0. They do not bypass Fairness and compete as one group. +- **Task Queue partitioning reducing accuracy.** Task Queues are internally partitioned and tasks are distributed to partitions randomly, which can interfere with fair dispatch proportions. If your workload requires higher accuracy, contact Temporal Support to configure a single-partition Task Queue. +- **Assuming Fairness applies across Worker Versioning boundaries.** When using Worker Versioning and moving Workflows between versions, Priority still applies across versions but Fairness is only guaranteed within tasks originally queued on the same Worker version. Tasks moved from one version to another may not dispatch in fairness order relative to tasks on the destination version. +- **Expecting consistent fairness immediately after a server restart.** Fairness ordering is preserved across restarts for the most active keys. Less active keys may briefly dispatch new tasks ahead of their existing backlog until ordering normalizes. +- **Expecting the running task mix to immediately reflect fair dispatch.** Fairness governs which task is dispatched next; it does not account for tasks already running on Workers. The mix of in-flight tasks at any moment may not match the configured weight ratios. + +## Related patterns + +- **[Priority Task Queues](/design-patterns/priority-task-queues)**: Order tasks by urgency level within the same Task Queue using `PriorityKey`. +- **[Downstream Rate Limiting](/design-patterns/downstream-rate-limiting)**: Cap absolute throughput to a downstream service with a queue RPS setting. +- **[Worker-Specific Task Queues](/design-patterns/worker-specific-taskqueue)**: Route Activities to a specific Worker host for resource or data affinity. diff --git a/docs/design-patterns/fanout-child-workflows.mdx b/docs/design-patterns/fanout-child-workflows.mdx new file mode 100644 index 0000000000..e72f7a6e5c --- /dev/null +++ b/docs/design-patterns/fanout-child-workflows.mdx @@ -0,0 +1,326 @@ +--- +id: fanout-child-workflows +title: "Fan-Out with Child Workflows" +sidebar_label: "Fan-Out with Child Workflows" +description: "Distributes a large record set across parallel Child Workflows for concurrent processing with automatic scaling." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Split your record set into fixed-size chunks and start **one child Workflow per chunk** so that each chunk's history stays within Temporal's limits. Use this when you want maximum concurrency with no rate control and you can pre-compute how many chunks you need before the job starts. Keep the total number of children per parent under 1,000; use [Sliding Window](/design-patterns/sliding-window) or [Batch Iterator](/design-patterns/batch-iterator) for larger workloads. +::: + +## Overview + +The Fan-Out pattern distributes a large record set across multiple independent child Workflows, each responsible for processing a fixed-size chunk. The parent Workflow assigns work by offset and length so that no record IDs need to be passed over the wire — only two integers per child. + +## Problem + +A single Workflow run can have at most 2,000 in-flight Activities (aim for 500) and at most 50,000 history events. Processing millions of records in a single Workflow run is therefore not possible. + +You need a way to partition a large record set, process each partition independently, and coordinate the overall job while keeping each Workflow's history within safe bounds. + +## Solution + +You split the total record count into fixed-size chunks and start one child Workflow per chunk. Each child is given an `offset` and a `length` so it knows which slice of the record set to fetch and process independently. + +The parent Workflow starts all children concurrently and waits for them all to complete. If a child fails the parent can retry that child without re-processing the records handled by other children. + +```mermaid +flowchart TD + Records["📋 Total record set\n(N records)"] + Parent["Parent Workflow\n(fanOutWorkflow)"] + C1["Child Workflow\n(offset=0, length=chunk)"] + C2["Child Workflow\n(offset=chunk, length=chunk)"] + C3["Child Workflow\n(offset=2×chunk, length=chunk)"] + + Records --> Parent + Parent -->|"start child 1"| C1 + Parent -->|"start child 2"| C2 + Parent -->|"start child 3"| C3 + + C1 --> A1["processRecord ×chunk"] + C2 --> A2["processRecord ×chunk"] + C3 --> A3["processRecord ×chunk"] + + A1 -->|"done"| Parent + A2 -->|"done"| Parent + A3 -->|"done"| Parent +``` + +The following describes each step in the diagram: + +1. The parent Workflow receives the total record count and a configured chunk size. +2. It divides the total into chunks and starts one child Workflow per chunk, passing only `offset` and `length`. +3. Each child independently fetches its slice of records (using the offset and length) and calls `processRecord` for each one. +4. Each child completes and returns its result to the parent. +5. The parent blocks until all children have completed, then returns the aggregated result. + +## Implementation + + +The following examples show how each SDK implements the Fan-Out pattern. + + + + +```typescript +// workflows.ts +import { + executeChild, + proxyActivities, + workflowInfo, +} from "@temporalio/workflow"; +import type * as activities from "./activities"; +import { TASK_QUEUE, CHUNK_SIZE } from "./shared"; + +const { processRecord } = proxyActivities({ + startToCloseTimeout: "10 seconds", +}); + +export async function fanOutWorkflow( + totalRecords: number, + chunkSize: number = CHUNK_SIZE +): Promise { + const children: Promise[] = []; + + for (let offset = 0; offset < totalRecords; offset += chunkSize) { + const length = Math.min(chunkSize, totalRecords - offset); + children.push( + executeChild(recordBatchWorkflow, { + args: [offset, length], + taskQueue: TASK_QUEUE, + workflowId: `${workflowInfo().workflowId}/batch-${offset}`, + }) + ); + } + + const results = await Promise.all(children); + return results.reduce((sum, n) => sum + n, 0); +} + +export async function recordBatchWorkflow( + offset: number, + length: number +): Promise { + let processed = 0; + for (let i = offset; i < offset + length; i++) { + await processRecord(i); + processed++; + } + return processed; +} +``` + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow +from temporalio.workflow import ChildWorkflowHandle +import asyncio +from activities import process_record +from shared import TASK_QUEUE, CHUNK_SIZE + + +@workflow.defn +class RecordBatchWorkflow: + @workflow.run + async def run(self, offset: int, length: int) -> int: + processed = 0 + for i in range(offset, offset + length): + await workflow.execute_activity( + process_record, + i, + start_to_close_timeout=timedelta(seconds=10), + ) + processed += 1 + return processed + + +@workflow.defn +class FanOutWorkflow: + @workflow.run + async def run(self, total_records: int, chunk_size: int = CHUNK_SIZE) -> int: + handles: list[ChildWorkflowHandle] = [] + parent_id = workflow.info().workflow_id + + offset = 0 + while offset < total_records: + length = min(chunk_size, total_records - offset) + handle = await workflow.start_child_workflow( + RecordBatchWorkflow.run, + args=[offset, length], + id=f"{parent_id}/batch-{offset}", + task_queue=TASK_QUEUE, + ) + handles.append(handle) + offset += chunk_size + + results = await asyncio.gather(*handles) + return sum(results) +``` + + + + +```go +// workflows.go +package main + +import ( + "fmt" + "time" + + "go.temporal.io/sdk/workflow" +) + +func FanOutWorkflow(ctx workflow.Context, totalRecords int, chunkSize int) (int, error) { + if chunkSize <= 0 { + chunkSize = ChunkSize + } + + var futures []workflow.Future + parentID := workflow.GetInfo(ctx).WorkflowExecution.ID + + for offset := 0; offset < totalRecords; offset += chunkSize { + length := chunkSize + if offset+chunkSize > totalRecords { + length = totalRecords - offset + } + off := offset // capture loop variable + cwo := workflow.ChildWorkflowOptions{ + WorkflowID: parentID + "/batch-" + fmt.Sprintf("%d", off), + TaskQueue: TaskQueue, + } + cctx := workflow.WithChildOptions(ctx, cwo) + futures = append(futures, workflow.ExecuteChildWorkflow(cctx, RecordBatchWorkflow, off, length)) + } + + total := 0 + for _, f := range futures { + var n int + if err := f.Get(ctx, &n); err != nil { + return total, err + } + total += n + } + return total, nil +} + +func RecordBatchWorkflow(ctx workflow.Context, offset int, length int) (int, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + processed := 0 + for i := offset; i < offset+length; i++ { + if err := workflow.ExecuteActivity(ctx, ProcessRecord, i).Get(ctx, nil); err != nil { + return processed, err + } + processed++ + } + return processed, nil +} +``` + + + + +```java +// FanOutWorkflow.java +import io.temporal.activity.ActivityOptions; +import io.temporal.workflow.*; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +@WorkflowInterface +public interface FanOutWorkflow { + @WorkflowMethod + int run(int totalRecords, int chunkSize); +} + +// FanOutWorkflowImpl.java +public class FanOutWorkflowImpl implements FanOutWorkflow { + @Override + public int run(int totalRecords, int chunkSize) { + if (chunkSize <= 0) chunkSize = Shared.CHUNK_SIZE; + + List> promises = new ArrayList<>(); + String parentId = Workflow.getInfo().getWorkflowId(); + + for (int offset = 0; offset < totalRecords; offset += chunkSize) { + int length = Math.min(chunkSize, totalRecords - offset); + ChildWorkflowOptions opts = ChildWorkflowOptions.newBuilder() + .setWorkflowId(parentId + "/batch-" + offset) + .setTaskQueue(Shared.TASK_QUEUE) + .build(); + RecordBatchWorkflow child = Workflow.newChildWorkflowStub(RecordBatchWorkflow.class, opts); + promises.add(Async.function(child::run, offset, length)); + } + + int total = 0; + for (Promise p : promises) { + total += p.get(); + } + return total; + } +} + +// RecordBatchWorkflow.java +@WorkflowInterface +public interface RecordBatchWorkflow { + @WorkflowMethod + int run(int offset, int length); +} + +// RecordBatchWorkflowImpl.java +public class RecordBatchWorkflowImpl implements RecordBatchWorkflow { + private final Activities activities = Workflow.newActivityStub( + Activities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .build() + ); + + @Override + public int run(int offset, int length) { + int processed = 0; + for (int i = offset; i < offset + length; i++) { + activities.processRecord(i); + processed++; + } + return processed; + } +} +``` + + + + +## Best Practices + +- **Use offset and length, not explicit IDs.** Pass only two integers to each child rather than a full slice of IDs. The child fetches its own records. This keeps history events small. +- **Size chunks to stay under the Activity limit.** Each child Workflow can have at most 2,000 in-flight Activities. Aim for chunks of 500 records or fewer if each record maps to one Activity. +- **Cap concurrent children in the parent.** Starting thousands of child Workflows simultaneously puts pressure on the namespace. Consider batching child starts or using [Sliding Window](/design-patterns/sliding-window) if you need tighter concurrency control. +- **Set `PARENT_CLOSE_POLICY_ABANDON`** for fire-and-forget fan-outs where the parent does not need to collect results. With the default `TERMINATE` policy, cancelling or timing out the parent will terminate all in-flight children. +- **Give each child a deterministic Workflow ID** (`parentId/batch-`). This makes it safe to re-run the parent: Temporal deduplicates child starts by Workflow ID, so already-completed children are not re-executed. + +## Common Pitfalls + +- **Starting too many children at once.** Each child start adds to the parent's history. Keep total children per parent under 1,000 per [Temporal guidance](https://docs.temporal.io/workflows#when-to-use-child-workflows). If you need more children, switch to [MapReduce Tree](/design-patterns/mapreduce-tree) or [Sliding Window](/design-patterns/sliding-window). +- **Passing large lists of IDs.** Workflow inputs are stored in event history. Passing millions of record IDs as a list will blow the history size limit. Use offset + length instead. +- **Ignoring child failures.** A failed child does not automatically fail the parent unless you await all results. Always await child handles and handle errors explicitly. + +## Related Resources + +- [Child Workflows pattern](/design-patterns/child-workflows) — core concepts for parent/child Workflow coordination +- [Batch Iterator](/design-patterns/batch-iterator) — unbounded record sets with Continue-as-New pagination +- [Sliding Window](/design-patterns/sliding-window) — bounded concurrency with maximum throughput +- [Temporal limits reference](https://docs.temporal.io/cloud/limits) diff --git a/docs/design-patterns/fast-slow-retries.mdx b/docs/design-patterns/fast-slow-retries.mdx new file mode 100644 index 0000000000..d1bdf43be7 --- /dev/null +++ b/docs/design-patterns/fast-slow-retries.mdx @@ -0,0 +1,310 @@ +--- +id: fast-slow-retries +title: "Fast/Slow Retries" +sidebar_label: "Fast/Slow Retries" +description: "Try aggressively with a short interval first, then shift to a long interval when fast retries are exhausted." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Orchestrate two retry phases in the Workflow: a fast phase with short intervals and bounded attempts for transient errors, followed by a slow phase with long intervals and unlimited retries for extended outages. **Use this when a single `RetryPolicy` should not cover both brief blips and hour-long outages or maintenance windows.** +::: + +## Overview + +The Fast/Slow Retries pattern runs an Activity through two distinct retry phases: a fast phase with a short interval and bounded attempt count, followed by an unlimited slow phase with a long fixed interval managed by the Temporal Service. +Use it when transient errors are common and worth recovering from quickly, but the downstream system may also suffer extended outages that require patient, indefinite waiting. + +## Problem + +Conventional retry policies force you to choose between: + +- **Low `MaximumAttempts`**: Recovers from transient errors quickly but abandons the request when the downstream system has a longer outage. +- **Unlimited or High `MaximumAttempts` with short interval**: Floods a degraded downstream system with retries and accumulates noisy failures in logs. Incurs processing cost with each attempt. +- **Long fixed interval with unlimited retries**: Recovers from outages eventually, but is too slow to recover from transient errors that would have resolved in seconds. + +None of these options handles both scenarios well — a downstream system that sometimes has brief 503 errors *and* occasionally goes down for an extended maintenance window. + +## Solution + +Use the Workflow itself as a retry orchestrator across two phases: + +**Phase 1 — Fast retries**: Execute the Activity with a short `InitialInterval` and a bounded `MaximumAttempts`. This phase recovers from transient errors within seconds or minutes. + +**Phase 2 — Slow retries**: When the fast retry policy is exhausted, catch the `ActivityError` in the Workflow and execute the Activity again with a long `InitialInterval` and unlimited `MaximumAttempts`. The Temporal Service owns the slow retry management; the Workflow blocks until the Activity eventually succeeds. + +This design is invisible in conventional retry libraries because it requires the retry orchestrator to be a durable, resumable process — exactly what a Temporal Workflow is. + +```mermaid +flowchart TD + Start([Workflow starts]) --> Phase1 + + subgraph Phase1 [Phase 1 — Fast Retries] + F1[Execute Activity\ninitialInterval=1s\nmaxAttempts=10] -->|Success| Done + F1 -->|Failure| FCheck{Attempts\nexhausted?} + FCheck -->|No| F1 + FCheck -->|Yes| Log[Log: switching to slow phase] + end + + Log --> Phase2 + + subgraph Phase2 [Phase 2 — Slow Retries, Unlimited] + S1[Execute Activity\ninitialInterval=5m\nunlimited attempts] -->|Success| Done + S1 -->|Failure| SWait[Temporal waits 5m\nthen retries] + SWait --> S1 + end + + Done([Return result]) +``` + +The following describes each step: + +1. The Workflow first tries the Activity with a fast policy: 1-second initial interval and a maximum of 10 total attempts. +2. If the Activity succeeds during the fast phase, the Workflow returns the result immediately. +3. If all fast attempts are exhausted, the Workflow logs a warning and transitions to the slow phase. +4. In the slow phase, the Workflow executes the Activity with a 5-minute fixed interval and unlimited retries. The Temporal Service manages the wait between attempts. +5. When the Activity eventually succeeds — after the downstream system recovers — the Workflow returns the result. + +## Implementation + + +### Two-phase workflow retry management + +The key change between phases is the retry interval and attempt count. +In Phase 1, the Temporal Service manages a fast set of retries: short interval, bounded attempts. +In Phase 2, the Temporal Service manages a slow set of retries: long fixed interval, unlimited attempts. + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy +from temporalio.exceptions import ActivityError +import activities + +@workflow.defn +class FastSlowRetryWorkflow: + @workflow.run + async def run(self, request: str) -> str: + # Phase 1: fast retries + fast_policy = RetryPolicy( + initial_interval=timedelta(seconds=1), + backoff_coefficient=1.5, + maximum_interval=timedelta(seconds=30), + maximum_attempts=10, + ) + try: + return await workflow.execute_activity( + activities.call_downstream, + request, + start_to_close_timeout=timedelta(seconds=30), + retry_policy=fast_policy, + ) + except ActivityError: + workflow.logger.warning( + "Fast retries exhausted — switching to slow retry phase", + extra={"request": request}, + ) + + # Phase 2: slow retries + slow_policy = RetryPolicy( + initial_interval=timedelta(minutes=5), + backoff_coefficient=1.0, + ) + return await workflow.execute_activity( + activities.call_downstream, + request, + start_to_close_timeout=timedelta(seconds=30), + retry_policy=slow_policy, + ) +``` + + + + +```go +// workflow.go +package downstream + +import ( + "time" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func FastSlowRetryWorkflow(ctx workflow.Context, request string) (string, error) { + log := workflow.GetLogger(ctx) + + // Phase 1: fast retries + fastCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + InitialInterval: time.Second, + BackoffCoefficient: 1.5, + MaximumInterval: 30 * time.Second, + MaximumAttempts: 10, + }, + }) + + var result string + err := workflow.ExecuteActivity(fastCtx, CallDownstream, request).Get(fastCtx, &result) + if err != nil { + log.Warn("Fast retries exhausted — switching to slow retry phase", + "request", request) + + // Phase 2: slow retries + slowCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + InitialInterval: 5 * time.Minute, + BackoffCoefficient: 1.0, + // MaximumAttempts defaults to 0 (unlimited) + }, + }) + + err = workflow.ExecuteActivity(slowCtx, CallDownstream, request).Get(slowCtx, &result) + } + return result, err +} +``` + + + + +```java +// FastSlowRetryWorkflowImpl.java +import io.temporal.activity.ActivityOptions; +import io.temporal.common.RetryOptions; +import io.temporal.failure.ActivityFailure; +import io.temporal.workflow.Workflow; +import java.time.Duration; + +public class FastSlowRetryWorkflowImpl implements FastSlowRetryWorkflow { + @Override + public String run(String request) { + // Phase 1: fast retries + DownstreamActivities fastActivities = Workflow.newActivityStub( + DownstreamActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .setRetryOptions(RetryOptions.newBuilder() + .setInitialInterval(Duration.ofSeconds(1)) + .setBackoffCoefficient(1.5) + .setMaximumInterval(Duration.ofSeconds(30)) + .setMaximumAttempts(10) + .build()) + .build() + ); + + try { + return fastActivities.callDownstream(request); + } catch (ActivityFailure e) { + Workflow.getLogger(getClass()).warn( + "Fast retries exhausted — switching to slow retry phase: " + request + ); + + // Phase 2: slow retries + DownstreamActivities slowActivities = Workflow.newActivityStub( + DownstreamActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .setRetryOptions(RetryOptions.newBuilder() + .setInitialInterval(Duration.ofMinutes(5)) + .setBackoffCoefficient(1.0) + // setMaximumAttempts not set — defaults to unlimited + .build()) + .build() + ); + + return slowActivities.callDownstream(request); + } + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import type * as activities from './activities'; + +const fastDownstream = wf.proxyActivities({ + startToCloseTimeout: '30s', + retry: { + initialInterval: '1s', + backoffCoefficient: 1.5, + maximumInterval: '30s', + maximumAttempts: 10, + }, +}); + +const slowDownstream = wf.proxyActivities({ + startToCloseTimeout: '30s', + retry: { + initialInterval: '5m', + backoffCoefficient: 1, + // maximumAttempts defaults to unlimited + }, +}); + +export async function fastSlowRetryWorkflow(request: string): Promise { + // Phase 1: fast retries + try { + return await fastDownstream.callDownstream(request); + } catch { + wf.log.warn('Fast retries exhausted — switching to slow retry phase', { request }); + + // Phase 2: slow retries + return await slowDownstream.callDownstream(request); + } +} +``` + + + + +## Tuning the phases + +Adjust the phase parameters to match the characteristics of your downstream system: + +| Parameter | Typical value | Rationale | +| :--- | :--- | :--- | +| Phase 1 `InitialInterval` | 1–5 seconds | Recover from transient errors within seconds | +| Phase 1 `BackoffCoefficient` | 1.5–2.0 | Spread retries to avoid overwhelming a briefly degraded system | +| Phase 1 `MaximumAttempts` | 5–20 | Enough attempts to cover a short transient period | +| Phase 2 `InitialInterval` | 1–15 minutes | Long enough to avoid hammering a down system; short enough to recover promptly | +| Phase 2 `BackoffCoefficient` | 1.0 | Keeps the interval fixed; the default 2.0 would exponentially increase delays between slow-phase attempts | + +Phase 2 runs indefinitely by default. +If the business process has a maximum wait time, add a `ScheduleToCloseTimeout` or use a Workflow execution timeout to impose an outer bound. + +## Best practices + +- **Log the phase transition.** The transition from fast to slow is a meaningful signal that the downstream system may have a sustained problem. Log it with enough context — request identifier, attempt count, timestamp — to aid diagnosis. +- **Leave `MaximumAttempts` unset in Phase 2.** Omitting `MaximumAttempts` (or setting it to 0) gives the slow phase unlimited retries. The Temporal Service manages the wait between attempts via `InitialInterval`; the Workflow simply blocks until the Activity eventually succeeds. +- **Combine with Retry Alerting via Metrics.** Add a metric counter inside the Activity to surface slow-phase attempts to on-call teams. See [Retry Alerting via Metrics](/design-patterns/retry-metrics). + +## Common pitfalls + +- **Catching too broadly in Phase 1.** Catch `ActivityError` specifically. Catching all exceptions in Phase 1 may swallow errors that should propagate immediately (such as `CancelledError` in Python or a `PanicError` in Go). +- **Setting `MaximumAttempts` in Phase 2.** If you set a finite `MaximumAttempts` in the slow phase, it will eventually exhaust and propagate a failure to the Workflow. Only add a limit if the business process has a defined maximum wait time; in that case, pair it with a `ScheduleToCloseTimeout` to make the budget explicit. +- **Using exponential backoff in Phase 2.** The default `BackoffCoefficient` is 2.0, which doubles the interval with each attempt. Set `BackoffCoefficient=1.0` in the slow phase to keep the interval fixed and predictable. + +## Related patterns + +- [Retry Alerting via Metrics](/design-patterns/retry-metrics): Emit a metric in the slow-phase Activity to surface sustained failures to on-call teams. +- [Delayed Retry](/design-patterns/delayed-retry): Override the retry interval per error type using `nextRetryDelay` on `ApplicationFailure`. +- [Error Handling & Retry Patterns](/design-patterns/error-handling-patterns): Overview and decision tree for all retry patterns. + +## References + +- [Understanding Workflow Retries and Failures](https://community.temporal.io/t/understanding-workflow-retries-and-failures/122) +- [Failure Handling in Practice](https://temporal.io/blog/failure-handling-in-practice) diff --git a/docs/design-patterns/fixed-count-retries.mdx b/docs/design-patterns/fixed-count-retries.mdx new file mode 100644 index 0000000000..863a292ce6 --- /dev/null +++ b/docs/design-patterns/fixed-count-retries.mdx @@ -0,0 +1,304 @@ +--- +id: fixed-count-retries +title: "Fixed Count of Retries" +sidebar_label: "Fixed Count of Retries" +description: "Cap the number of Activity retry attempts to control cost when each attempt consumes a paid or limited resource." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Set `MaximumAttempts` on the `RetryPolicy` to **cap how many times Temporal will attempt an Activity**. Use this when each attempt consumes a paid API call, a rate-limited token, or any scarce resource where unbounded retries translate directly to unbounded cost. +::: + +## Overview + +The Fixed Count of Retries pattern caps the total number of Activity execution attempts by setting `MaximumAttempts` on the `RetryPolicy`. +Use it when each attempt consumes a paid API call, a rate-limited token, or any scarce resource where unbounded retries translate directly to unbounded cost. + +## Problem + +Temporal's default retry policy retries Activities indefinitely with exponential backoff. +This is appropriate for most infrastructure failures, but it creates problems when the Activity calls a paid third-party API: + +- A credit card authorization that fails due to a transient network error will be retried dozens of times, each attempt charging a per-call fee. +- A generative AI API with a per-token pricing model will accumulate costs silently while the Workflow waits. +- A rate-limited partner API will exhaust its quota across all callers if one Workflow retries without bound. + +Without a cap, a single stuck Workflow can generate costs that are orders of magnitude larger than the intended spend. + +## Solution + +Set `MaximumAttempts` on the `RetryPolicy` passed to the Activity call. +Temporal counts the initial attempt and each retry toward the limit. +When the limit is reached, Temporal stops retrying and delivers an `ActivityError` to the Workflow. +The Workflow can catch that error and decide whether to fail, alert, or escalate. + +```mermaid +sequenceDiagram + participant Workflow + participant Temporal as Temporal Service + participant API as Payment API + + Workflow->>Temporal: Schedule activity (MaximumAttempts=3) + Temporal->>+API: Attempt 1 + API-->>-Temporal: Failure + Note over Temporal: Retry 1 of 2 + Temporal->>+API: Attempt 2 + API-->>-Temporal: Failure + Note over Temporal: Retry 2 of 2 + Temporal->>+API: Attempt 3 + API-->>-Temporal: Failure + Note over Temporal: MaximumAttempts reached — no more retries + Temporal-->>Workflow: ActivityError + Workflow->>Workflow: Handle failure (alert, compensate, or escalate) +``` + +The following describes each step: + +1. The Workflow schedules the Activity with a `RetryPolicy` that caps attempts at 3. +2. The Temporal Service executes the Activity. On failure, it schedules a retry. +3. After 3 total attempts (1 initial + 2 retries), Temporal delivers an `ActivityError` to the Workflow. +4. The Workflow catches the error and handles it — logging, compensating, or escalating — rather than accumulating further cost. + +## Implementation + + +### Capping attempts + +Set `maximum_attempts` (Python), `MaximumAttempts` (Go / Java), or `maximumAttempts` (TypeScript) on the retry policy. +The count includes the initial attempt, so `maximum_attempts=3` means one attempt plus two retries. + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy +from temporalio.exceptions import ActivityError, RetryState +import activities + +@workflow.defn +class PaymentWorkflow: + @workflow.run + async def run(self, order_id: str) -> str: + try: + return await workflow.execute_activity( + activities.charge_payment_api, + order_id, + start_to_close_timeout=timedelta(seconds=10), + retry_policy=RetryPolicy(maximum_attempts=3), + ) + except ActivityError as e: + if e.retry_state == RetryState.MAXIMUM_ATTEMPTS_REACHED: + # All retries exhausted — handle the failure here. + # Options: alert on-call, trigger a compensation activity, or escalate to a human. + workflow.logger.error( + "Payment failed: all 3 attempts exhausted", + extra={"order_id": order_id}, + ) + raise +``` + + + + +```go +// workflow.go +package payments + +import ( + "errors" + "time" + + enumspb "go.temporal.io/api/enums/v1" + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func PaymentWorkflow(ctx workflow.Context, orderID string) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + MaximumAttempts: 3, + }, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, ChargePaymentAPI, orderID).Get(ctx, &result) + if err != nil { + var actErr *temporal.ActivityError + if errors.As(err, &actErr) && actErr.RetryState() == enumspb.RETRY_STATE_MAXIMUM_ATTEMPTS_REACHED { + // All retries exhausted — handle the failure here. + // Options: alert on-call, trigger a compensation activity, or escalate to a human. + workflow.GetLogger(ctx).Error("Payment failed: all 3 attempts exhausted", + "orderID", orderID) + } + return "", err + } + return result, nil +} +``` + + + + +```java +// PaymentWorkflowImpl.java +import io.temporal.activity.ActivityOptions; +import io.temporal.api.enums.v1.RetryState; +import io.temporal.common.RetryOptions; +import io.temporal.failure.ActivityFailure; +import io.temporal.workflow.Workflow; +import java.time.Duration; + +public class PaymentWorkflowImpl implements PaymentWorkflow { + private final PaymentActivities activities = Workflow.newActivityStub( + PaymentActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .setRetryOptions(RetryOptions.newBuilder() + .setMaximumAttempts(3) + .build()) + .build() + ); + + @Override + public String run(String orderId) { + try { + return activities.chargePaymentApi(orderId); + } catch (ActivityFailure e) { + if (e.getRetryState() == RetryState.RETRY_STATE_MAXIMUM_ATTEMPTS_REACHED) { + // All retries exhausted — handle the failure here. + // Options: alert on-call, trigger a compensation activity, or escalate to a human. + Workflow.getLogger(getClass()).error( + "Payment failed: all 3 attempts exhausted: " + orderId, e); + } + throw e; + } + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { chargePaymentApi } = wf.proxyActivities({ + startToCloseTimeout: '10s', + retry: { maximumAttempts: 3 }, +}); + +export async function paymentWorkflow(orderId: string): Promise { + try { + return await chargePaymentApi(orderId); + } catch (err) { + if (err instanceof wf.ActivityFailure && err.retryState === wf.RetryState.MAXIMUM_ATTEMPTS_REACHED) { + // All retries exhausted — handle the failure here. + // Options: alert on-call, trigger a compensation activity, or escalate to a human. + wf.log.error('Payment failed: all 3 attempts exhausted', { orderId }); + } + throw err; + } +} +``` + + + + +### Disabling retries entirely + +Set `maximum_attempts=1` to disable retries. +The Activity starts once and any failure is immediately delivered to the Workflow. +This is appropriate when the operation is not idempotent and a second attempt would cause a duplicate side effect such as a double charge or a duplicate email. + + + + +```python +# workflows.py +result = await workflow.execute_activity( + activities.send_welcome_email, + user_id, + start_to_close_timeout=timedelta(seconds=10), + retry_policy=RetryPolicy(maximum_attempts=1), +) +``` + + + + +```go +// workflow.go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + MaximumAttempts: 1, + }, +} +``` + + + + +```java +// Workflow.java +ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .setRetryOptions(RetryOptions.newBuilder() + .setMaximumAttempts(1) + .build()) + .build() +``` + + + + +```typescript +// workflows.ts +const { sendWelcomeEmail } = wf.proxyActivities({ + startToCloseTimeout: '10s', + retry: { maximumAttempts: 1 }, +}); +``` + + + + +If a Worker crashes after the API call succeeds but before the result is recorded, Temporal will not retry — the call is lost. +An idempotency key (a stable identifier derived from the Workflow and Activity IDs) lets the downstream system detect and discard duplicates if a retry is needed in future. If you have idempotency keys, there's little need to cap retries at 1. + +## Best practices + +- **Match the cap to the cost model.** If the API charges per call, set `maximum_attempts` to the maximum number of calls you are willing to pay for per Workflow execution. +- **Combine with `StartToCloseTimeout`.** A per-attempt timeout prevents a slow response from consuming the entire retry budget on a single hanging call. +- **Catch `ActivityError` in the Workflow.** Handle the exhausted-retries case explicitly — log, alert, compensate, or escalate — rather than letting it fail the Workflow silently. +- **Use idempotency keys.** When retrying, it's vital to have downstream systems detect and discard duplicate calls to avoid duplicate downstream effects. +- **Prefer non-retryable errors for structural failures.** If the failure is not transient (for example, invalid input), mark it as non-retryable rather than relying solely on `maximum_attempts`. + +## Common pitfalls + +- **Confusing `MaximumAttempts` with allowed retry count.** `MaximumAttempts=3` means 3 total attempts (1 initial + 2 retries), not 3 retries after the initial attempt. +- **Setting no timeout alongside a low attempt cap.** Without `StartToCloseTimeout`, a single hanging attempt can block all retries for minutes or hours. +- **Ignoring the `ActivityError` in the Workflow.** Exhausted retries raise an error in the Workflow. If you do not catch it, the Workflow fails without any compensation or alerting. +- **Disabling retries on operations without safeguards.** `maximum_attempts=1` on a call means any failure — including a Worker crash after the API responded — results in a permanent gap. + +## Related patterns + +- [Non-Retryable Errors](/design-patterns/non-retryable-errors): Fail immediately for errors that will never succeed regardless of how many times you try. +- [Fixed Wall-Time Retries](/design-patterns/fixed-wall-time-retries): Bound by total elapsed time rather than attempt count. +- [Idempotent Distributed Transactions](/design-patterns/idempotent-distributed-transactions): Design Activities to be safe to retry without duplicate side effects. +- [Error Handling & Retry Patterns](/design-patterns/error-handling-patterns): Overview and decision tree for all retry patterns. + +## References + +- [Temporal Retry Policies](https://docs.temporal.io/encyclopedia/retry-policies) +- [Idempotency and Durable Execution](https://temporal.io/blog/idempotency-and-durable-execution) diff --git a/docs/design-patterns/fixed-wall-time-retries.mdx b/docs/design-patterns/fixed-wall-time-retries.mdx new file mode 100644 index 0000000000..ce9a87391d --- /dev/null +++ b/docs/design-patterns/fixed-wall-time-retries.mdx @@ -0,0 +1,322 @@ +--- +id: fixed-wall-time-retries +title: "Fixed Wall-Time Retries" +sidebar_label: "Fixed Wall-Time Retries" +description: "Bound the total elapsed time across all retry attempts to enforce a business SLA, regardless of how many individual attempts occur." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Set `ScheduleToCloseTimeout` on the Activity call to enforce a hard time budget across all retry attempts. Use this when a business SLA requires the Activity to **succeed or fail within a defined window**, regardless of how many individual attempts occur. +::: + +## Overview + +The Fixed Wall-Time Retries pattern enforces a maximum total elapsed time across all Activity retry attempts using `ScheduleToCloseTimeout`. +Use it when a business process must _succeed or fail_ within a defined time budget, regardless of how many individual attempts occur. + +## Problem + +`StartToCloseTimeout` limits how long a single Activity attempt may run before Temporal cancels it and schedules a retry. +It does not limit how long retries collectively may run. + +A process with `StartToCloseTimeout=5m` and the default unlimited retry policy can run for days — each attempt times out at 5 minutes, then Temporal waits for the backoff delay and tries again, indefinitely. + +When a business SLA exists and violating that SLA is a failure such as a payment must charge in two minutes or less, an authorization check must complete within 30 seconds — you need a hard outer boundary that Temporal enforces automatically without requiring the Workflow to track elapsed time itself. + +## Solution + +Set `ScheduleToCloseTimeout` on the Activity call options. +It starts when the Activity is first scheduled and expires when the clock runs out, regardless of how many attempts have occurred. +If the timeout expires during an attempt, that attempt is cancelled. +If it expires between retries, the pending retry is abandoned and Temporal delivers an `ActivityError` to the Workflow. + +```mermaid +sequenceDiagram + participant Workflow + participant Temporal as Temporal Service + participant Service as Downstream Service + + Note over Temporal: ScheduleToCloseTimeout = 2m starts + Workflow->>Temporal: Schedule activity + Temporal->>+Service: Attempt 1 (StartToClose = 30s) + Service-->>-Temporal: Failure + Note over Temporal: Backoff delay (5s) + Temporal->>+Service: Attempt 2 (StartToClose = 30s) + Service-->>-Temporal: Failure + Note over Temporal: Backoff delay (5s), ...retries continue + Note over Temporal: 2m elapsed — ScheduleToClose exceeded + Temporal-->>Workflow: ActivityError (schedule-to-close timeout) + Workflow->>Workflow: Handle SLA breach +``` + +The following describes each step: + +1. The two minute budget clock starts the moment the Workflow schedules the Activity. +2. Each attempt runs up to 30 seconds (`StartToCloseTimeout`). On failure, Temporal waits the backoff delay and retries. +3. Retries continue until either the Activity succeeds or the two minute budget is exhausted. +4. When the budget expires, Temporal delivers an `ActivityError` to the Workflow, which can log, alert, or compensate. + +## Implementation + + +### Enforcing a 2-minute SLA + +Set both `schedule_to_close_timeout` (the total budget) and `start_to_close_timeout` (the per-attempt cap). +The retry policy controls the interval between attempts. +Temporal stops retrying automatically when the budget runs out. + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy +from temporalio.exceptions import ActivityError, TimeoutError, TimeoutType +import activities + +@workflow.defn +class PaymentAuthWorkflow: + @workflow.run + async def run(self, transaction_id: str) -> str: + try: + return await workflow.execute_activity( + activities.authorize_transaction, + transaction_id, + schedule_to_close_timeout=timedelta(minutes=2), # total budget + start_to_close_timeout=timedelta(seconds=30), # per attempt + retry_policy=RetryPolicy( + initial_interval=timedelta(seconds=5), + backoff_coefficient=1.5, + maximum_interval=timedelta(seconds=30), + ), + ) + except ActivityError as e: + cause = e.__cause__ + if isinstance(cause, TimeoutError) and cause.type == TimeoutType.SCHEDULE_TO_CLOSE: + workflow.logger.error( + "Authorization failed — 2-minute SLA breached", + extra={"transaction_id": transaction_id}, + ) + raise +``` + + + + +```go +// workflow.go +package shipment + +import ( + "errors" + "time" + + enumspb "go.temporal.io/api/enums/v1" + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func PaymentAuthWorkflow(ctx workflow.Context, transactionID string) (string, error) { + ao := workflow.ActivityOptions{ + ScheduleToCloseTimeout: 2 * time.Minute, // total budget + StartToCloseTimeout: 30 * time.Second, // per attempt + RetryPolicy: &temporal.RetryPolicy{ + InitialInterval: 5 * time.Second, + BackoffCoefficient: 1.5, + MaximumInterval: 30 * time.Second, + }, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, AuthorizeTransaction, transactionID).Get(ctx, &result) + if err != nil { + var timeoutErr *temporal.TimeoutError + if errors.As(err, &timeoutErr) && timeoutErr.TimeoutType() == enumspb.TIMEOUT_TYPE_SCHEDULE_TO_CLOSE { + workflow.GetLogger(ctx).Error( + "Authorization failed — 2-minute SLA breached", + "transactionID", transactionID, + ) + } + return "", err + } + return result, nil +} +``` + + + + +```java +// ShipmentNotificationWorkflowImpl.java +import io.temporal.activity.ActivityOptions; +import io.temporal.api.enums.v1.TimeoutType; +import io.temporal.common.RetryOptions; +import io.temporal.failure.ActivityFailure; +import io.temporal.failure.TimeoutFailure; +import io.temporal.workflow.Workflow; +import java.time.Duration; + +public class PaymentAuthWorkflowImpl implements PaymentAuthWorkflow { + private final PaymentActivities activities = Workflow.newActivityStub( + PaymentActivities.class, + ActivityOptions.newBuilder() + .setScheduleToCloseTimeout(Duration.ofMinutes(2)) // total budget + .setStartToCloseTimeout(Duration.ofSeconds(30)) // per attempt + .setRetryOptions(RetryOptions.newBuilder() + .setInitialInterval(Duration.ofSeconds(5)) + .setBackoffCoefficient(1.5) + .setMaximumInterval(Duration.ofSeconds(30)) + .build()) + .build() + ); + + @Override + public String run(String transactionId) { + try { + return activities.authorizeTransaction(transactionId); + } catch (ActivityFailure e) { + if (e.getCause() instanceof TimeoutFailure tf + && tf.getTimeoutType() == TimeoutType.TIMEOUT_TYPE_SCHEDULE_TO_CLOSE) { + Workflow.getLogger(getClass()).error( + "Authorization failed — 2-minute SLA breached: " + transactionId, e + ); + } + throw e; + } + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { authorizeTransaction } = wf.proxyActivities({ + scheduleToCloseTimeout: '2m', // total budget + startToCloseTimeout: '30s', // per attempt + retry: { + initialInterval: '5s', + backoffCoefficient: 1.5, + maximumInterval: '30s', + }, +}); + +export async function paymentAuthWorkflow(transactionId: string): Promise { + try { + return await authorizeTransaction(transactionId); + } catch (err) { + if (err instanceof wf.ActivityFailure) { + const cause = err.cause; + if (cause instanceof wf.TimeoutFailure && cause.type === wf.TimeoutType.SCHEDULE_TO_CLOSE) { + wf.log.error('Authorization failed — 2-minute SLA breached', { transactionId }); + } + } + throw err; + } +} +``` + + + + +### Short SLA without a per-attempt timeout + +For tighter budgets — such as a 30 second authorization window — you may omit `StartToCloseTimeout` and let `ScheduleToCloseTimeout` act as the only bound. +Temporal requires at least one timeout to be set; `ScheduleToCloseTimeout` alone satisfies that requirement. + + + + +```python +# workflows.py +result = await workflow.execute_activity( + activities.authorize_transaction, + transaction_id, + schedule_to_close_timeout=timedelta(seconds=30), + retry_policy=RetryPolicy( + initial_interval=timedelta(seconds=3), + backoff_coefficient=1.5, + ), +) +``` + + + + +```go +// workflow.go +ao := workflow.ActivityOptions{ + ScheduleToCloseTimeout: 30 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + InitialInterval: 3 * time.Second, + BackoffCoefficient: 1.5, + }, +} +``` + + + + +```java +// Workflow.java +ActivityOptions.newBuilder() + .setScheduleToCloseTimeout(Duration.ofSeconds(30)) + .setRetryOptions(RetryOptions.newBuilder() + .setInitialInterval(Duration.ofSeconds(3)) + .setBackoffCoefficient(1.5) + .build()) + .build() +``` + + + + +```typescript +// workflows.ts +const { authorizeTransaction } = wf.proxyActivities({ + scheduleToCloseTimeout: '30s', + retry: { + initialInterval: '3s', + backoffCoefficient: 1.5, + }, +}); +``` + + + + +## Best practices + +- **Set both timeouts for clarity.** Use `ScheduleToCloseTimeout` as the total SLA and `StartToCloseTimeout` as a per-attempt safety valve. Omitting `StartToCloseTimeout` means a single slow response can consume the entire budget. +- **Cap `MaximumInterval` well below the SLA.** If `MaximumInterval` is 2 hours and the SLA is 24 hours, only 12 retries are possible. Tune the interval so the backoff plateaus at a value that allows meaningful retries within the budget. +- **Handle `ActivityError` explicitly.** When the SLA expires, Temporal delivers an error to the Workflow. Catch it to send an alert, trigger a compensation, or record a breach in an audit log. +- **Distinguish SLA breaches from transient errors.** Inspect the error cause — check that the `ActivityError`'s cause is a `TimeoutError` with `TimeoutType.SCHEDULE_TO_CLOSE` (Python) or a `TimeoutFailure` with `TimeoutType.SCHEDULE_TO_CLOSE` (TypeScript) or `TIMEOUT_TYPE_SCHEDULE_TO_CLOSE` (Go/Java) to separate an SLA breach from an application failure. This lets you log or alert specifically on SLA violations rather than treating all activity errors the same way. + +## Common pitfalls + +- **Not accounting for `ScheduleToStart` delay in the budget.** `ScheduleToCloseTimeout` begins when the Activity is first scheduled, which includes the time the task waits in the queue before a Worker picks it up. Under high load or insufficient Worker capacity, tasks can sit in the queue for seconds or minutes before the first attempt starts — consuming SLA budget before any work is done. Provision Workers with enough capacity for peak traffic, or use autoscaling, to keep `ScheduleToStart` latency negligible relative to the SLA window. +- **Using `StartToCloseTimeout` alone for SLA enforcement.** A downstream system that responds slowly but never fully times out can keep resetting the per-attempt clock indefinitely. +- **Setting `ScheduleToCloseTimeout` shorter than `StartToCloseTimeout`.** If the total budget is shorter than a single attempt's maximum, the Activity will never complete — Temporal will cancel it before it finishes. +- **Ignoring the breach in the Workflow.** Letting the `ActivityError` propagate without handling it means SLA breaches go unlogged and uncompensated. +- **Not accounting for backoff delays in the budget.** The total time includes both attempt durations and the backoff delays between them. A 1-hour budget with a 30-minute initial interval and coefficient 2.0 leaves room for only one or two attempts. + +## Related patterns + +- [Fixed Count of Retries](/design-patterns/fixed-count-retries): Bound by attempt count rather than elapsed time. +- [Delayed Retry](/design-patterns/delayed-retry): Fixed-interval retry when the downstream unavailability window is known. +- [Error Handling & Retry Patterns](/design-patterns/error-handling-patterns): Overview and decision tree for all retry patterns. + +## References + +- [Activity Timeouts](https://temporal.io/blog/activity-timeouts) +- [Temporal Retry Policies](https://docs.temporal.io/encyclopedia/retry-policies) diff --git a/docs/design-patterns/idempotent-distributed-transactions.mdx b/docs/design-patterns/idempotent-distributed-transactions.mdx new file mode 100644 index 0000000000..f231575e57 --- /dev/null +++ b/docs/design-patterns/idempotent-distributed-transactions.mdx @@ -0,0 +1,1404 @@ +--- +id: idempotent-distributed-transactions +title: "Idempotent Distributed Transactions" +sidebar_label: "Idempotent Distributed Transactions" +description: "Coordinates multi-step operations across external services with safe retries, automatic rollback on failure, and protection against duplicate submissions." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Idempotent Distributed Transactions pattern coordinates a multi-step operation across external services such that every step is safe to retry, every failure triggers an automatic rollback, and duplicate client submissions never produce duplicate side effects. + +A payment workflow is used as the concrete example throughout because it makes the consequences of non-idempotent execution immediately tangible, but the pattern applies to any multi-step transaction that touches more than one external system. + +## Problem + +Any workflow that calls multiple external services in sequence faces three hard failure modes: + +1. **Double execution on retry.** Your service crashes after calling an external system but before recording the result. +On restart, you retry the call — but the remote system already processed the first request and now processes a second. +In a payment context, the payer is charged twice. +In an inventory context, stock is decremented twice. +In an email context, the user receives two copies of the same notification. + +2. **Lost compensation on failure.** Step A succeeds, step B times out, and the process crashes. +When it restarts, it has no record of the in-flight state. +Step A is never reversed. +The system is left in a partially committed state that requires manual repair. + +3. **Duplicate submissions from clients.** The client retries a timed-out HTTP request. +Two executions of the same logical transaction are now running concurrently. +Both reach the first mutating step simultaneously. + +Conventional mitigations — distributed locks, outbox tables, idempotency middleware, custom saga orchestrators — each address one failure mode but require significant infrastructure, careful coordination, and extensive testing to hold together under real conditions. + +## Solution + +Temporal replaces this custom infrastructure with three built-in guarantees: + +1. **Exactly-once Workflow execution per ID.** If you use the client-supplied request reference as the Workflow ID, Temporal rejects any duplicate `StartWorkflow` call for a running or completed execution. Client retries are handled automatically at the platform level. + +2. **Durable activity state.** When a Worker crashes mid-execution, Temporal replays the Workflow from its event history on any available Worker. Activities that already completed are not re-executed — their results are replayed from history. Activities that were in-flight are retried from scratch, so each activity must be idempotent, but no activity runs more than once to completion. + +3. **Deterministic idempotency keys.** Because the Workflow replays deterministically, any value derived from `workflow.info().workflowId` and a per-step constant is identical on every replay attempt. You pass these keys to external systems so that retried activity calls are recognized as duplicates and skipped. + +The example used throughout this pattern is an **outward interbank payment** from a source bank to a government/industry-managed payment switch (such as NPCI/UPI, SEPA, NPP, FedNow/RTP or SWIFT). +The destination bank leg is out of scope — we focus only on what the source bank's system must do to send money out reliably. + +The workflow has four steps, each with a registered compensation: + +1. **Validate input** — check the request fields locally; no external calls +2. **Reserve and block** — generate a transaction ID, block the transfer amount in the customer's account, and notify the customer that a transfer is pending +3. **Submit to switch and wait for callback** — send the transfer instruction to the switch, then pause the Workflow and wait for the switch's asynchronous confirmation to arrive as a Signal; use check-and-skip on the submit activity to avoid double submission if the Worker crashes mid-flight +4. **Debit and settle** — once the switch confirmation Signal is received, move the blocked funds from the source account to the central escrow account, then notify the customer of success + +The switch confirmation is **asynchronous**: the switch does not respond inline to the HTTP call. +Instead, it processes the transfer and POSTs a callback to your service's webhook endpoint. +Your webhook handler calls `workflow.signal()` to deliver the confirmation into the waiting Workflow. + +The Workflow resumes from exactly where it paused — no polling, no database, no separate coordinator. + +The key insight is that **funds are only debited after the switch signals confirmation**. +Blocking the amount in step 2 prevents the customer from spending the money while the transfer is in-flight, but the actual debit does not happen until step 4. +If the switch rejects the transfer, signals a failure, or the SLA deadline expires with no signal received, the block is released and no debit ever occurs. + +### Transaction flow (happy path) + +```mermaid +sequenceDiagram + participant Customer + participant Workflow as Temporal Workflow + participant Ledger as Source Bank Ledger + participant Switch as Payment Switch + participant Webhook as Webhook Handler + + Customer->>Workflow: StartWorkflow(paymentId, amount, ...) + Note over Workflow: paymentId = Workflow ID
Temporal rejects duplicate starts + + Workflow->>Ledger: reserveAndBlock(key: paymentId:reserve) + Note over Ledger: Block amount, generate txID + Ledger-->>Workflow: ReservationRecord(txID) + Workflow->>Customer: Notification: transfer pending + + Workflow->>Switch: submitToSwitch(key: txID:submit)
Debit central escrow → send to destination via switch + Note over Workflow,Switch: Check-and-skip guard runs first.
Switch debits central escrow and routes to destination bank.
ScheduleToCloseTimeout = SLA deadline for HTTP submit. + Switch-->>Workflow: HTTP 202 Accepted (async — will callback) + + Note over Workflow: Workflow pauses here.
Waiting for switchConfirmed Signal. + + Switch->>Webhook: POST /callback (txID, status=SUCCESS) + Webhook->>Workflow: Signal switchConfirmed(txID) + + Note over Workflow: Signal received — Workflow resumes. + + Workflow->>Ledger: debitAndSettle(key: txID:debit) + Note over Ledger: Debit source account
Credit central escrow + Ledger-->>Workflow: settled + + Workflow->>Customer: Notification: transfer sent + Workflow-->>Customer: WorkflowResult(txID) +``` + +The following describes each step: + +1. The customer starts the payment. The `paymentId` is used as the Workflow ID — Temporal rejects any duplicate start for the same ID, handling customer retries automatically. +2. The Workflow validates all fields locally with no external calls. If validation fails, the Workflow ends with no side effects. +3. The `reserveAndBlock` activity generates a `txID`, blocks the transfer amount in the customer's account, and sends a "transfer pending" notification. The idempotency key `paymentId:reserve` makes this safe to retry. The compensation for this step unblocks the amount. +4. The `submitToSwitch` activity runs the check-and-skip guard (query switch before sending), then sends the transfer instruction. The instruction tells the switch to debit the source bank's central escrow account and route the funds to the destination bank. The switch responds with HTTP 202 — it has accepted the instruction but has not yet settled it. The `ScheduleToCloseTimeout` caps the total time spent on this activity across all retries. +5. The Workflow calls `workflow.wait_condition` (or `workflow.await`) on the `switchConfirmed` signal and pauses. No polling. No thread blocking. The Workflow is durably suspended — if the Worker restarts, Temporal replays it back to this exact waiting state. +6. The switch processes the transfer asynchronously and POSTs a callback to your webhook. The webhook handler calls `temporal_client.signal_workflow(workflow_id=payment_id, signal="switchConfirmed", ...)`. The Workflow resumes immediately. +7. The `debitAndSettle` activity debits the customer's source account and credits the bank's central escrow account (key `txID:debit`). This is the first and only time the customer's balance is reduced — and it happens only because the switch already confirmed that it debited the central escrow and delivered the funds to the destination bank. The debit step completes the internal settlement: escrow is now square. +8. The customer receives a "transfer sent" notification and the Workflow completes. + +### Compensation flow on mid-step failure + +There are two distinct failure scenarios after step 3 — and they require different compensation paths. + +**Scenario A — submit activity fails (Worker crash or switch error):** The submit activity returns an error. We do not know if the switch received the instruction, so we must query before cancelling. + +**Scenario B — SLA deadline exceeded waiting for the callback signal:** The submit activity succeeded (switch accepted the instruction), but the switch never sent the callback within the SLA window. The Workflow's signal wait times out. + +```mermaid +flowchart TD + Submit[Step 3a: submitToSwitch\nKey: txID:submit] -->|Activity error| QS + + Submit -->|HTTP 202 received| Wait[Step 3b: Wait for switchConfirmed Signal\nworkflow.wait with SLA timeout] + Wait -->|Signal received: SUCCESS| Debit[Step 4: debitAndSettle] + Wait -->|Signal received: FAILURE\nor SLA timeout — no signal| QS + + QS{Check-and-skip:\nQuery switch for txID:submit} + QS -->|Switch has a record| Cancel[Compensate: cancelWithSwitch\nKey: txID:comp-submit] + QS -->|No record on switch| Skip[Skip cancellation\nnothing to undo] + + Cancel --> Unblock[Compensate: unblockAndCancel\nKey: txID:comp-reserve\nRelease blocked amount] + Skip --> Unblock + + Debit --> Done([Success]) + Unblock --> Fail([End: Failed\nCustomer: transfer cancelled]) + classDef success stroke-width:1px + classDef wait stroke-width:1px + classDef compensation stroke-width:1px + classDef complete stroke-width:1px + classDef fail stroke-width:1px + class Submit,Debit success + class Wait,Skip wait + class Cancel,Unblock compensation + class Done complete + class Fail fail +``` + +The following describes both failure paths: + +**Path A — submit activity errors:** +1. The `submitToSwitch` activity fails (Worker crashed, network error, switch returned non-retryable error, or `ScheduleToCloseTimeout` expired across all retries). +2. The check-and-skip guard queries the switch for `txID:submit` to resolve the ambiguity: did the instruction reach the switch before the failure? +3. If the switch has a record, the transfer was accepted. The compensation sends a cancellation with key `txID:comp-submit`. +4. If the switch has no record, the instruction never arrived. Cancellation is skipped. +5. Either way, the source account block is released with key `txID:comp-reserve`. The customer is notified. + +**Path B — SLA timeout waiting for callback:** +1. The `submitToSwitch` activity returns successfully (switch responded HTTP 202). The Workflow begins waiting for the `switchConfirmed` signal with a deadline. +2. The deadline passes with no signal. This means the switch accepted the instruction but has not called back yet — it may still be processing, or the callback was lost. +3. The same check-and-skip guard runs. The switch is queried for `txID:submit`. +4. Since the switch accepted the original submission (it returned 202), a record will likely exist. The compensation sends a cancellation. +5. The source account block is released. The customer is notified that the transfer timed out. + +### Idempotency strategies for the submit activity + +The riskiest moment in the entire workflow is when the `submitToSwitch` activity sends the transfer instruction, the switch accepts it, but the Worker crashes before Temporal records the activity result. +On retry, the activity runs again. +Without a guard, the switch receives the same instruction twice and may process it twice. + +There are four strategies for handling this. They differ in what the switch must support and what infrastructure you must maintain. + +#### Option A — Query switch by idempotency key (check-and-skip) + +Before sending, query the switch for an existing record with `txID:submit`. +If a record exists, the previous attempt reached the switch — return immediately without sending again. + +``` +query switch for txID:submit + → found → return (skip send) + → not found → send instruction +``` + +| | | +|---|---| +| **Pros** | No extra infrastructure. Works even after a Worker crash mid-send. Compensation also uses the same guard to decide whether to cancel. | +| **Cons** | Switch must expose a query-by-key endpoint. Not all switches do. | +| **Best for** | Switches that support idempotency key lookup (most modern payment APIs). | + +This is the approach shown in the implementation code above. + +--- + +#### Option B — Client-supplied transaction reference in the request body + +Many switches accept a `clientTransactionId` or `endToEndId` field in the request body and use it to de-duplicate on their side. +You derive this field deterministically from the Workflow ID in the Workflow layer — the same value on every retry. +The switch silently returns the same response for a duplicate reference without processing it twice. + +```python +# The clientTransactionId is derived in the Workflow (not in the activity) +# so it is identical on every replay attempt. +client_tx_id = f"{workflow_id}:submit" # e.g. "pay-abc123:submit" + +await switch_client.submit( + client_transaction_id=client_tx_id, + amount=params["amount"], + ... +) +``` + +| | | +|---|---| +| **Pros** | No query round-trip. No extra infrastructure on your side. The switch absorbs duplicates transparently. | +| **Cons** | Switch must honour the client-supplied field for de-duplication. Requires confirming behaviour with the switch operator. | +| **Best for** | Switches that accept a client reference field (UPI, SEPA, SWIFT, Stripe). This is the most common case for government-managed interbank switches. | + +--- + +#### Option C — Write-ahead submission log in your own database + +Before calling the switch, the activity writes a record to your own database: + +``` +INSERT INTO payment_submissions (tx_id, status) +VALUES (:tx_id, 'PENDING') +ON CONFLICT (tx_id) DO NOTHING +``` + +On retry, the activity reads the record first: +- `status = SENT` → switch was called and succeeded. Skip. Return recorded result. +- `status = PENDING` → previous attempt crashed before or after calling the switch. The state is ambiguous — fall through to Option D to resolve. + +```mermaid +flowchart TD + A[Activity starts] --> B{Check DB:\ntx_id record exists?} + B -->|status=SENT| Skip([Return recorded result\ndo not call switch]) + B -->|status=PENDING| Ambiguous[Ambiguous: crash before or after send\nuse Option D to resolve] + B -->|No record| C[Write PENDING to DB] + C --> D[Call switch] + D --> E[Update DB: status=SENT] + E --> Done([Return result]) + classDef success stroke-width:1px + classDef wait stroke-width:1px + classDef complete stroke-width:1px + class Skip success + class Ambiguous wait + class Done complete +``` + +| | | +|---|---| +| **Pros** | Fully within your control. No switch query API needed. Cleanly handles the "before the call" crash case. | +| **Cons** | Does not resolve the ambiguous window between calling the switch and updating the DB. That window still requires Option A or D. Best used in combination. | +| **Best for** | Adding a first layer of protection on top of Option D. | + +--- + +#### Option D — Non-retryable submit + reconciliation activity + +Set `MaxAttempts: 1` on the submit activity. +If it fails for any reason (including a crash after the switch accepted), the activity fails immediately — no automatic retry. +The Workflow then executes a separate `reconcileSubmission` activity that reads from your own webhook callback log to determine what actually happened. + +```mermaid +flowchart TD + Submit[submitToSwitch\nMaxAttempts=1] -->|Success| Wait[Wait for switchConfirmed Signal] + Submit -->|Failure| Reconcile[reconcileSubmission\nPoll your own webhook log] + + Reconcile -->|status=SUCCESS\nin webhook log| Wait + Reconcile -->|status=FAILED\nin webhook log| Compensate[Run compensations] + Reconcile -->|status=UNKNOWN\nnot in log yet| Retry[Retry reconcile\nafter delay] + Retry --> Reconcile + classDef success stroke-width:1px + classDef wait stroke-width:1px + classDef compensation stroke-width:1px + classDef complete stroke-width:1px + class Submit success + class Reconcile wait + class Compensate compensation + class Wait complete +``` + +Your webhook handler stores every incoming switch callback in a `switch_callbacks` table keyed by `tx_id`. +The `reconcileSubmission` activity reads from that table. + + + + +```python +@activity.defn +async def reconcile_submission(params: dict) -> str: + """ + Read from the local webhook callback log to determine what the switch + decided. Called only when submitToSwitch fails. + Returns "SUCCESS", "FAILED", or "UNKNOWN". + """ + row = await db.query_one( + "SELECT status FROM switch_callbacks WHERE tx_id = :tx_id", + tx_id=params["tx_id"], + ) + if row is None: + return "UNKNOWN" # callback not received yet — caller should retry + return row["status"] # "SUCCESS" or "FAILED" +``` + + + + +```go +func ReconcileSubmission(ctx context.Context, params ReconcileParams) (string, error) { + // Read from the local webhook callback log. + // Called only when SubmitToSwitch fails. + // Returns "SUCCESS", "FAILED", or "UNKNOWN". + row, err := db.QueryOne(ctx, + "SELECT status FROM switch_callbacks WHERE tx_id = $1", params.TxID) + if err != nil { + return "", err + } + if row == nil { + return "UNKNOWN", nil // callback not received yet — caller should retry + } + return row.Status, nil // "SUCCESS" or "FAILED" +} +``` + + + + +```java +@ActivityMethod +public String reconcileSubmission(ReconcileParams params) { + // Read from the local webhook callback log. + // Called only when submitToSwitch fails. + // Returns "SUCCESS", "FAILED", or "UNKNOWN". + Optional row = db.queryOne( + "SELECT status FROM switch_callbacks WHERE tx_id = ?", params.txId()); + if (row.isEmpty()) { + return "UNKNOWN"; // callback not received yet — caller should retry + } + return row.get().status(); // "SUCCESS" or "FAILED" +} +``` + + + + +```typescript +export async function reconcileSubmission(params: ReconcileParams): Promise { + // Read from the local webhook callback log. + // Called only when submitToSwitch fails. + // Returns 'SUCCESS', 'FAILED', or 'UNKNOWN'. + const row = await db.queryOne( + 'SELECT status FROM switch_callbacks WHERE tx_id = $1', [params.txId] + ); + if (!row) return 'UNKNOWN'; // callback not received yet — caller should retry + return row.status; // 'SUCCESS' or 'FAILED' +} +``` + + + + +| | | +|---|---| +| **Pros** | Most robust. Resolves every ambiguous crash scenario. Does not depend on the switch having a query API. Reconciliation reads your own data. | +| **Cons** | Requires a `switch_callbacks` table and a webhook handler that writes to it. More moving parts than Options A–C. | +| **Best for** | Switches with no query API and no client reference field. Also the right fallback when Options A or B are unavailable. | + +--- + +#### Comparison and recommendation + +| Strategy | Switch query API needed | Your DB needed | Handles crash-after-send | Complexity | +| :--- | :--- | :--- | :--- | :--- | +| A — Query switch by key | Yes | No | Yes | Low | +| B — Client reference in request body | No (switch de-duplicates on field) | No | Yes | Low | +| C — Write-ahead submission log | No | Yes (submissions table) | Partially — ambiguous window remains | Medium | +| D — Non-retryable + reconciliation | No | Yes (callbacks table) | Yes | Medium | + +**Recommended approach:** + +1. **If your switch accepts a `clientTransactionId` or equivalent field** → use Option B. It requires nothing extra and the switch handles de-duplication. Verify the switch API documentation confirms idempotent behaviour on that field. + +2. **If your switch exposes a query-by-key endpoint** → use Option A (check-and-skip). This is what the implementation code in this pattern demonstrates. + +3. **If your switch has neither** → use Option D (non-retryable + reconciliation). Your webhook handler already stores callbacks to drive the `switchConfirmed` Signal — extend the same table to also serve the reconciliation activity. Option C can be layered on top of D to make the `PENDING` case cheaper to resolve. + +In practice, most government-managed interbank switches (UPI, SEPA, SWIFT gpi) support either a client end-to-end reference (Option B) or a status query endpoint (Option A), so Options C and D are primarily fallback strategies. + +### Before Temporal: the conventional approach + +```mermaid +flowchart TD + Client([Client]) --> LB[Load Balancer] + LB --> API[Transaction API Service] + API --> Lock[Distributed Lock\nRedis / ZooKeeper] + API --> Outbox[Outbox Table\nPostgres] + Outbox --> Worker[Outbox Worker\nPolling job] + Worker --> ExtSys[External System] + Worker --> Store[Data Store] + API --> SagaDB[Saga State Table\nPostgres] + SagaDB --> SagaWorker[Saga Orchestrator\nCustom service] + SagaWorker --> Store + SagaWorker --> ExtSys + classDef compensation stroke-width:1px + class Lock,Outbox,SagaDB,SagaWorker,Worker compensation +``` + +The conventional approach requires you to build and maintain: + +- A distributed lock to prevent concurrent duplicate executions +- An outbox table and polling worker to guarantee at-least-once delivery to external systems +- A saga state table to persist compensation state across process restarts +- A saga orchestrator service to resume and compensate interrupted transactions +- Idempotency middleware to de-duplicate retried HTTP calls + +Each component is an independent failure point requiring its own monitoring, schema migrations, retry logic, and operational runbook. +The surface area for bugs is large, and the bugs tend to be rare, hard to reproduce, and high-impact. + +### After Temporal: durable execution as infrastructure + +```mermaid +flowchart TD + Client([Client]) --> Temporal[Temporal Workflow\nrequestId = Workflow ID] + Temporal --> StoreA[Source Data Store] + Temporal --> Escrow[Intermediate Holding State] + Temporal --> ExtSys[External System] + Temporal --> StoreB[Destination Data Store] + Temporal --> Notify[Notification Service] + classDef complete stroke-width:1px + class Temporal complete +``` + +Temporal's durable execution engine replaces the lock, the outbox, the saga state table, and the orchestrator with a single programming model. +The Workflow function is the authoritative record of execution state. +When a Worker crashes, Temporal replays the Workflow from its event history on any available Worker, skipping completed activities and retrying in-flight ones. + +### Idempotency key derivation + +You derive idempotency keys from the Workflow ID and a per-step suffix inside the Workflow function, not inside Activities. +The key is stable across all replay attempts because it is derived from deterministic inputs. + +``` +idempotency_key = workflow_id + ":" + step_name +``` + +Example keys for a Workflow ID `pay-abc123`: + +| Step | Key | Used by | +| :--- | :--- | :--- | +| Reserve and block | `pay-abc123:reserve` | Source bank ledger | +| Submit to switch | `pay-abc123:submit` | Payment switch | +| Debit and settle | `pay-abc123:debit` | Source bank ledger | +| Unblock (comp. for reserve) | `pay-abc123:comp-reserve` | Source bank ledger | +| Cancel with switch (comp. for submit) | `pay-abc123:comp-submit` | Payment switch | + +Each system uses this key to de-duplicate requests — if the same key arrives twice, the system returns the result of the first call instead of executing again. +Forward and compensation steps must use distinct keys so the switch does not reject a cancellation as a duplicate of the original submission. + +### Intermediate holding state + +The intermediate holding state is any platform-controlled resource that sits between the source mutation and the destination mutation. +In the payment example it is a ledger escrow account. +In other domains it could be a reservation record, a staging queue, or a provisional inventory allocation. + +The invariant is: at any point in time, the total across source + holding + destination equals the original total. +No resources are created or destroyed by a partial failure — they are either fully committed or fully reversed. + +### SLA enforcement on external calls + +External systems have contractual SLAs (for example, respond within 30 seconds). +You enforce the SLA by setting `ScheduleToCloseTimeout` on the delivery activity. +This timeout covers all retries of the activity, not just a single attempt. +If the external system does not confirm within the SLA window, the activity fails deterministically and compensation begins. + +## Implementation + +### Define payment types and idempotency key helper + +Define the payment request type and the key derivation helper in the Workflow layer. +The helper must live here — not inside activities — so that every key computed during a replay is identical to the key computed during the original execution. + + + + +```python +# payment_types.py +from dataclasses import dataclass + +@dataclass +class PaymentRequest: + payment_id: str # Client-supplied reference — used as the Workflow ID + source_account: str + destination_account: str + amount: int # In minor units (e.g. cents) + currency: str + +@dataclass +class ReservationRecord: + tx_id: str # Unique transaction ID generated during reserve step + +def step_key(tx_id: str, step: str) -> str: + """Derive a stable idempotency key for a workflow step. + + Keys are derived here (in Workflow code) not inside activities, so + the same key is produced on every replay attempt. + """ + return f"{tx_id}:{step}" +``` + + + + +```go +// payment_types.go +package payment + +import "fmt" + +type PaymentRequest struct { + PaymentID string // Client-supplied reference — used as the Workflow ID + SourceAccount string + DestinationAccount string + Amount int64 // In minor units (e.g. cents) + Currency string +} + +type ReservationRecord struct { + TxID string // Unique transaction ID generated during reserve step +} + +// StepKey derives a stable idempotency key for a workflow step. +// Must be called in Workflow code, not inside activities, so the key +// is identical on every replay attempt. +func StepKey(txID, step string) string { + return fmt.Sprintf("%s:%s", txID, step) +} +``` + + + + +```java +// PaymentTypes.java +public class PaymentTypes { + + public record PaymentRequest( + String paymentId, // Client-supplied reference — used as the Workflow ID + String sourceAccount, + String destinationAccount, + long amount, // In minor units (e.g. cents) + String currency + ) {} + + public record ReservationRecord( + String txId // Unique transaction ID generated during reserve step + ) {} + + /** + * Derives a stable idempotency key for a workflow step. + * Must be called in Workflow code, not inside activities, so the key + * is identical on every replay attempt. + */ + public static String stepKey(String txId, String step) { + return txId + ":" + step; + } +} +``` + + + + +```typescript +// paymentTypes.ts +export interface PaymentRequest { + paymentId: string; // Client-supplied reference — used as the Workflow ID + sourceAccount: string; + destinationAccount: string; + amount: number; // In minor units (e.g. cents) + currency: string; +} + +export interface ReservationRecord { + txId: string; // Unique transaction ID generated during reserve step +} + +/** + * Derives a stable idempotency key for a workflow step. + * Must be called in Workflow code, not inside activities, so the key + * is identical on every replay attempt. + */ +export function stepKey(txId: string, step: string): string { + return `${txId}:${step}`; +} +``` + + + + +### Implement the Workflow with Signal and Saga compensations + +The Workflow defines a `switchConfirmed` Signal handler that the webhook calls to deliver the switch's asynchronous response. +After submitting to the switch, the Workflow pauses with `workflow.wait_condition` / `workflow.await` until either the Signal arrives or the SLA deadline passes. +Idempotency keys are derived in the Workflow function — never inside activities — so they are stable across all replay attempts. + + + + +```python +# payment_workflow.py +from datetime import timedelta +from dataclasses import dataclass, field +from temporalio import workflow +from payment_types import PaymentRequest, step_key + +@dataclass +class SwitchCallback: + tx_id: str + status: str # "SUCCESS" or "FAILURE" + reason: str = "" + +@workflow.defn +class InterBankPaymentWorkflow: + def __init__(self) -> None: + # Holds the callback payload when the signal arrives + self._switch_callback: SwitchCallback | None = None + + @workflow.signal + def switch_confirmed(self, callback: SwitchCallback) -> None: + """ + Called by your webhook handler when the payment switch posts a callback. + The webhook sends: temporal_client.signal_workflow( + workflow_id=payment_id, + signal_name="switch_confirmed", + arg=SwitchCallback(tx_id, status)) + """ + self._switch_callback = callback + + @workflow.run + async def run(self, req: PaymentRequest) -> str: + compensations = [] + + try: + # Step 1: Validate input — local only, no external calls + if req.amount <= 0 or not req.source_account or not req.destination_account: + raise ValueError("Invalid payment request") + + # Step 2: Reserve and block + # Generates txID, blocks the amount, notifies customer "transfer pending" + reservation = await workflow.execute_activity( + reserve_and_block, + { + "idempotency_key": f"{req.payment_id}:reserve", + "payment_id": req.payment_id, + "source_account": req.source_account, + "amount": req.amount, + }, + start_to_close_timeout=timedelta(seconds=10), + ) + # Compensation: unblock the amount if anything goes wrong + compensations.append( + lambda r=reservation: workflow.execute_activity( + unblock_and_cancel, + { + "idempotency_key": step_key(r.tx_id, "comp-reserve"), + "tx_id": r.tx_id, + "source_account": req.source_account, + "amount": req.amount, + }, + start_to_close_timeout=timedelta(seconds=10), + ) + ) + + # Step 3a: Submit to payment switch + # ScheduleToCloseTimeout = total budget for the HTTP submit across all retries + # Key derived here — stable even if Worker crashes and replays + await workflow.execute_activity( + submit_to_switch, + { + "idempotency_key": step_key(reservation.tx_id, "submit"), + "tx_id": reservation.tx_id, + "source_account": req.source_account, + "destination_account": req.destination_account, + "amount": req.amount, + "currency": req.currency, + }, + schedule_to_close_timeout=timedelta(seconds=10), + ) + # Compensation: cancel with switch if debit step fails after confirmation + compensations.append( + lambda r=reservation: workflow.execute_activity( + cancel_with_switch, + { + "idempotency_key": step_key(r.tx_id, "comp-submit"), + "tx_id": r.tx_id, + }, + start_to_close_timeout=timedelta(seconds=15), + ) + ) + + # Step 3b: Wait for switch callback signal (SLA = 60 seconds) + # The Workflow is durably suspended here — no polling, no thread blocking. + # If the Worker restarts, Temporal replays to this point and waits again. + confirmed = await workflow.wait_condition( + lambda: self._switch_callback is not None, + timeout=timedelta(seconds=60), + ) + if not confirmed or self._switch_callback.status != "SUCCESS": + reason = self._switch_callback.reason if self._switch_callback else "SLA timeout" + raise RuntimeError(f"Switch did not confirm transfer: {reason}") + + # Step 4: Debit and settle + # Only reached after switch signals SUCCESS — now safe to move funds + await workflow.execute_activity( + debit_and_settle, + { + "idempotency_key": step_key(reservation.tx_id, "debit"), + "tx_id": reservation.tx_id, + "source_account": req.source_account, + "amount": req.amount, + }, + start_to_close_timeout=timedelta(seconds=15), + ) + + return reservation.tx_id + + except Exception: + # Run compensations in reverse (LIFO) — most recent step first + for compensation in reversed(compensations): + await compensation() + raise +``` + + + + +```go +// payment_workflow.go +package payment + +import ( + "fmt" + "time" + + "go.temporal.io/sdk/workflow" +) + +// SwitchCallback is the payload delivered via the switchConfirmed Signal. +type SwitchCallback struct { + TxID string + Status string // "SUCCESS" or "FAILURE" + Reason string +} + +func InterBankPaymentWorkflow(ctx workflow.Context, req PaymentRequest) (string, error) { + var compensations []func() error + var err error + + // defer runs compensations in LIFO order if err is non-nil at return + defer func() { + if err != nil { + for i := len(compensations) - 1; i >= 0; i-- { + _ = compensations[i]() + } + } + }() + + // switchCallback is set when the switchConfirmed Signal arrives. + // The webhook handler calls: temporal_client.SignalWorkflow(ctx, paymentID, + // "", "switchConfirmed", SwitchCallback{...}) + var switchCallback *SwitchCallback + workflow.SetSignalHandler(ctx, "switchConfirmed", func(cb SwitchCallback) { + switchCallback = &cb + }) + + // Step 1: Validate input — local only, no external calls + if req.Amount <= 0 || req.SourceAccount == "" || req.DestinationAccount == "" { + err = fmt.Errorf("invalid payment request") + return "", err + } + + ao := workflow.ActivityOptions{StartToCloseTimeout: 10 * time.Second} + ctx = workflow.WithActivityOptions(ctx, ao) + + // Step 2: Reserve and block + // Generates txID, blocks the amount, notifies customer "transfer pending" + var reservation ReservationRecord + err = workflow.ExecuteActivity(ctx, ReserveAndBlock, ReserveParams{ + IdempotencyKey: req.PaymentID + ":reserve", + PaymentID: req.PaymentID, + SourceAccount: req.SourceAccount, + Amount: req.Amount, + }).Get(ctx, &reservation) + if err != nil { + return "", err + } + // Compensation: unblock the amount if anything goes wrong + compensations = append(compensations, func() error { + return workflow.ExecuteActivity(ctx, UnblockAndCancel, UnblockParams{ + IdempotencyKey: StepKey(reservation.TxID, "comp-reserve"), + TxID: reservation.TxID, + SourceAccount: req.SourceAccount, + Amount: req.Amount, + }).Get(ctx, nil) + }) + + // Step 3a: Submit to payment switch + // Key derived here — stable even if Worker crashes and replays + switchCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + ScheduleToCloseTimeout: 10 * time.Second, + }) + err = workflow.ExecuteActivity(switchCtx, SubmitToSwitch, SubmitParams{ + IdempotencyKey: StepKey(reservation.TxID, "submit"), + TxID: reservation.TxID, + SourceAccount: req.SourceAccount, + DestinationAccount: req.DestinationAccount, + Amount: req.Amount, + Currency: req.Currency, + }).Get(ctx, nil) + if err != nil { + return "", err + } + // Compensation: cancel with switch if debit step fails after confirmation + compensations = append(compensations, func() error { + return workflow.ExecuteActivity(ctx, CancelWithSwitch, CancelParams{ + IdempotencyKey: StepKey(reservation.TxID, "comp-submit"), + TxID: reservation.TxID, + }).Get(ctx, nil) + }) + + // Step 3b: Wait for switch callback signal (SLA = 60 seconds) + // The Workflow is durably suspended here — no polling, no thread blocking. + // If the Worker restarts, Temporal replays to this point and waits again. + slaCtx, cancel := workflow.WithCancel(ctx) + defer cancel() + _ = workflow.NewTimer(slaCtx, 60*time.Second) + + workflow.Await(ctx, func() bool { + return switchCallback != nil + }) + + if switchCallback == nil { + err = fmt.Errorf("SLA timeout: switch did not confirm within 60s") + return "", err + } + if switchCallback.Status != "SUCCESS" { + err = fmt.Errorf("switch rejected transfer: %s", switchCallback.Reason) + return "", err + } + + // Step 4: Debit and settle + // Only reached after switch signals SUCCESS — now safe to move funds + err = workflow.ExecuteActivity(ctx, DebitAndSettle, DebitParams{ + IdempotencyKey: StepKey(reservation.TxID, "debit"), + TxID: reservation.TxID, + SourceAccount: req.SourceAccount, + Amount: req.Amount, + }).Get(ctx, nil) + return reservation.TxID, err +} +``` + + + + +```java +// InterBankPaymentWorkflowImpl.java +import io.temporal.workflow.Workflow; +import io.temporal.workflow.SignalMethod; +import io.temporal.activity.ActivityOptions; +import java.time.Duration; + +public class InterBankPaymentWorkflowImpl implements InterBankPaymentWorkflow { + + // Set by the switchConfirmed Signal when the webhook calls back + private SwitchCallback switchCallback = null; + + /** + * Signal handler — called by your webhook endpoint via: + * WorkflowStub stub = client.newUntypedWorkflowStub(paymentId); + * stub.signal("switchConfirmed", new SwitchCallback(txId, "SUCCESS", "")); + */ + @SignalMethod + public void switchConfirmed(SwitchCallback callback) { + this.switchCallback = callback; + } + + private final PaymentActivities activities = Workflow.newActivityStub( + PaymentActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .build() + ); + + private final PaymentActivities switchActivities = Workflow.newActivityStub( + PaymentActivities.class, + ActivityOptions.newBuilder() + .setScheduleToCloseTimeout(Duration.ofSeconds(10)) + .build() + ); + + @Override + public String processPayment(PaymentRequest req) { + // Step 1: Validate input — local only, no external calls + if (req.amount() <= 0 || req.sourceAccount().isEmpty()) { + throw new IllegalArgumentException("Invalid payment request"); + } + + Saga saga = new Saga(new Saga.Options.Builder() + .setParallelCompensation(false) + .build()); + + try { + // Step 2: Reserve and block + ReservationRecord reservation = activities.reserveAndBlock(new ReserveParams( + req.paymentId() + ":reserve", + req.paymentId(), req.sourceAccount(), req.amount() + )); + saga.addCompensation(activities::unblockAndCancel, new UnblockParams( + PaymentTypes.stepKey(reservation.txId(), "comp-reserve"), + reservation.txId(), req.sourceAccount(), req.amount() + )); + + // Step 3a: Submit to payment switch + // Key derived here — stable even if Worker crashes and replays + switchActivities.submitToSwitch(new SubmitParams( + PaymentTypes.stepKey(reservation.txId(), "submit"), + reservation.txId(), req.sourceAccount(), + req.destinationAccount(), req.amount(), req.currency() + )); + saga.addCompensation(activities::cancelWithSwitch, new CancelParams( + PaymentTypes.stepKey(reservation.txId(), "comp-submit"), + reservation.txId() + )); + + // Step 3b: Wait for switch callback signal (SLA = 60 seconds) + // The Workflow is durably suspended here — no polling, no thread blocking. + // If the Worker restarts, Temporal replays to this point and waits again. + boolean received = Workflow.await( + Duration.ofSeconds(60), + () -> this.switchCallback != null + ); + if (!received || !"SUCCESS".equals(this.switchCallback.status())) { + String reason = received ? this.switchCallback.reason() : "SLA timeout"; + throw new RuntimeException("Switch did not confirm transfer: " + reason); + } + + // Step 4: Debit and settle + // Only reached after switch signals SUCCESS — now safe to move funds + activities.debitAndSettle(new DebitParams( + PaymentTypes.stepKey(reservation.txId(), "debit"), + reservation.txId(), req.sourceAccount(), req.amount() + )); + + return reservation.txId(); + + } catch (Exception e) { + saga.compensate(); + throw e; + } + } +} +``` + + + + +```typescript +// paymentWorkflow.ts +import { + executeActivity, ActivityOptions, + defineSignal, setHandler, condition, sleep +} from '@temporalio/workflow'; +import { PaymentRequest, ReservationRecord, stepKey } from './paymentTypes'; +import * as activities from './paymentActivities'; + +// Signal definition — webhook handler calls: +// handle.signal(switchConfirmedSignal, { txId, status: 'SUCCESS', reason: '' }) +export const switchConfirmedSignal = defineSignal<[SwitchCallback]>('switchConfirmed'); + +interface SwitchCallback { + txId: string; + status: 'SUCCESS' | 'FAILURE'; + reason: string; +} + +const defaultOptions: ActivityOptions = { startToCloseTimeout: '10s' }; +const switchOptions: ActivityOptions = { scheduleToCloseTimeout: '10s' }; +const SLA_SECONDS = 60; + +export async function interBankPaymentWorkflow(req: PaymentRequest): Promise { + type Compensation = () => Promise; + const compensations: Compensation[] = []; + + // switchCallback is set when the Signal arrives from the webhook + let switchCallback: SwitchCallback | null = null; + setHandler(switchConfirmedSignal, (cb: SwitchCallback) => { + switchCallback = cb; + }); + + try { + // Step 1: Validate input — local only, no external calls + if (req.amount <= 0 || !req.sourceAccount || !req.destinationAccount) { + throw new Error('Invalid payment request'); + } + + // Step 2: Reserve and block + const reservation: ReservationRecord = await executeActivity( + activities.reserveAndBlock, + [{ idempotencyKey: `${req.paymentId}:reserve`, paymentId: req.paymentId, + sourceAccount: req.sourceAccount, amount: req.amount }], + defaultOptions + ); + compensations.unshift(() => + executeActivity(activities.unblockAndCancel, [{ + idempotencyKey: stepKey(reservation.txId, 'comp-reserve'), + txId: reservation.txId, + sourceAccount: req.sourceAccount, + amount: req.amount, + }], defaultOptions) + ); + + // Step 3a: Submit to payment switch + // Key derived here — stable even if Worker crashes and replays + await executeActivity(activities.submitToSwitch, [{ + idempotencyKey: stepKey(reservation.txId, 'submit'), + txId: reservation.txId, + sourceAccount: req.sourceAccount, + destinationAccount: req.destinationAccount, + amount: req.amount, + currency: req.currency, + }], switchOptions); + compensations.unshift(() => + executeActivity(activities.cancelWithSwitch, [{ + idempotencyKey: stepKey(reservation.txId, 'comp-submit'), + txId: reservation.txId, + }], defaultOptions) + ); + + // Step 3b: Wait for switch callback signal (SLA = 60 seconds) + // The Workflow is durably suspended here — no polling, no thread blocking. + // If the Worker restarts, Temporal replays to this point and waits again. + const received = await condition( + () => switchCallback !== null, + `${SLA_SECONDS}s` + ); + if (!received || switchCallback!.status !== 'SUCCESS') { + const reason = received ? switchCallback!.reason : 'SLA timeout'; + throw new Error(`Switch did not confirm transfer: ${reason}`); + } + + // Step 4: Debit and settle + // Only reached after switch signals SUCCESS — now safe to move funds + await executeActivity(activities.debitAndSettle, [{ + idempotencyKey: stepKey(reservation.txId, 'debit'), + txId: reservation.txId, + sourceAccount: req.sourceAccount, + amount: req.amount, + }], defaultOptions); + + return reservation.txId; + + } catch (err) { + for (const comp of compensations) { + await comp(); + } + throw err; + } +} +``` + + + + +The key differences between SDKs for the Signal wait are: + +- **Python**: `workflow.wait_condition(lambda: ..., timeout=timedelta(...))` — returns `False` on timeout. +- **Go**: `workflow.Await(ctx, func() bool { ... })` with a timer-based cancel context for the SLA deadline. +- **Java**: `Workflow.await(Duration.ofSeconds(60), () -> condition)` — returns `false` on timeout. +- **TypeScript**: `condition(() => ..., '60s')` — returns `false` on timeout. + +In all SDKs, if the Worker restarts while waiting, Temporal replays the Workflow deterministically back to the `await`/`wait_condition` call and continues waiting — no message is lost. + +The key differences between SDKs are: + +- **Go**: Uses `defer` with a captured error variable to run compensations in LIFO order when the function returns with a non-nil error. +- **Python**: Appends compensations to a list and iterates with `reversed()` on exception. +- **TypeScript**: Uses `unshift()` to prepend each new compensation so the array is always in reverse registration order. +- **Java**: Uses the built-in `Saga` object which tracks and executes compensations in reverse order automatically. + +### Implement the switch activity with check-and-skip + +The submit-to-switch activity is the most dangerous step in the workflow. +Consider this scenario: the Worker sends the transfer instruction to the switch, the switch accepts it, but the network drops before the response reaches the Worker. +The Worker marks the activity as failed and retries — but the switch already processed the first request. +Without check-and-skip, the switch receives a duplicate instruction and could process the transfer twice. + +The fix is a two-step guard inside the activity: +1. Query the switch for an existing record with the idempotency key before sending. +2. If a record exists, return the existing result immediately — do not send again. + +The same guard applies to the cancellation compensation: if the switch never received the original instruction, there is nothing to cancel. + + + + +```python +# payment_activities.py +from temporalio import activity + +@activity.defn +async def submit_to_switch(params: dict) -> None: + """ + Submit a transfer to the payment switch with exactly-once protection. + + Scenario this solves: Worker crashes after the switch accepts the instruction + but before the response arrives. On retry, we must not submit again. + + Guard: + 1. Query switch for existing record with this idempotency key. + 2. If found — switch already processed it. Return immediately. + 3. If not found — safe to send. + """ + existing = await switch_client.query_by_key(params["idempotency_key"]) + if existing is not None: + return # Switch already has this — do not send again + + await switch_client.submit( + idempotency_key=params["idempotency_key"], + tx_id=params["tx_id"], + source_account=params["source_account"], + destination_account=params["destination_account"], + amount=params["amount"], + currency=params["currency"], + ) + +@activity.defn +async def cancel_with_switch(params: dict) -> None: + """ + Cancel a previously submitted transfer with the payment switch. + + Scenario this solves: debit step fails after the switch confirmed the transfer. + We need to cancel on the switch side. + + Guard: + 1. Query switch for the original submission record. + 2. If not found — transfer never reached the switch. Nothing to cancel. + 3. If found — send cancellation instruction. + """ + existing = await switch_client.query_by_key(params["idempotency_key"]) + if existing is None: + return # Transfer never reached switch — nothing to cancel + + await switch_client.cancel( + idempotency_key=params["idempotency_key"] + ":cancel", + original_key=params["idempotency_key"], + tx_id=params["tx_id"], + ) +``` + + + + +```go +// switch_activity.go + +// SubmitToSwitch sends a transfer instruction to the payment switch +// with exactly-once protection via check-and-skip. +// +// Scenario this solves: Worker crashes after the switch accepts the instruction +// but before the response arrives. On retry, we must not submit again. +func SubmitToSwitch(ctx context.Context, params SubmitParams) error { + // 1. Query switch for existing record — did a previous attempt get through? + existing, err := switchClient.QueryByKey(ctx, params.IdempotencyKey) + if err != nil { + return err + } + if existing != nil { + return nil // Switch already has this — do not send again + } + + // 2. Safe to send — switch has no record of this key + return switchClient.Submit(ctx, SubmitRequest{ + IdempotencyKey: params.IdempotencyKey, + TxID: params.TxID, + SourceAccount: params.SourceAccount, + DestinationAccount: params.DestinationAccount, + Amount: params.Amount, + Currency: params.Currency, + }) +} + +// CancelWithSwitch cancels a previously submitted transfer. +// +// Scenario this solves: debit step fails after the switch confirmed. +// Guard: if switch has no record of the original submission, there is nothing to cancel. +func CancelWithSwitch(ctx context.Context, params CancelParams) error { + existing, err := switchClient.QueryByKey(ctx, params.IdempotencyKey) + if err != nil { + return err + } + if existing == nil { + return nil // Transfer never reached switch — nothing to cancel + } + return switchClient.Cancel(ctx, CancelRequest{ + IdempotencyKey: params.IdempotencyKey + ":cancel", + OriginalKey: params.IdempotencyKey, + TxID: params.TxID, + }) +} +``` + + + + +```java +// SwitchActivity.java + +/** + * Submits a transfer instruction to the payment switch with exactly-once protection. + * + * Scenario this solves: Worker crashes after the switch accepts the instruction + * but before the response arrives. On retry, we must not submit again. + */ +@ActivityMethod +public void submitToSwitch(SubmitParams params) { + // 1. Query switch for existing record — did a previous attempt get through? + SwitchRecord existing = switchClient.queryByKey(params.idempotencyKey()); + if (existing != null) { + return; // Switch already has this — do not send again + } + + // 2. Safe to send — switch has no record of this key + switchClient.submit(SubmitRequest.builder() + .idempotencyKey(params.idempotencyKey()) + .txId(params.txId()) + .sourceAccount(params.sourceAccount()) + .destinationAccount(params.destinationAccount()) + .amount(params.amount()) + .currency(params.currency()) + .build()); +} + +/** + * Cancels a previously submitted transfer. + * Guard: if switch has no record of the original submission, there is nothing to cancel. + */ +@ActivityMethod +public void cancelWithSwitch(CancelParams params) { + SwitchRecord existing = switchClient.queryByKey(params.idempotencyKey()); + if (existing == null) { + return; // Transfer never reached switch — nothing to cancel + } + switchClient.cancel(CancelRequest.builder() + .idempotencyKey(params.idempotencyKey() + ":cancel") + .originalKey(params.idempotencyKey()) + .txId(params.txId()) + .build()); +} +``` + + + + +```typescript +// switchActivity.ts + +/** + * Submits a transfer instruction to the payment switch with exactly-once protection. + * + * Scenario this solves: Worker crashes after the switch accepts the instruction + * but before the response arrives. On retry, we must not submit again. + */ +export async function submitToSwitch(params: SubmitParams): Promise { + // 1. Query switch for existing record — did a previous attempt get through? + const existing = await switchClient.queryByKey(params.idempotencyKey); + if (existing !== null) { + return; // Switch already has this — do not send again + } + + // 2. Safe to send — switch has no record of this key + await switchClient.submit({ + idempotencyKey: params.idempotencyKey, + txId: params.txId, + sourceAccount: params.sourceAccount, + destinationAccount: params.destinationAccount, + amount: params.amount, + currency: params.currency, + }); +} + +/** + * Cancels a previously submitted transfer. + * Guard: if switch has no record of the original submission, there is nothing to cancel. + */ +export async function cancelWithSwitch(params: CancelParams): Promise { + const existing = await switchClient.queryByKey(params.idempotencyKey); + if (existing === null) { + return; // Transfer never reached switch — nothing to cancel + } + await switchClient.cancel({ + idempotencyKey: params.idempotencyKey + ':cancel', + originalKey: params.idempotencyKey, + txId: params.txId, + }); +} +``` + + + + +## When to use + +The Idempotent Distributed Transactions pattern is a good fit when you coordinate writes across two or more external systems in a single logical operation, any individual step must be safe to retry without producing duplicate side effects, a failure partway through must trigger automatic rollback across all completed steps, and your external systems support idempotency keys on write operations. + +It is not a good fit for single-service operations that can use a local database transaction, processes where no meaningful compensation can be defined for a completed step, external systems that do not support idempotency keys or a query-by-key endpoint, or operations that require strong ACID consistency rather than eventual consistency. + +In the payment domain specifically, use this pattern for fund transfers, disbursements, and multi-leg settlements where duplicate charges or lost credits are unacceptable. + +### Target audience + +This pattern is for engineers building any multi-step workflow that coordinates writes across more than one external system and requires exactly-once execution, full rollback on failure, and protection against duplicate submissions. +Common domains include financial transactions, order fulfilment, inventory reservation, and multi-service provisioning flows. + +### Prerequisites + +- Familiarity with the [Saga Pattern](/design-patterns/saga-pattern) and compensating transactions +- External systems that support idempotency keys on write operations +- An external system that supports a query-by-key endpoint so you can check whether a previous call was processed +- An intermediate holding resource (escrow account, reservation slot, staging allocation) controlled by your platform + +### People and process considerations + +- **Reconciliation team**: Ensure reconciliation jobs can distinguish resources held in the intermediate state from fully committed resources. A transaction stuck in the holding state after a failed compensation requires manual investigation. +- **Operations runbook**: Define escalation procedures for compensation failures. If the external system's reversal endpoint is also unavailable, resources remain in the holding state until it recovers. Alert on any Workflow that remains in the compensating state longer than a defined threshold. +- **Idempotency key agreement**: Agree with each external system operator on the key format and expiry policy before integration. Mismatches in key format cause the system to treat each retry as a new operation. +- **Holding state monitoring**: Monitor the age and size of resources in the intermediate holding state continuously. Growth indicates stuck or uncommitted transactions. + +## Benefits + +This pattern provides exactly-once delivery semantics for each step without requiring custom infrastructure. +Temporal's durable execution guarantees that compensations run even after Worker failures, network partitions, or process crashes. +The intermediate holding state ensures that at any point in time the total resource count across source, holding, and destination is consistent — no resources are created or destroyed by a partial failure. +Using the client-supplied reference as the Workflow ID makes the entire end-to-end operation idempotent at the API boundary. + + +The following table maps each failure scenario in the interbank payment example to the before and after behaviour: + +| Failure scenario | Without Temporal | With this pattern | +| :--- | :--- | :--- | +| Worker crashes after switch accepts instruction, before response arrives | Retry submits again — switch processes transfer twice | Check-and-skip detects existing record; second submission skipped | +| SLA expires — switch never responds | Funds blocked forever; manual repair | `ScheduleToCloseTimeout` fires; compensation unblocks funds automatically | +| Worker crashes after reserve step, before submit | Amount stays blocked; no record of in-flight state | Temporal replays Workflow; reservation key recognised; compensation unblocks on failure | +| Customer submits same payment twice | Two concurrent Workflows; possible double transfer | Temporal rejects duplicate `paymentId` Workflow ID; first result returned | +| New Worker picks up in-flight Workflow | Completed activities re-execute; reserve runs again | Temporal replays from history; completed activities skipped deterministically | +| Debit step fails after switch confirms | Funds stuck in escrow; switch already transferred | Compensation sends `comp-submit` cancellation to switch; unblocks source funds | + +## Comparison with alternatives + +| Approach | Duplicate protection | Auto compensation | Infrastructure required | Consistency | +| :--- | :--- | :--- | :--- | :--- | +| Idempotent Distributed Transactions (this pattern) | Per-step via idempotency keys | Automatic via Saga | Temporal only | Eventual | +| Custom saga + outbox | Per-step via outbox table | Manual orchestrator | Lock + outbox + state table + orchestrator | Eventual | +| Two-phase commit | Transaction-level | Database rollback | Distributed lock manager | Strong (ACID) | +| Best-effort retry | None | None | None | None | + +## Best practices + +- **Derive idempotency keys in the Workflow, not the Activity.** Keys generated inside an Activity are recomputed on each retry and defeat de-duplication. Keys derived from `workflow.info().workflowId` and a step constant are stable across all replays. +- **Use the client-supplied request reference as the Workflow ID.** This makes the Workflow ID the outer idempotency key. Temporal rejects a duplicate `StartWorkflow` call for a running or completed Workflow automatically. +- **Use `ScheduleToCloseTimeout` for external SLA enforcement.** `StartToCloseTimeout` restarts on each retry. `ScheduleToCloseTimeout` covers the full activity lifecycle including all retries and maps directly to a contractual SLA. +- **Use distinct keys for forward and compensation activities.** Reusing the same key causes the external system to reject the compensation as a duplicate of the original call. +- **Register compensations before Activity execution.** This ensures cleanup runs even if the Activity fails after a partial side effect has occurred. +- **Always re-raise the original exception after compensating.** Swallowing the error marks the Workflow as successful even though it rolled back, making incident investigation very difficult. +- **Alert on resources stuck in the holding state.** A transaction that completes the source mutation but fails all delivery attempts leaves resources in holding indefinitely. Alert when holding-state age exceeds your maximum retry window. + +## Common pitfalls + +- **Generating idempotency keys inside an Activity.** Each retry produces a different key. The external system treats every retry as a new operation, defeating the entire purpose of the pattern. +- **Using `StartToCloseTimeout` for SLA enforcement on external calls.** A slow-but-responding system keeps resetting the per-attempt timeout indefinitely. Use `ScheduleToCloseTimeout` to cap total elapsed time across all retries. +- **Skipping the check-and-skip on compensation.** If the external system processed the delivery but the response was lost, and compensation skips the query, the reversal is never sent. The source is refunded but the destination retains the resource. +- **Reusing the same key for forward and compensation activities.** The compensation is rejected as a duplicate of the forward call and the rollback silently fails. +- **Swallowing the original exception after compensating.** The Workflow is recorded as successful in history even though it rolled back. +- **Not handling compensation failures.** If a compensation activity fails with a non-retryable error, the rollback is incomplete. Always alert for manual intervention rather than silently swallowing compensation errors. + +## Related patterns + +- **[Saga Pattern](/design-patterns/saga-pattern)**: The base pattern this extends. Covers compensation registration strategies and SDK-specific differences in detail. +- **[Child Workflows](/design-patterns/child-workflows)**: Extract each phase into a Child Workflow to scope history growth and apply independent retry policies per phase. +- **[Polling External Services](/design-patterns/polling)**: Replace the synchronous delivery call with a polling loop when the external system is asynchronous. +- **[Long Running Activity](/design-patterns/long-running-activity)**: Add heartbeating to the delivery activity when the external system uses a long-polling acknowledgement model. +- **[Continue-As-New](/design-patterns/continue-as-new)**: Bound history size for high-throughput processors running many transactions per long-lived Workflow. + +## Sample code + +- [Go Saga Sample](https://github.com/temporalio/samples-go/tree/main/saga) — Saga with `defer`-based compensations, adaptable to the idempotency key pattern. +- [Java Saga Sample](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloSaga.java) — Saga using the built-in `Saga` API. +- [TypeScript Saga Sample](https://github.com/temporalio/samples-typescript/tree/main/saga) — Saga with array-based compensations. +- [Python Samples](https://github.com/temporalio/samples-python) — Saga with list-based compensations. diff --git a/docs/design-patterns/index.mdx b/docs/design-patterns/index.mdx new file mode 100644 index 0000000000..0e9f375798 --- /dev/null +++ b/docs/design-patterns/index.mdx @@ -0,0 +1,251 @@ +--- +id: index +title: Temporal Design Patterns +sidebar_label: Overview +description: A catalog of common, reusable, and proven design patterns for Temporal Workflows, organized by problem domain. +slug: /design-patterns +displayed_sidebar: designPatterns +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Temporal provides a set of durable execution primitives that you can compose into common, reusable, and proven patterns. Having these patterns in your toolbox helps you solve recurring problems in a battle-tested way. + +## Distributed transaction patterns + + + +## Entity & lifecycle patterns + + + +## Workflow messaging patterns + + + +## Task orchestration patterns + + + +## External interaction patterns + + + +## Worker configuration patterns + + + +## Error handling & retry patterns + + + +## QoS & throughput patterns + + + +## Batch processing patterns + + diff --git a/docs/design-patterns/long-running-activity.mdx b/docs/design-patterns/long-running-activity.mdx new file mode 100644 index 0000000000..c35d5c1e5a --- /dev/null +++ b/docs/design-patterns/long-running-activity.mdx @@ -0,0 +1,619 @@ +--- +id: long-running-activity +title: "Long-Running Activity - Tracking Progress and Handling Cancellation with Heartbeats" +sidebar_label: "Long Running Activity" +description: "Long-running Activities report progress via heartbeats and enable resumption after failures with cancellation support." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Activity Heartbeat pattern enables long-running Activities to report progress, handle cancellation gracefully, and resume from the last checkpoint after failures. +Heartbeats inform Temporal that the Activity is still alive and allow storing progress details that survive Worker restarts. + +## Problem + +In long-running operations, you often need Activities that process large datasets or perform time-consuming operations (minutes to hours), report progress to avoid appearing stuck or timing out, resume from the last checkpoint after Worker crashes or restarts, handle cancellation requests gracefully and clean up resources, and avoid reprocessing already-completed work. + +Without heartbeats, you must set very long Activity timeouts that delay failure detection, reprocess entire batches from the beginning on failures, accept no visibility into Activity progress, risk zombie Activities that appear alive but are stuck, and implement custom checkpointing and recovery logic. + +## Solution + +Activity heartbeats periodically report progress to the Temporal Service. +The heartbeat details are persisted and available to retry attempts, enabling resumption from the last checkpoint. +Heartbeat timeouts detect stuck Activities faster than execution timeouts. + +```mermaid +sequenceDiagram + participant Workflow + participant Activity + participant Temporal + + Workflow->>+Activity: Start (with heartbeat timeout) + loop Process items + Activity->>Activity: Process item + Activity->>Temporal: heartbeat(progress) + Note over Temporal: Store progress + end + + alt Activity completes + Activity-->>-Workflow: Result + else Worker crashes + Note over Activity: Heartbeat timeout expires + Temporal->>+Activity: Retry on new worker + Activity->>Temporal: getHeartbeatDetails() + Temporal-->>Activity: Last progress + Activity->>Activity: Resume from checkpoint + Activity-->>-Workflow: Result + end +``` + +The following describes each step in the diagram: + +1. The Workflow starts the Activity with a heartbeat timeout. +2. The Activity processes items in a loop, heartbeating progress after each batch. +3. If the Activity completes normally, it returns the result to the Workflow. +4. If the Worker crashes, the heartbeat timeout expires and Temporal retries the Activity on a new Worker. The new attempt retrieves the last heartbeat details and resumes from the checkpoint. + +## Implementation + +### Basic progress tracking + +The following implementation processes a large file line by line, heartbeating every 100 lines. +On retry, it retrieves the last processed line number and skips ahead: + + + + +```python +# activities.py +from temporalio import activity + +@activity.defn +async def process_large_file(file_path: str) -> None: + details = activity.info().heartbeat_details + start_line = details[0] if details else 0 + + with open(file_path, "r") as f: + for i, line in enumerate(f): + if i < start_line: + continue + + process_line(line) + + if (i + 1) % 100 == 0: + activity.heartbeat(i + 1) +``` + + + + +```go +// activities.go +func ProcessLargeFile(ctx context.Context, filePath string) error { + startLine := 0 + if activity.HasHeartbeatDetails(ctx) { + if err := activity.GetHeartbeatDetails(ctx, &startLine); err != nil { + return err + } + } + + file, err := os.Open(filePath) + if err != nil { + return err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + currentLine := 0 + for scanner.Scan() { + if currentLine < startLine { + currentLine++ + continue + } + + processLine(scanner.Text()) + currentLine++ + + if currentLine%100 == 0 { + activity.RecordHeartbeat(ctx, currentLine) + } + } + return scanner.Err() +} +``` + + + + +```java +// FileProcessingActivityImpl.java +@ActivityInterface +public interface FileProcessingActivity { + void processLargeFile(String filePath); +} + +public class FileProcessingActivityImpl implements FileProcessingActivity { + @Override + public void processLargeFile(String filePath) { + ActivityExecutionContext context = Activity.getExecutionContext(); + Optional lastProcessedLine = context.getHeartbeatDetails(Integer.class); + int startLine = lastProcessedLine.orElse(0); + + try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) { + for (int i = 0; i < startLine; i++) { + reader.readLine(); + } + + String line; + int currentLine = startLine; + while ((line = reader.readLine()) != null) { + processLine(line); + currentLine++; + + if (currentLine % 100 == 0) { + context.heartbeat(currentLine); + } + } + } + } +} +``` + + + + +```typescript +// activities.ts +import { heartbeat, activityInfo } from '@temporalio/activity'; +import { createReadStream } from 'fs'; +import { createInterface } from 'readline'; + +export async function processLargeFile(filePath: string): Promise { + const startLine = activityInfo().heartbeatDetails ?? 0; + + const rl = createInterface({ + input: createReadStream(filePath), + }); + + let currentLine = 0; + for await (const line of rl) { + if (currentLine < startLine) { + currentLine++; + continue; + } + + processLine(line); + currentLine++; + + if (currentLine % 100 === 0) { + heartbeat(currentLine); + } + } +} +``` + + + + +The heartbeat details call retrieves the last heartbeat value from a previous attempt. +If this is the first attempt, there are no details and the Activity starts from line 0. +The Activity heartbeats every 100 lines, storing the current line number as the checkpoint. + +### Handling cancellation + +The following implementation adds cancellation support. +The Activity checks for cancellation on each heartbeat and cleans up resources before exiting: + + + + +```python +# activities.py +import asyncio +from temporalio import activity + +@activity.defn +async def process_large_file(file_path: str) -> None: + details = activity.info().heartbeat_details + current_line = details[0] if details else 0 + + try: + with open(file_path, "r") as f: + for i, line in enumerate(f): + if i < current_line: + continue + + activity.heartbeat(i) + process_line(line) + current_line = i + 1 + except asyncio.CancelledError: + cleanup_resources() + raise +``` + + + + +```go +// activities.go +func ProcessLargeFile(ctx context.Context, filePath string) error { + currentLine := 0 + if activity.HasHeartbeatDetails(ctx) { + if err := activity.GetHeartbeatDetails(ctx, ¤tLine); err != nil { + return err + } + } + + file, err := os.Open(filePath) + if err != nil { + return err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + if currentLine > 0 { + currentLine-- + continue + } + + activity.RecordHeartbeat(ctx, currentLine) + + // Check if the Activity has been cancelled + select { + case <-ctx.Done(): + cleanupResources() + return ctx.Err() + default: + } + + processLine(scanner.Text()) + currentLine++ + } + return scanner.Err() +} +``` + + + + +```java +// FileProcessingActivityImpl.java +public class FileProcessingActivityImpl implements FileProcessingActivity { + @Override + public void processLargeFile(String filePath) { + ActivityExecutionContext context = Activity.getExecutionContext(); + Optional lastProcessedLine = context.getHeartbeatDetails(Integer.class); + int currentLine = lastProcessedLine.orElse(0); + + try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) { + for (int i = 0; i < currentLine; i++) { + reader.readLine(); + } + + String line; + while ((line = reader.readLine()) != null) { + context.heartbeat(currentLine); + processLine(line); + currentLine++; + } + } catch (CanceledFailure e) { + cleanupResources(); + throw e; + } + } +} +``` + + + + +```typescript +// activities.ts +import { heartbeat, activityInfo, sleep } from '@temporalio/activity'; +import { CancelledFailure } from '@temporalio/common'; +import { createReadStream } from 'fs'; +import { createInterface } from 'readline'; + +export async function processLargeFile(filePath: string): Promise { + const startLine = activityInfo().heartbeatDetails ?? 0; + + const rl = createInterface({ + input: createReadStream(filePath), + }); + + let currentLine = 0; + try { + for await (const line of rl) { + if (currentLine < startLine) { + currentLine++; + continue; + } + + heartbeat(currentLine); + processLine(line); + currentLine++; + } + } catch (err) { + if (err instanceof CancelledFailure) { + cleanupResources(); + } + throw err; + } +} +``` + + + + +Cancellation is delivered to the Activity when it heartbeats. +In Java, the next `heartbeat()` call throws a `CanceledFailure`. +In TypeScript, cancellation is delivered as a `CancelledFailure` via `sleep()` or `Context.current().cancelled`. +In Python, cancellation is delivered as an `asyncio.CancelledError`. +In Go, the context is cancelled and `ctx.Done()` becomes readable. +The catch/error handling block performs cleanup before re-throwing the error. + +### Complex progress state + +The following implementation tracks multiple progress fields -- processed count, failed count, and the last processed ID: + + + + +```python +# activities.py +from dataclasses import dataclass +from temporalio import activity + +@dataclass +class ProgressState: + processed_count: int = 0 + failed_count: int = 0 + last_processed_id: str = "" + +@activity.defn +async def process_batch(item_ids: list[str]) -> dict: + details = activity.info().heartbeat_details + progress = details[0] if details else ProgressState() + + start_index = ( + item_ids.index(progress.last_processed_id) + 1 + if progress.last_processed_id + else 0 + ) + + for i in range(start_index, len(item_ids)): + item_id = item_ids[i] + + try: + await process_item(item_id) + progress.processed_count += 1 + except Exception: + progress.failed_count += 1 + + progress.last_processed_id = item_id + activity.heartbeat(progress) + + return { + "processed_count": progress.processed_count, + "failed_count": progress.failed_count, + } +``` + + + + +```go +// activities.go +type ProgressState struct { + ProcessedCount int `json:"processedCount"` + FailedCount int `json:"failedCount"` + LastProcessedID string `json:"lastProcessedId"` +} + +type BatchResult struct { + ProcessedCount int `json:"processedCount"` + FailedCount int `json:"failedCount"` +} + +func ProcessBatch(ctx context.Context, itemIDs []string) (BatchResult, error) { + progress := ProgressState{} + if activity.HasHeartbeatDetails(ctx) { + if err := activity.GetHeartbeatDetails(ctx, &progress); err != nil { + return BatchResult{}, err + } + } + + startIndex := 0 + if progress.LastProcessedID != "" { + for i, id := range itemIDs { + if id == progress.LastProcessedID { + startIndex = i + 1 + break + } + } + } + + for i := startIndex; i < len(itemIDs); i++ { + itemID := itemIDs[i] + + if err := processItem(ctx, itemID); err != nil { + progress.FailedCount++ + } else { + progress.ProcessedCount++ + } + + progress.LastProcessedID = itemID + activity.RecordHeartbeat(ctx, progress) + } + + return BatchResult{ + ProcessedCount: progress.ProcessedCount, + FailedCount: progress.FailedCount, + }, nil +} +``` + + + + +```java +// BatchProcessingActivityImpl.java +public class BatchProcessingActivityImpl implements BatchProcessingActivity { + + static class ProgressState { + int processedCount; + int failedCount; + String lastProcessedId; + } + + @Override + public BatchResult processBatch(List itemIds) { + ActivityExecutionContext context = Activity.getExecutionContext(); + Optional details = context.getHeartbeatDetails(ProgressState.class); + ProgressState progress = details.orElse(new ProgressState()); + + int startIndex = itemIds.indexOf(progress.lastProcessedId) + 1; + + for (int i = startIndex; i < itemIds.size(); i++) { + String itemId = itemIds.get(i); + + try { + processItem(itemId); + progress.processedCount++; + } catch (Exception e) { + progress.failedCount++; + } + + progress.lastProcessedId = itemId; + context.heartbeat(progress); + } + + return new BatchResult(progress.processedCount, progress.failedCount); + } +} +``` + + + + +```typescript +// activities.ts +import { heartbeat, activityInfo } from '@temporalio/activity'; + +interface ProgressState { + processedCount: number; + failedCount: number; + lastProcessedId: string; +} + +export async function processBatch(itemIds: string[]): Promise { + const saved: ProgressState = activityInfo().heartbeatDetails ?? { + processedCount: 0, + failedCount: 0, + lastProcessedId: '', + }; + + const startIndex = saved.lastProcessedId + ? itemIds.indexOf(saved.lastProcessedId) + 1 + : 0; + + const progress = { ...saved }; + + for (let i = startIndex; i < itemIds.length; i++) { + const itemId = itemIds[i]; + + try { + await processItem(itemId); + progress.processedCount++; + } catch { + progress.failedCount++; + } + + progress.lastProcessedId = itemId; + heartbeat(progress); + } + + return { processedCount: progress.processedCount, failedCount: progress.failedCount }; +} +``` + + + + +The progress state object stores all the checkpoint data needed to resume. +On retry, the Activity finds the index of the last processed ID and starts from the next item. +Each heartbeat stores the full progress state, so the next attempt has everything it needs to resume. + +## When to use + +The Heartbeat pattern is a good fit for batch processing of large datasets, file uploads and downloads with progress tracking, database migrations or bulk operations, long-running computations (ML training, video encoding), external API polling with multiple attempts, and any Activity running longer than 30 seconds. + +It is not a good fit for quick operations (under 10 seconds), operations that cannot be checkpointed, Activities requiring exact-once semantics without idempotency, or real-time streaming (use Workflows instead). + +## Benefits and trade-offs + +Heartbeats enable fault tolerance by resuming from the last checkpoint after failures. +Heartbeat timeouts detect stuck Activities faster than execution timeouts. +You gain visibility into Activity progress in real-time. +Activities can handle cancellation gracefully and clean up resources. +Completed work is not reprocessed, and Activities can move between Workers. + +The trade-offs to consider are that frequent heartbeats increase network traffic. +You must implement checkpointing logic and state management. +You must handle partial reprocessing of the last checkpoint (idempotency). +You need to balance heartbeat frequency between responsiveness and overhead. +Heartbeat details have size limits, so you should avoid large objects. + +## Comparison with alternatives + +| Approach | Progress tracking | Resumable | Cancellation | Complexity | +| :--- | :--- | :--- | :--- | :--- | +| Heartbeat | Yes | Yes | Graceful | Medium | +| Long Timeout | No | No | Delayed | Low | +| Child Workflows | Yes | Yes | Immediate | High | +| Local Activity | No | No | N/A | Low | + +## Best practices + +- **Set heartbeat timeout.** Configure to 2-3x the expected heartbeat interval. +- **Heartbeat at regular intervals.** Balance between responsiveness (every 10-30 seconds) and overhead. +- **Checkpoint strategically.** Save progress at meaningful boundaries (records, pages, chunks). +- **Keep details small.** Store minimal state (IDs, offsets, counts), not full objects. +- **Handle idempotency.** Ensure reprocessing the last checkpoint is safe. +- **Check cancellation.** Heartbeat regularly to detect cancellation quickly. +- **Clean up on cancel.** Handle cancellation errors appropriately: catch `CanceledFailure` (Java), `CancelledFailure` (TypeScript), `asyncio.CancelledError` (Python), or check `ctx.Done()` (Go). +- **Log progress.** Log heartbeat details for debugging and monitoring. +- **Test resumption.** Verify Activities resume correctly after simulated failures. +- **Avoid heartbeat spam.** Do not heartbeat on every iteration of tight loops. + +## Common pitfalls + +- **Missing HeartbeatTimeout.** Without a HeartbeatTimeout, Temporal cannot detect a stuck or crashed Worker until the StartToCloseTimeout expires. Always set HeartbeatTimeout shorter than StartToCloseTimeout. +- **Heartbeating too infrequently.** Cancellation is only delivered on the next heartbeat. If the Activity heartbeats every 5 minutes, cancellation takes up to 5 minutes to propagate. +- **Not resuming from heartbeat progress on retry.** When an Activity retries, retrieve the last heartbeat details -- `context.getHeartbeatDetails()` (Java), `activityInfo().heartbeatDetails` (TypeScript), `activity.info().heartbeat_details` (Python), or `activity.GetHeartbeatDetails()` (Go) -- and resume from the last checkpoint instead of restarting from scratch. +- **Catching the wrong exception for cancellation.** Cancellation is SDK-specific: `CanceledFailure` (Java), `CancelledFailure` (TypeScript), `asyncio.CancelledError` (Python), or `ctx.Err()` returning `context.Canceled` (Go). + +## Related patterns + +- **[Saga Pattern](/design-patterns/saga-pattern)**: Compensating transactions with long-running steps. +- **[Polling](/design-patterns/polling)**: Heartbeating Activity for frequent polling. + +## Sample code + +### Java +- [Heartbeating Activity Batch](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/batch/heartbeatingactivity) -- Complete batch processing implementation. +- [Auto-Heartbeating](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/autoheartbeat) -- Automatic heartbeating via interceptor. + +### TypeScript +- [Activities Cancellation and Heartbeating](https://github.com/temporalio/samples-typescript/tree/main/activities-cancellation-heartbeating) -- Activity cancellation and heartbeat-based resumption. + +### Python +- [Hello Cancellation](https://github.com/temporalio/samples-python/blob/main/hello/hello_cancellation.py) -- Activity heartbeating with cancellation handling. +- [Custom Decorator Heartbeat](https://github.com/temporalio/samples-python/blob/main/custom_decorator/activity_utils.py) -- Automatic heartbeating via decorator. + +### Go +- [Cancellation](https://github.com/temporalio/samples-go/tree/main/cancellation) -- Workflow and Activity cancellation with heartbeating. diff --git a/docs/design-patterns/mapreduce-tree.mdx b/docs/design-patterns/mapreduce-tree.mdx new file mode 100644 index 0000000000..1fdbe2d4d5 --- /dev/null +++ b/docs/design-patterns/mapreduce-tree.mdx @@ -0,0 +1,436 @@ +--- +id: mapreduce-tree +title: "MapReduce Tree" +sidebar_label: "MapReduce Tree" +description: "Recursively splits a dataset into a binary tree of Child Workflows, processes leaves in parallel, then aggregates results back up the tree." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Recursively split a record set into a tree of child Workflows — each node fans out to N sub-slices (two by default) — process every leaf in parallel, and signal results back up the tree to the root. Use this when you need **maximum throughput** for an embarrassingly parallel workload and downstream systems can absorb an unbounded burst of concurrent requests. +::: + +## Overview + +The MapReduce Tree pattern processes a large record set with maximum parallelism by recursively splitting it into smaller chunks and distributing each chunk to a child Workflow. Results are signalled back up the tree to the parent. It is best suited for embarrassingly parallel workloads where speed matters more than rate limiting. + +## Problem + +Both the [Batch Iterator](/design-patterns/batch-iterator) and [Sliding Window](/design-patterns/sliding-window) patterns bound concurrency, which limits throughput. When you need to process a large record set as fast as possible and downstream systems can handle the load, you want to fan out work across as many concurrent processors as possible without a fixed window. + +You also need a way to handle record sets larger than what a single Workflow's concurrency limits allow, without pre-partitioning data into fixed chunks before the job starts. + +## Solution + +A Node Workflow receives a slice of records. If the slice is small enough (at or below a configurable `leafThreshold`), it starts one Leaf Workflow per record. Otherwise it splits the slice into `n` sub-slices and starts `n` Node child Workflows recursively. + +Each Leaf Workflow runs the actual processing Activity and signals its result back to its parent Node. Each Node aggregates the results it receives and signals them up to its own parent. The Root Node returns the final aggregated result. + +```mermaid +flowchart TD + Records["📋 Full record set"] + Root["Root Node Workflow\n(depth=0)"] + Node1["Node Workflow\n(depth=1, chunk 1)"] + Node2["Node Workflow\n(depth=1, chunk 2)"] + L1["Leaf Workflow\n(record A)"] + L2["Leaf Workflow\n(record B)"] + L3["Leaf Workflow\n(record C)"] + L4["Leaf Workflow\n(record D)"] + L5["Leaf Workflow\n(record E)"] + L6["Leaf Workflow\n(record F)"] + + Records --> Root + Root -->|"split → chunk 1"| Node1 + Root -->|"split → chunk 2"| Node2 + + Node1 --> L1 + Node1 --> L2 + Node1 --> L3 + + Node2 --> L4 + Node2 --> L5 + Node2 --> L6 + + L1 -->|"Signal result"| Node1 + L2 -->|"Signal result"| Node1 + L3 -->|"Signal result"| Node1 + + L4 -->|"Signal result"| Node2 + L5 -->|"Signal result"| Node2 + L6 -->|"Signal result"| Node2 + + Node1 -->|"Signal result"| Root + Node2 -->|"Signal result"| Root +``` + +The following describes each step in the diagram: + +1. The Root Node Workflow receives the full record set and `depth=0`. +2. Because the record set is larger than `leafThreshold`, the Root splits it into N chunks and starts N child Node Workflows (two in this example, but the split factor is configurable). +3. Each Node Workflow receives its chunk and checks its size against `leafThreshold`. In this example, each chunk is small enough, so each Node starts one Leaf Workflow per record. +4. Each Leaf Workflow calls the `processRecord` Activity and, when complete, signals its result back to its parent Node using `signalExternalWorkflow`. +5. Each Node collects all leaf results via signal handlers, aggregates them, and signals the aggregated result back to the Root. +6. The Root collects both node results and returns the final aggregate. + +## Implementation + + +The following examples show how each SDK implements the MapReduce Tree pattern. + + + + +```typescript +// workflows.ts +import { + condition, + defineSignal, + getExternalWorkflowHandle, + proxyActivities, + setHandler, + startChild, + workflowInfo, +} from "@temporalio/workflow"; +import type * as activities from "./activities"; +import { TASK_QUEUE, LEAF_THRESHOLD, MAX_DEPTH } from "./shared"; + +const { processRecord } = proxyActivities({ + startToCloseTimeout: "30 seconds", +}); + +interface ResultPayload { + id: string; + results: string[]; +} + +export const resultSignal = defineSignal<[ResultPayload]>("leafResult"); + +export async function leafWorkflow( + record: string, + parentWorkflowId: string +): Promise { + const result = await processRecord(record); + // Signal result back to parent node. + const parent = getExternalWorkflowHandle(parentWorkflowId); + await parent.signal(resultSignal, { id: record, results: [result] }); +} + +export async function nodeWorkflow( + records: string[], + depth: number = 0, + parentWorkflowId: string = "" +): Promise { + if (depth > MAX_DEPTH) { + throw new Error(`Tree depth exceeded ${MAX_DEPTH}`); + } + + const myId = workflowInfo().workflowId; + const collectedResults: string[] = []; + let received = 0; + let expected = 0; + + setHandler(resultSignal, (payload: ResultPayload) => { + collectedResults.push(...payload.results); + received++; + }); + + if (records.length <= LEAF_THRESHOLD) { + // Start one leaf per record. + expected = records.length; + for (const record of records) { + void startChild(leafWorkflow, { + args: [record, myId], + workflowId: `${myId}/leaf-${record}`, + taskQueue: TASK_QUEUE, + }); + } + } else { + // Split and recurse. + const mid = Math.floor(records.length / 2); + const chunks = [records.slice(0, mid), records.slice(mid)]; + expected = chunks.length; + for (let i = 0; i < chunks.length; i++) { + void startChild(nodeWorkflow, { + args: [chunks[i], depth + 1, myId], + workflowId: `${myId}/node-d${depth + 1}-${i}`, + taskQueue: TASK_QUEUE, + }); + } + } + + // Wait until all expected signals have arrived. + await condition(() => received >= expected); + + // Signal aggregated results up to parent (if this is not the root). + if (parentWorkflowId) { + const parent = getExternalWorkflowHandle(parentWorkflowId); + await parent.signal(resultSignal, { id: myId, results: collectedResults }); + } + + return collectedResults; +} +``` + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow +from activities import process_record +from shared import TASK_QUEUE, LEAF_THRESHOLD, MAX_DEPTH + +RESULT_SIGNAL = "leafResult" + + +@workflow.defn +class LeafWorkflow: + @workflow.run + async def run(self, record: str, parent_workflow_id: str) -> None: + result = await workflow.execute_activity( + process_record, + record, + start_to_close_timeout=timedelta(seconds=30), + ) + handle = workflow.get_external_workflow_handle(parent_workflow_id) + await handle.signal(RESULT_SIGNAL, [record, result]) + + +@workflow.defn +class NodeWorkflow: + def __init__(self) -> None: + self._results: list[str] = [] + + @workflow.signal(name=RESULT_SIGNAL) + def leaf_result(self, record: str, result: str) -> None: + self._results.append(result) + + @workflow.run + async def run( + self, + records: list[str], + depth: int = 0, + parent_workflow_id: str = "", + ) -> list[str]: + if depth > MAX_DEPTH: + raise RuntimeError(f"Tree depth exceeded {MAX_DEPTH}") + + my_id = workflow.info().workflow_id + expected = 0 + + if len(records) <= LEAF_THRESHOLD: + for record in records: + await workflow.start_child_workflow( + LeafWorkflow.run, + args=[record, my_id], + id=f"{my_id}/leaf-{record}", + task_queue=TASK_QUEUE, + ) + expected = len(records) + else: + mid = len(records) // 2 + chunks = [records[:mid], records[mid:]] + for i, chunk in enumerate(chunks): + await workflow.start_child_workflow( + NodeWorkflow.run, + args=[chunk, depth + 1, my_id], + id=f"{my_id}/node-d{depth+1}-{i}", + task_queue=TASK_QUEUE, + ) + expected = len(chunks) + + await workflow.wait_condition(lambda: len(self._results) >= expected) + + if parent_workflow_id: + handle = workflow.get_external_workflow_handle(parent_workflow_id) + await handle.signal(RESULT_SIGNAL, [my_id, ",".join(self._results)]) + + return self._results +``` + + + + +```go +// workflows.go +package main + +import ( + "fmt" + "strings" + "time" + + "go.temporal.io/sdk/workflow" +) + +const ResultSignal = "leafResult" + +type ResultPayload struct { + ID string + Result string +} + +func LeafWorkflow(ctx workflow.Context, record string, parentWorkflowID string) error { + ao := workflow.ActivityOptions{StartToCloseTimeout: 30 * time.Second} + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + if err := workflow.ExecuteActivity(ctx, ProcessRecord, record).Get(ctx, &result); err != nil { + return err + } + + return workflow.SignalExternalWorkflow(ctx, parentWorkflowID, "", ResultSignal, + ResultPayload{ID: record, Result: result}).Get(ctx, nil) +} + +func NodeWorkflow(ctx workflow.Context, records []string, depth int, parentWorkflowID string) ([]string, error) { + if depth > MaxDepth { + return nil, fmt.Errorf("tree depth exceeded %d", MaxDepth) + } + + myID := workflow.GetInfo(ctx).WorkflowExecution.ID + resultCh := workflow.GetSignalChannel(ctx, ResultSignal) + + var results []string + expected := 0 + + if len(records) <= LeafThreshold { + for _, record := range records { + cwo := workflow.ChildWorkflowOptions{ + WorkflowID: myID + "/leaf-" + record, + TaskQueue: TaskQueue, + } + workflow.ExecuteChildWorkflow(workflow.WithChildOptions(ctx, cwo), LeafWorkflow, record, myID) + expected++ + } + } else { + mid := len(records) / 2 + chunks := [][]string{records[:mid], records[mid:]} + for i, chunk := range chunks { + cwo := workflow.ChildWorkflowOptions{ + WorkflowID: fmt.Sprintf("%s/node-d%d-%d", myID, depth+1, i), + TaskQueue: TaskQueue, + } + workflow.ExecuteChildWorkflow(workflow.WithChildOptions(ctx, cwo), NodeWorkflow, chunk, depth+1, myID) + expected++ + } + } + + for i := 0; i < expected; i++ { + var payload ResultPayload + resultCh.Receive(ctx, &payload) + results = append(results, payload.Result) + } + + if parentWorkflowID != "" { + err := workflow.SignalExternalWorkflow(ctx, parentWorkflowID, "", ResultSignal, + ResultPayload{ID: myID, Result: strings.Join(results, ",")}).Get(ctx, nil) + if err != nil { + return results, err + } + } + + return results, nil +} +``` + + + + +```java +// NodeWorkflow.java +import io.temporal.workflow.*; +import java.util.*; + +@WorkflowInterface +public interface NodeWorkflow { + @WorkflowMethod + List run(List records, int depth, String parentWorkflowId); + + @SignalMethod + void leafResult(String id, String result); +} + +// NodeWorkflowImpl.java +public class NodeWorkflowImpl implements NodeWorkflow { + private final List results = new ArrayList<>(); + + @Override + public void leafResult(String id, String result) { + results.add(result); + } + + @Override + public List run(List records, int depth, String parentWorkflowId) { + if (depth > Shared.MAX_DEPTH) { + throw new RuntimeException("Tree depth exceeded " + Shared.MAX_DEPTH); + } + + String myId = Workflow.getInfo().getWorkflowId(); + int expected; + + if (records.size() <= Shared.LEAF_THRESHOLD) { + for (String record : records) { + ChildWorkflowOptions opts = ChildWorkflowOptions.newBuilder() + .setWorkflowId(myId + "/leaf-" + record) + .setTaskQueue(Shared.TASK_QUEUE) + .build(); + LeafWorkflow leaf = Workflow.newChildWorkflowStub(LeafWorkflow.class, opts); + Async.procedure(leaf::run, record, myId); + } + expected = records.size(); + } else { + int mid = records.size() / 2; + List> chunks = List.of(records.subList(0, mid), records.subList(mid, records.size())); + for (int i = 0; i < chunks.size(); i++) { + ChildWorkflowOptions opts = ChildWorkflowOptions.newBuilder() + .setWorkflowId(String.format("%s/node-d%d-%d", myId, depth + 1, i)) + .setTaskQueue(Shared.TASK_QUEUE) + .build(); + NodeWorkflow child = Workflow.newChildWorkflowStub(NodeWorkflow.class, opts); + Async.function(child::run, chunks.get(i), depth + 1, myId); + } + expected = chunks.size(); + } + + Workflow.await(() -> results.size() >= expected); + + if (parentWorkflowId != null && !parentWorkflowId.isEmpty()) { + ExternalWorkflowStub parent = Workflow.newUntypedExternalWorkflowStub(parentWorkflowId, ""); + parent.signal("leafResult", myId, String.join(",", results)); + } + + return results; + } +} +``` + + + + +## Best Practices + +- **Set a `leafThreshold` to control tree depth.** A threshold of 3–10 records per leaf is typical. Too small a threshold creates excessive Workflow overhead; too large prevents full parallelism. +- **Set a `MAX_DEPTH` guard.** Recursive fan-out without a depth limit can produce extremely deep trees for large record sets. Fail fast if depth exceeds your expected maximum (e.g. `log2(totalRecords / leafThreshold) + 2`). +- **Avoid external writes in Node Workflows.** Node Workflows only aggregate results from children. Leaf Workflows perform the actual work. Keeping the roles separate prevents duplicate external writes if a Node is retried. +- **Use signals for result aggregation, not return values.** A parent cannot directly await a child started in a previous Workflow run. Signals decouple the result delivery from the parent-child lifetime, making the pattern resilient to replays. +- **Skip the reduce phase if results are not needed.** If you only need the side effects of processing each record (writes to a database, messages sent), omit the signal-back entirely and set `PARENT_CLOSE_POLICY_ABANDON` on all children. +- **Consider replacing Leaf Workflows with Activities for simpler workloads.** Leaf Workflows give each record its own history, independent cancellation, and dedicated visibility in the UI — useful when per-record observability matters. If those properties are not required, executing the `processRecord` Activity directly from a Node Workflow reduces overhead: each Leaf Workflow start and completion adds roughly 3 extra history events to the Node's history compared to a direct Activity call. + +## Common Pitfalls + +- **Thundering herd.** The MapReduce Tree fans out exponentially. For large record sets, all leaf Activities start nearly simultaneously. Ensure your downstream system can absorb the burst, or switch to [Sliding Window](/design-patterns/sliding-window) for rate limiting. +- **Signal storms.** If thousands of leaves all signal a single Node at the same time, the Node's signal queue can become a bottleneck. A two-level tree (Root → Nodes → Leaves) distributes this load; a deeper tree helps even more. +- **History bloat in the Root Workflow.** Each child start and signal received adds events to the Root's history. For very large record sets, consider adding an extra tree level to keep the Root from receiving too many direct signals. +- **Attempting external/downstream writes from Node Workflows.** Nodes may be retried. Any external write in a Node Workflow will be executed multiple times. Keep all side effects in Leaf Workflows (or Activities called by Leaves). + +## Related Resources + +- [Fan-Out with Child Workflows](/design-patterns/fanout-child-workflows) — simpler flat fan-out for smaller record sets +- [Sliding Window](/design-patterns/sliding-window) — bounded concurrency with rate limiting +- [Child Workflows pattern](/design-patterns/child-workflows) — core concepts for parent/child coordination +- [Temporal limits reference](https://docs.temporal.io/cloud/limits) diff --git a/docs/design-patterns/non-retryable-errors.mdx b/docs/design-patterns/non-retryable-errors.mdx new file mode 100644 index 0000000000..80b9bef1bb --- /dev/null +++ b/docs/design-patterns/non-retryable-errors.mdx @@ -0,0 +1,487 @@ +--- +id: non-retryable-errors +title: "Non-Retryable Errors" +sidebar_label: "Non-Retryable Errors" +description: "Mark error types that will never succeed so Temporal fails fast instead of retrying indefinitely." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Raise a non-retryable `ApplicationError` from the Activity — or list error types in the `RetryPolicy` — so **the Temporal Activity fails fast instead of retrying indefinitely**. Use this for permanent failures such as invalid input, missing records, or authorization errors where repeating the same call will never succeed. +::: + +## Overview + +The Non-Retryable Errors pattern marks specific error types so Temporal stops retrying immediately when one is raised. +Use it for failures where the root cause is structural — invalid input, a missing record, an authorization problem — where repeating the same call will never produce a different result. + +## Problem + +Temporal retries all Activity failures by default. +For transient infrastructure errors such as network timeouts or service restarts, this is the right behavior. +But some failures are permanent: no amount of retrying will fix them. + +Retrying a permanent failure wastes time and resources: + +- A transfer to a non-existent account number will fail on attempt 1, 2, and 3 in exactly the same way. +- An API call with a malformed request body will be rejected every time. +- A request from a revoked API key will receive an authorization error on every attempt. + +With the default unlimited retry policy, the Workflow waits through exponential backoff delays — minutes to hours — before eventually delivering the error to the Workflow, when it could have failed in milliseconds. + +## Solution + +Raise a non-retryable error from inside the Activity when the failure is known to be permanent. +Temporal detects the error type and skips all remaining retries, delivering the failure to the Workflow immediately. + +There are two complementary mechanisms: + +1. **Mark the error as non-retryable at the throw site** — the Activity explicitly signals that this specific failure should not be retried. +2. **Register non-retryable error types in the `RetryPolicy`** — the Workflow declares which error type names should never be retried, regardless of how the Activity raises them. + +Both mechanisms can be used together. + +```mermaid +flowchart TD + Workflow -->|Schedule Activity| Activity + Activity -->|Raises error| Check{Is error type\nnon-retryable?} + Check -->|Yes — marked at throw site\nor listed in RetryPolicy| Fail([Deliver ActivityError\nto Workflow immediately]) + Check -->|No| Retry[Schedule retry\nwith backoff] + Retry --> Activity + Fail --> Handle[Workflow handles error:\nlog, compensate, or escalate] +``` + +The following describes each path: + +1. The Activity raises an error. Temporal inspects whether the error type is non-retryable. +2. If non-retryable — either because the Activity flagged it or the `RetryPolicy` lists the type — Temporal delivers the `ActivityError` to the Workflow without delay. +3. If retryable, Temporal schedules another attempt after the configured backoff. +4. The Workflow catches the `ActivityError` and handles it according to the business logic. + +## Implementation + + +### Marking an error as non-retryable at the throw site + +Use the SDK's `ApplicationError` (or equivalent) with the non-retryable flag. +Temporal propagates the error type name and the flag to the Workflow without retrying. + + + + +```python +# activities.py +from temporalio import activity +from temporalio.exceptions import ApplicationError + +@activity.defn +async def process_order(order_id: str) -> str: + response = await http_client.post(f"/orders/{order_id}/process") + if response.status_code == 404: + raise ApplicationError( + f"Order {order_id} not found", + type="OrderNotFoundError", + non_retryable=True, + ) + if response.status_code == 422: + raise ApplicationError( + f"Order {order_id} rejected: {response.json().get('detail', 'validation error')}", + type="ValidationError", + non_retryable=True, + ) + + response.raise_for_status() + return response.json()["confirmation_id"] +``` + + + + +```go +// activities.go +package orders + +import ( + "context" + "fmt" + + "go.temporal.io/sdk/temporal" +) + +func ProcessOrder(ctx context.Context, orderID string) (string, error) { + resp, err := httpClient.Post(fmt.Sprintf("/orders/%s/process", orderID)) + if err != nil { + return "", err + } + if resp.StatusCode == 404 { + return "", temporal.NewNonRetryableApplicationError( + fmt.Sprintf("order %s not found", orderID), + "OrderNotFoundError", + nil, + ) + } + if resp.StatusCode == 422 { + return "", temporal.NewNonRetryableApplicationError( + fmt.Sprintf("order %s rejected: %s", orderID, resp.ErrorDetail), + "ValidationError", + nil, + ) + } + return resp.ConfirmationID, nil +} +``` + + + + +```java +// ProcessOrderActivityImpl.java +import io.temporal.failure.ApplicationFailure; + +public class ProcessOrderActivityImpl implements ProcessOrderActivity { + @Override + public String processOrder(String orderId) { + HttpResponse response = httpClient.post("/orders/" + orderId + "/process"); + if (response.getStatusCode() == 404) { + throw ApplicationFailure.newNonRetryableFailure( + "Order " + orderId + " not found", + "OrderNotFoundError" + ); + } + if (response.getStatusCode() == 422) { + throw ApplicationFailure.newNonRetryableFailure( + "Order " + orderId + " rejected: " + response.getErrorDetail(), + "ValidationError" + ); + } + return response.getConfirmationId(); + } +} +``` + + + + +```typescript +// activities.ts +import { ApplicationFailure } from '@temporalio/activity'; + +export async function processOrder(orderId: string): Promise { + const response = await fetch(`/orders/${orderId}/process`, { method: 'POST' }); + if (response.status === 404) { + throw ApplicationFailure.nonRetryable( + `Order ${orderId} not found`, + 'OrderNotFoundError', + ); + } + if (response.status === 422) { + const body = await response.json(); + throw ApplicationFailure.nonRetryable( + `Order ${orderId} rejected: ${body.detail ?? 'validation error'}`, + 'ValidationError', + ); + } + + if (!response.ok) throw new Error(`API error ${response.status}`); + const body = await response.json(); + return body.confirmation_id; +} +``` + + + + +### Declaring non-retryable types in the RetryPolicy + +Alternatively, list error type names in the `RetryPolicy` at the Workflow call site. +Temporal stops retrying when the Activity raises an error whose type matches any name in the list. +This approach separates the retry decision from the Activity code, which is useful when the Activity is shared and the non-retryable classification depends on the caller's context. + +The Activity raises a standard `ApplicationError` with a type name but without the non-retryable flag — the retry decision is delegated to the Workflow's `RetryPolicy`: + + + + +```python +# activities.py +@activity.defn +async def process_order(order_id: str) -> str: + response = await http_client.post(f"/orders/{order_id}/process") + if response.status_code == 404: + # No non_retryable=True — the RetryPolicy in the Workflow controls retry behavior + raise ApplicationError(f"Order {order_id} not found", type="OrderNotFoundError") + if response.status_code == 422: + raise ApplicationError( + f"Order {order_id} rejected: {response.json().get('detail', 'validation error')}", + type="ValidationError", + ) + response.raise_for_status() + return response.json()["confirmation_id"] +``` + + + + +```go +// activities.go +func ProcessOrder(ctx context.Context, orderID string) (string, error) { + resp, err := httpClient.Post(fmt.Sprintf("/orders/%s/process", orderID)) + if err != nil { + return "", err + } + if resp.StatusCode == 404 { + // Use NewApplicationError, not NewNonRetryableApplicationError + return "", temporal.NewApplicationError( + fmt.Sprintf("order %s not found", orderID), "OrderNotFoundError", + ) + } + if resp.StatusCode == 422 { + return "", temporal.NewApplicationError( + fmt.Sprintf("order %s rejected: %s", orderID, resp.ErrorDetail), "ValidationError", + ) + } + return resp.ConfirmationID, nil +} +``` + + + + +```java +// ProcessOrderActivityImpl.java +public String processOrder(String orderId) { + HttpResponse response = httpClient.post("/orders/" + orderId + "/process"); + if (response.getStatusCode() == 404) { + // Use newFailure, not newNonRetryableFailure + throw ApplicationFailure.newFailure("Order " + orderId + " not found", "OrderNotFoundError"); + } + if (response.getStatusCode() == 422) { + throw ApplicationFailure.newFailure( + "Order " + orderId + " rejected: " + response.getErrorDetail(), "ValidationError"); + } + return response.getConfirmationId(); +} +``` + + + + +```typescript +// activities.ts +export async function processOrder(orderId: string): Promise { + const response = await fetch(`/orders/${orderId}/process`, { method: 'POST' }); + if (response.status === 404) { + // Use ApplicationFailure.create, not .nonRetryable + throw ApplicationFailure.create({ message: `Order ${orderId} not found`, type: 'OrderNotFoundError' }); + } + if (response.status === 422) { + const body = await response.json(); + throw ApplicationFailure.create({ + message: `Order ${orderId} rejected: ${body.detail ?? 'validation error'}`, + type: 'ValidationError', + }); + } + if (!response.ok) throw new Error(`API error ${response.status}`); + const body = await response.json(); + return body.confirmation_id; +} +``` + + + + +The Workflow lists which error type names to never retry: + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy +import activities + +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, order_id: str) -> str: + return await workflow.execute_activity( + activities.process_order, + order_id, + start_to_close_timeout=timedelta(seconds=10), + retry_policy=RetryPolicy( + non_retryable_error_types=["OrderNotFoundError", "ValidationError"], + ), + ) +``` + + + + +```go +// workflow.go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + NonRetryableErrorTypes: []string{"OrderNotFoundError", "ValidationError"}, + }, +} +ctx = workflow.WithActivityOptions(ctx, ao) +``` + + + + +```java +// OrderWorkflowImpl.java +private final ProcessOrderActivity activities = Workflow.newActivityStub( + ProcessOrderActivity.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .setRetryOptions(RetryOptions.newBuilder() + .setDoNotRetry("OrderNotFoundError", "ValidationError") + .build()) + .build() +); +``` + + + + +```typescript +// workflows.ts +const { processOrder } = wf.proxyActivities({ + startToCloseTimeout: '10s', + retry: { + nonRetryableErrorTypes: ['OrderNotFoundError', 'ValidationError'], + }, +}); +``` + + + + +### Handling non-retryable errors in the Workflow + +Catch the `ActivityError` in the Workflow to distinguish between permanent failures and transient ones. +Inspect the underlying cause to route to the appropriate compensation or escalation path. + + + + +```python +# workflows.py +from temporalio.exceptions import ActivityError, ApplicationError + +@workflow.defn +class OrderWorkflow: + @workflow.run + async def run(self, order_id: str) -> str: + try: + return await workflow.execute_activity( + activities.process_order, + order_id, + start_to_close_timeout=timedelta(seconds=10), + retry_policy=RetryPolicy( + non_retryable_error_types=["OrderNotFoundError", "ValidationError"], + ), + ) + except ActivityError as e: + cause = e.__cause__ + if isinstance(cause, ApplicationError): + if cause.type == "ValidationError": + return f"Order rejected: {cause}" + if cause.type == "OrderNotFoundError": + return f"Order not found: {cause}" + raise +``` + + + + +```go +// workflow.go +var result string +err := workflow.ExecuteActivity(ctx, ProcessOrder, orderID).Get(ctx, &result) +if err != nil { + var appErr *temporal.ApplicationError + if errors.As(err, &appErr) { + switch appErr.Type() { + case "ValidationError": + return "", fmt.Errorf("order rejected: %w", appErr) + case "OrderNotFoundError": + return "", fmt.Errorf("order not found: %w", appErr) + } + } + return "", err +} +``` + + + + +```java +// OrderWorkflowImpl.java +try { + return activities.processOrder(orderId); +} catch (ActivityFailure e) { + if (e.getCause() instanceof ApplicationFailure appFailure) { + switch (appFailure.getType()) { + case "ValidationError" -> { return "Order rejected: " + appFailure.getMessage(); } + case "OrderNotFoundError" -> { return "Order not found: " + appFailure.getMessage(); } + } + } + throw e; +} +``` + + + + +```typescript +// workflows.ts +try { + return await processOrder(orderId); +} catch (err) { + if (err instanceof wf.ActivityFailure && err.cause instanceof wf.ApplicationFailure) { + if (err.cause.type === 'ValidationError') { + return `Order rejected: ${err.cause.message}`; + } + if (err.cause.type === 'OrderNotFoundError') { + return `Order not found: ${err.cause.message}`; + } + } + throw err; +} +``` + + + + +## Best practices + +- **Validate input before scheduling the Activity.** If the Workflow can detect invalid input upfront — using an `Update` validator or by inspecting the input data — fail fast in the Workflow rather than paying the cost of an Activity execution. +- **Use specific error type names.** Generic names like `"Error"` or `"Failure"` match too broadly. Use domain-specific names like `"OrderNotFoundError"` or `"InsufficientFundsError"` so the Workflow can distinguish between failure causes. +- **Reserve non-retryable for truly permanent failures.** A rate-limit error (HTTP 429) is transient — the same call will succeed after a delay. A not-found error (HTTP 404) is typically permanent. Match the non-retryable classification to the nature of the error. +- **Combine both mechanisms for defence in depth.** Mark the error as non-retryable at the throw site so the Activity is self-describing, and also list the type in the `RetryPolicy` so the classification is enforced even if the Activity code changes. + +## Common pitfalls + +- **Marking transient errors as non-retryable.** Network timeouts and service unavailability are transient. Marking them non-retryable removes Temporal's ability to recover automatically. +- **Using the error message instead of a type name.** `RetryPolicy.NonRetryableErrorTypes` matches on type names, not message strings. Without a type name, the policy cannot identify the error. +- **Swallowing the `ActivityError` without logging.** Non-retryable errors fail fast and silently if you do not catch and log them. Always log the failure before re-raising or returning an error result. +- **Confusing non-retryable errors with Workflow failures.** A non-retryable `ActivityError` fails the Activity and delivers the error to the Workflow. The Workflow itself does not fail unless it re-raises the error without catching it. + +## Related patterns + +- [Fixed Count of Retries](/design-patterns/fixed-count-retries): Limit retries for transient errors that are worth retrying a bounded number of times. +- [Resumable Activity](/design-patterns/resumable-activity): Park the Workflow and accept a corrected input via Signal when retries are exhausted. +- [Error Handling & Retry Patterns](/design-patterns/error-handling-patterns): Overview and decision tree for all retry patterns. + +## References + +- [Temporal Retry Policies](https://docs.temporal.io/encyclopedia/retry-policies) +- [Failure Handling in Practice](https://temporal.io/blog/failure-handling-in-practice) diff --git a/docs/design-patterns/parallel-execution.mdx b/docs/design-patterns/parallel-execution.mdx new file mode 100644 index 0000000000..251e048df3 --- /dev/null +++ b/docs/design-patterns/parallel-execution.mdx @@ -0,0 +1,537 @@ +--- +id: parallel-execution +title: "Parallel Execution" +sidebar_label: "Parallel Execution" +description: "Executes multiple Activities concurrently for maximum throughput with error handling and controlled parallelism." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Parallel Execution pattern enables concurrent execution of multiple Activities or Child Workflows to maximize throughput and minimize total execution time. +Using Temporal's async APIs, you can launch multiple operations asynchronously and wait for their completion. + +## Problem + +In sequential execution, operations run one after another, causing unnecessary delays when multiple independent operations could run simultaneously. +Total execution time equals the sum of all operation durations, resources sit idle while waiting, and batch processing takes hours when it could take minutes. + +Without parallel execution, you must accept slow sequential processing, implement complex threading or async logic manually, risk inconsistent state management across threads, and handle thread safety and synchronization issues. + +## Solution + +Each SDK provides its own mechanism for launching Activities concurrently and waiting for the results: + +- **Java**: `Async.function()` schedules Activities that return `Promise` objects. `Promise.allOf()` waits for all of them. +- **TypeScript**: Activity proxy functions return native `Promise` objects. `Promise.all()` waits for all of them. +- **Python**: `workflow.execute_activity()` returns awaitables. `asyncio.gather()` waits for all of them. +- **Go**: `workflow.ExecuteActivity()` returns `Future` objects. You call `.Get()` on each Future to collect results. + +```mermaid +sequenceDiagram + participant Workflow + participant Activity1 + participant Activity2 + participant Activity3 + + Workflow->>+Activity1: Start async + Workflow->>+Activity2: Start async + Workflow->>+Activity3: Start async + Note over Workflow: Returns immediately with Futures/Promises + + par Parallel Execution + Activity1->>Activity1: Execute + Activity2->>Activity2: Execute + Activity3->>Activity3: Execute + end + + Activity1-->>-Workflow: Result 1 + Activity2-->>-Workflow: Result 2 + Activity3-->>-Workflow: Result 3 + + Workflow->>Workflow: Await all results + Note over Workflow: Collect all results +``` + +The following describes each step in the diagram: + +1. The Workflow starts three Activities asynchronously, which returns immediately with Futures or Promises. +2. All three Activities execute in parallel on available Workers. +3. As each Activity completes, its Future or Promise resolves with the result. +4. The Workflow waits until all results are available, then collects them. + +## Implementation + +### Basic parallel Activities + +The following implementation starts one Activity per item in a list and waits for all of them to complete: + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import process + +@workflow.defn +class ParallelWorkflow: + @workflow.run + async def run(self, items: list[str]) -> list[str]: + results = await asyncio.gather( + *[ + workflow.execute_activity( + process, + item, + start_to_close_timeout=timedelta(seconds=30), + ) + for item in items + ] + ) + return list(results) +``` + + + + +```go +// parallel_workflow.go +func ProcessInParallel(ctx workflow.Context, items []string) ([]string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + futures := make([]workflow.Future, len(items)) + for i, item := range items { + futures[i] = workflow.ExecuteActivity(ctx, Process, item) + } + + results := make([]string, len(items)) + for i, future := range futures { + if err := future.Get(ctx, &results[i]); err != nil { + return nil, err + } + } + return results, nil +} +``` + + + + +```java +// ParallelWorkflowImpl.java +@WorkflowInterface +public interface ParallelWorkflow { + @WorkflowMethod + List processInParallel(List items); +} + +public class ParallelWorkflowImpl implements ParallelWorkflow { + private final ProcessingActivity activity = + Workflow.newActivityStub(ProcessingActivity.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .build()); + + @Override + public List processInParallel(List items) { + List> promises = new ArrayList<>(); + + for (String item : items) { + Promise promise = Async.function(activity::process, item); + promises.add(promise); + } + + Promise.allOf(promises).get(); + return promises.stream().map(Promise::get).collect(Collectors.toList()); + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { process } = proxyActivities({ + startToCloseTimeout: '30s', +}); + +export async function processInParallel(items: string[]): Promise { + const promises = items.map((item) => process(item)); + return await Promise.all(promises); +} +``` + + + + +Each SDK schedules all Activities before waiting for any results. +In Java, `Async.function()` returns a `Promise`; in TypeScript, calling the activity proxy without `await` returns a native `Promise`; in Python, `workflow.execute_activity()` returns an awaitable; and in Go, `workflow.ExecuteActivity()` returns a `Future`. +The Workflow then waits for all of them to complete and collects the results. + +### Controlled parallelism + +The following implementation limits the number of concurrent Activities by processing items in batches: + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import process + +@workflow.defn +class BatchWorkflow: + @workflow.run + async def run(self, items: list[str], max_parallel: int) -> list[str]: + results: list[str] = [] + + for i in range(0, len(items), max_parallel): + batch = items[i : i + max_parallel] + batch_results = await asyncio.gather( + *[ + workflow.execute_activity( + process, + item, + start_to_close_timeout=timedelta(seconds=30), + ) + for item in batch + ] + ) + results.extend(batch_results) + + return results +``` + + + + +```go +// batch_workflow.go +func ProcessBatch(ctx workflow.Context, items []string, maxParallel int) ([]string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var results []string + + for i := 0; i < len(items); i += maxParallel { + end := i + maxParallel + if end > len(items) { + end = len(items) + } + batch := items[i:end] + + futures := make([]workflow.Future, len(batch)) + for j, item := range batch { + futures[j] = workflow.ExecuteActivity(ctx, Process, item) + } + + for _, future := range futures { + var result string + if err := future.Get(ctx, &result); err != nil { + return nil, err + } + results = append(results, result) + } + } + + return results, nil +} +``` + + + + +```java +// BatchWorkflowImpl.java +public class BatchWorkflowImpl implements BatchWorkflow { + private final ProcessingActivity activity = + Workflow.newActivityStub(ProcessingActivity.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .build()); + + @Override + public BatchResult processBatch(List items, int maxParallel) { + List results = new ArrayList<>(); + + for (int i = 0; i < items.size(); i += maxParallel) { + int end = Math.min(i + maxParallel, items.size()); + List batch = items.subList(i, end); + + List> promises = batch.stream() + .map(item -> Async.function(activity::process, item)) + .collect(Collectors.toList()); + + Promise.allOf(promises).get(); + results.addAll(promises.stream().map(Promise::get).collect(Collectors.toList())); + } + + return new BatchResult(results); + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { process } = proxyActivities({ + startToCloseTimeout: '30s', +}); + +export async function processBatch( + items: string[], + maxParallel: number +): Promise { + const results: string[] = []; + + for (let i = 0; i < items.length; i += maxParallel) { + const batch = items.slice(i, i + maxParallel); + const batchResults = await Promise.all(batch.map((item) => process(item))); + results.push(...batchResults); + } + + return results; +} +``` + + + + +The Workflow processes items in chunks of `maxParallel`. +Each chunk runs in parallel, and the Workflow waits for the entire chunk to complete before starting the next one. +This prevents overwhelming Workers or external services. + +### Error handling + +The following implementation wraps each Activity in error handling so that individual failures do not prevent other Activities from completing: + + + + +```python +# workflows.py +import asyncio +from dataclasses import dataclass +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import process + +@dataclass +class Result: + item: str + output: str | None = None + error: str | None = None + +@workflow.defn +class ResilientParallelWorkflow: + @workflow.run + async def run(self, items: list[str]) -> list[Result]: + tasks = [ + workflow.execute_activity( + process, + item, + start_to_close_timeout=timedelta(seconds=30), + ) + for item in items + ] + outcomes = await asyncio.gather(*tasks, return_exceptions=True) + + results: list[Result] = [] + for item, outcome in zip(items, outcomes): + if isinstance(outcome, BaseException): + results.append(Result(item=item, error=str(outcome))) + else: + results.append(Result(item=item, output=outcome)) + return results +``` + + + + +```go +// resilient_parallel_workflow.go +func ProcessWithErrorHandling(ctx workflow.Context, items []string) ([]Result, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + futures := make([]workflow.Future, len(items)) + for i, item := range items { + futures[i] = workflow.ExecuteActivity(ctx, Process, item) + } + + results := make([]Result, len(items)) + for i, future := range futures { + var output string + if err := future.Get(ctx, &output); err != nil { + results[i] = Result{Item: items[i], Error: err.Error()} + } else { + results[i] = Result{Item: items[i], Output: output} + } + } + return results, nil +} +``` + + + + +```java +// ResilientParallelWorkflowImpl.java +public class ResilientParallelWorkflowImpl implements ParallelWorkflow { + + @Override + public ProcessingReport processWithErrorHandling(List items) { + List> promises = new ArrayList<>(); + + for (String item : items) { + Promise promise = Async.function(() -> { + try { + return activity.process(item); + } catch (Exception e) { + return Result.failed(item, e.getMessage()); + } + }); + promises.add(promise); + } + + Promise.allOf(promises).get(); + + List results = promises.stream().map(Promise::get).collect(Collectors.toList()); + return new ProcessingReport(results); + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { process } = proxyActivities({ + startToCloseTimeout: '30s', +}); + +interface Result { + item: string; + output?: string; + error?: string; +} + +export async function processWithErrorHandling(items: string[]): Promise { + const settled = await Promise.allSettled(items.map((item) => process(item))); + + return settled.map((outcome, i) => { + if (outcome.status === 'fulfilled') { + return { item: items[i], output: outcome.value }; + } + return { item: items[i], error: String(outcome.reason) }; + }); +} +``` + + + + +Each Activity handles its own errors so that the Workflow can collect results from all Activities, including those that failed. +In TypeScript, `Promise.allSettled()` is especially convenient for this pattern. +In Python, `asyncio.gather()` with `return_exceptions=True` captures errors alongside successes. +In Go, each `Future.Get()` call is checked individually for errors. + +## When to use + +The Parallel Execution pattern is a good fit for processing independent items in batches, calling multiple external services simultaneously, fan-out/fan-in patterns, parallel data transformations, concurrent API requests, and multi-step pipelines with independent stages. + +It is not a good fit for operations with dependencies between them, resource-constrained environments (use controlled parallelism), operations requiring strict ordering, or a single fast operation (the overhead is not worth it). + +## Benefits and trade-offs + +Parallel execution reduces total execution time dramatically and maximizes Worker and external service usage. +Temporal handles retries and failures per operation, and you do not need manual thread management or synchronization. + +The trade-offs to consider are that more concurrent operations require more Workers. +Error handling across parallel operations is harder. +Parallel execution makes tracing more difficult. +You may overwhelm external services without throttling, and storing many Futures or Promises consumes Workflow memory. + +## Comparison with alternatives + +| Approach | Parallelism | Complexity | Control | Use case | +| :--- | :--- | :--- | :--- | :--- | +| Async Activities | High | Low | Medium | Independent operations | +| Sequential | None | Very Low | Full | Dependent operations | +| Child Workflows | High | Medium | High | Complex sub-processes | +| ContinueAsNew | None | Medium | Full | Large iterations | + +## Best practices + +- **Limit concurrency.** Use batching to avoid overwhelming Workers or external services. +- **Handle failures.** Wrap operations in error handling or use Activity retry policies. +- **Set timeouts.** Configure appropriate Activity timeouts for parallel operations. +- **Monitor resources.** Ensure sufficient Workers for the desired parallelism. +- **Aggregate carefully.** Consider memory when collecting large result sets. +- **Use Child Workflows.** For complex parallel operations with their own state. +- **Test scalability.** Verify performance with realistic parallel loads. +- **Rate limit.** Implement throttling for external API calls. +- **Support partial results.** Consider returning partial results on some failures. +- **Avoid premature blocking.** Schedule all Activities before waiting for any results. + +## Common pitfalls + +- **Exceeding the pending Activities limit.** A single Workflow Execution can have at most 2,000 pending (concurrently running) Activities. Scheduling more causes Workflow Task failures. Batch Activities or use child Workflows for higher concurrency. +- **Ignoring errors from individual Activities.** Waiting for all results (e.g., `Promise.allOf()` in Java, `Promise.all()` in TypeScript, `asyncio.gather()` in Python) fails on the first error by default. If you need partial results, catch errors inside each async function or use `Promise.allSettled()` / `return_exceptions=True` / per-Future error checking. +- **Blowing the 4 MB gRPC message limit.** Scheduling hundreds of Activities in a single Workflow Task can exceed the 4 MB gRPC message size limit if their combined inputs are large. Batch scheduling across multiple Workflow Tasks. +- **Not using Continue-As-New for large fan-outs.** Each Activity adds events to history. Hundreds of parallel Activities can quickly approach the 50K event limit. Use Continue-As-New or child Workflows to partition work. + +## Related patterns + +- **[Child Workflows](/design-patterns/child-workflows)**: For complex parallel operations with their own state. +- **[Saga Pattern](/design-patterns/saga-pattern)**: Parallel operations with compensation. + +## Sample code + +**Java:** +- [HelloParallelActivity](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloParallelActivity.java) — Basic parallel Activity execution. +- [HelloAsync](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloAsync.java) — Async execution with Promises. +- [Sliding Window Batch](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/batch/slidingwindow) — Controlled parallel Child Workflows. + +**TypeScript:** +- [activities-examples](https://github.com/temporalio/samples-typescript/tree/main/activities-examples) — Activity patterns including parallel execution with `Promise.all()`. + +**Python:** +- [hello_parallel_activity](https://github.com/temporalio/samples-python/blob/main/hello/hello_parallel_activity.py) — Basic parallel Activity execution with `asyncio.gather()`. + +**Go:** +- [splitmerge-future](https://github.com/temporalio/samples-go/tree/main/splitmerge-future) — Parallel Activity execution with Futures. +- [splitmerge-selector](https://github.com/temporalio/samples-go/tree/main/splitmerge-selector) — Parallel Activities with Selector for first-completion handling. diff --git a/docs/design-patterns/pick-first.mdx b/docs/design-patterns/pick-first.mdx new file mode 100644 index 0000000000..3896999515 --- /dev/null +++ b/docs/design-patterns/pick-first.mdx @@ -0,0 +1,558 @@ +--- +id: pick-first +title: "Pick First Pattern" +sidebar_label: "Pick First (Race)" +description: "Starts multiple Activities in parallel and uses the first result, cancelling the rest." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Pick First pattern executes multiple Activities in parallel and returns the result of whichever completes first, then cancels the remaining Activities. +It is suitable for racing multiple approaches to the same task, implementing timeout alternatives, or optimizing for fastest response when multiple options are available. + +## Problem + +In distributed systems, you often need Workflows that execute multiple Activities that can accomplish the same goal, return as soon as any one succeeds (fastest wins), cancel remaining Activities to avoid wasted resources, and handle scenarios where speed matters more than trying all options. + +Without the Pick First pattern, you must wait for all Activities to complete even when only one result is needed, manually track which Activity finished first, implement complex cancellation logic for remaining Activities, and waste compute resources on Activities whose results will not be used. + +## Solution + +The Pick First pattern races multiple Activities simultaneously, captures the first result, then cancels remaining Activities using a cancellation mechanism provided by each SDK. + +```mermaid +sequenceDiagram + participant Workflow + participant Activity1 + participant Activity2 + participant Activity3 + + Workflow->>+Activity1: Start (shared ctx) + Workflow->>+Activity2: Start (shared ctx) + Workflow->>+Activity3: Start (shared ctx) + Note over Workflow: Selector waits for first + + par Race + Activity1->>Activity1: Execute (slow) + Activity2->>Activity2: Execute (fast) + Activity3->>Activity3: Execute (medium) + end + + Activity2-->>Workflow: Result (FIRST!) + Note over Workflow: Selector returns + Workflow->>Workflow: cancelHandler() + + Workflow->>Activity1: Cancel + Workflow->>Activity3: Cancel + Activity1-->>-Workflow: Cancelled + Activity3-->>-Workflow: Cancelled + deactivate Activity2 +``` + +The following describes each step in the diagram: + +1. The Workflow starts three Activities in parallel using a shared cancellable context. +2. The Workflow waits for the first Activity to complete. +3. Activity 2 completes first. The Workflow captures its result. +4. The Workflow cancels the shared context, which cancels Activities 1 and 3. + +The following implementation shows the core pattern. +The Workflow creates a cancellable context, starts two Activities, and captures the first result: + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import sample_activity + +@workflow.defn +class PickFirstWorkflow: + @workflow.run + async def run(self) -> str: + task1 = asyncio.create_task( + workflow.execute_activity( + sample_activity, + "option1", + start_to_close_timeout=timedelta(minutes=2), + heartbeat_timeout=timedelta(seconds=10), + ) + ) + task2 = asyncio.create_task( + workflow.execute_activity( + sample_activity, + "option2", + start_to_close_timeout=timedelta(minutes=2), + heartbeat_timeout=timedelta(seconds=10), + ) + ) + + done, pending = await workflow.wait( + [task1, task2], return_when=asyncio.FIRST_COMPLETED + ) + + for task in pending: + task.cancel() + + return done.pop().result() +``` + + + + +```go +// workflow.go +func PickFirstWorkflow(ctx workflow.Context) (string, error) { + selector := workflow.NewSelector(ctx) + var firstResponse string + + childCtx, cancelHandler := workflow.WithCancel(ctx) + childCtx = workflow.WithActivityOptions(childCtx, activityOptions) + + f1 := workflow.ExecuteActivity(childCtx, Activity, "option1") + f2 := workflow.ExecuteActivity(childCtx, Activity, "option2") + + selector.AddFuture(f1, func(f workflow.Future) { + _ = f.Get(ctx, &firstResponse) + }).AddFuture(f2, func(f workflow.Future) { + _ = f.Get(ctx, &firstResponse) + }) + + selector.Select(ctx) // Blocks until first completes + cancelHandler() // Cancel remaining activities + + return firstResponse, nil +} +``` + + + + +```java +// PickFirstWorkflow.java +public class PickFirstWorkflowImpl implements PickFirstWorkflow { + + private final SampleActivities activities = + Workflow.newActivityStub( + SampleActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(2)) + .setHeartbeatTimeout(Duration.ofSeconds(10)) + .build()); + + @Override + public String pickFirst() { + List> results = new ArrayList<>(); + + CancellationScope scope = + Workflow.newCancellationScope( + () -> { + results.add(Async.function(activities::sampleActivity, "option1")); + results.add(Async.function(activities::sampleActivity, "option2")); + }); + + scope.run(); + + String firstResponse = Promise.anyOf(results).get(); + + scope.cancel(); + + return firstResponse; + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities, CancellationScope } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { sampleActivity } = proxyActivities({ + startToCloseTimeout: '2m', + heartbeatTimeout: '10s', +}); + +export async function pickFirstWorkflow(): Promise { + return await CancellationScope.cancellable(async () => { + const p1 = sampleActivity('option1'); + const p2 = sampleActivity('option2'); + const firstResponse = await Promise.race([p1, p2]); + CancellationScope.current().cancel(); + return firstResponse; + }); +} +``` + + + + +Each SDK provides a different mechanism for racing Activities and cancelling the rest: + +- **Go** uses `workflow.NewSelector()` with `AddFuture()` callbacks and `workflow.WithCancel(ctx)` for cancellation. +- **TypeScript** uses `Promise.race()` within a `CancellationScope.cancellable()`. Calling `CancellationScope.current().cancel()` cancels all Activities started in that scope. +- **Python** uses `asyncio.create_task()` to start Activities concurrently, then `workflow.wait()` with `return_when=asyncio.FIRST_COMPLETED` to get the first result. Pending tasks are cancelled explicitly. +- **Java** uses `Async.function()` to start Activities inside a `CancellationScope`, then `Promise.anyOf()` to wait for the first result. Calling `scope.cancel()` cancels the remaining Activities. + +## Implementation + +### Activity with cancellation support + +For the Pick First pattern to work efficiently, Activities must detect cancellation via heartbeats and respond to the cancellation signal provided by their SDK: + + + + +```python +# activities.py +import asyncio +from temporalio import activity + +@activity.defn +async def sample_activity(branch_id: str) -> str: + try: + for elapsed in range(60): + await asyncio.sleep(1) + activity.heartbeat("status-report") + return f"Branch {branch_id} completed" + except asyncio.CancelledError: + activity.logger.info(f"Branch {branch_id} cancelled") + raise +``` + + + + +```go +// activity.go +func SampleActivity(ctx context.Context, branchID int, duration time.Duration) (string, error) { + logger := activity.GetLogger(ctx) + elapsed := time.Nanosecond + + for elapsed < duration { + time.Sleep(time.Second) + elapsed += time.Second + + activity.RecordHeartbeat(ctx, "status-report") + + select { + case <-ctx.Done(): + msg := fmt.Sprintf("Branch %d cancelled", branchID) + logger.Info(msg) + return msg, ctx.Err() + default: + // Continue working + } + } + + return fmt.Sprintf("Branch %d completed", branchID), nil +} +``` + + + + +```java +// SampleActivityImpl.java +public class SampleActivityImpl implements SampleActivities { + + @Override + public String sampleActivity(String branchID) { + for (int elapsed = 0; elapsed < 60; elapsed++) { + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw Activity.wrap(e); + } + Activity.getExecutionContext().heartbeat("status-report"); + // Heartbeat throws CanceledFailure if cancellation was requested + } + return "Branch " + branchID + " completed"; + } +} +``` + + + + +```typescript +// activities.ts +import { Context, heartbeat } from '@temporalio/activity'; + +export async function sampleActivity(branchID: string): Promise { + for (let elapsed = 0; elapsed < 60; elapsed++) { + await new Promise((resolve) => setTimeout(resolve, 1000)); + heartbeat('status-report'); + Context.current().cancellationSignal.throwIfAborted(); + } + return `Branch ${branchID} completed`; +} +``` + + + + +The Activity heartbeats on each iteration, which allows the Temporal Server to deliver cancellation notifications promptly. +When cancellation is detected, the Activity performs any necessary cleanup and exits. + +### Wait for cancellation completion + +The following implementation waits for all Activities to finish their cleanup before returning: + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy + +with workflow.unsafe.imports_passed_through(): + from activities import sample_activity + +@workflow.defn +class PickFirstWithCleanup: + @workflow.run + async def run(self) -> str: + task1 = asyncio.create_task( + workflow.execute_activity( + sample_activity, + "branch1", + start_to_close_timeout=timedelta(minutes=2), + heartbeat_timeout=timedelta(seconds=10), + cancellation_type=workflow.ActivityCancellationType.WAIT_CANCELLATION_COMPLETED, + ) + ) + task2 = asyncio.create_task( + workflow.execute_activity( + sample_activity, + "branch2", + start_to_close_timeout=timedelta(minutes=2), + heartbeat_timeout=timedelta(seconds=10), + cancellation_type=workflow.ActivityCancellationType.WAIT_CANCELLATION_COMPLETED, + ) + ) + + done, pending = await workflow.wait( + [task1, task2], return_when=asyncio.FIRST_COMPLETED + ) + + for task in pending: + task.cancel() + + # Wait for all activities to finish cancellation + for task in pending: + try: + await task + except asyncio.CancelledError: + pass + + return done.pop().result() +``` + + + + +```go +// workflow.go +func PickFirstWithCleanup(ctx workflow.Context) (string, error) { + selector := workflow.NewSelector(ctx) + var firstResponse string + + childCtx, cancelHandler := workflow.WithCancel(ctx) + childCtx = workflow.WithActivityOptions(childCtx, workflow.ActivityOptions{ + StartToCloseTimeout: 2 * time.Minute, + WaitForCancellation: true, + }) + + f1 := workflow.ExecuteActivity(childCtx, Activity, "branch1") + f2 := workflow.ExecuteActivity(childCtx, Activity, "branch2") + pendingFutures := []workflow.Future{f1, f2} + + selector.AddFuture(f1, func(f workflow.Future) { + _ = f.Get(ctx, &firstResponse) + }).AddFuture(f2, func(f workflow.Future) { + _ = f.Get(ctx, &firstResponse) + }) + + selector.Select(ctx) + cancelHandler() + + // Wait for all activities to finish cancellation + for _, f := range pendingFutures { + _ = f.Get(ctx, nil) + } + + return firstResponse, nil +} +``` + + + + +```java +// PickFirstWithCleanupImpl.java +public class PickFirstWithCleanupImpl implements PickFirstWorkflow { + + private final SampleActivities activities = + Workflow.newActivityStub( + SampleActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(2)) + .setHeartbeatTimeout(Duration.ofSeconds(10)) + .setCancellationType( + ActivityCancellationType.WAIT_CANCELLATION_COMPLETED) + .build()); + + @Override + public String pickFirst() { + List> results = new ArrayList<>(); + + CancellationScope scope = + Workflow.newCancellationScope( + () -> { + results.add(Async.function(activities::sampleActivity, "branch1")); + results.add(Async.function(activities::sampleActivity, "branch2")); + }); + + scope.run(); + + String firstResponse = Promise.anyOf(results).get(); + + scope.cancel(); + + // Wait for all activities to finish cancellation + for (Promise activityResult : results) { + try { + activityResult.get(); + } catch (ActivityFailure e) { + if (!(e.getCause() instanceof CanceledFailure)) { + throw e; + } + } + } + + return firstResponse; + } +} +``` + + + + +```typescript +// workflows.ts +import { + proxyActivities, + CancellationScope, + ActivityCancellationType, + isCancellation, +} from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { sampleActivity } = proxyActivities({ + startToCloseTimeout: '2m', + heartbeatTimeout: '10s', + cancellationType: ActivityCancellationType.WAIT_CANCELLATION_COMPLETED, +}); + +export async function pickFirstWithCleanup(): Promise { + return await CancellationScope.cancellable(async () => { + const p1 = sampleActivity('branch1'); + const p2 = sampleActivity('branch2'); + const firstResponse = await Promise.race([p1, p2]); + CancellationScope.current().cancel(); + + // Wait for all activities to finish cancellation + const results = [p1, p2]; + for (const p of results) { + try { + await p; + } catch (err) { + if (!isCancellation(err)) throw err; + } + } + + return firstResponse; + }); +} +``` + + + + +Each SDK provides a way to wait for cancelled Activities to finish their cleanup: + +- **Go** sets `WaitForCancellation: true` in the Activity options, then calls `Get` on all futures after cancelling. +- **TypeScript** sets `cancellationType: ActivityCancellationType.WAIT_CANCELLATION_COMPLETED` in the Activity options, then awaits all promises while catching cancellation errors with `isCancellation()`. +- **Python** sets `cancellation_type=workflow.ActivityCancellationType.WAIT_CANCELLATION_COMPLETED`, then awaits pending tasks while catching `asyncio.CancelledError`. +- **Java** sets `setCancellationType(ActivityCancellationType.WAIT_CANCELLATION_COMPLETED)`, then calls `get()` on all promises while catching `ActivityFailure` with a `CanceledFailure` cause. + +## When to use + +The Pick First pattern is a good fit for racing multiple data sources (primary vs backup), trying multiple algorithms and picking the fastest, implementing fallback strategies with timeout, optimizing for latency when multiple options exist, and testing multiple service endpoints for fastest response. + +It is not a good fit when you need results from all Activities (use parallel execution), Activities have side effects that should not be cancelled, order matters (use sequential execution), or all Activities must complete. + +## Benefits and trade-offs + +The pattern returns as soon as the fastest option completes, optimizing for latency. +Unnecessary work is cancelled automatically. +Each SDK's race mechanism ensures replay consistency, and cancellation cleanup is handled properly. + +The trade-offs to consider are that cancelled Activities may have done partial work. +Activities need heartbeats to detect cancellation quickly. +Activities do not cancel instantly (they wait for the next heartbeat). +You must implement proper cancellation handling in Activities. +Only the first result is used; others are discarded. + +## Comparison with alternatives + +| Approach | Returns first | Cancels others | Complexity | Use case | +| :--- | :--- | :--- | :--- | :--- | +| Pick First | Yes | Yes | Medium | Race for fastest | +| Parallel Execution | No | No | Low | All must complete | +| Sequential | No | N/A | Low | Order matters | +| Split/Merge | No | No | Medium | Aggregate results | + +## Best practices + +- **Use heartbeats.** Activities must heartbeat to detect cancellation quickly. +- **Configure cancellation wait behavior.** Decide if the Workflow should wait for cleanup to complete before returning. +- **Handle cancellation in Activities.** Activities must check for cancellation signals and exit cleanly. +- **Use a shared cancellable context.** Use a single cancellable context or scope for all raced Activities. +- **Track futures or tasks.** Keep references to all futures or tasks if waiting for cleanup. +- **Set Activity timeouts.** Configure appropriate StartToCloseTimeout and HeartbeatTimeout. +- **Log cancellations.** Log when Activities are cancelled for observability. +- **Design idempotent Activities.** Ensure Activities handle cancellation safely. + +## Common pitfalls + +- **Missing heartbeats in Activities.** Activities must heartbeat to detect cancellation. Without heartbeats, cancelled Activities continue running until their StartToCloseTimeout expires, wasting resources. +- **Not waiting for cancellation cleanup.** Without configuring the cancellation type to wait for completion (e.g., `WaitForCancellation: true` in Go, `WAIT_CANCELLATION_COMPLETED` in other SDKs), fetching a cancelled Activity's result returns a cancellation error immediately, before the Activity has finished cleanup. Configure this setting if you need to wait for cleanup to complete. +- **Ignoring errors from the winning Activity.** The first Activity to complete might return an error. Always check the result for errors, not just assume success. +- **Forgetting to cancel remaining Activities.** If you forget to cancel the shared context or scope after receiving the first result, the remaining Activities continue running indefinitely. + +## Related patterns + +- **[Parallel Execution](/design-patterns/parallel-execution)**: Execute in parallel and combine all results. + +## Sample code + +- [Go Sample](https://github.com/temporalio/samples-go/tree/main/pickfirst) — Complete implementation with Worker and starter. +- [TypeScript Sample](https://github.com/temporalio/samples-typescript/tree/main/activities-cancellation-heartbeating) — Activities with cancellation and heartbeating. +- [Java Sample](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello) — Hello samples including cancellation scope patterns. +- [Python Sample](https://github.com/temporalio/samples-python) — Python SDK samples with async patterns. diff --git a/docs/design-patterns/polling.mdx b/docs/design-patterns/polling.mdx new file mode 100644 index 0000000000..2df5ba0593 --- /dev/null +++ b/docs/design-patterns/polling.mdx @@ -0,0 +1,742 @@ +--- +id: polling +title: "Polling External Services" +sidebar_label: "Polling External Services" +description: "Strategies for polling external resources with varying frequencies: frequent, infrequent, and periodic patterns." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Polling External Services pattern implements strategies for periodically checking external systems until a desired state is reached. +It enables Workflows to wait for asynchronous operations in third-party services that do not support callbacks, making it essential for integrating with REST APIs, job queues, and batch processing systems. + +## Problem + +In distributed systems, you often need Workflows that wait for external jobs to complete, poll REST APIs that do not provide webhooks, check the status of long-running operations in third-party systems, handle varying poll frequencies, and avoid overwhelming external services with requests. + +Without proper polling strategies, you must implement complex retry logic manually, risk unbounded Workflow history growth, choose between responsiveness and resource efficiency, and handle heartbeating and timeout management yourself. + +## Solution + +You can use Temporal to implement three distinct polling strategies, each optimized for different polling frequencies and requirements: + +1. **Frequent Polling (1 second or faster)**: Loop inside an Activity with heartbeats. +2. **Infrequent Polling (1 minute or slower)**: Use Activity retries with fixed backoff. +3. **Periodic Sequence**: Use Child Workflows for complex polling sequences. + +```mermaid +flowchart TD + Start([Polling Required]) --> Freq{Polling
Frequency?} + + Freq -->|≤1 second| Fast[Frequent Polling] + Freq -->|≥1 minute| Slow[Infrequent Polling] + Freq -->|Complex| Complex[Periodic Sequence] + + Fast --> FastImpl[Activity with loop
+ heartbeats] + Slow --> SlowImpl[Activity retries
backoffCoefficient=1] + Complex --> ComplexImpl[Child workflow
+ Continue-As-New] +``` + +The following describes each path in the diagram: + +1. If you need polling at 1-second intervals or faster, use frequent polling with an Activity loop and heartbeats. +2. If you need polling at 1-minute intervals or slower, use infrequent polling with Activity retries and a fixed backoff coefficient. +3. If you need complex multi-step polling or changing parameters between attempts, use a periodic sequence with Child Workflows and Continue-As-New. + +## Implementation + +### Frequent polling (fast response required) + +For polling intervals of 1 second or faster, implement the polling loop inside the Activity with heartbeats. +The heartbeat reports progress and enables Temporal to detect stuck Activities: + + + + +```python +# activities.py +from temporalio import activity +import asyncio + +@activity.defn +async def do_poll() -> str: + while True: + activity.heartbeat() + + result = await external_service.check_status() + + if result == "COMPLETED": + return result + + await asyncio.sleep(1) +``` + + + + +```go +// activities.go +func DoPoll(ctx context.Context) (string, error) { + for { + activity.RecordHeartbeat(ctx) + + result, err := externalService.CheckStatus() + if err != nil { + return "", err + } + + if result == "COMPLETED" { + return result, nil + } + + select { + case <-ctx.Done(): + return "", ctx.Err() + case <-time.After(1 * time.Second): + } + } +} +``` + + + + +```java +// FrequentPollingActivityImpl.java +@ActivityInterface +public interface PollingActivities { + String doPoll(); +} + +public class FrequentPollingActivityImpl implements PollingActivities { + @Override + public String doPoll() { + while (true) { + Activity.getExecutionContext().heartbeat(null); + + String result = externalService.checkStatus(); + + if (result.equals("COMPLETED")) { + return result; + } + + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + throw Activity.wrap(e); + } + } + } +} +``` + + + + +```typescript +// activities.ts +import { heartbeat, sleep } from '@temporalio/activity'; + +export async function doPoll(): Promise { + while (true) { + heartbeat(); + + const result = await externalService.checkStatus(); + + if (result === 'COMPLETED') { + return result; + } + + await sleep('1s'); + } +} +``` + + + + +The Activity loops indefinitely, heartbeating on each iteration. +If the Worker crashes, the heartbeat timeout expires and Temporal retries the Activity on another Worker. + +The Workflow configures the Activity with a heartbeat timeout shorter than the start-to-close timeout: + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import do_poll + +@workflow.defn +class FrequentPollingWorkflow: + @workflow.run + async def run(self) -> str: + return await workflow.execute_activity( + do_poll, + start_to_close_timeout=timedelta(seconds=60), + heartbeat_timeout=timedelta(seconds=2), + ) +``` + + + + +```go +// workflow.go +func FrequentPollingWorkflow(ctx workflow.Context) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 60 * time.Second, + HeartbeatTimeout: 2 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, DoPoll).Get(ctx, &result) + return result, err +} +``` + + + + +```java +// FrequentPollingWorkflowImpl.java +public class FrequentPollingWorkflowImpl implements PollingWorkflow { + @Override + public String exec() { + ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(60)) + .setHeartbeatTimeout(Duration.ofSeconds(2)) + .build(); + + PollingActivities activities = Workflow.newActivityStub(PollingActivities.class, options); + return activities.doPoll(); + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { doPoll } = proxyActivities({ + startToCloseTimeout: '60s', + heartbeatTimeout: '2s', +}); + +export async function frequentPollingWorkflow(): Promise { + return await doPoll(); +} +``` + + + + +The heartbeat timeout (2 seconds) is shorter than the start-to-close timeout (60 seconds). +If the Activity misses a heartbeat, Temporal detects the failure and retries the Activity. + +### Infrequent polling (resource efficient) + +For polling intervals of 1 minute or slower, use Activity retries with a backoff coefficient of 1. +The Activity throws an exception when the external service is not ready, and Temporal retries after the configured interval: + + + + +```python +# activities.py +from temporalio import activity +from temporalio.exceptions import ApplicationError + +@activity.defn +async def do_poll() -> str: + result = await external_service.check_status() + + if result != "COMPLETED": + raise ApplicationError("Service not ready, will retry") + + return result +``` + + + + +```go +// activities.go +func DoPoll(ctx context.Context) (string, error) { + result, err := externalService.CheckStatus() + if err != nil { + return "", err + } + + if result != "COMPLETED" { + return "", fmt.Errorf("service not ready, will retry") + } + + return result, nil +} +``` + + + + +```java +// InfrequentPollingActivityImpl.java +public class InfrequentPollingActivityImpl implements PollingActivities { + @Override + public String doPoll() { + String result = externalService.checkStatus(); + + if (!result.equals("COMPLETED")) { + throw new RuntimeException("Service not ready, will retry"); + } + + return result; + } +} +``` + + + + +```typescript +// activities.ts +import { ApplicationFailure } from '@temporalio/activity'; + +export async function doPoll(): Promise { + const result = await externalService.checkStatus(); + + if (result !== 'COMPLETED') { + throw ApplicationFailure.retryable('Service not ready, will retry'); + } + + return result; +} +``` + + + + +The Activity performs a single poll and throws if the service is not ready. +Temporal handles the retry scheduling. + +The Workflow configures the retry policy with a fixed interval: + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy + +with workflow.unsafe.imports_passed_through(): + from activities import do_poll + +@workflow.defn +class InfrequentPollingWorkflow: + @workflow.run + async def run(self) -> str: + return await workflow.execute_activity( + do_poll, + start_to_close_timeout=timedelta(seconds=2), + retry_policy=RetryPolicy( + backoff_coefficient=1, + initial_interval=timedelta(seconds=60), + ), + ) +``` + + + + +```go +// workflow.go +func InfrequentPollingWorkflow(ctx workflow.Context) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 2 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + BackoffCoefficient: 1, + InitialInterval: 60 * time.Second, + }, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, DoPoll).Get(ctx, &result) + return result, err +} +``` + + + + +```java +// InfrequentPollingWorkflowImpl.java +public class InfrequentPollingWorkflowImpl implements PollingWorkflow { + @Override + public String exec() { + ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(2)) + .setRetryOptions( + RetryOptions.newBuilder() + .setBackoffCoefficient(1) + .setInitialInterval(Duration.ofSeconds(60)) + .build()) + .build(); + + PollingActivities activities = Workflow.newActivityStub(PollingActivities.class, options); + return activities.doPoll(); + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { doPoll } = proxyActivities({ + startToCloseTimeout: '2s', + retry: { + backoffCoefficient: 1, + initialInterval: '60s', + }, +}); + +export async function infrequentPollingWorkflow(): Promise { + return await doPoll(); +} +``` + + + + +Setting the backoff coefficient to 1 creates a fixed retry interval. +The initial interval of 60 seconds sets the polling frequency. +Retries do not add events to the Workflow history, keeping it small. + +### Periodic sequence (complex polling) + +For polling that requires multiple Activities or changing parameters between attempts, use Child Workflows with Continue-As-New. +The Child Workflow polls in a loop and calls Continue-As-New to prevent unbounded history: + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import do_poll + +@workflow.defn +class PollingChildWorkflow: + @workflow.run + async def run(self, polling_interval_seconds: int) -> str: + max_attempts = 10 + + for _ in range(max_attempts): + result = await workflow.execute_activity( + do_poll, + start_to_close_timeout=timedelta(seconds=10), + ) + + if result == "COMPLETED": + return result + + await workflow.sleep(polling_interval_seconds) + + # Continue-as-new to prevent unbounded history + workflow.continue_as_new(polling_interval_seconds) +``` + + + + +```go +// workflow.go +func PollingChildWorkflow(ctx workflow.Context, pollingIntervalSeconds int) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 10 * time.Second, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + maxAttempts := 10 + for i := 0; i < maxAttempts; i++ { + var result string + err := workflow.ExecuteActivity(ctx, DoPoll).Get(ctx, &result) + if err != nil { + return "", err + } + + if result == "COMPLETED" { + return result, nil + } + + workflow.Sleep(ctx, time.Duration(pollingIntervalSeconds)*time.Second) + } + + // Continue-as-new to prevent unbounded history + return "", workflow.NewContinueAsNewError(ctx, PollingChildWorkflow, pollingIntervalSeconds) +} +``` + + + + +```java +// PeriodicPollingChildWorkflowImpl.java +@WorkflowInterface +public interface PollingChildWorkflow { + @WorkflowMethod + String exec(int pollingIntervalInSeconds); +} + +public class PeriodicPollingChildWorkflowImpl implements PollingChildWorkflow { + @Override + public String exec(int pollingIntervalInSeconds) { + ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .build(); + + PollingActivities activities = Workflow.newActivityStub(PollingActivities.class, options); + + int maxAttempts = 10; + for (int i = 0; i < maxAttempts; i++) { + String result = activities.doPoll(); + + if (result.equals("COMPLETED")) { + return result; + } + + Workflow.sleep(Duration.ofSeconds(pollingIntervalInSeconds)); + } + + // Continue-as-new to prevent unbounded history + PollingChildWorkflow continueAsNew = Workflow.newContinueAsNewStub(PollingChildWorkflow.class); + continueAsNew.exec(pollingIntervalInSeconds); + return null; + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities, sleep, continueAsNew } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { doPoll } = proxyActivities({ + startToCloseTimeout: '10s', +}); + +export async function pollingChildWorkflow( + pollingIntervalSeconds: number +): Promise { + const maxAttempts = 10; + + for (let i = 0; i < maxAttempts; i++) { + const result = await doPoll(); + + if (result === 'COMPLETED') { + return result; + } + + await sleep(`${pollingIntervalSeconds}s`); + } + + // Continue-as-new to prevent unbounded history + await continueAsNew(pollingIntervalSeconds); + return ''; // unreachable +} +``` + + + + +The Child Workflow polls up to 10 times, sleeping between attempts. +After 10 attempts, it calls Continue-As-New to start a fresh execution with the same parameters. + +The parent Workflow starts the Child Workflow and waits for its result: + + + + +```python +# workflows.py +from temporalio import workflow + +@workflow.defn +class PeriodicPollingWorkflow: + @workflow.run + async def run(self) -> str: + return await workflow.execute_child_workflow( + PollingChildWorkflow.run, + 5, + id="ChildWorkflowPoll", + ) +``` + + + + +```go +// workflow.go +func PeriodicPollingWorkflow(ctx workflow.Context) (string, error) { + cwo := workflow.ChildWorkflowOptions{ + WorkflowID: "ChildWorkflowPoll", + } + ctx = workflow.WithChildOptions(ctx, cwo) + + var result string + err := workflow.ExecuteChildWorkflow(ctx, PollingChildWorkflow, 5).Get(ctx, &result) + return result, err +} +``` + + + + +```java +// PeriodicPollingWorkflowImpl.java +public class PeriodicPollingWorkflowImpl implements PollingWorkflow { + @Override + public String exec() { + PollingChildWorkflow childWorkflow = Workflow.newChildWorkflowStub( + PollingChildWorkflow.class, + ChildWorkflowOptions.newBuilder() + .setWorkflowId("ChildWorkflowPoll") + .build()); + + return childWorkflow.exec(5); + } +} +``` + + + + +```typescript +// workflows.ts +import { executeChild } from '@temporalio/workflow'; +import { pollingChildWorkflow } from './polling-child-workflow'; + +export async function periodicPollingWorkflow(): Promise { + return await executeChild(pollingChildWorkflow, { + args: [5], + workflowId: 'ChildWorkflowPoll', + }); +} +``` + + + + +The parent remains blocked and is unaware of the child's Continue-As-New calls. +When the child completes, the parent receives the result. + +## When to use + +### Frequent polling (1 second or faster) + +This strategy is a good fit for real-time status checks, high-priority operations requiring fast response, and short-lived external operations (minutes, not hours). +It is not a good fit for long-running operations (hours or days), rate-limited APIs, or resource-constrained external services. + +### Infrequent polling (1 minute or slower) + +This strategy is a good fit for batch job completion checks, long-running external processes, rate-limited APIs, and operations that may take hours or days. +It is not a good fit for sub-minute polling requirements or operations requiring immediate response. + +### Periodic sequence + +This strategy is a good fit for multi-step polling sequences, changing Activity parameters between polls, and very long-running polls requiring Continue-As-New. +It is not a good fit when the frequent or infrequent patterns are sufficient. + +## Benefits and trade-offs + +All three strategies work with any external service that does not support callbacks. +Temporal handles retry scheduling and fault tolerance automatically. +All timing is based on Workflow time, ensuring deterministic behavior. + +Frequent polling provides fast response but consumes more resources and requires heartbeating. +Infrequent polling is resource-efficient with minimal history growth but has a minimum 1-minute interval. +Periodic sequence is the most flexible but adds complexity through Child Workflow management. + +## Comparison with alternatives + +| Strategy | Poll frequency | History impact | Complexity | Best for | +| :--- | :--- | :--- | :--- | :--- | +| Frequent Polling | 1 second or faster | Medium | Low | Real-time checks | +| Infrequent Polling | 1 minute or slower | Minimal | Low | Long operations | +| Periodic Sequence | Any | Low (with CAN) | Medium | Complex sequences | +| Workflow Timer Loop | Any | High | Medium | Avoid this approach | + +## Best practices + +- **Choose the right strategy.** Match polling frequency to the pattern. +- **Set appropriate timeouts.** HeartbeatTimeout must be shorter than StartToCloseTimeout for frequent polling. +- **Handle failures gracefully.** Distinguish transient from permanent failures. +- **Add exponential backoff.** Use backoff for error cases (not normal polling). +- **Implement circuit breakers.** Protect external services from overload. +- **Use Continue-As-New.** Prevent unbounded history in periodic sequences. +- **Monitor polling metrics.** Track poll attempts, success rates, and durations. +- **Respect rate limits.** Adjust polling frequency to API constraints. +- **Add jitter.** Prevent thundering herd when many Workflows poll simultaneously. +- **Consider webhooks.** If the external service supports callbacks, use async completion instead. + +## Common pitfalls + +- **Wrong pattern choice.** Using frequent polling for hour-long operations wastes resources. +- **Missing heartbeats.** Frequent polling without heartbeats causes delayed failure detection. +- **Unbounded history.** Not using Continue-As-New in periodic sequences leads to history limit failures. +- **Tight polling loops.** Polling too frequently overwhelms external services. +- **No timeout.** Polling indefinitely without max attempts or a deadline risks runaway Workflows. +- **Ignoring errors.** Not distinguishing between retryable and permanent failures leads to wasted retries. +- **Workflow timer loops.** Using Workflow timers instead of proper polling patterns bloats history. + +## Related patterns + +- **[Long-Running Activity](/design-patterns/long-running-activity)**: Reporting progress in long Activities. +- **[Continue-As-New](/design-patterns/continue-as-new)**: Managing unbounded Workflow history. + +## Sample code + +### Java +- [Frequent Polling](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/polling/frequent) — Fast polling with heartbeats. +- [Infrequent Polling](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/polling/infrequent) — Efficient long-interval polling. +- [Periodic Sequence](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/polling/periodicsequence) — Complex polling with Child Workflows. + +### TypeScript +- [Frequent Polling](https://github.com/temporalio/samples-typescript/tree/main/polling/frequent) — Fast polling with heartbeats. +- [Infrequent Polling](https://github.com/temporalio/samples-typescript/tree/main/polling/infrequent) — Efficient long-interval polling. +- [Periodic Sequence](https://github.com/temporalio/samples-typescript/tree/main/polling/periodic-sequence) — Complex polling with Child Workflows. + +### Python +- [Frequent Polling](https://github.com/temporalio/samples-python/tree/main/polling/frequent) — Fast polling with heartbeats. +- [Infrequent Polling](https://github.com/temporalio/samples-python/tree/main/polling/infrequent) — Efficient long-interval polling. +- [Periodic Sequence](https://github.com/temporalio/samples-python/tree/main/polling/periodic_sequence) — Complex polling with Child Workflows. + +### Go +- [Frequent Polling](https://github.com/temporalio/samples-go/tree/main/polling/frequent) — Fast polling with heartbeats. +- [Infrequent Polling](https://github.com/temporalio/samples-go/tree/main/polling/infrequent) — Efficient long-interval polling. +- [Periodic Sequence](https://github.com/temporalio/samples-go/tree/main/polling/periodicsequence) — Complex polling with Child Workflows. diff --git a/docs/design-patterns/priority-task-queues.mdx b/docs/design-patterns/priority-task-queues.mdx new file mode 100644 index 0000000000..dab4a7ac66 --- /dev/null +++ b/docs/design-patterns/priority-task-queues.mdx @@ -0,0 +1,248 @@ +--- +id: priority-task-queues +title: "Priority Task Queues" +sidebar_label: "Priority Task Queues" +description: "Assigns a priority level to Workflows and Activities so that time-sensitive work executes ahead of lower-priority work within a single Task Queue." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Assign a `PriorityKey` (1–5) to Workflows and Activities so **high priority work executes ahead of lower priority work** on a shared Task Queue. Use this when a flood of batch or background tasks would otherwise delay high-urgency requests. +::: + +## Overview + +The Priority Task Queues pattern assigns a `PriorityKey` to Workflows, Activities, and Child Workflows so that time-sensitive work executes ahead of lower-priority work within a single Task Queue—without requiring separate queues or routing logic. + +## Problem + +In a shared Task Queue, tasks execute in generally first-in-first-out (FIFO) order. When a large batch of low-priority work—nightly reports, bulk imports, background processing—floods the queue before time-sensitive requests arrive, the higher-priority requests wait behind the entire batch. A single queue with no ordering mechanism gives equal treatment to all tasks regardless of business urgency. + +## Solution + +Temporal's native Priority feature lets you assign a `PriorityKey` (an integer from 1 to 5, where 1 is the highest priority and 5 is the lowest) to any Workflow, Activity, or Child Workflow. The Temporal matching service maintains a sub-queue for each priority level and exhausts all tasks at a given level before dispatching to the next. Tasks default to priority 3 when no key is set. Activities and Child Workflows inherit the parent Workflow's priority unless they set their own. + +```mermaid +flowchart TD + WF1["Workflow\nPriorityKey=1\n(payment)"] --> TQ["my-task-queue"] + WF2["Workflow\nPriorityKey=3\n(default)"] --> TQ + WF3["Workflow\nPriorityKey=5\n(batch report)"] --> TQ + TQ --> P1["Priority 1\nsub-queue"] + TQ --> P3["Priority 3\nsub-queue"] + TQ --> P5["Priority 5\nsub-queue"] + P1 -->|dispatched first| W["Shared Workers"] + P3 -->|dispatched second| W + P5 -->|dispatched last| W + W --> DS["Downstream\nService"] +``` + +The following describes each step in the diagram: + +1. Workflows start with a `PriorityKey` in their start options. Payment workflows use priority 1; routine workflows default to 3; nightly batch reports use priority 5. +2. The Temporal matching service routes each task to the corresponding priority sub-queue inside the single Task Queue. +3. Workers poll the Task Queue and receive tasks in priority order: all priority-1 tasks are dispatched before any priority-2 task, and so on. +4. Activities and Child Workflows inherit the parent Workflow's `PriorityKey` unless they explicitly set their own. + +## Implementation + +Priority is enabled by default in Temporal Cloud and self-hosted Temporal. + +### Set Workflow priority at start + + + + +```python +from temporalio.common import Priority + +handle = await client.start_workflow( + ChargeCustomer.run, + id="charge-customer-wf", + task_queue="my-task-queue", + priority=Priority(priority_key=1), +) +``` + + + + +```go +we, err := c.ExecuteWorkflow( + context.Background(), + client.StartWorkflowOptions{ + ID: "charge-customer-wf", + TaskQueue: "my-task-queue", + Priority: temporal.Priority{PriorityKey: 1}, + }, + ChargeCustomer, +) +``` + + + + +```java +WorkflowOptions options = WorkflowOptions.newBuilder() + .setWorkflowId("charge-customer-wf") + .setTaskQueue("my-task-queue") + .setPriority(Priority.newBuilder().setPriorityKey(1).build()) + .build(); +ChargeCustomer workflow = client.newWorkflowStub(ChargeCustomer.class, options); +WorkflowClient.start(workflow::run); +``` + + + + +### Set Activity priority + +Activities inherit the parent Workflow's priority. Override the `PriorityKey` in `ActivityOptions` when an individual Activity should run at a different level than its Workflow. + + + + +```python +from temporalio.common import Priority + +# inside the workflow +result = await workflow.execute_activity( + process_payment, + start_to_close_timeout=timedelta(minutes=1), + priority=Priority(priority_key=1), +) +``` + + + + +```go +ao := workflow.ActivityOptions{ + StartToCloseTimeout: time.Minute, + Priority: temporal.Priority{PriorityKey: 1}, +} +ctx = workflow.WithActivityOptions(ctx, ao) +err := workflow.ExecuteActivity(ctx, ProcessPayment).Get(ctx, nil) +``` + + + + +```java +ActivityOptions options = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofMinutes(1)) + .setPriority(Priority.newBuilder().setPriorityKey(1).build()) + .build(); +PaymentActivities activities = Workflow.newActivityStub(PaymentActivities.class, options); +activities.processPayment(); +``` + + + + +### Set Child Workflow priority + + + + +```python +from temporalio.common import Priority + +# inside the parent workflow +result = await workflow.execute_child_workflow( + ProcessOrder.run, + id="process-order-child", + task_queue="my-task-queue", + priority=Priority(priority_key=2), +) +``` + + + + +```go +cwo := workflow.ChildWorkflowOptions{ + WorkflowID: "process-order-child", + TaskQueue: "my-task-queue", + Priority: temporal.Priority{PriorityKey: 2}, +} +ctx = workflow.WithChildOptions(ctx, cwo) +err := workflow.ExecuteChildWorkflow(ctx, ProcessOrder).Get(ctx, nil) +``` + + + + +```java +ChildWorkflowOptions options = ChildWorkflowOptions.newBuilder() + .setWorkflowId("process-order-child") + .setTaskQueue("my-task-queue") + .setPriority(Priority.newBuilder().setPriorityKey(2).build()) + .build(); +ProcessOrder child = Workflow.newChildWorkflowStub(ProcessOrder.class, options); +child.run(); +``` + + + + +### Set priority via CLI + +```sh +temporal workflow start \ + --type ChargeCustomer \ + --task-queue my-task-queue \ + --workflow-id charge-customer-wf \ + --input '{"customerId":"12345"}' \ + --priority-key 1 +``` + +## When to use + +This pattern is a good fit when your system mixes time-sensitive operations (payment processing, user-facing requests) with background or batch work (reporting, data imports, inventory management), and you want urgent tasks to proceed even during periods of high load. It also works well when you need to mark urgent tasks that should override normal processing—for example, triggering immediate re-runs of failed critical tasks. + +It is not a good fit when all work is effectively equal in urgency, when a continuously replenished high-priority backlog could starve lower-priority work indefinitely, or when you need hard capacity isolation between tiers (see dedicated queues per tier as a supplementary measure). If your concern is prioritizing work amongst tenants or customers, consider the [Fairness](/design-patterns/fairness) pattern, which distributes capacity proportionally using weighted fairness keys rather than strict ordering. + +## Benefits and trade-offs + +Native priority requires no extra queues, routing logic, or additional Worker pools. A single pool of Workers serves all priority levels, so idle capacity at low-priority levels is automatically used by higher-priority work without any additional configuration. + +Lower-priority tasks are blocked until all higher-priority tasks at that level are drained. In an environment with a continuously replenished high-priority backlog, low-priority tasks may be significantly delayed. The built-in `PriorityKey` range is 1–5; if more than five distinct levels are needed, the feature cannot accommodate them. + +## Comparison with alternatives + +| Approach | Isolation | Dynamic priority | Complexity | Scales to many priorities | +| :--- | :--- | :--- | :--- | :--- | +| Temporal PriorityKey (native) | Soft | Yes | Low | Yes (1–5 levels) | +| [Fairness](/design-patterns/fairness) | Soft | Yes | Low | Yes (unlimited keys) | +| Separate Task Queues per tier | Hard | No | Medium | No (static tiers) | +| Single queue (no control) | None | N/A | Lowest | N/A | +| External queue (Kafka, SQS) | Hard | Yes | High | Yes | + +## Best practices + +- **Use no more than five priority levels.** The `PriorityKey` range is 1–5. Keep levels coarse—for example, 1 = urgent, 3 = normal, 5 = batch—rather than mapping fine-grained business importance to many values. +- **Reserve priority 1 for genuinely urgent work.** If high priority is the fallback when no priority is specified, the highest level fills with routine work and the feature provides no benefit. The default is 3 when no key is set. +- **Set `PriorityKey` at Workflow start, not inside Workflow code.** Workflow code cannot change its own priority after it starts. Set the priority in the start options before execution begins. +- **Override Activity priority deliberately.** Activities inherit the parent Workflow's priority by default. Override only when a specific Activity must run at a different level than its Workflow. +- **Monitor queue depth per priority level.** Sustained backlog growth at a priority level signals that Worker capacity is insufficient for the submitted load at that level. + +## Common pitfalls + +- **Assigning priority 1 to all work by default.** When every caller sets the highest priority, the feature provides no ordering benefit. Establish an explicit policy for which work types qualify for each level. +- **Neglecting low-priority starvation.** Under sustained high load, priority-5 tasks may wait indefinitely. Use `ScheduleToStartTimeout` on low-priority activities to surface starvation as a visible failure. +- **Changing priority after scheduling.** `PriorityKey` is evaluated when a task enters the queue and cannot be changed while it waits. To re-prioritize an already-queued task, cancel it and reschedule with the new priority. +- **Assuming hard isolation between priority levels.** Priority controls dispatch order, not Worker capacity allocation. A priority-5 task may still consume a Worker slot that is then unavailable for a priority-1 task arriving a moment later. + +## Related patterns + +- **[Fairness](/design-patterns/fairness)**: Distribute capacity proportionally across tenants within a priority level using fairness keys. +- **[Downstream Rate Limiting](/design-patterns/downstream-rate-limiting)**: Cap absolute throughput to a downstream service regardless of task priority. +- **[Worker-Specific Task Queues](/design-patterns/worker-specific-taskqueue)**: Route Activities to a specific Worker host for resource or data affinity. + +## Sample code + +The official Temporal documentation provides SDK code examples for setting priority keys on Workflows, Activities, and Child Workflows across all supported languages: + +- [Task Queue Priority and Fairness — Temporal docs](https://docs.temporal.io/develop/task-queue-priority-fairness#task-queue-priority) diff --git a/docs/design-patterns/qos-throughput-patterns.mdx b/docs/design-patterns/qos-throughput-patterns.mdx new file mode 100644 index 0000000000..aa13ea266f --- /dev/null +++ b/docs/design-patterns/qos-throughput-patterns.mdx @@ -0,0 +1,31 @@ +--- +id: qos-throughput-patterns +title: QoS & Throughput Patterns +sidebar_label: Overview +description: Patterns for controlling execution rate, protecting downstream services from overload, and ensuring fair capacity distribution across tenants. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for controlling how fast work executes, protecting downstream services from overload, and ensuring that no single caller or tenant can monopolize Worker capacity at the expense of others. + + diff --git a/docs/design-patterns/request-response-via-updates.mdx b/docs/design-patterns/request-response-via-updates.mdx new file mode 100644 index 0000000000..c99625ee91 --- /dev/null +++ b/docs/design-patterns/request-response-via-updates.mdx @@ -0,0 +1,307 @@ +--- +id: request-response-via-updates +title: "Request-Response via Updates" +sidebar_label: "Request-Response via Updates" +description: "Synchronous request-response with validation. Updates modify state and return results directly." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +Workflow Updates enable synchronous request-response interactions where clients receive immediate, typed responses while the Workflow continues processing. +Updates modify Workflow state, validate inputs, and return results directly to the caller with strong consistency guarantees. + +## Problem + +In distributed systems, you often need Workflows that provide immediate feedback to clients (validation results, confirmation IDs), require strong consistency guarantees for operations, need typed error handling for validation failures, should validate inputs before accepting work, and allow external systems to modify Workflow state synchronously. + +Without Updates, clients must use Signals and poll via Queries (complex, eventually consistent), wait for entire Workflow completion (slow), implement complex coordination logic, and handle race conditions between Signals and Queries. + +## Solution + +Temporal's Update API executes an Update handler that can validate inputs, modify state, and return values synchronously. +The Update is recorded in Workflow history before returning, providing strong consistency. + +```mermaid +sequenceDiagram + participant Client + participant Workflow + participant State + + Client->>+Workflow: Update Request + Workflow->>Workflow: Validate Input + alt Validation Fails + Workflow-->>Client: Error Response + else Validation Succeeds + Workflow->>State: Modify State + State-->>Workflow: Updated + Workflow-->>Client: Typed Response + end + deactivate Workflow +``` + +The following describes each step in the diagram: + +1. The client sends an Update request to the Workflow. +2. The Workflow validates the input. If validation fails, it returns a typed error response. +3. If validation succeeds, the Workflow modifies its state and returns a typed response to the client. + +## Implementation + +The following examples show a task assignment Workflow that accepts tasks via Updates with validation. +The Update validator rejects requests when the task limit is reached, and the Update handler assigns the task and returns a result. + + + + +```python +# workflows.py +import uuid +from dataclasses import dataclass +from temporalio import workflow + +MAX_TASKS = 10 + +@dataclass +class AssignmentResult: + assignment_id: str + task_name: str + total_tasks: int + +@workflow.defn +class TaskWorkflow: + def __init__(self) -> None: + self.tasks: list[str] = [] + + @workflow.run + async def run(self) -> None: + await workflow.wait_condition(lambda: False) + + @workflow.update + async def assign_task(self, task_name: str) -> AssignmentResult: + assignment_id = str(uuid.uuid4()) + self.tasks.append(task_name) + return AssignmentResult( + assignment_id=assignment_id, + task_name=task_name, + total_tasks=len(self.tasks), + ) + + @assign_task.validator + def validate_assign_task(self, task_name: str) -> None: + if len(self.tasks) >= MAX_TASKS: + raise ValueError("Task limit reached") + + @workflow.query + def get_tasks(self) -> list[str]: + return list(self.tasks) +``` + + + + +```go +// workflow.go +type TaskWorkflow struct{} + +const MaxTasks = 10 + +func (w *TaskWorkflow) Run(ctx workflow.Context) error { + tasks := []string{} + + err := workflow.SetUpdateHandlerWithOptions( + ctx, + "AssignTask", + func(ctx workflow.Context, taskName string) (AssignmentResult, error) { + assignmentID := uuid.New().String() + tasks = append(tasks, taskName) + return AssignmentResult{ + AssignmentID: assignmentID, + TaskName: taskName, + TotalTasks: len(tasks), + }, nil + }, + workflow.UpdateHandlerOptions{ + Validator: func(taskName string) error { + if len(tasks) >= MaxTasks { + return fmt.Errorf("task limit reached") + } + return nil + }, + }, + ) + if err != nil { + return err + } + + err = workflow.SetQueryHandler(ctx, "GetTasks", func() ([]string, error) { + return tasks, nil + }) + if err != nil { + return err + } + + workflow.GetSignalChannel(ctx, "").Receive(ctx, nil) + return nil +} +``` + + + + +```java +// TaskWorkflow.java +@WorkflowInterface +public interface TaskWorkflow { + @WorkflowMethod + void run(); + + @UpdateMethod + AssignmentResult assignTask(String taskName); + + @QueryMethod + List getTasks(); +} + +public class TaskWorkflowImpl implements TaskWorkflow { + private static final int MAX_TASKS = 10; + private List tasks = new ArrayList<>(); + + @Override + public void run() { + Workflow.await(() -> false); + } + + @UpdateValidatorMethod(updateName = "assignTask") + protected void validateAssignTask(String taskName) { + if (tasks.size() >= MAX_TASKS) { + throw new IllegalStateException("Task limit reached"); + } + } + + @Override + public AssignmentResult assignTask(String taskName) { + String assignmentId = UUID.randomUUID().toString(); + tasks.add(taskName); + + return new AssignmentResult(assignmentId, taskName, tasks.size()); + } + + @Override + public List getTasks() { + return new ArrayList<>(tasks); + } +} +``` + + + + +```typescript +// workflow.ts +import * as wf from '@temporalio/workflow'; + +interface AssignmentResult { + assignmentId: string; + taskName: string; + totalTasks: number; +} + +export const assignTaskUpdate = wf.defineUpdate('assignTask'); +export const getTasksQuery = wf.defineQuery('getTasks'); + +const MAX_TASKS = 10; + +export async function taskWorkflow(): Promise { + const tasks: string[] = []; + + wf.setHandler( + assignTaskUpdate, + (taskName: string): AssignmentResult => { + const assignmentId = wf.uuid4(); + tasks.push(taskName); + return { assignmentId, taskName, totalTasks: tasks.length }; + }, + { + validator: (taskName: string): void => { + if (tasks.length >= MAX_TASKS) { + throw new Error('Task limit reached'); + } + }, + } + ); + + wf.setHandler(getTasksQuery, (): string[] => tasks); + + await wf.condition(() => false); +} +``` + + + + +In all SDKs, the validator runs before the Update handler. +If the validator throws an exception, the Update is rejected and the client receives a typed error. +If the validator passes, the Update handler modifies state and returns a typed result. +The Update is recorded in Workflow history before the response is returned to the client. + +## When to use + +The Update pattern is a good fit for request-response patterns requiring immediate confirmation, input validation before accepting work, synchronous state modifications with typed responses, operations requiring strong consistency guarantees, and entity Workflows that need external state Updates. + +It is not a good fit for fire-and-forget operations (use Signals), read-only operations (use Queries), high-throughput scenarios where latency matters (Updates are slower than Signals), or operations that do not need an immediate response. + +## Benefits and trade-offs + +Updates provide a synchronous response — the client receives a typed return value immediately. +Validation failures return as typed exceptions. +The Update is recorded in history before returning, providing strong consistency. +You can modify Workflow state directly from external systems. + +The trade-offs to consider are that Updates are slower than Signals (they require a history write). +The Update handler blocks Workflow Task execution. +Update handlers consume Workflow Task execution time. +Updates are more complex than Signals for notifications. +Update arguments and return values are limited by the Workflow history event size (typically 2 MB per event). +Each Update adds events to Workflow history, contributing to the 50K event limit. +There is a maximum of 10 in-flight Updates per Workflow execution and a maximum of 2,000 total Updates in Workflow history. + +## Comparison with alternatives + +| Approach | Use case | Response type | Latency | Consistency | +| :--- | :--- | :--- | :--- | :--- | +| Update | Request-response | Sync typed value | Higher | Strong | +| Signal | Fire-and-forget | None | Lower | Eventual | +| Query | Read-only | Sync typed value | Lowest | Eventual | + +## Best practices + +- **Validate early.** Check inputs at the start of the Update handler to fail fast. +- **Handle errors.** Throw typed exceptions for validation failures. +- **Return quickly.** Do not perform long operations in the Update handler. +- **Ensure idempotency.** Track processed Update IDs if Updates can be retried. +- **Set timeouts.** Configure appropriate Update timeouts. +- **Maintain state consistency.** Ensure state modifications are atomic within the handler. + +## Common pitfalls + +- **Performing long operations in the Update handler.** Update handlers block Workflow Task execution. Offload long-running work to Activities and use `Workflow.await` in the handler to wait for results. +- **Exceeding the 2,000 total Updates limit.** Each accepted Update adds events to history. Use Continue-As-New before reaching the limit. The server sets `SuggestContinueAsNew` at 90% of the limit. +- **Not setting Update timeouts.** Without a client-side timeout, the caller blocks indefinitely if the Worker is unavailable. Always set a context timeout or deadline. +- **Ignoring Update ID for deduplication.** Without an Update ID, retried requests create duplicate Updates. Provide a unique `updateId` for idempotency, especially with Update-with-Start. +- **Using Updates for fire-and-forget.** Updates require a Worker to be online and responsive. For fire-and-forget operations, use Signals instead. + +## Related patterns + +- **Signal**: Fire-and-forget state modifications. +- **Query**: Read-only state inspection. +- **[Entity Workflow](/design-patterns/entity-workflow)**: Long-running Workflows representing business entities. +- **[Early Return](/design-patterns/early-return)**: Returning intermediate results before Workflow completion. + +## Sample code + +- [Safe Message Handlers (Python)](https://github.com/temporalio/samples-python/tree/main/message_passing/safe_message_handlers) — Concurrent Update handling with validation. +- [Safe Message Passing (Java)](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/safemessagepassing) — Concurrent Update handling with validation. +- [Update with Start - Shopping Cart (Go)](https://github.com/temporalio/samples-go/tree/main/shoppingcart) — Update-with-Start for lazy initialization. diff --git a/docs/design-patterns/resumable-activity.mdx b/docs/design-patterns/resumable-activity.mdx new file mode 100644 index 0000000000..1da02d0214 --- /dev/null +++ b/docs/design-patterns/resumable-activity.mdx @@ -0,0 +1,563 @@ +--- +id: resumable-activity +title: "Resumable Activity (AKA Pause On Failure)" +sidebar_label: "Resumable Activity" +description: "Park the Workflow after retries are exhausted and wait for a human to signal a correction, then resume execution from where it left off." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +After retries are exhausted, **park the Workflow in a waiting state and block on a Signal that notifies the Workflow to proceed or optionally delivers corrected input from a human operator, then re-execute the Activity.** Use this when failures are caused by bad input that can be fixed externally — so the Workflow resumes exactly where it left off instead of being restarted from scratch. +::: + +## Overview + +The Resumable Activity pattern parks a Workflow, durably waiting, after Activity retries are exhausted, waits for a corrective Signal from a human operator, then re-executes the Activity with the corrected input. +Use it when failures are caused by bad input data that can be corrected externally — a wrong account number, an invalid reference, a missing record that will be created — and abandoning the Workflow is worse than pausing it. + +## Problem + +When an Activity fails due to bad input, retrying with the same input will never succeed. +The standard options are: + +- **Fail the Workflow immediately.** The client must restart the process from scratch, often re-entering the same data that caused the failure. +- **Mark the error as non-retryable.** Same result — the Workflow fails and the client has no way to inject a correction. +- **Poll for the correction from inside the Workflow.** Wastes resources for a fix that may never come. + +What you actually want is for the Workflow to *pause* — consuming zero resources — until an authorized operator provides corrected data, then *resume* exactly where it left off. +Temporal's durable execution model makes this possible without any external database, queue, or polling mechanism. + +## Solution + +Use a bounded `RetryPolicy` to allow a few automatic retries (in case the failure is transient), then catch the exhausted `ActivityError` in the Workflow. +Transition to an `AWAITING_CORRECTION` state and block on `workflow.wait_condition` (or equivalent). +Register a Signal handler that accepts the corrected input and unblocks the condition. +When the Signal arrives, re-execute the Activity with the corrected input. +A second Signal gates the final approval before completing. + +```mermaid +sequenceDiagram + participant Client + participant Admin + participant Workflow + participant Activity as Transfer Activity + + Client->>+Workflow: Start transfer(from, invalid-account, $500) + Workflow->>Workflow: status = TRANSFERRING + + loop maxAttempts=3 + Workflow->>+Activity: executeTransfer(invalid-account) + Activity-->>-Workflow: Failure — account not found + end + + Note over Workflow: Retries exhausted + Workflow->>Workflow: status = AWAITING_CORRECTION + Note over Workflow: Parked in Temporal — zero cost, no polling + + Admin->>Workflow: Signal: retryWithCorrection("account-123") + Workflow->>Workflow: status = TRANSFERRING + + Workflow->>+Activity: executeTransfer(account-123) + Activity-->>-Workflow: Success + + Workflow->>Workflow: status = AWAITING_APPROVAL + Note over Workflow: Parked again — waiting for client approval + + Client->>Workflow: Signal: approve(true) + Workflow-->>-Client: Transfer completed +``` + +The following describes each step: + +1. The client starts the Workflow with an invalid account number. +2. The Activity fails. Temporal retries automatically up to the configured `maxAttempts`. +3. When retries are exhausted, the Workflow catches the `ActivityError` and sets its status to `AWAITING_CORRECTION`. +4. The Workflow parks itself using `wait_condition` — it consumes no CPU, no polling, no timers. Its state is fully persisted in Temporal. +5. An admin notices the problem (via Temporal UI, an alert, or an operations dashboard) and sends a `retryWithCorrection` Signal with the corrected account number. +6. The Workflow wakes up, applies the correction, and re-executes the Activity — which now succeeds. +7. The Workflow transitions to `AWAITING_APPROVAL` and parks again, waiting for the client to approve the transfer. +8. The client sends an `approve` Signal. The Workflow completes and returns the result. + +The key insight: **the Workflow never died**. It survived bad input, waited indefinitely without polling, accepted an external correction, and completed cleanly. Its entire state — status, corrected account, approval decision — is durable in Temporal throughout. + +## Implementation + + +### Workflow with correction and approval signals + +The Workflow maintains state as named fields. +Signal handlers set the fields, and `wait_condition` blocks until they are non-null. + + + + +```python +# workflows.py +from dataclasses import dataclass +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy, SearchAttributeKey +from temporalio.exceptions import ActivityError +import activities + +TRANSFER_STATUS_KEY = SearchAttributeKey.for_keyword("TransferStatus") + +@dataclass +class TransferInput: + from_account: str + to_account: str + amount: float + +@workflow.defn +class TransferWorkflow: + def __init__(self) -> None: + self._status = "PENDING" + self._corrected_account: str | None = None + self._approval: bool | None = None + + @workflow.run + async def run(self, transfer: TransferInput) -> str: + account = transfer.to_account + correction_attempts = 0 + + while True: + self._status = "TRANSFERRING" + try: + result = await workflow.execute_activity( + activities.execute_transfer, + TransferInput(transfer.from_account, account, transfer.amount), + start_to_close_timeout=timedelta(seconds=30), + retry_policy=RetryPolicy(maximum_attempts=3), + ) + break # Activity succeeded — exit the correction loop + except ActivityError: + correction_attempts += 1 + if correction_attempts > 5: + self._status = "FAILED" + workflow.upsert_search_attributes([TRANSFER_STATUS_KEY.value_set(self._status)]) + raise + self._status = "AWAITING_CORRECTION" + workflow.upsert_search_attributes([TRANSFER_STATUS_KEY.value_set(self._status)]) + workflow.logger.warning( + "Transfer failed — waiting for account correction", + extra={"to_account": account}, + ) + # Park until the admin sends a correction signal + await workflow.wait_condition( + lambda: self._corrected_account is not None + ) + account = self._corrected_account + self._corrected_account = None + + self._status = "AWAITING_APPROVAL" + await workflow.wait_condition(lambda: self._approval is not None) + + if self._approval: + self._status = "COMPLETED" + return f"Transfer of {transfer.amount} to {account} completed" + self._status = "REJECTED" + return "Transfer rejected by client" + + @workflow.signal + def retry_with_correction(self, corrected_account: str) -> None: + self._corrected_account = corrected_account + + @workflow.signal + def approve(self, approved: bool) -> None: + self._approval = approved + + @workflow.query + def get_status(self) -> str: + return self._status +``` + + + + +```go +// workflow.go +package transfer + +import ( + "fmt" + "time" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +type TransferInput struct { + FromAccount string + ToAccount string + Amount float64 +} + +func TransferWorkflow(ctx workflow.Context, input TransferInput) (string, error) { + status := "PENDING" + if err := workflow.SetQueryHandler(ctx, "getStatus", func() (string, error) { + return status, nil + }); err != nil { + return "", err + } + + correctionCh := workflow.GetSignalChannel(ctx, "retryWithCorrection") + approvalCh := workflow.GetSignalChannel(ctx, "approve") + + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 3}, + } + actCtx := workflow.WithActivityOptions(ctx, ao) + + account := input.ToAccount + correctionCount := 0 + for { + status = "TRANSFERRING" + err := workflow.ExecuteActivity(actCtx, ExecuteTransfer, TransferInput{ + FromAccount: input.FromAccount, + ToAccount: account, + Amount: input.Amount, + }).Get(actCtx, nil) + + if err == nil { + break // Activity succeeded — exit the correction loop + } + + correctionCount++ + if correctionCount > 5 { // for long correction cycles, consider Continue As New + status = "FAILED" + _ = workflow.UpsertSearchAttributes(ctx, map[string]interface{}{"TransferStatus": status}) + return "", err + } + status = "AWAITING_CORRECTION" + _ = workflow.UpsertSearchAttributes(ctx, map[string]interface{}{"TransferStatus": status}) + workflow.GetLogger(ctx).Warn("Transfer failed — waiting for account correction", + "to_account", account) + + // Park until the admin sends a correction signal + var corrected string + _ = workflow.Await(ctx, func() bool { + return correctionCh.ReceiveAsync(&corrected) + }) + account = corrected + } + + status = "AWAITING_APPROVAL" + var approved bool + _ = workflow.Await(ctx, func() bool { + return approvalCh.ReceiveAsync(&approved) + }) + + if approved { + status = "COMPLETED" + return fmt.Sprintf("Transfer of %.2f to %s completed", input.Amount, account), nil + } + status = "REJECTED" + return "Transfer rejected by client", nil +} +``` + + + + +```java +// TransferWorkflowImpl.java +import io.temporal.activity.ActivityOptions; +import io.temporal.common.RetryOptions; +import io.temporal.common.SearchAttributeKey; +import io.temporal.failure.ActivityFailure; +import io.temporal.workflow.SignalMethod; +import io.temporal.workflow.QueryMethod; +import io.temporal.workflow.WorkflowInterface; +import io.temporal.workflow.WorkflowMethod; +import io.temporal.workflow.Workflow; +import java.time.Duration; + +@WorkflowInterface +public interface TransferWorkflow { + @WorkflowMethod + String run(TransferInput input); + + @SignalMethod + void retryWithCorrection(String correctedAccount); + + @SignalMethod + void approve(boolean approved); + + @QueryMethod + String getStatus(); +} + +public class TransferWorkflowImpl implements TransferWorkflow { + private String status = "PENDING"; + private String correctedAccount; + private Boolean approval; + + private final TransferActivities activities = Workflow.newActivityStub( + TransferActivities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .setRetryOptions(RetryOptions.newBuilder() + .setMaximumAttempts(3) + .build()) + .build() + ); + + @Override + public String run(TransferInput input) { + String account = input.getToAccount(); + int correctionCount = 0; + + while (true) { + status = "TRANSFERRING"; + try { + activities.executeTransfer( + new TransferInput(input.getFromAccount(), account, input.getAmount()) + ); + break; // Activity succeeded — exit the correction loop + } catch (ActivityFailure e) { + correctionCount++; + if (correctionCount > 5) { // for long correction cycles, consider Continue As New + status = "FAILED"; + Workflow.upsertTypedSearchAttributes( + SearchAttributeKey.forKeyword("TransferStatus").valueSet(status) + ); + throw e; + } + status = "AWAITING_CORRECTION"; + Workflow.upsertTypedSearchAttributes( + SearchAttributeKey.forKeyword("TransferStatus").valueSet(status) + ); + Workflow.getLogger(getClass()).warn( + "Transfer failed — waiting for account correction: " + account + ); + // Park until the admin sends a correction signal + Workflow.await(() -> correctedAccount != null); + account = correctedAccount; + correctedAccount = null; + } + } + + status = "AWAITING_APPROVAL"; + Workflow.await(() -> approval != null); + + if (approval) { + status = "COMPLETED"; + return String.format("Transfer of %.2f to %s completed", input.getAmount(), account); + } + status = "REJECTED"; + return "Transfer rejected by client"; + } + + @Override + public void retryWithCorrection(String account) { + this.correctedAccount = account; + } + + @Override + public void approve(boolean decision) { + this.approval = decision; + } + + @Override + public String getStatus() { + return status; + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import type * as activities from './activities'; + +export interface TransferInput { + fromAccount: string; + toAccount: string; + amount: number; +} + +export const retryWithCorrectionSignal = wf.defineSignal<[string]>('retryWithCorrection'); +export const approveSignal = wf.defineSignal<[boolean]>('approve'); +export const getStatusQuery = wf.defineQuery('getStatus'); + +const { executeTransfer } = wf.proxyActivities({ + startToCloseTimeout: '30s', + retry: { maximumAttempts: 3 }, +}); + +export async function transferWorkflow(input: TransferInput): Promise { + let status = 'PENDING'; + let correctedAccount: string | undefined; + let approval: boolean | undefined; + + wf.setHandler(retryWithCorrectionSignal, (account: string) => { + correctedAccount = account; + }); + wf.setHandler(approveSignal, (decision: boolean) => { + approval = decision; + }); + wf.setHandler(getStatusQuery, () => status); + + let account = input.toAccount; + let correctionCount = 0; + + while (true) { + status = 'TRANSFERRING'; + try { + await executeTransfer({ ...input, toAccount: account }); + break; // Activity succeeded — exit the correction loop + } catch (err) { + correctionCount++; + if (correctionCount > 5) { + status = 'FAILED'; + wf.upsertSearchAttributes({ TransferStatus: [status] }); + throw err; + } + status = 'AWAITING_CORRECTION'; + wf.upsertSearchAttributes({ TransferStatus: [status] }); + wf.log.warn('Transfer failed — waiting for account correction', { account }); + // Park until the admin sends a correction signal + await wf.condition(() => correctedAccount !== undefined); + account = correctedAccount!; + correctedAccount = undefined; + } + } + + status = 'AWAITING_APPROVAL'; + await wf.condition(() => approval !== undefined); + + if (approval) { + status = 'COMPLETED'; + return `Transfer of ${input.amount} to ${account} completed`; + } + status = 'REJECTED'; + return 'Transfer rejected by client'; +} +``` + + + + +### Sending signals + +An operator sends the correction Signal using the Temporal CLI or any SDK client. +The Workflow wakes immediately when the Signal is delivered. + +```bash +# Correct the account number +temporal workflow signal \ + --workflow-id transfer-wf-001 \ + --name retryWithCorrection \ + --input '"account-123"' + +# Approve the transfer +temporal workflow signal \ + --workflow-id transfer-wf-001 \ + --name approve \ + --input 'true' +``` + +### Activity implementation + +The `executeTransfer` Activity must distinguish between permanent failures — such as an invalid account number — and transient failures that Temporal should retry automatically. +Throw a non-retryable `ApplicationFailure` for permanent input errors so the Workflow catches the `ActivityError` immediately and transitions to `AWAITING_CORRECTION` instead of exhausting all retry attempts first. +Let all other exceptions propagate so the RetryPolicy handles transient failures. + + + + +```python +# activities.py +from temporalio import activity +from temporalio.exceptions import ApplicationError + +@activity.defn +async def execute_transfer(transfer: TransferInput) -> str: + # Non-retryable: bad account number requires a human correction, not a retry. + if not await account_service.exists(transfer.to_account): + raise ApplicationError( + f"Account {transfer.to_account} not found", + type="AccountNotFoundError", + non_retryable=True, + ) + # Other exceptions propagate as retryable so the RetryPolicy handles them. + return await payment_service.transfer( + transfer.from_account, transfer.to_account, transfer.amount + ) +``` + + + + +```typescript +// activities.ts +import { ApplicationFailure } from '@temporalio/activity'; +import type { TransferInput } from './workflows'; + +export async function executeTransfer(transfer: TransferInput): Promise { + // Non-retryable: bad account number requires a human correction, not a retry. + const accountExists = await accountService.exists(transfer.toAccount); + if (!accountExists) { + throw ApplicationFailure.nonRetryable( + `Account ${transfer.toAccount} not found`, + 'AccountNotFoundError', + ); + } + // Other exceptions propagate as retryable so the RetryPolicy handles them. + return paymentService.transfer(transfer.fromAccount, transfer.toAccount, transfer.amount); +} +``` + + + + +## State Diagram + +The Workflow transitions through a well-defined set of states. +Query `getStatus` at any time to observe the current state. + +```mermaid +stateDiagram-v2 + [*] --> PENDING + PENDING --> TRANSFERRING : Workflow starts + TRANSFERRING --> AWAITING_CORRECTION : Activity retries exhausted + AWAITING_CORRECTION --> TRANSFERRING : retryWithCorrection signal received + AWAITING_CORRECTION --> FAILED : 5 correction attempts exceeded + TRANSFERRING --> AWAITING_APPROVAL : Activity succeeds + AWAITING_APPROVAL --> COMPLETED : approve(true) signal + AWAITING_APPROVAL --> REJECTED : approve(false) signal + COMPLETED --> [*] + REJECTED --> [*] + FAILED --> [*] +``` + +## Best practices + +- **Use a bounded `MaximumAttempts` before parking.** Allow a few automatic retries to recover from transient failures. Parking immediately on the first failure forces operators to intervene for problems that would have resolved on their own. +- **Manage history growth in long-running correction loops.** Each correction cycle — park, receive signal, re-execute Activity — adds events to the Workflow history (signal received, state transitions, Activity scheduled/completed). For workflows that may receive many corrections over time, use [Continue-As-New](/design-patterns/continue-as-new) to carry the current state into a fresh execution before the history grows too large, rather than relying solely on an arbitrary correction counter. +- **Expose status via a Query method.** The `getStatus` Query gives operations tooling visibility into where the Workflow is parked without requiring access to the Workflow history. +- **Validate the correction in the Signal handler.** Check that the corrected account is non-empty and matches the expected format before setting the state. An invalid correction just parks the Workflow again, but a clear error message helps operators. +- **Log and record state at every transition.** The `AWAITING_CORRECTION` and `AWAITING_APPROVAL` states can last hours or days. Structured log lines at each transition make the audit trail clear. For operational visibility, also update a [Search Attribute](https://docs.temporal.io/visibility) at each transition (for example, a `Keyword` attribute storing the current status) so operators can filter and query workflows by state directly from the Temporal UI or CLI. +- **Notify the operator proactively.** The `AWAITING_CORRECTION` transition is a good point to send an alert — an email, a Slack message, or a ticket — rather than waiting for the operator to notice in the Temporal UI. +- **Distinguish this from the Approval pattern.** The [Approval](/design-patterns/approval) pattern gates forward progress on a human decision. This pattern recovers from failure with a human-supplied data correction. Both use Signals and `wait_condition`, but serve different roles in a process. + +## Common pitfalls + +- **Waiting without a timeout.** If operators never send the correction Signal, the Workflow waits indefinitely. Add durable timer if the process must resolve within a time bound. +- **Not clearing the correction state before re-entering the loop.** After applying the correction, set `corrected_account = None` (or equivalent) before the next Activity attempt. Otherwise, if the corrected activity also fails, the Workflow immediately re-uses the previous correction instead of waiting for a new one. +- **Accepting corrections in the wrong state.** If a Signal arrives while the Activity is running (not parked), the correction should be queued and applied after the current attempt completes. The Signal handler always runs — the condition check (`wait_condition`) determines when the Workflow acts on it. +- **Conflating the correction loop with a general retry loop.** This pattern is for correcting *input data*. For retrying the same call against a temporarily unavailable system, use [Fast/Slow Retries](/design-patterns/fast-slow-retries) instead. + +## Related patterns + +- [Approval](/design-patterns/approval): Human-in-the-loop gate for forward progress rather than failure recovery. +- [Non-Retryable Errors](/design-patterns/non-retryable-errors): Fail immediately without parking when the error is structural and no correction is expected. +- [Fast/Slow Retries](/design-patterns/fast-slow-retries): Infinite patient retries when the downstream system is temporarily unavailable. +- [Signal with Start](/design-patterns/signal-with-start): Start the Workflow and send the correction Signal atomically. +- [Error Handling & Retry Patterns](/design-patterns/error-handling-patterns): Overview and decision tree for all retry patterns. diff --git a/docs/design-patterns/retry-metrics.mdx b/docs/design-patterns/retry-metrics.mdx new file mode 100644 index 0000000000..1d1197fa3d --- /dev/null +++ b/docs/design-patterns/retry-metrics.mdx @@ -0,0 +1,375 @@ +--- +id: retry-metrics +title: "Retry Alerting via Metrics" +sidebar_label: "Retry Alerting via Metrics" +description: "Emit a custom metric from inside the Activity when the attempt count crosses a threshold, surfacing silent persistent failures before an SLA breach." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Emit a counter metric from inside the Activity when the attempt number exceeds a threshold, using the SDK's built-in metrics scope. **Use this to surface silent, persistent failures to on-call teams before they breach an SLA** — without changing retry behavior or adding Workflow-level tracking. +::: + +## Overview + +The Retry Alerting via Metrics pattern emits a custom metric counter from inside the Activity whenever the attempt number exceeds a threshold. +Use it to surface silent, persistent failures to on-call teams before they breach an SLA — without modifying retry behavior or adding Workflow-level tracking. + +## Problem + +When an Activity retries indefinitely, failures are invisible at the system level until something breaks. +The Temporal UI shows the current attempt number, but on-call teams do not watch the UI continuously. +Without a metric or alert, a downstream system can be down for hours while the Workflow keeps retrying silently — and the first sign of a problem is an SLA breach or a user complaint. + +Common gaps: + +- A payment gateway is down. Workflows are retrying every 5 minutes. No alert fires until the merchant escalates. +- An email provider is rejecting requests. Activities are on attempt 50. No metric has been emitted. On-call has no signal. +- A third-party API degraded. Retries are accumulating. The engineering team learns about the problem from a customer, not from their own alerting. + +## Solution + +Read the current attempt number from the Activity execution context and emit a counter metric when it exceeds a threshold. +The metric is sent through the Temporal SDK's built-in metrics scope — the same pipeline used for SDK-internal metrics — so it flows to whatever metrics backend your Workers are already configured to use (Prometheus, StatsD, etc.) without additional setup. + +```mermaid +sequenceDiagram + participant Temporal as Temporal Service + participant Activity + participant Metrics as Metrics Backend + + loop Each retry attempt + Temporal->>+Activity: Execute (attempt N) + Activity->>Activity: Check attempt number + alt attempt > threshold (e.g. 5) + Activity->>Metrics: increment high_activity_error_count + end + Activity-->>-Temporal: Failure + Note over Temporal: Wait backoff interval + end + Note over Metrics: Alert fires when counter crosses threshold +``` + +The following describes each step: + +1. Temporal executes the Activity, passing the current attempt number in the execution context. +2. The Activity checks whether the attempt number exceeds the threshold. +3. If it does, the Activity increments a counter metric using the SDK's built-in metrics scope. +4. On failure, Temporal waits the backoff interval and retries. +5. The metrics backend accumulates the counter. Your alerting system fires when the counter or rate crosses a configured threshold. + +## Implementation + + +### Emitting a counter at high attempt counts + +Read the attempt number from the Activity info and emit a counter through the SDK metrics scope. +Configure the retry policy separately — the metric emission does not change retry behavior. + + + + +```python +# activities.py +from temporalio import activity +from temporalio.exceptions import ApplicationError + +ALERT_THRESHOLD = 5 + +@activity.defn +async def call_downstream_service(endpoint: str) -> str: + info = activity.info() + + if info.attempt > ALERT_THRESHOLD: + meter = activity.metric_meter() + meter.create_counter( + "high_activity_error_count", + "Activity has exceeded the failure attempt threshold", + ).add(1) + + # Attempt the actual work — raises on failure, triggering a retry + response = await downstream.call(endpoint) + return response.data +``` + + + + +```go +// activities.go +package downstream + +import ( + "context" + + "go.temporal.io/sdk/activity" +) + +const alertThreshold = 5 + +func CallDownstreamService(ctx context.Context, endpoint string) (string, error) { + info := activity.GetInfo(ctx) + + if info.Attempt > alertThreshold { + activity.GetMetricsHandler(ctx). + Counter("high_activity_error_count"). + Inc(1) + } + + // Attempt the actual work — returns an error on failure, triggering a retry + response, err := downstream.Call(endpoint) + if err != nil { + return "", err + } + return response.Data, nil +} +``` + + + + +```java +// CallDownstreamActivityImpl.java +import io.temporal.activity.Activity; +import io.temporal.activity.ActivityExecutionContext; + +public class CallDownstreamActivityImpl implements CallDownstreamActivity { + private static final int ALERT_THRESHOLD = 5; + + @Override + public String callDownstreamService(String endpoint) { + ActivityExecutionContext ctx = Activity.getExecutionContext(); + + if (ctx.getInfo().getAttempt() > ALERT_THRESHOLD) { + ctx.getMetricsScope() + .counter("HighActivityErrorCount") + .inc(1); + } + + // Attempt the actual work — throws on failure, triggering a retry + return downstream.call(endpoint).getData(); + } +} +``` + + + + +```typescript +// activities.ts +import { Context } from '@temporalio/activity'; + +const ALERT_THRESHOLD = 5; + +export async function callDownstreamService(endpoint: string): Promise { + const ctx = Context.current(); + + if (ctx.info.attempt > ALERT_THRESHOLD) { + ctx.metricMeter + .createCounter('high_activity_error_count') + .add(1); + } + + // Attempt the actual work — throws on failure, triggering a retry + const response = await downstream.call(endpoint); + return response.data; +} +``` + + + + +### Workflow configuration + +Configure the Activity in the Workflow with the desired retry policy. +The metric emission inside the Activity is independent of the retry configuration. + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow +from temporalio.common import RetryPolicy +import activities + +@workflow.defn +class MonitoredRetryWorkflow: + @workflow.run + async def run(self, endpoint: str) -> str: + return await workflow.execute_activity( + activities.call_downstream_service, + endpoint, + start_to_close_timeout=timedelta(seconds=30), + retry_policy=RetryPolicy( + initial_interval=timedelta(seconds=5), + backoff_coefficient=2.0, + maximum_interval=timedelta(minutes=5), + # No maximum_attempts — retries indefinitely until success + ), + ) +``` + + + + +```go +// workflow.go +func MonitoredRetryWorkflow(ctx workflow.Context, endpoint string) (string, error) { + ao := workflow.ActivityOptions{ + StartToCloseTimeout: 30 * time.Second, + RetryPolicy: &temporal.RetryPolicy{ + InitialInterval: 5 * time.Second, + BackoffCoefficient: 2.0, + MaximumInterval: 5 * time.Minute, + // No MaximumAttempts — retries indefinitely until success + }, + } + ctx = workflow.WithActivityOptions(ctx, ao) + + var result string + err := workflow.ExecuteActivity(ctx, CallDownstreamService, endpoint).Get(ctx, &result) + return result, err +} +``` + + + + +```java +// MonitoredRetryWorkflowImpl.java +public class MonitoredRetryWorkflowImpl implements MonitoredRetryWorkflow { + private final CallDownstreamActivity activities = Workflow.newActivityStub( + CallDownstreamActivity.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(30)) + .setRetryOptions(RetryOptions.newBuilder() + .setInitialInterval(Duration.ofSeconds(5)) + .setBackoffCoefficient(2.0) + .setMaximumInterval(Duration.ofMinutes(5)) + // No setMaximumAttempts — retries indefinitely until success + .build()) + .build() + ); + + @Override + public String run(String endpoint) { + return activities.callDownstreamService(endpoint); + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { callDownstreamService } = wf.proxyActivities({ + startToCloseTimeout: '30s', + retry: { + initialInterval: '5s', + backoffCoefficient: 2, + maximumInterval: '5m', + // No maximumAttempts — retries indefinitely until success + }, +}); + +export async function monitoredRetryWorkflow(endpoint: string): Promise { + return await callDownstreamService(endpoint); +} +``` + + + + +### Adding dimension labels to the metric + +Add labels (tags) to the metric to identify which Activity type, endpoint, or Workflow is producing the high attempt counts. +This makes the metric actionable in dashboards and alerts. + + + + +```python +# activities.py +if info.attempt > ALERT_THRESHOLD: + meter = activity.metric_meter() + meter.create_counter( + "high_activity_error_count", + "Activity has exceeded the failure attempt threshold", + ).add(1, {"activity_type": info.activity_type, "endpoint": endpoint}) +``` + + + + +```go +// activities.go +if info.Attempt > alertThreshold { + activity.GetMetricsHandler(ctx). + WithTags(map[string]string{ + "activity_type": info.ActivityType.Name, + "endpoint": endpoint, + }). + Counter("high_activity_error_count"). + Inc(1) +} +``` + + + + +```java +// CallDownstreamActivityImpl.java +if (ctx.getInfo().getAttempt() > ALERT_THRESHOLD) { + ctx.getMetricsScope() + .tagged(ImmutableMap.of( + "activity_type", ctx.getInfo().getActivityType(), + "endpoint", endpoint + )) + .counter("HighActivityErrorCount") + .inc(1); +} +``` + + + + +```typescript +// activities.ts +if (ctx.info.attempt > ALERT_THRESHOLD) { + ctx.metricMeter + .createCounter('high_activity_error_count') + .add(1, { activity_type: ctx.info.activityType, endpoint }); +} +``` + + + + +## Best practices + +- **Choose a threshold above normal transient noise.** If your downstream system occasionally has 1–2 retry attempts under normal conditions, set the threshold at 5 or 10 so the metric only fires for genuinely sustained failures. +- **Emit on every attempt above the threshold, not just once.** Incrementing the counter on each high-attempt invocation allows alerting systems to detect both the onset and the duration of a problem by watching the counter rate. +- **Use the SDK metrics scope, not a third-party library.** The SDK scope integrates with your Worker's existing metrics pipeline and adds default labels such as namespace and task queue automatically. +- **Set up rate-based alerts, not count-based.** A count alert requires resetting or remembering the baseline. A rate alert (e.g., "more than 3 increments per minute") fires when the problem is active and clears when it resolves. +- **Combine with Fast/Slow Retries.** Emit the metric in the slow-phase Activity of a [Fast/Slow Retries](/design-patterns/fast-slow-retries) pattern to alert when the Workflow has been in the slow phase long enough to be a concern. + +## Common pitfalls + +- **Emitting the metric in the Workflow instead of the Activity.** The Workflow does not have access to the Activity's attempt number without passing it explicitly. The Activity context always has the current attempt number — use it there. +- **Alerting on the total counter value instead of the rate.** If the counter is cumulative, a single high-attempt event in the past will keep the counter elevated forever. Alert on the increment rate (events per minute) rather than the absolute count. +- **Not resetting alerting context on success.** If the Activity eventually succeeds after 50 attempts, the high-attempt metric has already fired. Ensure your alerting system can resolve the alert when the metric rate drops to zero. +- **Setting the threshold too low.** A threshold of 1 means the metric fires on the very first retry — which is normal behavior. Calibrate the threshold to your system's expected transient error rate. + +## Related patterns + +- [Fast/Slow Retries](/design-patterns/fast-slow-retries): Combine by emitting this metric inside the slow-phase Activity to alert when patient waiting has gone on too long. +- [Fixed Count of Retries](/design-patterns/fixed-count-retries): Cap attempts at a fixed number instead of alerting at a threshold. +- [Error Handling & Retry Patterns](/design-patterns/error-handling-patterns): Overview and decision tree for all retry patterns. diff --git a/docs/design-patterns/saga-pattern.mdx b/docs/design-patterns/saga-pattern.mdx new file mode 100644 index 0000000000..e43b427439 --- /dev/null +++ b/docs/design-patterns/saga-pattern.mdx @@ -0,0 +1,357 @@ +--- +id: saga-pattern +title: Saga Pattern +sidebar_label: Saga Pattern +description: Manages distributed transactions with compensating actions. Each step has a compensation that undoes its effects if subsequent steps fail. +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Saga pattern manages distributed transactions across multiple services by coordinating a sequence of local transactions, each with a compensating action that can undo its effects if subsequent steps fail. + +## Problem + +In distributed systems, you need to maintain data consistency across multiple services or databases without using traditional ACID transactions. +When a multi-step business process fails partway through, you must undo the effects of completed steps to maintain system consistency. +Traditional two-phase commit does not scale well and creates tight coupling between services. + +## Solution + +You implement each step as a local transaction with a corresponding compensation transaction. +If any step fails, you execute compensation transactions in reverse order to undo the effects of all completed steps. +You register compensations as each step completes, then automatically trigger them when errors occur to ensure cleanup happens reliably. + +The following diagram shows the worked example: opening a customer account in four steps, where `addBankAccount` simulates a downstream failure to trigger compensation. + +```mermaid +flowchart TD + Start([Start Saga]) --> Step1[Step 1: createAccount] + Step1 -->|Success| Step2[Step 2: addAddress] + Step1 -->|Failure| End([End: Failed]) + + Step2 -->|Success| Step3[Step 3: addClient] + Step2 -->|Failure| Comp1[clearPostalAddresses] + + Step3 -->|Success| Step4[Step 4: addBankAccount] + Step3 -->|Failure| Comp2[removeClient] + + Step4 -->|Success| Complete([End: Success]) + Step4 -->|Failure| Comp3[disconnectBankAccounts] + + Comp3 --> Comp2 + Comp2 --> Comp1 + Comp1 --> End + classDef success stroke-width:1px + classDef compensation stroke-width:1px + classDef complete stroke-width:1px + classDef fail stroke-width:1px + class Step1,Step2,Step3,Step4 success + class Comp1,Comp2,Comp3 compensation + class Complete complete + class End fail +``` + +The following describes each step in the diagram: + +1. The Saga begins by executing Step 1 (`createAccount`). +2. If Step 1 succeeds, the Workflow proceeds to Step 2 (`addAddress`). If it fails, the Saga ends immediately — no compensations are registered yet. +3. If Step 2 succeeds, the Workflow proceeds to Step 3 (`addClient`). If it fails, the Workflow runs `clearPostalAddresses`. +4. If Step 3 succeeds, the Workflow proceeds to Step 4 (`addBankAccount`). If it fails, the Workflow runs `removeClient`, then `clearPostalAddresses`. +5. If Step 4 succeeds, the Saga completes. If it fails, the Workflow runs all three compensations in reverse: `disconnectBankAccounts`, `removeClient`, `clearPostalAddresses`. Note that `disconnectBankAccounts` is registered before `addBankAccount` runs, so it executes even if `addBankAccount` failed mid-flight — its implementation must be idempotent. + +## Implementation + +The following examples show how each SDK implements the Saga pattern. +Each language uses a different mechanism to register and execute compensations, but the core principle is the same: register a compensation before or after each step, and run all compensations in reverse order on failure. + + + + +```python +# workflows.py +from temporalio import workflow + +@workflow.defn +class OpenAccountWorkflow: + @workflow.run + async def run(self, req: OpenAccountRequest) -> str: + compensations = [] + + try: + # Step 1: createAccount has no compensation — leaving an empty + # account stub on later failure is acceptable. + await workflow.execute_activity( + create_account, req, + start_to_close_timeout=timedelta(seconds=10), + ) + + # Register compensation for Step 2 BEFORE execution + compensations.append( + lambda: workflow.execute_activity( + clear_postal_addresses, req, + start_to_close_timeout=timedelta(seconds=10), + ) + ) + # Step 2: Add postal address + await workflow.execute_activity( + add_address, req, + start_to_close_timeout=timedelta(seconds=10), + ) + + # Register compensation for Step 3 BEFORE execution + compensations.append( + lambda: workflow.execute_activity( + remove_client, req, + start_to_close_timeout=timedelta(seconds=10), + ) + ) + # Step 3: Add client record + await workflow.execute_activity( + add_client, req, + start_to_close_timeout=timedelta(seconds=10), + ) + + # Register compensation for Step 4 BEFORE execution + compensations.append( + lambda: workflow.execute_activity( + disconnect_bank_accounts, req, + start_to_close_timeout=timedelta(seconds=10), + ) + ) + # Step 4: Link bank account (this step fails in the demo) + await workflow.execute_activity( + add_bank_account, req, + start_to_close_timeout=timedelta(seconds=10), + ) + except Exception: + # On error, run compensations in reverse order + for compensation in reversed(compensations): + await compensation() + raise +``` + + + + +```go +// open_account_workflow.go +func OpenAccountWorkflow(ctx workflow.Context, req OpenAccountRequest) error { + var compensations []func() + runCompensations := func() { + for i := len(compensations) - 1; i >= 0; i-- { + compensations[i]() + } + } + + // Step 1: CreateAccount has no compensation — leaving an empty account + // stub on later failure is acceptable. + if err := workflow.ExecuteActivity(ctx, CreateAccount, req).Get(ctx, nil); err != nil { + return err + } + + // Register compensation for Step 2 BEFORE execution + compensations = append(compensations, func() { + _ = workflow.ExecuteActivity(ctx, ClearPostalAddresses, req).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(ctx, AddAddress, req).Get(ctx, nil); err != nil { + runCompensations() + return err + } + + // Register compensation for Step 3 BEFORE execution + compensations = append(compensations, func() { + _ = workflow.ExecuteActivity(ctx, RemoveClient, req).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(ctx, AddClient, req).Get(ctx, nil); err != nil { + runCompensations() + return err + } + + // Register compensation for Step 4 BEFORE execution + compensations = append(compensations, func() { + _ = workflow.ExecuteActivity(ctx, DisconnectBankAccounts, req).Get(ctx, nil) + }) + if err := workflow.ExecuteActivity(ctx, AddBankAccount, req).Get(ctx, nil); err != nil { + runCompensations() + return err + } + + return nil +} +``` + + + + +```java +// OpenAccountWorkflow.java +@WorkflowInterface +public interface OpenAccountWorkflow { + @WorkflowMethod + String openAccount(OpenAccountRequest req); +} + +public class OpenAccountWorkflowImpl implements OpenAccountWorkflow { + private final Activities activities = Workflow.newActivityStub( + Activities.class, + ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(10)) + .build()); + + @Override + public String openAccount(OpenAccountRequest req) { + // Create a Saga instance with compensation options + Saga saga = new Saga(new Saga.Options.Builder() + .setParallelCompensation(false) // Run compensations sequentially + .build()); + + try { + // Step 1: createAccount has no compensation — leaving an empty + // account stub on later failure is acceptable. + activities.createAccount(req); + + // Register compensation for Step 2 BEFORE execution + saga.addCompensation(activities::clearPostalAddresses, req); + activities.addAddress(req); + + // Register compensation for Step 3 BEFORE execution + saga.addCompensation(activities::removeClient, req); + activities.addClient(req); + + // Register compensation for Step 4 BEFORE execution + saga.addCompensation(activities::disconnectBankAccounts, req); + activities.addBankAccount(req); + + return "Account " + req.accountId() + " opened"; + + } catch (Exception e) { + // On any error, run all registered compensations in reverse order + saga.compensate(); + throw e; + } + } +} +``` + + + + +```typescript +// workflows.ts +type Compensation = () => Promise; + +export async function openAccount(req: OpenAccountRequest): Promise { + const compensations: Compensation[] = []; + + try { + // Step 1: createAccount has no compensation — leaving an empty account + // stub on later failure is acceptable. + await acts.createAccount(req); + + // Register compensation for Step 2 BEFORE execution + compensations.unshift(() => acts.clearPostalAddresses(req)); + await acts.addAddress(req); + + // Register compensation for Step 3 BEFORE execution + compensations.unshift(() => acts.removeClient(req)); + await acts.addClient(req); + + // Register compensation for Step 4 BEFORE execution + compensations.unshift(() => acts.disconnectBankAccounts(req)); + await acts.addBankAccount(req); + + return `Account ${req.accountId} opened`; + } catch (err) { + // On error, run all compensations in reverse order (unshift keeps them in LIFO already) + for (const compensate of compensations) { + await compensate(); + } + throw err; + } +} +``` + + + + +The key differences between SDKs are: + +- **Go**: Uses a slice of closures and iterates from the end on error. (Some samples use `defer` instead — both achieve LIFO; the slice form makes the rollback trigger explicit.) +- **Python**: Uses a list with `reversed()` to iterate compensations in LIFO order on error. +- **TypeScript**: Uses an array with `unshift()` to maintain LIFO order, and manually iterates on error. +- **Java**: Uses the SDK's `Saga` helper to track compensations and trigger them with `saga.compensate()`. + +In all SDKs, compensations are registered before Activity execution and run in reverse order of registration. +All compensations must be idempotent and able to handle cases where the forward Activity never executed. + +### When to register compensations + +There are two approaches for when to register compensation Activities: + +1. **Register before Activity execution** (recommended for safety): This ensures the compensation runs even if the Activity fails after partial completion. For example, a credit card may be charged but the Activity fails before returning success. The compensation must be idempotent and handle cases where the forward Activity never executed (no-op). This is the safer default when Activities have side effects that may occur before failure. + +2. **Register after Activity execution** (appropriate when safe): This only compensates Activities that completed successfully. The compensation logic is simpler because you do not need to check whether the forward action occurred. This approach is appropriate when Activities are truly atomic (all-or-nothing). The risk is partial completion without compensation if the Activity fails mid-execution. + +The choice depends on your Activity's failure characteristics and whether the compensation can safely handle cases where the forward Activity never executed. +When in doubt, register compensations before execution and ensure they are idempotent. + +## When to use + +The Saga pattern is a good fit when you need to maintain consistency across multiple services or databases, traditional distributed transactions (two-phase commit) are too slow or unavailable, you can define compensating actions for each step in your business process, eventual consistency is acceptable for your use case, and you need to handle long-running transactions that may span hours or days. + +It is not a good fit for operations that require strong ACID consistency, single-service transactions that can use a local database transaction, processes where compensations cannot be defined, or operations that must appear atomic to external observers. + +## Benefits and trade-offs + +The Saga pattern maintains eventual consistency without distributed locks, and each service can use its own database and transaction model. +Temporal's durable execution guarantees that compensations will execute even after Worker failures. +The pattern scales better than two-phase commit protocols. + +The trade-offs to consider are that only eventual consistency is provided — intermediate states are visible to other processes. +You must design idempotent compensation Activities, and compensation logic must be maintained alongside forward logic. +Some operations may not have meaningful compensations. + +## Comparison with alternatives + +| Approach | Consistency | Rollback mechanism | Coupling | Scalability | +| :--- | :--- | :--- | :--- | :--- | +| Saga (orchestration) | Eventual | Compensating transactions | Loose | High | +| Two-phase commit | Strong (ACID) | Distributed lock/rollback | Tight | Low | +| Saga (choreography) | Eventual | Event-driven compensations | Very loose | High | +| Local transaction | Strong (ACID) | Database rollback | None | Single service | + +## Best practices + +- **Make all compensations idempotent.** Compensations may run even when the forward Activity never executed (if registered before execution) or may run multiple times on retry. Use idempotency keys to ensure safe re-execution. +- **Register compensations before Activity execution.** This ensures cleanup runs even if the Activity fails after partial completion. The compensation must handle the case where the forward action never occurred (no-op). +- **Use idempotency keys for forward Activities.** Pass a unique identifier (such as a client ID or Workflow ID) to each Activity so retries do not create duplicate side effects. +- **Set StartToCloseTimeout on compensation Activities.** Set a `StartToCloseTimeout` but avoid `ScheduleToCloseTimeout` on compensations. Do not set Workflow-level timeouts — let compensations retry until they succeed. +- **Use a disconnected context for cancellation compensation.** In Go, use `NewDisconnectedContext` to run compensation Activities after Workflow cancellation, since the original context is already cancelled. +- **Keep compensation payloads small.** Pass references (IDs, URLs) instead of full data objects to avoid exceeding the 2 MB payload limit. +- **Log compensation failures but continue.** If a compensation fails, log the error and continue executing remaining compensations. In production, alert for manual intervention on persistent compensation failures. +- **Re-throw the original error after compensating.** Always re-throw the original exception after running compensations so the Workflow reports the correct failure reason. + +## Common pitfalls + +- **Non-idempotent compensations.** Compensations may run even when the forward Activity never executed (if registered before execution) or may run multiple times on retry. All compensations must be idempotent. +- **Forgetting to register a compensation.** If a step succeeds but its compensation was never registered, a later failure leaves that step's effects permanently in place. +- **Compensations that can fail permanently.** If a compensation Activity fails with a non-retryable error, the Saga cannot fully roll back. Design compensations with generous retry policies. +- **Large payloads in compensation state.** Passing large objects through the compensation chain can exceed the 2 MB payload limit. Use references (IDs, URLs) instead of full data. +- **Swallowing the ContinueAsNew exception in TypeScript.** In TypeScript, `continueAsNew` works by throwing a special exception. A `catch` block that does not re-throw it, or a `finally` block that returns a value, silently prevents Continue-As-New. + +## Related patterns + +- **Retry Policies**: Often combined with the Saga pattern to handle transient failures before compensating. +- **[Child Workflows](/design-patterns/child-workflows)**: You can use Child Workflows to organize complex Sagas with multiple sub-Sagas. +- **[Long-Running Activity](/design-patterns/long-running-activity)**: Heartbeats work well with long-running compensation Activities. +- **[Early Return](/design-patterns/early-return)**: You can combine Early Return with the Saga pattern to return initialization results before compensation runs. + +## Sample code + +- [Go Sample](https://github.com/temporalio/samples-go/tree/main/saga) — Saga with `defer`-based compensations. +- [Java Sample](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloSaga.java) — Saga with the `Saga` API. +- [TypeScript Sample](https://github.com/temporalio/samples-typescript/tree/main/saga) — Saga with array-based compensations. +- [Python Sample](https://github.com/temporalio/samples-python) — Saga with list-based compensations. diff --git a/docs/design-patterns/signal-with-start.mdx b/docs/design-patterns/signal-with-start.mdx new file mode 100644 index 0000000000..ecd81acc7f --- /dev/null +++ b/docs/design-patterns/signal-with-start.mdx @@ -0,0 +1,325 @@ +--- +id: signal-with-start +title: "Signal with Start Pattern" +sidebar_label: "Signal with Start" +description: "Starts a Workflow when Signaling it if it does not already exist. If already running, it receives the Signal directly." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +Signal with Start is a pattern that lazily creates Workflows when Signaling them. +If the Workflow is already running, it receives the Signal; if not, the Workflow starts first and then receives the Signal. +This enables entity Workflows that only exist when needed and can receive operations throughout their lifetime. + +## Problem + +In distributed systems, you often need Workflows that represent long-lived entities (accounts, shopping carts, user sessions), consume events from streams (Kafka, SQS) and trigger certain behaviors of an aggregate or entity, receive multiple operations over time, should only exist when there is work to do, and need to handle the first operation without special client logic. + +Without Signal with Start, clients must check if the Workflow exists before Signaling, start the Workflow if it does not exist and then Signal it, handle race conditions when multiple clients try to start the same Workflow, and write complex coordination logic. + +## Solution + +Temporal's Signal with Start API atomically starts a Workflow (if not running) and delivers a Signal in a single operation. +The client does not need to know whether the Workflow exists — the platform handles it automatically. + +```mermaid +sequenceDiagram + participant C as Client + participant T as Temporal + participant W as Workflow + + C->>T: SignalWithStart(workflowId, signal, args) + T->>T: Check if workflow exists + alt Workflow does not exist + T->>W: Start Workflow + activate W + T->>W: Deliver Signal + W->>W: Process Signal + end + alt Workflow already running + T->>W: Deliver Signal only (no start) + W->>W: Process Signal + end + deactivate W +``` + +The following describes each step in the diagram: + +1. The client calls SignalWithStart with a Workflow ID, Signal name, and arguments. +2. Temporal checks whether a Workflow with that ID is already running. +3. If the Workflow does not exist, Temporal starts it and then delivers the Signal. +4. If the Workflow is already running, Temporal delivers the Signal without starting a new instance. + +## Implementation + +### Basic Signal with Start + +The following examples show a shopping cart entity Workflow that is created lazily when the first item is added. +Each SDK uses its own API to atomically start the Workflow (if needed) and deliver the Signal. + + + + +```python +# client.py +from temporalio.client import Client +from workflows import ShoppingCartWorkflow, AddItemSignal + +async def add_item(client: Client, cart_id: str, item_id: str, product_id: str, quantity: int) -> None: + # Atomically start workflow (if needed) and deliver signal + await client.start_workflow( + ShoppingCartWorkflow.run, + id=f"cart-{cart_id}", + task_queue="carts", + start_signal="add_item", + start_signal_args=[AddItemSignal(item_id=item_id, product_id=product_id, quantity=quantity)], + ) + +# workflows.py +from dataclasses import dataclass +from temporalio import workflow + +@dataclass +class AddItemSignal: + item_id: str + product_id: str + quantity: int + +@dataclass +class CartItem: + product_id: str + quantity: int + +@workflow.defn +class ShoppingCartWorkflow: + def __init__(self) -> None: + self.processed_items: set[str] = set() + self.items: list[CartItem] = [] + + @workflow.run + async def run(self) -> None: + await workflow.wait_condition(lambda: False) # Run forever (entity workflow) + + @workflow.signal + def add_item(self, sig: AddItemSignal) -> None: + if sig.item_id in self.processed_items: + return # Idempotency: ignore duplicate signals + self.processed_items.add(sig.item_id) + self.items.append(CartItem(product_id=sig.product_id, quantity=sig.quantity)) +``` + + + + +```go +// client.go +func AddItem(ctx context.Context, cartID, itemID, productID string, quantity int) error { + opts := client.StartWorkflowOptions{ + ID: "cart-" + cartID, + TaskQueue: "carts", + } + + // Atomically start workflow (if needed) and deliver signal + sig := AddItemSignal{ItemID: itemID, ProductID: productID, Quantity: quantity} + _, err := c.SignalWithStartWorkflow(ctx, "cart-"+cartID, "addItem", sig, opts, ShoppingCartWorkflow) + return err +} + +// workflow.go +func ShoppingCartWorkflow(ctx workflow.Context) error { + processedItems := make(map[string]bool) + var items []CartItem + + addItemCh := workflow.GetSignalChannel(ctx, "addItem") + workflow.Go(ctx, func(ctx workflow.Context) { + for { + var sig AddItemSignal + addItemCh.Receive(ctx, &sig) + if processedItems[sig.ItemID] { + continue // Idempotency: ignore duplicate signals + } + processedItems[sig.ItemID] = true + items = append(items, CartItem{ProductID: sig.ProductID, Quantity: sig.Quantity}) + } + }) + + workflow.Await(ctx, func() bool { return false }) // Run forever (entity workflow) + return nil +} +``` + + + + +```java +// ShoppingCartManager.java +public class ShoppingCartManager { + public void addItem(String cartId, String itemId, String productId, int quantity) { + WorkflowOptions options = WorkflowOptions.newBuilder() + .setWorkflowId("cart-" + cartId) + .setTaskQueue("carts") + .build(); + + ShoppingCartWorkflow workflow = + workflowClient.newWorkflowStub(ShoppingCartWorkflow.class, options); + + // Atomically start workflow (if needed) and deliver signal + BatchRequest request = workflowClient.newSignalWithStartRequest(); + request.add(workflow::run); + request.add(workflow::addItem, itemId, productId, quantity); + workflowClient.signalWithStart(request); + } +} + +// ShoppingCartWorkflow.java +@WorkflowInterface +public interface ShoppingCartWorkflow { + @WorkflowMethod + void run(); + + @SignalMethod + void addItem(String itemId, String productId, int quantity); +} + +public class ShoppingCartWorkflowImpl implements ShoppingCartWorkflow { + private Set processedItems = new HashSet<>(); + private List items = new ArrayList<>(); + + @Override + public void run() { + Workflow.await(() -> false); // Run forever (entity workflow) + } + + @Override + public void addItem(String itemId, String productId, int quantity) { + if (!processedItems.add(itemId)) { + return; // Idempotency: ignore duplicate signals + } + items.add(new CartItem(productId, quantity)); + } +} +``` + + + + +```typescript +// client.ts +export async function addItem( + cartId: string, + itemId: string, + productId: string, + quantity: number +) { + // Atomically start workflow (if needed) and deliver signal + const handle = await client.workflow.signalWithStart(shoppingCartWorkflow, { + workflowId: `cart-${cartId}`, + taskQueue: 'carts', + signal: 'addItem', + signalArgs: [itemId, productId, quantity], + }); +} + +// workflow.ts +export async function shoppingCartWorkflow(): Promise { + const processedItems = new Set(); + const items: CartItem[] = []; + + setHandler(addItemSignal, (itemId: string, productId: string, quantity: number) => { + if (processedItems.has(itemId)) { + return; // Idempotency: ignore duplicate signals + } + processedItems.add(itemId); + items.push({ productId, quantity }); + }); + + await condition(() => false); // Run forever (entity workflow) +} + +export const addItemSignal = defineSignal<[string, string, number]>('addItem'); +``` + + + + +In all SDKs, the Workflow ID is derived from the business entity (the cart ID), ensuring one Workflow per entity. +The Signal handler checks a set of processed item IDs to prevent duplicate processing. +The Workflow blocks indefinitely, acting as a long-lived entity that receives operations over its lifetime. + +## When to use + +The Signal with Start pattern is a good fit for entity Workflows (accounts, shopping carts, user sessions, clusters), event-driven architectures (Kafka consumers, message queue processors), Workflows that receive multiple operations over their lifetime, lazy entity creation where you only create when the first operation arrives, and fire-and-forget operations where immediate response is not needed. + +It is not a good fit for one-time operations (use REJECT_DUPLICATE policy instead), request-response patterns requiring synchronous confirmation (use Update with Start), or operations that need immediate return values. + +## Benefits and trade-offs + +Signal with Start provides an atomic operation — start and Signal happen atomically with no race conditions. +Workflows only exist when needed (lazy creation). +The client does not need to check if the Workflow exists. +The operation is safe to retry because duplicate starts are handled by the Workflow ID. +The pattern is a natural fit for long-lived business entities. + +The trade-offs to consider are that Signals are fire-and-forget with no immediate confirmation that the Signal was processed. +You still need to track processed operation IDs in the Workflow for Signal idempotency. +Workflows must handle unbounded execution (use Continue-As-New). +Signals do not return values — use Queries or Updates for that. + +Both ALLOW_DUPLICATE and ALLOW_DUPLICATE_FAILED_ONLY work well with Signal with Start: + +- **ALLOW_DUPLICATE** (default): Allows a new Workflow Execution with the same ID after the previous one has closed (completed, failed, timed out, terminated, or cancelled). Does not affect a currently running Workflow — Signal with Start delivers the Signal to the running execution. +- **ALLOW_DUPLICATE_FAILED_ONLY**: Allows restart only if the previous run failed — prevents accidental restarts of running Workflows. +- **REJECT_DUPLICATE**: Prevents any duplicate starts — useful for one-time operations, not entity Workflows. +- **TERMINATE_IF_RUNNING**: Terminates the running Workflow and starts a new one — use with caution. + +## Comparison with alternatives + +| Approach | Use case | Response type | Idempotency | +| :--- | :--- | :--- | :--- | +| Signal with Start | Entity Workflows | Fire-and-forget | Signal-level | +| Update with Start | Request-response | Sync return value | Update-level | +| REJECT_DUPLICATE | One-time operations | Async (Workflow ID) | Workflow-level | + +## Best practices + +- **Derive Workflow ID from entity.** Use stable business identifiers (account ID, user ID). +- **Implement Signal idempotency.** Track processed operation IDs to prevent duplicates. +- **Use WorkflowInit.** Initialize state before Signals are delivered (Java, .NET, and Python's `__init__`). +- **Handle unbounded execution.** Use Continue-As-New for long-running entity Workflows. +- **Choose the right Workflow ID policy.** Use ALLOW_DUPLICATE_FAILED_ONLY for entity Workflows. +- **Include operation IDs.** Every Signal should include a unique operation or reference ID. +- **Return early.** Check for duplicates at the start of Signal handlers. + +## Common pitfalls + +- **Not implementing Signal idempotency.** Signals can be delivered more than once (for example, client retries). Without tracking processed operation IDs, the Workflow processes duplicates. +- **Unbounded history growth.** Entity Workflows that receive many Signals without calling Continue-As-New will hit the 50K event or 10K Signal limit. Use `isContinueAsNewSuggested()` to trigger Continue-As-New. +- **Losing pending Signals on Continue-As-New.** Drain all pending Signals before calling Continue-As-New, and pass unprocessed ones as input to the new execution. +- **Expecting a return value from Signals.** Signals are fire-and-forget. If you need a synchronous response, use Updates or Update-with-Start instead. +- **Race between SignalWithStart and Continue-As-New.** Temporal prevents this race — if a Signal arrives while the Workflow is completing via Continue-As-New, the Workflow rewinds to process the Signal first. + +## Related patterns + +- **[Entity Workflow](/design-patterns/entity-workflow)**: Long-running Workflows representing business entities. +- **[Continue-As-New](/design-patterns/continue-as-new)**: Managing unbounded Workflow history. +- **[Request-Response via Updates](/design-patterns/request-response-via-updates)**: When you need synchronous responses instead of fire-and-forget. +- **[Early Return](/design-patterns/early-return)**: Update-with-Start for request-response with lazy initialization. + +## Sample code + +**Python** +- [Hello Signal](https://github.com/temporalio/samples-python/tree/main/hello/hello_signal.py) — Basic Signal handling in a Workflow. +- [Message Passing](https://github.com/temporalio/samples-python/tree/main/message_passing/introduction) — Introduction to message passing with Signals, Queries, and Updates. + +**Java** +- [Hello Signal](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/hello/HelloSignal.java) — Basic Signal handling in a Workflow. +- [Safe Message Passing](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/safemessagepassing) — Concurrent Signal handling with validation. + +**TypeScript** +- [Signals and Queries](https://github.com/temporalio/samples-typescript/tree/main/signals-queries) — Signal and Query usage in a Workflow. + +**Go** +- [Await Signals](https://github.com/temporalio/samples-go/tree/main/await-signals) — Waiting for Signals with timeout. diff --git a/docs/design-patterns/sliding-window.mdx b/docs/design-patterns/sliding-window.mdx new file mode 100644 index 0000000000..3f1a224d75 --- /dev/null +++ b/docs/design-patterns/sliding-window.mdx @@ -0,0 +1,508 @@ +--- +id: sliding-window +title: "Sliding Window" +sidebar_label: "Sliding Window" +description: "Maintains a fixed number of concurrently active Child Workflows, starting a new one each time an existing one completes." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info[TLDR] +Keep exactly `windowSize` child Workflows running at all times — each completion signal triggers the next record to start immediately. Use this when your record set is arbitrarily large, you need **bounded concurrency** to protect downstream systems, and you want higher throughput than a sequential Batch Iterator provides. +::: + +## Overview + +The Sliding Window pattern maintains a fixed-size pool of concurrently running child Workflows. As each child completes it signals the parent, which immediately starts a replacement — keeping the concurrency level constant and progressing at the rate of the fastest processor. Continue-as-New prevents the parent's history from growing without bound. + +## Problem + +The [Batch Iterator](/design-patterns/batch-iterator) processes records sequentially — the overall throughput is limited by the slowest record in each page. The [Fan-Out](/design-patterns/fanout-child-workflows) pattern starts all children at once, which can overwhelm downstream systems when the record set is large. + +You need a way to process an arbitrarily large record set with bounded concurrency, maximum throughput within that bound, and protection against history bloat. + +## Solution + +The parent Workflow starts exactly `windowSize` child Workflows simultaneously. Each child processes one record and, when finished, signals the parent with a completion notification. The parent maintains a count of completed children and starts a new child for the next record as soon as a slot becomes free. + +Continue-as-New is called after the parent has started `windowSize` children. Because child Workflows have stable Workflow IDs and Continue-as-New preserves the parent's Workflow ID, children started by a previous run can still signal the current run. + +```mermaid +flowchart TD + Records["📋 Record IDs\n[r0, r1, r2, ...]"] + Parent["Parent Workflow\n(window size = W)"] + C1["Child r0\n✅ done"] + C2["Child r1\n⏳ running"] + C3["Child r2\n⏳ running"] + C4["Child r3\n🆕 started"] + CAN["continueAsNew\n(startIndex + W)"] + + Records --> Parent + Parent -->|"start W children"| C1 + Parent --> C2 + Parent --> C3 + + C1 -->|"Signal: complete"| Parent + Parent -->|"slot free → start next"| C4 + + Parent -->|"after W children started"| CAN +``` + +The following describes each step in the diagram: + +1. The parent Workflow starts with a list of record IDs and a configured `windowSize`. +2. It starts the first `windowSize` children concurrently, one per record, each receiving the parent's Workflow ID so they know where to signal. +3. As each child completes, it sends a completion signal to the parent. +4. The parent receives the signal, increments its completion counter, and starts the next child (the next record in the list). +5. After starting `windowSize` children in total, the parent calls `continueAsNew` with the updated start index. The window slides forward without gaps because the parent's Workflow ID is preserved across runs. +6. Children from previous runs that have not yet signalled will find the new run when they send the signal, because the parent Workflow ID remains the same. + +## Implementation + + +The following examples show how each SDK implements the Sliding Window pattern. + + + + +```typescript +// workflows.ts +import { + ApplicationFailure, + ParentClosePolicy, + condition, + continueAsNew, + defineSignal, + getExternalWorkflowHandle, + proxyActivities, + setHandler, + startChild, + workflowInfo, +} from "@temporalio/workflow"; +import type * as activities from "./activities"; +import { TASK_QUEUE, WINDOW_SIZE } from "./shared"; + +const { processRecord } = proxyActivities({ + startToCloseTimeout: "30 seconds", +}); + +export const completionSignal = defineSignal<[string]>("recordCompleted"); + +export async function recordProcessorWorkflow( + recordId: string, + parentWorkflowId: string +): Promise { + await processRecord(recordId); + // Ignore NOT_FOUND — the parent's final run may have already completed. + try { + const parent = getExternalWorkflowHandle(parentWorkflowId); + await parent.signal(completionSignal, recordId); + } catch (err) { + if (!(err instanceof ApplicationFailure && err.type === 'NOT_FOUND')) throw err; + } +} + +export async function slidingWindowWorkflow(input: SlidingWindowInput): Promise { + const { + recordIds, + windowSize = WINDOW_SIZE, + startIndex = 0, + inFlight = 0, + } = input; + let totalProcessed = input.totalProcessed ?? 0; + const parentId = workflowInfo().workflowId; + let pendingSignals = 0; + let nextIndex = startIndex; + let dispatched = 0; + let active = inFlight; + + // Signal handler: each completion frees a slot and increments the total. + setHandler(completionSignal, (_recordId: string) => { + pendingSignals++; + totalProcessed++; + }); + + // Only start (windowSize - inFlight) new children. Carried-over in-flight + // children from the previous run will signal us when they complete. + const newFill = Math.min(windowSize - inFlight, recordIds.length - startIndex); + for (let i = 0; i < newFill; i++) { + await startChild(recordProcessorWorkflow, { + args: [recordIds[nextIndex], parentId], + workflowId: `${parentId}/record-${recordIds[nextIndex]}`, + taskQueue: TASK_QUEUE, + parentClosePolicy: ParentClosePolicy.ABANDON, + }); + nextIndex++; + dispatched++; + active++; + } + + // If the window is full after the initial fill, continue-as-new immediately. + if (dispatched >= windowSize) { + await continueAsNew({ recordIds, windowSize, startIndex: nextIndex, totalProcessed, inFlight: windowSize }); + return; + } + + // Slide the window: as each slot frees, start the next child. + while (nextIndex < recordIds.length) { + await condition(() => pendingSignals > 0); + pendingSignals--; + active--; + await startChild(recordProcessorWorkflow, { + args: [recordIds[nextIndex], parentId], + workflowId: `${parentId}/record-${recordIds[nextIndex]}`, + taskQueue: TASK_QUEUE, + parentClosePolicy: ParentClosePolicy.ABANDON, + }); + nextIndex++; + dispatched++; + active++; + + // Continue-as-New after starting windowSize children to keep history short. + // Pass nextIndex (next unstarted record) and inFlight=windowSize (window is full). + if (dispatched >= windowSize) { + await continueAsNew({ + recordIds, + windowSize, + startIndex: nextIndex, + totalProcessed, + inFlight: windowSize, + }); + return; + } + } + + // Wait for all remaining in-flight children to complete. + await condition(() => pendingSignals >= active); + return totalProcessed; +} +``` + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow +from temporalio.exceptions import ApplicationError +from temporalio.workflow import ParentClosePolicy, continue_as_new +from activities import process_record +from shared import TASK_QUEUE, WINDOW_SIZE + +COMPLETION_SIGNAL = "recordCompleted" + + +@workflow.defn +class RecordProcessorWorkflow: + @workflow.run + async def run(self, record_id: str, parent_workflow_id: str) -> None: + await workflow.execute_activity( + process_record, + record_id, + start_to_close_timeout=timedelta(seconds=30), + ) + # Ignore NOT_FOUND — the parent's final run may have already completed. + try: + handle = workflow.get_external_workflow_handle(parent_workflow_id) + await handle.signal(COMPLETION_SIGNAL, record_id) + except ApplicationError as e: + if "not found" not in str(e).lower(): + raise + + +@workflow.defn +class SlidingWindowWorkflow: + def __init__(self) -> None: + self._pending_signals = 0 + self._total_processed = 0 + + @workflow.signal(name=COMPLETION_SIGNAL) + def record_completed(self, record_id: str) -> None: + self._pending_signals += 1 + self._total_processed += 1 + + @workflow.run + async def run(self, input: SlidingWindowInput) -> int: + self._total_processed += input.total_processed + record_ids = input.record_ids + window_size = input.window_size + start_index = input.start_index + in_flight = input.in_flight + parent_id = workflow.info().workflow_id + next_index = start_index + dispatched = 0 + active = in_flight + + # Only start (window_size - in_flight) new children. Carried-over in-flight + # children from the previous run will signal us when they complete. + new_fill = min(window_size - in_flight, len(record_ids) - start_index) + for _ in range(new_fill): + await workflow.start_child_workflow( + RecordProcessorWorkflow.run, + args=[record_ids[next_index], parent_id], + id=f"{parent_id}/record-{record_ids[next_index]}", + task_queue=TASK_QUEUE, + parent_close_policy=ParentClosePolicy.ABANDON, + ) + next_index += 1 + dispatched += 1 + active += 1 + + # Slide the window. + while next_index < len(record_ids): + await workflow.wait_condition(lambda: self._pending_signals > 0) + self._pending_signals -= 1 + active -= 1 + await workflow.start_child_workflow( + RecordProcessorWorkflow.run, + args=[record_ids[next_index], parent_id], + id=f"{parent_id}/record-{record_ids[next_index]}", + task_queue=TASK_QUEUE, + parent_close_policy=ParentClosePolicy.ABANDON, + ) + next_index += 1 + dispatched += 1 + active += 1 + + # Pass next_index (next unstarted record) and in_flight=window_size (window is full). + if dispatched >= window_size: + continue_as_new(args=[SlidingWindowInput( + record_ids=record_ids, + window_size=window_size, + start_index=next_index, + total_processed=self._total_processed, + in_flight=window_size, + )]) + + # Wait for all remaining in-flight children to complete. + await workflow.wait_condition(lambda: self._pending_signals >= active) + return self._total_processed +``` + + + + +```go +// workflows.go +package main + +import ( + "strings" + "time" + + enums "go.temporal.io/api/enums/v1" + "go.temporal.io/sdk/workflow" +) + +const CompletionSignal = "recordCompleted" + +func RecordProcessorWorkflow(ctx workflow.Context, recordID string, parentWorkflowID string) error { + ao := workflow.ActivityOptions{StartToCloseTimeout: 30 * time.Second} + ctx = workflow.WithActivityOptions(ctx, ao) + + if err := workflow.ExecuteActivity(ctx, ProcessRecord, recordID).Get(ctx, nil); err != nil { + return err + } + + // Ignore not-found — the parent's final run may have already completed. + err := workflow.SignalExternalWorkflow(ctx, parentWorkflowID, "", CompletionSignal, recordID).Get(ctx, nil) + if err != nil && strings.Contains(err.Error(), "not found") { + return nil + } + return err +} + +func SlidingWindowWorkflow(ctx workflow.Context, input SlidingWindowInput) (int, error) { + windowSize := input.WindowSize + if windowSize <= 0 { + windowSize = WindowSize + } + recordIDs := input.RecordIDs + parentID := workflow.GetInfo(ctx).WorkflowExecution.ID + + completedCh := workflow.GetSignalChannel(ctx, CompletionSignal) + nextIndex := input.StartIndex + totalProcessed := input.TotalProcessed + dispatched := 0 + active := input.InFlight + + startChild := func(recordID string) error { + cwo := workflow.ChildWorkflowOptions{ + WorkflowID: parentID + "/record-" + recordID, + TaskQueue: TaskQueue, + ParentClosePolicy: enums.PARENT_CLOSE_POLICY_ABANDON, + } + future := workflow.ExecuteChildWorkflow(workflow.WithChildOptions(ctx, cwo), RecordProcessorWorkflow, recordID, parentID) + return future.GetChildWorkflowExecution().Get(ctx, nil) + } + + // Only start (windowSize - inFlight) new children. Carried-over in-flight + // children from the previous run will signal us when they complete. + newFill := len(recordIDs) - input.StartIndex + if newFill > windowSize-input.InFlight { + newFill = windowSize - input.InFlight + } + for i := 0; i < newFill; i++ { + if err := startChild(recordIDs[nextIndex]); err != nil { + return totalProcessed, err + } + nextIndex++ + dispatched++ + active++ + } + + // If the window is full after the initial fill, continue-as-new immediately. + if dispatched >= windowSize { + return 0, workflow.NewContinueAsNewError(ctx, SlidingWindowWorkflow, SlidingWindowInput{ + RecordIDs: recordIDs, + WindowSize: windowSize, + StartIndex: nextIndex, + TotalProcessed: totalProcessed, + InFlight: windowSize, + }) + } + + // Slide the window. + for nextIndex < len(recordIDs) { + workflow.GetSignalChannel(ctx, CompletionSignal).Receive(ctx, nil) + totalProcessed++ + active-- + if err := startChild(recordIDs[nextIndex]); err != nil { + return totalProcessed, err + } + nextIndex++ + dispatched++ + active++ + + // Pass nextIndex (next unstarted record) and inFlight=windowSize (window is full). + if dispatched >= windowSize { + return 0, workflow.NewContinueAsNewError(ctx, SlidingWindowWorkflow, SlidingWindowInput{ + RecordIDs: recordIDs, + WindowSize: windowSize, + StartIndex: nextIndex, + TotalProcessed: totalProcessed, + InFlight: windowSize, + }) + } + } + + // Drain all remaining in-flight children. + for active > 0 { + completedCh.Receive(ctx, nil) + totalProcessed++ + active-- + } + return totalProcessed, nil +} +``` + + + + +```java +// SlidingWindowWorkflow.java +import io.temporal.workflow.*; +import java.util.List; + +@WorkflowInterface +public interface SlidingWindowWorkflow { + @WorkflowMethod + int run(Shared.SlidingWindowInput input); + + @SignalMethod + void recordCompleted(String recordId); +} + +// SlidingWindowWorkflowImpl.java +public class SlidingWindowWorkflowImpl implements SlidingWindowWorkflow { + private int pendingSignals = 0; + private int totalProcessed = 0; + + @Override + public void recordCompleted(String recordId) { + pendingSignals++; + totalProcessed++; + } + + @Override + public int run(Shared.SlidingWindowInput input) { + this.totalProcessed = input.totalProcessed; + int windowSize = input.windowSize > 0 ? input.windowSize : Shared.WINDOW_SIZE; + List recordIds = input.recordIds; + String parentId = Workflow.getInfo().getWorkflowId(); + int nextIndex = input.startIndex; + int dispatched = 0; + int active = input.inFlight; + + // Only start (windowSize - inFlight) new children. Carried-over in-flight + // children from the previous run will signal us when they complete. + int newFill = Math.min(windowSize - input.inFlight, recordIds.size() - input.startIndex); + for (int i = 0; i < newFill; i++) { + startChild(recordIds.get(nextIndex), parentId); + nextIndex++; + dispatched++; + active++; + } + + // Slide the window. + while (nextIndex < recordIds.size()) { + Workflow.await(() -> pendingSignals > 0); + pendingSignals--; + active--; + startChild(recordIds.get(nextIndex), parentId); + nextIndex++; + dispatched++; + active++; + + // Pass nextIndex (next unstarted record) and inFlight=windowSize (window is full). + if (dispatched >= windowSize) { + Workflow.newContinueAsNewStub(SlidingWindowWorkflow.class) + .run(new Shared.SlidingWindowInput(recordIds, windowSize, nextIndex, this.totalProcessed, windowSize)); + } + } + + // Drain all remaining in-flight children. + final int remainingActive = active; + Workflow.await(() -> pendingSignals >= remainingActive); + return this.totalProcessed; + } + + private void startChild(String recordId, String parentId) { + ChildWorkflowOptions opts = ChildWorkflowOptions.newBuilder() + .setWorkflowId(parentId + "/record-" + recordId) + .setTaskQueue(Shared.TASK_QUEUE) + .setParentClosePolicy(ParentClosePolicy.PARENT_CLOSE_POLICY_ABANDON) + .build(); + RecordProcessorWorkflow child = Workflow.newChildWorkflowStub(RecordProcessorWorkflow.class, opts); + Async.procedure(child::run, recordId, parentId); + } +} +``` + + + + +## Best Practices + +- **Preserve the parent Workflow ID across Continue-as-New.** The parent's Workflow ID is stable across `continueAsNew` runs — do not generate a new one. Children use `signalExternalWorkflow` with that ID, so they always reach the current run. +- **Use `PARENT_CLOSE_POLICY_ABANDON` on child Workflows.** This lets children that were started by a previous run complete normally even after the parent has continued as new. +- **Size the window conservatively at first.** Each in-flight child counts toward the 2,000 unfinished-actions limit for the parent. A window of 50–200 is a reasonable starting point depending on child duration and downstream capacity. +- **Pass only IDs (not full records) to child Workflows.** Workflow inputs are stored in event history. Keep them small. +- **Carry minimal state into `continueAsNew`.** Only pass `windowSize`, `startIndex`, and the record ID list (or a reference to it). Do not accumulate results in the parent — collect them out-of-band if needed. + +## Common Pitfalls + +- **Losing signals across Continue-as-New.** If a child signals before the parent's new run has registered the signal handler, the signal can be buffered and delivered correctly — Temporal buffers signals for existing Workflow IDs. However, ensure the signal handler is registered before any await, not conditionally. +- **Race between CAN and remaining signal draining.** After `continueAsNew`, the new run must handle signals from children started by the previous run. Pass `nextIndex` (the next *unstarted* record) and `inFlight = windowSize` to the new run so it knows how many carried-over children to expect signals from, without re-starting them. +- **Thundering herd on startup.** Starting hundreds of children simultaneously causes a burst of Activity polls. Ramp up the window gradually or use the [Batch Iterator](/design-patterns/batch-iterator) if rate limiting is more important than throughput. + +## Related Resources + +- [Continue-as-New pattern](/design-patterns/continue-as-new) — history management fundamentals +- [Batch Iterator](/design-patterns/batch-iterator) — simpler alternative when sequential processing is acceptable +- [MapReduce Tree](/design-patterns/mapreduce-tree) — fully parallel alternative when rate limiting is not needed +- [Temporal limits reference](https://docs.temporal.io/cloud/limits) +- [Sliding window sample (Java)](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/batch/slidingwindow) diff --git a/docs/design-patterns/task-orchestration-patterns.mdx b/docs/design-patterns/task-orchestration-patterns.mdx new file mode 100644 index 0000000000..44510d5cc5 --- /dev/null +++ b/docs/design-patterns/task-orchestration-patterns.mdx @@ -0,0 +1,31 @@ +--- +id: task-orchestration-patterns +title: Task orchestration patterns +sidebar_label: Overview +description: Patterns for composing and coordinating multiple units of work within a Workflow. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for composing and coordinating multiple units of work within a Workflow. + + diff --git a/docs/design-patterns/updatable-timer.mdx b/docs/design-patterns/updatable-timer.mdx new file mode 100644 index 0000000000..7ed77e2a46 --- /dev/null +++ b/docs/design-patterns/updatable-timer.mdx @@ -0,0 +1,612 @@ +--- +id: updatable-timer +title: "Updatable / Debounced Timer Pattern" +sidebar_label: "Updatable Timer" +description: "Dynamically adjustable timers that respond to Signals or Updates. Extend, shorten, or cancel timers based on external events." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Updatable / Debounced Timer pattern implements a sleep operation that can be interrupted and dynamically adjusted via Signals. +It enables Workflows to wait for deadlines that can be extended or shortened based on external events, making it suitable for approval processes, SLA management, and time-sensitive business operations. + +## Problem + +In business processes, you often need Workflows that wait for a deadline (approval timeout, SLA expiration, grace period), allow the deadline to be extended or shortened dynamically, react immediately when the deadline changes, and continue waiting with the new deadline without restarting. + +Without an updatable timer, you must use fixed timeouts that cannot be adjusted, cancel and restart Workflows to change deadlines, poll frequently to check for deadline changes, or implement complex state machines to handle timing updates. + +## Solution + +The Updatable / Debounced Timer uses a blocking wait with both a time limit and an update condition. +When a Signal updates the wake-up time, the condition becomes true, the Workflow recalculates the sleep duration, and blocks again with the new deadline. + +Each SDK provides a different mechanism for this: +- **Java**: `Workflow.await(Duration, condition)` returns `false` when the duration expires, or `true` when the condition is met. +- **TypeScript**: `wf.condition(fn, timeout)` returns `false` when the timeout expires, or `true` when the function returns `true`. +- **Python**: `workflow.wait_condition(fn, timeout=duration)` returns normally when the condition is met, or raises `asyncio.TimeoutError` on timeout. +- **Go**: `workflow.NewTimer()` combined with `workflow.NewSelector()` to race a timer against a Signal channel. + +```mermaid +sequenceDiagram + participant Client + participant Workflow + participant Timer + + Client->>Workflow: Start with deadline + activate Workflow + Workflow->>Timer: sleepUntil(deadline) + activate Timer + Note over Timer: Waiting... + + Client->>Workflow: Signal: extendDeadline(newTime) + Workflow->>Timer: Update wakeUpTime + Timer->>Timer: Recalculate duration + Note over Timer: Waiting with new deadline... + + Timer-->>Workflow: Timer expired + deactivate Timer + Workflow-->>Client: Complete + deactivate Workflow +``` + +The following describes each step in the diagram: + +1. The client starts the Workflow with an initial deadline. +2. The Workflow calls `sleepUntil(deadline)`, which blocks until the deadline. +3. The client sends a Signal to extend the deadline. +4. The timer recalculates the remaining duration based on the new deadline and continues waiting. +5. When the timer expires, the Workflow completes. + +The core of the pattern is a reusable timer helper that loops on a blocking wait, recalculating the sleep duration each time the wake-up time is updated: + + + + +```python +# updatable_timer.py +import asyncio +from datetime import timedelta +from temporalio import workflow + + +class UpdatableTimer: + def __init__(self, wake_up_time: float) -> None: + self._wake_up_time = wake_up_time + self._wake_up_time_updated = False + + async def sleep_until(self, wake_up_time: float) -> None: + self._wake_up_time = wake_up_time + while True: + self._wake_up_time_updated = False + sleep_secs = self._wake_up_time - workflow.time() + try: + await workflow.wait_condition( + lambda: self._wake_up_time_updated, + timeout=timedelta(seconds=max(sleep_secs, 0)), + ) + # Condition met: wake-up time was updated, loop to recalculate + except asyncio.TimeoutError: + break # Timer expired + + def update_wake_up_time(self, wake_up_time: float) -> None: + self._wake_up_time = wake_up_time + self._wake_up_time_updated = True # Unblocks wait_condition + + @property + def wake_up_time(self) -> float: + return self._wake_up_time +``` + + + + +```go +// updatable_timer.go +func sleepUntil(ctx workflow.Context, wakeUpTime time.Time, wakeUpChannel workflow.ReceiveChannel) error { + for { + timerCtx, cancelTimer := workflow.WithCancel(ctx) + duration := wakeUpTime.Sub(workflow.Now(ctx)) + if duration <= 0 { + cancelTimer() + break + } + timer := workflow.NewTimer(timerCtx, duration) + + selector := workflow.NewSelector(ctx) + timerFired := false + + selector.AddFuture(timer, func(f workflow.Future) { + timerFired = true + }) + + selector.AddReceive(wakeUpChannel, func(c workflow.ReceiveChannel, more bool) { + c.Receive(ctx, &wakeUpTime) + // Cancel the current timer so it can be recreated with the new deadline + cancelTimer() + }) + + selector.Select(ctx) + + if timerFired { + break // Timer expired + } + // Signal received with new wakeUpTime, loop to recalculate + } + return nil +} +``` + + + + +```java +// UpdatableTimer.java +public class UpdatableTimer { + private long wakeUpTime; + private boolean wakeUpTimeUpdated; + + public void sleepUntil(long wakeUpTime) { + this.wakeUpTime = wakeUpTime; + while (true) { + wakeUpTimeUpdated = false; + Duration sleepInterval = Duration.ofMillis(this.wakeUpTime - Workflow.currentTimeMillis()); + if (!Workflow.await(sleepInterval, () -> wakeUpTimeUpdated)) { + break; // Timer expired + } + // Timer was updated, loop to recalculate + } + } + + public void updateWakeUpTime(long wakeUpTime) { + this.wakeUpTime = wakeUpTime; + this.wakeUpTimeUpdated = true; // Unblocks await + } +} +``` + + + + +```typescript +// updatable-timer.ts +import * as wf from '@temporalio/workflow'; + +export class UpdatableTimer implements PromiseLike { + deadlineUpdated = false; + #deadline: number; + + constructor(deadline: number) { + this.#deadline = deadline; + } + + private async run(): Promise { + while (true) { + this.deadlineUpdated = false; + if ( + !(await wf.condition( + () => this.deadlineUpdated, + this.#deadline - Date.now(), + )) + ) { + break; // Timer expired + } + // Timer was updated, loop to recalculate + } + } + + then( + onfulfilled?: (value: void) => TResult1 | PromiseLike, + onrejected?: (reason: any) => TResult2 | PromiseLike, + ): PromiseLike { + return this.run().then(onfulfilled, onrejected); + } + + set deadline(value: number) { + this.#deadline = value; + this.deadlineUpdated = true; + } + + get deadline(): number { + return this.#deadline; + } +} +``` + + + + +In Java and TypeScript, the `sleepUntil` method calculates the sleep interval and calls a blocking wait with both a duration and a condition. +If the duration expires first, the wait returns `false` (Java/TypeScript) or raises `asyncio.TimeoutError` (Python), and the timer completes. +If the update flag is set via a Signal, the condition becomes true, the wait unblocks, and the loop recalculates the interval with the new deadline. +In Go, a `Selector` races a `Timer` against a Signal channel; when the Signal arrives, the current timer is cancelled and a new one is created with the updated deadline. + +## Implementation + +### Basic approval Workflow + +The following implementation combines the updatable timer with an approval flag. +The Workflow waits for either an approval Signal or the deadline to expire: + + + + +```python +# workflows.py +import asyncio +from datetime import timedelta +from temporalio import workflow + + +@workflow.defn +class ApprovalWorkflow: + def __init__(self) -> None: + self._approved = False + self._status = "PENDING" + + @workflow.run + async def run(self, approval_deadline: float) -> None: + timeout_secs = approval_deadline - workflow.time() + try: + await workflow.wait_condition( + lambda: self._approved, + timeout=timedelta(seconds=max(timeout_secs, 0)), + ) + self._status = "APPROVED" + except asyncio.TimeoutError: + self._status = "REJECTED" + + @workflow.signal + def approve(self) -> None: + self._approved = True + + @workflow.query + def get_status(self) -> str: + return self._status +``` + + + + +```go +// workflow.go +func ApprovalWorkflow(ctx workflow.Context, approvalDeadline time.Time) (string, error) { + logger := workflow.GetLogger(ctx) + status := "PENDING" + approved := false + + // Listen for the approve signal in a goroutine + workflow.Go(ctx, func(ctx workflow.Context) { + ch := workflow.GetSignalChannel(ctx, "approve") + ch.Receive(ctx, nil) + approved = true + }) + + // Wait for approval or timeout + duration := approvalDeadline.Sub(workflow.Now(ctx)) + ok, _ := workflow.AwaitWithTimeout(ctx, duration, func() bool { + return approved + }) + + if ok { + status = "APPROVED" + } else { + status = "REJECTED" + } + + logger.Info("Approval workflow completed", "status", status) + return status, nil +} +``` + + + + +```java +// ApprovalWorkflowImpl.java +@WorkflowInterface +public interface ApprovalWorkflow { + @WorkflowMethod + void execute(long approvalDeadline); + + @SignalMethod + void extendDeadline(long newDeadline); + + @SignalMethod + void approve(); + + @QueryMethod + String getStatus(); +} + +public class ApprovalWorkflowImpl implements ApprovalWorkflow { + private UpdatableTimer timer = new UpdatableTimer(); + private boolean approved = false; + private String status = "PENDING"; + + @Override + public void execute(long approvalDeadline) { + Workflow.await( + Duration.ofMillis(approvalDeadline - Workflow.currentTimeMillis()), + () -> approved); + + if (approved) { + status = "APPROVED"; + } else { + status = "REJECTED"; + } + } + + @Override + public void extendDeadline(long newDeadline) { + timer.updateWakeUpTime(newDeadline); + } + + @Override + public void approve() { + approved = true; + } + + @Override + public String getStatus() { + return status; + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; + +export const extendDeadlineSignal = wf.defineSignal<[number]>('extendDeadline'); +export const approveSignal = wf.defineSignal('approve'); +export const getStatusQuery = wf.defineQuery('getStatus'); + +export async function approvalWorkflow(approvalDeadline: number): Promise { + let approved = false; + let status = 'PENDING'; + + wf.setHandler(approveSignal, () => { + approved = true; + }); + + wf.setHandler(getStatusQuery, () => status); + + // Wait for approval or deadline expiration + const approvedBeforeDeadline = await wf.condition( + () => approved, + approvalDeadline - Date.now(), + ); + + status = approvedBeforeDeadline ? 'APPROVED' : 'REJECTED'; +} +``` + + + + +The Workflow waits with both a deadline duration and a condition that checks the `approved` flag. +If the `approve` Signal arrives before the deadline, the condition becomes true and the Workflow sets the status to APPROVED. +If the deadline expires first, the Workflow sets the status to REJECTED. + +### Multiple deadline extensions + +The following implementation uses the `UpdatableTimer` directly to support multiple deadline extensions. +The Workflow blocks on the timer helper and checks the approval flag after the timer completes: + + + + +```python +# workflows.py +from temporalio import workflow +from .updatable_timer import UpdatableTimer + + +@workflow.defn +class MultiExtensionApprovalWorkflow: + def __init__(self) -> None: + self._timer = UpdatableTimer(0) + self._approved = False + self._rejected = False + + @workflow.run + async def run(self, initial_deadline: float) -> None: + await self._timer.sleep_until(initial_deadline) + + if not self._approved: + self._rejected = True + + @workflow.signal + def extend_deadline(self, new_deadline: float) -> None: + if not self._approved and not self._rejected: + self._timer.update_wake_up_time(new_deadline) + + @workflow.signal + def approve(self) -> None: + self._approved = True +``` + + + + +```go +// workflow.go +func MultiExtensionApprovalWorkflow(ctx workflow.Context, initialDeadline time.Time) (string, error) { + approved := false + rejected := false + wakeUpTime := initialDeadline + wakeUpChannel := workflow.NewChannel(ctx) + + // Listen for approval signal + workflow.Go(ctx, func(ctx workflow.Context) { + ch := workflow.GetSignalChannel(ctx, "approve") + ch.Receive(ctx, nil) + approved = true + }) + + // Listen for deadline extension signals + workflow.Go(ctx, func(ctx workflow.Context) { + ch := workflow.GetSignalChannel(ctx, "extendDeadline") + for { + var newDeadline time.Time + ch.Receive(ctx, &newDeadline) + if !approved && !rejected { + wakeUpChannel.Send(ctx, newDeadline) + } + } + }) + + // Block on the updatable timer + _ = sleepUntil(ctx, wakeUpTime, wakeUpChannel) + + if !approved { + rejected = true + } + + if rejected { + return "REJECTED", nil + } + return "APPROVED", nil +} +``` + + + + +```java +// MultiExtensionApprovalWorkflowImpl.java +public class MultiExtensionApprovalWorkflowImpl implements ApprovalWorkflow { + private UpdatableTimer timer = new UpdatableTimer(); + private boolean approved = false; + private boolean rejected = false; + + @Override + public void execute(long initialDeadline) { + timer.sleepUntil(initialDeadline); + + if (!approved) { + rejected = true; + } + } + + @Override + public void extendDeadline(long newDeadline) { + if (!approved && !rejected) { + timer.updateWakeUpTime(newDeadline); + } + } + + @Override + public void approve() { + approved = true; + } +} +``` + + + + +```typescript +// workflows.ts +import * as wf from '@temporalio/workflow'; +import { UpdatableTimer } from './updatable-timer'; + +export const extendDeadlineSignal = wf.defineSignal<[number]>('extendDeadline'); +export const approveSignal = wf.defineSignal('approve'); + +export async function multiExtensionApprovalWorkflow( + initialDeadline: number, +): Promise { + let approved = false; + let rejected = false; + const timer = new UpdatableTimer(initialDeadline); + + wf.setHandler(extendDeadlineSignal, (newDeadline: number) => { + if (!approved && !rejected) { + timer.deadline = newDeadline; + } + }); + + wf.setHandler(approveSignal, () => { + approved = true; + }); + + await timer; // Blocks until the timer expires + + if (!approved) { + rejected = true; + } +} +``` + + + + +The `extendDeadline` Signal handler checks that the Workflow has not already been approved or rejected before updating the timer. +Each update unblocks the timer loop, which recalculates the remaining duration and blocks again. + +## When to use + +The Updatable Timer pattern is a good fit for approval Workflows with deadline extensions, SLA management with grace periods, time-based escalations that can be postponed, auction bidding with extended closing times, and payment grace periods that can be adjusted. + +It is not a good fit for fixed timeouts that never change (use a simple sleep), immediate cancellation (use cancellation scopes), or complex scheduling (use Temporal Schedules). + +## Benefits and trade-offs + +The pattern allows you to adjust deadlines without restarting Workflows. +Changes take effect instantly. +The timer helper is reusable across multiple Workflows. +All timing is based on Workflow time, ensuring replay consistency. +You can Query the current deadline at any time. + +The trade-offs to consider are that the pattern requires an external process to send update Signals. +Each timer instance manages one deadline. +Previous deadlines are not tracked (add tracking if needed). +You must calculate absolute timestamps rather than relative durations. + +## Comparison with alternatives + +| Approach | Dynamic updates | Complexity | Use case | +| :--- | :--- | :--- | :--- | +| Updatable / Debounced Timer | Yes | Medium | Adjustable deadlines | +| Simple sleep | No | Low | Fixed delays | +| Cancellation Scope | Yes (cancel only) | Medium | Abort operations | +| Polling Loop | Yes | High | Frequent checks | + +## Best practices + +- **Use absolute timestamps.** Store wake-up time as an absolute value (epoch millis in Java/TypeScript, epoch seconds in Python, `time.Time` in Go), not relative durations. +- **Validate updates.** Ensure new deadlines are in the future. +- **Add Queries.** Expose the current deadline via Query methods. +- **Handle edge cases.** Check if the timer already expired before updating. +- **Consider max extensions.** Limit how many times or how far deadlines can be extended. +- **Log changes.** Log each deadline update for observability. +- **Reuse the timer helper.** Extract to a helper class or function for use across Workflows. +- **Combine with conditions.** Use a blocking wait with both time and business conditions. + +## Common pitfalls + +- **Using time-based conditions without a duration.** A wait without a timeout does not create a timer. The condition is only re-evaluated on state changes (Signals, Activity completions). Always provide a timeout for time-based waits. +- **Expecting the wait to re-evaluate its duration.** The timer duration is set once when the wait is called. Changing the duration variable afterward has no effect. This is why the timer helper loops and recalculates. +- **Not validating new deadlines.** Accepting a deadline in the past causes the timer to expire immediately. Always check that the new deadline is in the future before updating. +- **Accumulating uncancelled timers in Java.** In the Java SDK, `Workflow.await(Duration, condition)` does not automatically cancel its internal timer when the condition is met. Repeated calls in a loop accumulate timers. Wrap in a `CancellationScope` if this is a concern. +- **Not cancelling timers in Go.** In the Go SDK, always cancel the previous timer (via `workflow.WithCancel`) before creating a new one. Uncancelled timers wake up the Workflow unnecessarily, creating extra Worker load. + +## Related patterns + +- **[Signal with Start](/design-patterns/signal-with-start)**: Receiving external events to modify behavior. +- **[Approval Pattern](/design-patterns/approval)**: Approval Workflows with adjustable deadlines. + +## Sample code + +- [Java](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/updatabletimer) -- Complete implementation with starter and updater. +- [TypeScript](https://github.com/temporalio/samples-typescript/tree/main/timer-examples) -- Updatable timer with `condition` and `UpdatableTimer` class. +- [Python](https://github.com/temporalio/samples-python/tree/main/updatable_timer) -- Updatable timer with `wait_condition` and helper class. +- [Go](https://github.com/temporalio/samples-go/tree/main/updatabletimer) -- Timer cancellation with Selector, Signal channel, and `WithCancel`. diff --git a/docs/design-patterns/worker-configuration-patterns.mdx b/docs/design-patterns/worker-configuration-patterns.mdx new file mode 100644 index 0000000000..57d731ff72 --- /dev/null +++ b/docs/design-patterns/worker-configuration-patterns.mdx @@ -0,0 +1,25 @@ +--- +id: worker-configuration-patterns +title: Worker configuration patterns +sidebar_label: Overview +description: Patterns for configuring how Workers are set up, how work is routed, and how Activities access external dependencies. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for configuring how Workers are set up, how work is routed, and how Activities access external dependencies. + + diff --git a/docs/design-patterns/worker-specific-taskqueue.mdx b/docs/design-patterns/worker-specific-taskqueue.mdx new file mode 100644 index 0000000000..1dd36fee87 --- /dev/null +++ b/docs/design-patterns/worker-specific-taskqueue.mdx @@ -0,0 +1,786 @@ +--- +id: worker-specific-taskqueue +title: "Worker-Specific Task Queues Pattern" +sidebar_label: "Worker-Specific Task Queues" +description: "Routes Activities to specific Workers using unique Task Queues for Worker affinity and host-specific processing." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Overview + +The Worker-Specific Task Queues pattern enables routing Activities to specific Worker hosts when Activities must execute on the same machine. +This is essential for Workflows where subsequent Activities depend on local state, files, or resources created by previous Activities on a particular host. + +## Problem + +In distributed systems, you often need Workflows that download a file to a Worker's local disk and then process and upload it from the same location, establish a connection or session that subsequent Activities must reuse, create temporary resources on one host that later Activities need to access, or maintain affinity to a specific Worker for performance or data locality. + +Without Worker-specific routing, Activities execute on different hosts and cannot access local files or state. +You must set up complex distributed file systems or shared storage, handle race conditions when multiple Workers access the same resources, and accept that you cannot guarantee Activity colocation. + +## Solution + +You use a two-tier Task Queue architecture: a default shared Task Queue for initial Activities, and dynamically-named host-specific Task Queues for Activities that must run on the same Worker. +The first Activity returns its host-specific Task Queue name, and subsequent Activities use that queue. + +```mermaid +sequenceDiagram + participant Workflow + participant Worker1 + participant Worker2 + participant Worker3 + + Note over Workflow: Default Task Queue: "FileProcessing" + Workflow->>Worker2: download() - any worker + activate Worker2 + Worker2-->>Workflow: {hostQueue: "FileProcessing-host2", file: "/tmp/data"} + deactivate Worker2 + + Note over Workflow: Switch to host-specific queue + Note over Workflow: Task Queue: "FileProcessing-host2" + + Workflow->>Worker2: process(file) - MUST be Worker2 + activate Worker2 + Worker2-->>Workflow: processed file + deactivate Worker2 + + Workflow->>Worker2: upload(file) - MUST be Worker2 + activate Worker2 + Worker2-->>Workflow: done + deactivate Worker2 + + Note over Worker1,Worker3: Workers 1 & 3 never see
host-specific activities +``` + +The following describes each step in the diagram: + +1. The Workflow dispatches the download Activity on the default Task Queue. Any available Worker picks it up. +2. Worker 2 downloads the file and returns both the local file path and its host-specific Task Queue name. +3. The Workflow creates new Activity options targeting Worker 2's host-specific Task Queue. +4. The process and upload Activities execute on Worker 2, where the file is already on disk. +5. Workers 1 and 3 never see the host-specific Activities. + +The following snippet shows how the Workflow switches from the default Task Queue to the host-specific queue: + + + + +```python +# workflows.py +downloaded = await workflow.execute_activity( + download, + source, + start_to_close_timeout=timedelta(seconds=20), +) + +processed = await workflow.execute_activity( + process, + downloaded.file_name, + task_queue=downloaded.host_task_queue, + schedule_to_start_timeout=timedelta(seconds=10), + start_to_close_timeout=timedelta(seconds=20), +) +await workflow.execute_activity( + upload, + UploadInput(processed, destination), + task_queue=downloaded.host_task_queue, + schedule_to_start_timeout=timedelta(seconds=10), + start_to_close_timeout=timedelta(seconds=20), +) +``` + + + + +```go +// workflow.go +var defaultActivities *StoreActivities +downloaded, err := defaultActivities.Download(ctx, source) +if err != nil { + return err +} + +hostOptions := workflow.ActivityOptions{ + TaskQueue: downloaded.HostTaskQueue, + ScheduleToStartTimeout: 10 * time.Second, + StartToCloseTimeout: 20 * time.Second, +} +hostCtx := workflow.WithActivityOptions(ctx, hostOptions) + +var hostActivities *StoreActivities +processed, err := hostActivities.Process(hostCtx, downloaded.FileName) +if err != nil { + return err +} +err = hostActivities.Upload(hostCtx, processed, destination) +``` + + + + +```java +// FileProcessingWorkflowImpl.java +TaskQueueFileNamePair downloaded = defaultTaskQueueActivities.download(source); + +ActivityOptions hostOptions = ActivityOptions.newBuilder() + .setTaskQueue(downloaded.getHostTaskQueue()) + .setScheduleToStartTimeout(Duration.ofSeconds(10)) + .setStartToCloseTimeout(Duration.ofSeconds(20)) + .build(); +StoreActivities hostSpecificActivities = + Workflow.newActivityStub(StoreActivities.class, hostOptions); + +String processed = hostSpecificActivities.process(downloaded.getFileName()); +hostSpecificActivities.upload(processed, destination); +``` + + + + +```typescript +// workflows.ts +const { download } = proxyActivities({ + startToCloseTimeout: '20s', +}); + +const downloaded = await download(source); + +const hostSpecificActivities = proxyActivities({ + taskQueue: downloaded.hostTaskQueue, + scheduleToStartTimeout: '10s', + startToCloseTimeout: '20s', +}); + +const processed = await hostSpecificActivities.process(downloaded.fileName); +await hostSpecificActivities.upload(processed, destination); +``` + + + + +The `taskQueue` option (or `setTaskQueue()` in Java) routes subsequent Activities to the specific Worker that downloaded the file. +The `scheduleToStartTimeout` (or `setScheduleToStartTimeout()` in Java) is critical — if the specific Worker is unavailable, this timeout triggers retry logic rather than waiting indefinitely. + +## Implementation + +### Activity definition with host-specific return + +The download Activity returns both the file path and the host-specific Task Queue name: + + + + +```python +# activities.py +from dataclasses import dataclass +from temporalio import activity + +@dataclass +class TaskQueueFileNamePair: + host_task_queue: str + file_name: str + +@activity.defn +async def download(source: str) -> TaskQueueFileNamePair: + local_file = await download_to_local_disk(source) + return TaskQueueFileNamePair( + host_task_queue=host_specific_task_queue, + file_name=local_file, + ) + +@activity.defn +async def process(file_name: str) -> str: + return await process_local_file(file_name) + +@activity.defn +async def upload(file_name: str, destination: str) -> None: + await upload_from_local_disk(file_name, destination) +``` + + + + +```go +// activities.go +type TaskQueueFileNamePair struct { + HostTaskQueue string + FileName string +} + +type StoreActivities struct { + HostSpecificTaskQueue string +} + +func (a *StoreActivities) Download(ctx context.Context, source string) (*TaskQueueFileNamePair, error) { + localFile, err := downloadToLocalDisk(source) + if err != nil { + return nil, err + } + return &TaskQueueFileNamePair{ + HostTaskQueue: a.HostSpecificTaskQueue, + FileName: localFile, + }, nil +} + +func (a *StoreActivities) Process(ctx context.Context, fileName string) (string, error) { + return processLocalFile(fileName) +} + +func (a *StoreActivities) Upload(ctx context.Context, fileName string, destination string) error { + return uploadFromLocalDisk(fileName, destination) +} +``` + + + + +```java +// StoreActivities.java +public interface StoreActivities { + + class TaskQueueFileNamePair { + private final String hostTaskQueue; + private final String fileName; + + public TaskQueueFileNamePair(String hostTaskQueue, String fileName) { + this.hostTaskQueue = hostTaskQueue; + this.fileName = fileName; + } + + public String getHostTaskQueue() { return hostTaskQueue; } + public String getFileName() { return fileName; } + } + + TaskQueueFileNamePair download(URL source); + String process(String fileName); + void upload(String fileName, URL destination); +} +``` + + + + +```typescript +// activities.ts +export interface TaskQueueFileNamePair { + hostTaskQueue: string; + fileName: string; +} + +export async function download(source: string): Promise { + const localFile = await downloadToLocalDisk(source); + return { + hostTaskQueue: getHostSpecificTaskQueue(), + fileName: localFile, + }; +} + +export async function process(fileName: string): Promise { + return await processLocalFile(fileName); +} + +export async function upload(fileName: string, destination: string): Promise { + await uploadFromLocalDisk(fileName, destination); +} +``` + + + + +The download Activity bundles the local file path with the Task Queue name so the Workflow knows where to route subsequent Activities. + +### Activity implementation + +The Activity implementation receives the host-specific Task Queue name at construction time and includes it in the download result: + + + + +```python +# activities.py +# In Python, the host-specific Task Queue name is injected at Worker +# startup and captured by the activity closure or class instance. + +host_specific_task_queue: str = "" + +@activity.defn +async def download(source: str) -> TaskQueueFileNamePair: + local_file = await download_to_local_disk(source) + return TaskQueueFileNamePair( + host_task_queue=host_specific_task_queue, + file_name=local_file, + ) + +@activity.defn +async def process(file_name: str) -> str: + processed = await process_local_file(file_name) + return processed + +@activity.defn +async def upload(file_name: str, destination: str) -> None: + await upload_from_local_disk(file_name, destination) +``` + + + + +```go +// activities.go +// In Go, the host-specific Task Queue name is set on the struct +// at Worker startup and returned by the Download method. + +func (a *StoreActivities) Download(ctx context.Context, source string) (*TaskQueueFileNamePair, error) { + localFile, err := downloadToLocalDisk(source) + if err != nil { + return nil, err + } + return &TaskQueueFileNamePair{ + HostTaskQueue: a.HostSpecificTaskQueue, + FileName: localFile, + }, nil +} + +func (a *StoreActivities) Process(ctx context.Context, fileName string) (string, error) { + return processLocalFile(fileName) +} + +func (a *StoreActivities) Upload(ctx context.Context, fileName string, destination string) error { + return uploadFromLocalDisk(fileName, destination) +} +``` + + + + +```java +// StoreActivitiesImpl.java +public class StoreActivitiesImpl implements StoreActivities { + private final String hostSpecificTaskQueue; + + public StoreActivitiesImpl(String hostSpecificTaskQueue) { + this.hostSpecificTaskQueue = hostSpecificTaskQueue; + } + + @Override + public TaskQueueFileNamePair download(URL source) { + File localFile = downloadToLocalDisk(source); + return new TaskQueueFileNamePair( + hostSpecificTaskQueue, + localFile.getAbsolutePath()); + } + + @Override + public String process(String fileName) { + File processed = processLocalFile(new File(fileName)); + return processed.getAbsolutePath(); + } + + @Override + public void upload(String fileName, URL destination) { + uploadFromLocalDisk(new File(fileName), destination); + } +} +``` + + + + +```typescript +// activities.ts +// In TypeScript, the host-specific Task Queue name is captured via +// closure when defining the activity functions. A common approach is +// to initialize it at Worker startup and reference it from activities. + +let hostSpecificTaskQueue: string; + +export function initActivities(taskQueue: string) { + hostSpecificTaskQueue = taskQueue; +} + +function getHostSpecificTaskQueue(): string { + return hostSpecificTaskQueue; +} + +export async function download(source: string): Promise { + const localFile = await downloadToLocalDisk(source); + return { + hostTaskQueue: getHostSpecificTaskQueue(), + fileName: localFile, + }; +} + +export async function process(fileName: string): Promise { + return await processLocalFile(fileName); +} + +export async function upload(fileName: string, destination: string): Promise { + await uploadFromLocalDisk(fileName, destination); +} +``` + + + + +The `download` method returns the host-specific Task Queue name alongside the file path. +The `process` and `upload` methods operate on local files, which are guaranteed to exist because they run on the same host. + +### Workflow implementation + +The Workflow uses the default Task Queue for the initial download and switches to the host-specific queue for subsequent Activities: + + + + +```python +# workflows.py +from datetime import timedelta +from temporalio import workflow + +with workflow.unsafe.imports_passed_through(): + from activities import download, process, upload + +@workflow.defn +class FileProcessingWorkflow: + @workflow.run + async def run(self, source: str, destination: str) -> None: + downloaded = await workflow.execute_activity( + download, + source, + start_to_close_timeout=timedelta(seconds=20), + ) + + processed = await workflow.execute_activity( + process, + downloaded.file_name, + task_queue=downloaded.host_task_queue, + schedule_to_start_timeout=timedelta(seconds=10), + start_to_close_timeout=timedelta(seconds=20), + ) + + await workflow.execute_activity( + upload, + UploadInput(processed, destination), + task_queue=downloaded.host_task_queue, + schedule_to_start_timeout=timedelta(seconds=10), + start_to_close_timeout=timedelta(seconds=20), + ) +``` + + + + +```go +// workflow.go +func FileProcessingWorkflow(ctx workflow.Context, source string, destination string) error { + defaultOptions := workflow.ActivityOptions{ + StartToCloseTimeout: 20 * time.Second, + } + defaultCtx := workflow.WithActivityOptions(ctx, defaultOptions) + + var activities *StoreActivities + var downloaded TaskQueueFileNamePair + err := workflow.ExecuteActivity(defaultCtx, activities.Download, source).Get(ctx, &downloaded) + if err != nil { + return err + } + + hostOptions := workflow.ActivityOptions{ + TaskQueue: downloaded.HostTaskQueue, + ScheduleToStartTimeout: 10 * time.Second, + StartToCloseTimeout: 20 * time.Second, + } + hostCtx := workflow.WithActivityOptions(ctx, hostOptions) + + var processed string + err = workflow.ExecuteActivity(hostCtx, activities.Process, downloaded.FileName).Get(ctx, &processed) + if err != nil { + return err + } + + return workflow.ExecuteActivity(hostCtx, activities.Upload, processed, destination).Get(ctx, nil) +} +``` + + + + +```java +// FileProcessingWorkflowImpl.java +public class FileProcessingWorkflowImpl implements FileProcessingWorkflow { + private final StoreActivities defaultTaskQueueActivities; + + public FileProcessingWorkflowImpl() { + ActivityOptions defaultOptions = ActivityOptions.newBuilder() + .setStartToCloseTimeout(Duration.ofSeconds(20)) + .build(); + this.defaultTaskQueueActivities = + Workflow.newActivityStub(StoreActivities.class, defaultOptions); + } + + @Override + public void processFile(URL source, URL destination) { + TaskQueueFileNamePair downloaded = + defaultTaskQueueActivities.download(source); + + ActivityOptions hostOptions = ActivityOptions.newBuilder() + .setTaskQueue(downloaded.getHostTaskQueue()) + .setScheduleToStartTimeout(Duration.ofSeconds(10)) + .setStartToCloseTimeout(Duration.ofSeconds(20)) + .build(); + StoreActivities hostSpecificActivities = + Workflow.newActivityStub(StoreActivities.class, hostOptions); + + String processed = hostSpecificActivities.process(downloaded.getFileName()); + hostSpecificActivities.upload(processed, destination); + } +} +``` + + + + +```typescript +// workflows.ts +import { proxyActivities } from '@temporalio/workflow'; +import type * as activities from './activities'; + +const { download } = proxyActivities({ + startToCloseTimeout: '20s', +}); + +export async function fileProcessingWorkflow( + source: string, + destination: string, +): Promise { + const downloaded = await download(source); + + const hostSpecificActivities = proxyActivities({ + taskQueue: downloaded.hostTaskQueue, + scheduleToStartTimeout: '10s', + startToCloseTimeout: '20s', + }); + + const processed = await hostSpecificActivities.process(downloaded.fileName); + await hostSpecificActivities.upload(processed, destination); +} +``` + + + + +The Workflow creates two sets of Activity options: one for the default Task Queue and one for the host-specific queue returned by the download Activity. + +### Worker setup + +Each Worker registers with both the default Task Queue and its own host-specific Task Queue: + + + + +```python +# worker.py +import asyncio +import uuid +import socket +from temporalio.client import Client +from temporalio.worker import Worker +from workflows import FileProcessingWorkflow +from activities import download, process, upload, host_specific_task_queue +import activities as act_module + +async def main(): + client = await Client.connect("localhost:7233") + + default_task_queue = "FileProcessing" + host_task_queue = f"FileProcessing-{socket.gethostname()}-{uuid.uuid4()}" + + act_module.host_specific_task_queue = host_task_queue + + default_worker = Worker( + client, + task_queue=default_task_queue, + workflows=[FileProcessingWorkflow], + activities=[download, process, upload], + ) + host_worker = Worker( + client, + task_queue=host_task_queue, + activities=[download, process, upload], + ) + + await asyncio.gather(default_worker.run(), host_worker.run()) + +if __name__ == "__main__": + asyncio.run(main()) +``` + + + + +```go +// worker/main.go +func main() { + c, err := client.Dial(client.Options{}) + if err != nil { + log.Fatalln("Unable to create client", err) + } + defer c.Close() + + defaultTaskQueue := "FileProcessing" + hostTaskQueue := fmt.Sprintf("FileProcessing-%s-%s", getHostName(), uuid.New().String()) + + activities := &StoreActivities{HostSpecificTaskQueue: hostTaskQueue} + + defaultWorker := worker.New(c, defaultTaskQueue, worker.Options{}) + defaultWorker.RegisterWorkflow(FileProcessingWorkflow) + defaultWorker.RegisterActivity(activities) + + hostWorker := worker.New(c, hostTaskQueue, worker.Options{}) + hostWorker.RegisterActivity(activities) + + err = defaultWorker.Start() + if err != nil { + log.Fatalln("Unable to start default worker", err) + } + err = hostWorker.Start() + if err != nil { + log.Fatalln("Unable to start host worker", err) + } + + // Block until interrupted + select {} +} +``` + + + + +```java +// FileProcessingWorker.java +public class FileProcessingWorker { + public static void main(String[] args) { + WorkflowClient client = WorkflowClient.newInstance(service); + + String defaultTaskQueue = "FileProcessing"; + String hostTaskQueue = "FileProcessing-" + getHostName(); + + WorkerFactory factory = WorkerFactory.newInstance(client); + + Worker defaultWorker = factory.newWorker(defaultTaskQueue); + defaultWorker.registerWorkflowImplementationTypes( + FileProcessingWorkflowImpl.class); + defaultWorker.registerActivitiesImplementations( + new StoreActivitiesImpl(hostTaskQueue)); + + Worker hostWorker = factory.newWorker(hostTaskQueue); + hostWorker.registerActivitiesImplementations( + new StoreActivitiesImpl(hostTaskQueue)); + + factory.start(); + } +} +``` + + + + +```typescript +// worker.ts +import { Worker, NativeConnection } from '@temporalio/worker'; +import * as activities from './activities'; +import { v4 as uuid } from 'uuid'; +import os from 'os'; + +async function run() { + const defaultTaskQueue = 'FileProcessing'; + const hostTaskQueue = `FileProcessing-${os.hostname()}-${uuid()}`; + + activities.initActivities(hostTaskQueue); + + const defaultWorker = await Worker.create({ + workflowsPath: require.resolve('./workflows'), + activities, + taskQueue: defaultTaskQueue, + }); + + const hostWorker = await Worker.create({ + activities, + taskQueue: hostTaskQueue, + }); + + await Promise.all([defaultWorker.run(), hostWorker.run()]); +} + +run().catch((err) => { + console.error(err); + process.exit(1); +}); +``` + + + + +The default Worker handles Workflows and initial Activities. +The host-specific Worker handles only Activities that require Worker affinity. +Both Workers receive the same Activity implementation, but only the host-specific Worker receives Activities routed to its queue. + +## When to use + +The Worker-Specific Task Queues pattern is a good fit for file processing Workflows (download, process, upload on the same host), database connection pooling (maintain a connection across Activities), GPU-bound operations (route to Workers with specific hardware), session-based external API calls, and temporary resource management (cache, temp files, locks). + +It is not a good fit for stateless Activities that can run anywhere, Activities that use shared storage (S3, databases), high-availability requirements (host failure blocks the Workflow), or Workflows without local state dependencies. + +## Benefits and trade-offs + +Activities access local files and state without network overhead. +You do not need distributed file systems or state management. +Data transfer between Workers is eliminated. +The first Activity can run on any Worker; only subsequent ones are pinned. +Task Queue routing is recorded in Workflow history, ensuring deterministic behavior. + +The trade-offs to consider are that if the specific Worker crashes, Activities cannot proceed until the ScheduleToStartTimeout expires. +Host-specific queues may have uneven load distribution. +You must manage multiple Task Queues per Worker. +You must set ScheduleToStartTimeout to handle Worker unavailability. +You need to handle cleanup if the Workflow fails mid-process. + +## Comparison with alternatives + +| Approach | Locality | Complexity | Availability | +| :--- | :--- | :--- | :--- | +| Worker-Specific Queues | Guaranteed | Medium | Lower | +| Shared Storage (S3) | None | Low | Higher | +| Sticky Execution | Best effort | Low | Higher | +| Session Framework | Guaranteed | High | Lower | + +## Best practices + +- **Set ScheduleToStartTimeout.** Always configure this for host-specific queues to handle Worker failures. +- **Implement cleanup.** Use try-finally or cancellation scopes to clean up local resources. +- **Use unique queue names.** Use hostname, IP, or UUID to ensure unique Task Queue names. +- **Monitor queue depth.** Alert on growing host-specific queue backlogs. +- **Drain gracefully.** Drain host-specific queues before stopping Workers. +- **Retry the entire sequence.** Wrap the sequence in retry logic to restart on a different host if needed. +- **Limit concurrent Workflows.** Limit concurrent Workflows per Worker to prevent resource exhaustion. +- **Add health checks.** Verify Worker health before accepting work on host-specific queues. + +## Common pitfalls + +- **Missing ScheduleToStartTimeout on host-specific queues.** Without this timeout, if the target Worker is down, the Activity waits indefinitely. Always set `ScheduleToStartTimeout` so the Workflow can detect unavailability and retry on a different host. +- **Not registering the Worker on both queues.** Each Worker must listen on both the default shared Task Queue (for Workflows and initial Activities) and its own host-specific queue. Forgetting the host-specific queue means routed Activities are never picked up. +- **Assuming the host-specific Worker is always available.** The pinned Worker can crash or be restarted. Design the Workflow to retry the entire sequence on a different host when the `ScheduleToStartTimeout` expires. +- **Leaking temporary files on failure.** If the Workflow fails after downloading but before uploading, temporary files remain on disk. Use cleanup logic (defer, try-finally, or cancellation scopes) to remove local resources. +- **Using host-specific queues when shared storage suffices.** If all Workers can access the same storage (S3, NFS), Worker-specific routing adds unnecessary complexity and reduces availability. + +## Related patterns + +- **[Long-Running Activity](/design-patterns/long-running-activity)**: For very short operations that benefit from colocation. + +## Sample code + +- [Java Sample](https://github.com/temporalio/samples-java/tree/main/core/src/main/java/io/temporal/samples/fileprocessing) — Complete file processing implementation. +- [TypeScript Sample](https://github.com/temporalio/samples-typescript/tree/main/worker-specific-task-queues) — Worker-specific Task Queues with file processing. +- [Python Sample](https://github.com/temporalio/samples-python/tree/main/worker_specific_task_queues) — Worker-specific Task Queues with file processing. +- [Go Sample](https://github.com/temporalio/samples-go/tree/main/worker-specific-task-queues) — Worker-specific Task Queues with file processing. diff --git a/docs/design-patterns/workflow-messaging-patterns.mdx b/docs/design-patterns/workflow-messaging-patterns.mdx new file mode 100644 index 0000000000..f502c4f692 --- /dev/null +++ b/docs/design-patterns/workflow-messaging-patterns.mdx @@ -0,0 +1,25 @@ +--- +id: workflow-messaging-patterns +title: Workflow messaging patterns +sidebar_label: Overview +description: Patterns for sending data into running Workflows and receiving responses or triggering behavior changes. +--- + +import PatternCards from '@site/src/components/PatternCards'; + +Patterns for sending data into running Workflows and receiving responses or triggering behavior changes. + + diff --git a/docusaurus.config.js b/docusaurus.config.js index ae5a372f38..5a697d6488 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -94,6 +94,12 @@ module.exports = async function createConfigAsync() { activeBasePath: 'ai-cookbook', position: 'left', }, + { + label: 'Design Patterns', + to: '/design-patterns', + activeBasePath: 'design-patterns', + position: 'left', + }, { label: 'Code Exchange', href: 'https://temporal.io/code-exchange', diff --git a/sidebars.js b/sidebars.js index d284364099..7388d35028 100644 --- a/sidebars.js +++ b/sidebars.js @@ -1700,6 +1700,114 @@ module.exports = { // dirName: "./dev-guide", // '.' means the current docs folder // }, ], + designPatterns: [ + 'design-patterns/index', + { + type: 'category', + label: 'Distributed Transaction Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/distributed-transaction-patterns' }, + items: [ + 'design-patterns/saga-pattern', + 'design-patterns/early-return', + 'design-patterns/idempotent-distributed-transactions', + ], + }, + { + type: 'category', + label: 'Entity & Lifecycle Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/entity-lifecycle-patterns' }, + items: [ + 'design-patterns/entity-workflow', + 'design-patterns/continue-as-new', + 'design-patterns/updatable-timer', + ], + }, + { + type: 'category', + label: 'Workflow Messaging Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/workflow-messaging-patterns' }, + items: [ + 'design-patterns/signal-with-start', + 'design-patterns/request-response-via-updates', + 'design-patterns/event-accumulator', + ], + }, + { + type: 'category', + label: 'Task Orchestration Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/task-orchestration-patterns' }, + items: [ + 'design-patterns/child-workflows', + 'design-patterns/parallel-execution', + 'design-patterns/pick-first', + ], + }, + { + type: 'category', + label: 'External Interaction Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/external-interaction-patterns' }, + items: [ + 'design-patterns/polling', + 'design-patterns/long-running-activity', + 'design-patterns/approval', + 'design-patterns/delayed-start', + 'design-patterns/delayed-callback', + ], + }, + { + type: 'category', + label: 'Worker Configuration Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/worker-configuration-patterns' }, + items: [ + 'design-patterns/worker-specific-taskqueue', + 'design-patterns/activity-dependency-injection', + ], + }, + { + type: 'category', + label: 'Error Handling & Retry Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/error-handling-patterns' }, + items: [ + 'design-patterns/fixed-count-retries', + 'design-patterns/fixed-wall-time-retries', + 'design-patterns/non-retryable-errors', + 'design-patterns/delayed-retry', + 'design-patterns/fast-slow-retries', + 'design-patterns/retry-metrics', + 'design-patterns/resumable-activity', + ], + }, + { + type: 'category', + label: 'QoS & Throughput Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/qos-throughput-patterns' }, + items: [ + 'design-patterns/downstream-rate-limiting', + 'design-patterns/priority-task-queues', + 'design-patterns/fairness', + ], + }, + { + type: 'category', + label: 'Batch Processing Patterns', + collapsed: true, + link: { type: 'doc', id: 'design-patterns/batch-processing-patterns' }, + items: [ + 'design-patterns/fanout-child-workflows', + 'design-patterns/batch-iterator', + 'design-patterns/sliding-window', + 'design-patterns/mapreduce-tree', + ], + }, + ], tctl: [ 'tctl-v1/index', 'tctl-v1/activity', diff --git a/src/components/PatternCards.tsx b/src/components/PatternCards.tsx index 8ed2298001..23512aa0c6 100644 --- a/src/components/PatternCards.tsx +++ b/src/components/PatternCards.tsx @@ -1,16 +1,25 @@ import React from 'react'; +import useBaseUrl from '@docusaurus/useBaseUrl'; type PatternCardItem = { href: string; title: string; description: string; external?: boolean; + icon?: string; }; type PatternCardsProps = { items: PatternCardItem[]; }; +const ICON_BASE = '/img/design-patterns/icons/'; + +function CardIcon({ icon, title }: { icon: string; title: string }) { + const src = useBaseUrl(icon.startsWith('/') ? icon : `${ICON_BASE}${icon}`); + return {title}; +} + export default function PatternCards({ items }: PatternCardsProps) { return (
@@ -22,7 +31,14 @@ export default function PatternCards({ items }: PatternCardsProps) { {...(item.external ? { target: '_blank', rel: 'noopener noreferrer' } : {})} >
-

{item.title}

+ {item.icon ? ( +
+ +

{item.title}

+
+ ) : ( +

{item.title}

+ )}

{item.description}

diff --git a/src/css/custom.css b/src/css/custom.css index 863c7ac513..65962513eb 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -1249,6 +1249,24 @@ code { color: var(--ifm-color-primary); } +.pattern-card-header { + display: flex; + align-items: center; + gap: 0.75rem; + margin-bottom: 0.75rem; +} + +.pattern-card-header img { + width: 48px; + height: 36px; + object-fit: contain; + flex-shrink: 0; +} + +.pattern-card-header h3 { + margin: 0; +} + .pattern-content h3 { font-size: 1.25rem; font-weight: 500; @@ -1278,6 +1296,126 @@ code { } } +/* + * Theme-aware semantic colors for Design Patterns mermaid diagrams. + * Diagrams register semantic classes (classDef ...) and these variables + * are applied to the nodes below, so colors adapt automatically to light/dark mode. + */ +:root { + --dp-mermaid-success-bg: #d8f5dd; + --dp-mermaid-success-border: #2e9e4f; + --dp-mermaid-success-text: #14361f; + --dp-mermaid-compensation-bg: #fcd9e2; + --dp-mermaid-compensation-border: #d6457f; + --dp-mermaid-compensation-text: #3f1020; + --dp-mermaid-wait-bg: #ffe8c2; + --dp-mermaid-wait-border: #e0941f; + --dp-mermaid-wait-text: #3d2906; + --dp-mermaid-complete-bg: #d7e2ff; + --dp-mermaid-complete-border: #3f63e0; + --dp-mermaid-complete-text: #11214f; + --dp-mermaid-fail-bg: #ffd7dc; + --dp-mermaid-fail-border: #d83a3a; + --dp-mermaid-fail-text: #470f13; + --dp-mermaid-highlight-bg: #fff1c2; + --dp-mermaid-highlight-border: #d6a90f; + --dp-mermaid-highlight-text: #3b3107; +} + +[data-theme='dark'] { + --dp-mermaid-success-bg: #173a27; + --dp-mermaid-success-border: #43c46a; + --dp-mermaid-success-text: #c5f3cf; + --dp-mermaid-compensation-bg: #3a1c2a; + --dp-mermaid-compensation-border: #f06ba8; + --dp-mermaid-compensation-text: #fbd2e2; + --dp-mermaid-wait-bg: #3a2c14; + --dp-mermaid-wait-border: #f0b02e; + --dp-mermaid-wait-text: #f8e3b6; + --dp-mermaid-complete-bg: #182a52; + --dp-mermaid-complete-border: #6f8cff; + --dp-mermaid-complete-text: #d3deff; + --dp-mermaid-fail-bg: #3a1a1e; + --dp-mermaid-fail-border: #f0595c; + --dp-mermaid-fail-text: #f8cccf; + --dp-mermaid-highlight-bg: #3a3414; + --dp-mermaid-highlight-border: #f3cb29; + --dp-mermaid-highlight-text: #f7edab; +} + +/* + * Mermaid classDef cannot use var() in its values, so diagrams only register + * semantic classes (classDef ...) and the actual colors are applied here. + * !important is required to beat Mermaid's theme-generated node styles. + */ +.docusaurus-mermaid-container g.node.success rect, +.docusaurus-mermaid-container g.node.success polygon, +.docusaurus-mermaid-container g.node.success path, +.docusaurus-mermaid-container g.node.success circle { + fill: var(--dp-mermaid-success-bg) !important; + stroke: var(--dp-mermaid-success-border) !important; +} +.docusaurus-mermaid-container g.node.compensation rect, +.docusaurus-mermaid-container g.node.compensation polygon, +.docusaurus-mermaid-container g.node.compensation path, +.docusaurus-mermaid-container g.node.compensation circle { + fill: var(--dp-mermaid-compensation-bg) !important; + stroke: var(--dp-mermaid-compensation-border) !important; +} +.docusaurus-mermaid-container g.node.wait rect, +.docusaurus-mermaid-container g.node.wait polygon, +.docusaurus-mermaid-container g.node.wait path, +.docusaurus-mermaid-container g.node.wait circle { + fill: var(--dp-mermaid-wait-bg) !important; + stroke: var(--dp-mermaid-wait-border) !important; +} +.docusaurus-mermaid-container g.node.complete rect, +.docusaurus-mermaid-container g.node.complete polygon, +.docusaurus-mermaid-container g.node.complete path, +.docusaurus-mermaid-container g.node.complete circle { + fill: var(--dp-mermaid-complete-bg) !important; + stroke: var(--dp-mermaid-complete-border) !important; +} +.docusaurus-mermaid-container g.node.fail rect, +.docusaurus-mermaid-container g.node.fail polygon, +.docusaurus-mermaid-container g.node.fail path, +.docusaurus-mermaid-container g.node.fail circle { + fill: var(--dp-mermaid-fail-bg) !important; + stroke: var(--dp-mermaid-fail-border) !important; +} +.docusaurus-mermaid-container g.node.highlight rect, +.docusaurus-mermaid-container g.node.highlight polygon, +.docusaurus-mermaid-container g.node.highlight path, +.docusaurus-mermaid-container g.node.highlight circle { + fill: var(--dp-mermaid-highlight-bg) !important; + stroke: var(--dp-mermaid-highlight-border) !important; +} + +.docusaurus-mermaid-container g.node.success .nodeLabel, +.docusaurus-mermaid-container g.node.success .nodeLabel p { + color: var(--dp-mermaid-success-text) !important; +} +.docusaurus-mermaid-container g.node.compensation .nodeLabel, +.docusaurus-mermaid-container g.node.compensation .nodeLabel p { + color: var(--dp-mermaid-compensation-text) !important; +} +.docusaurus-mermaid-container g.node.wait .nodeLabel, +.docusaurus-mermaid-container g.node.wait .nodeLabel p { + color: var(--dp-mermaid-wait-text) !important; +} +.docusaurus-mermaid-container g.node.complete .nodeLabel, +.docusaurus-mermaid-container g.node.complete .nodeLabel p { + color: var(--dp-mermaid-complete-text) !important; +} +.docusaurus-mermaid-container g.node.fail .nodeLabel, +.docusaurus-mermaid-container g.node.fail .nodeLabel p { + color: var(--dp-mermaid-fail-text) !important; +} +.docusaurus-mermaid-container g.node.highlight .nodeLabel, +.docusaurus-mermaid-container g.node.highlight .nodeLabel p { + color: var(--dp-mermaid-highlight-text) !important; +} + /* Ask AI Navbar Button */ .ask-ai-navbar-button { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); diff --git a/static/img/design-patterns/icons/activity-dependency-injection-icon.png b/static/img/design-patterns/icons/activity-dependency-injection-icon.png new file mode 100644 index 0000000000..6c8685fe32 Binary files /dev/null and b/static/img/design-patterns/icons/activity-dependency-injection-icon.png differ diff --git a/static/img/design-patterns/icons/approval-icon.png b/static/img/design-patterns/icons/approval-icon.png new file mode 100644 index 0000000000..5f5a8a2ace Binary files /dev/null and b/static/img/design-patterns/icons/approval-icon.png differ diff --git a/static/img/design-patterns/icons/child-workflows-icon.png b/static/img/design-patterns/icons/child-workflows-icon.png new file mode 100644 index 0000000000..ca7740d579 Binary files /dev/null and b/static/img/design-patterns/icons/child-workflows-icon.png differ diff --git a/static/img/design-patterns/icons/continue-as-new-icon.png b/static/img/design-patterns/icons/continue-as-new-icon.png new file mode 100644 index 0000000000..0bc6ace5bf Binary files /dev/null and b/static/img/design-patterns/icons/continue-as-new-icon.png differ diff --git a/static/img/design-patterns/icons/delayed-start-icon.png b/static/img/design-patterns/icons/delayed-start-icon.png new file mode 100644 index 0000000000..31575cb867 Binary files /dev/null and b/static/img/design-patterns/icons/delayed-start-icon.png differ diff --git a/static/img/design-patterns/icons/downstream-rate-limiting-icon.svg b/static/img/design-patterns/icons/downstream-rate-limiting-icon.svg new file mode 100644 index 0000000000..30160c70c3 --- /dev/null +++ b/static/img/design-patterns/icons/downstream-rate-limiting-icon.svg @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + diff --git a/static/img/design-patterns/icons/early-return-icon.png b/static/img/design-patterns/icons/early-return-icon.png new file mode 100644 index 0000000000..90ecdd0662 Binary files /dev/null and b/static/img/design-patterns/icons/early-return-icon.png differ diff --git a/static/img/design-patterns/icons/entity-workflow-icon.png b/static/img/design-patterns/icons/entity-workflow-icon.png new file mode 100644 index 0000000000..e9d1f93604 Binary files /dev/null and b/static/img/design-patterns/icons/entity-workflow-icon.png differ diff --git a/static/img/design-patterns/icons/event-accumulator-icon.png b/static/img/design-patterns/icons/event-accumulator-icon.png new file mode 100644 index 0000000000..130deb9ca1 Binary files /dev/null and b/static/img/design-patterns/icons/event-accumulator-icon.png differ diff --git a/static/img/design-patterns/icons/fairness-icon.svg b/static/img/design-patterns/icons/fairness-icon.svg new file mode 100644 index 0000000000..7815564200 --- /dev/null +++ b/static/img/design-patterns/icons/fairness-icon.svg @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/static/img/design-patterns/icons/long-running-activity-icon.png b/static/img/design-patterns/icons/long-running-activity-icon.png new file mode 100644 index 0000000000..177fc2ccfd Binary files /dev/null and b/static/img/design-patterns/icons/long-running-activity-icon.png differ diff --git a/static/img/design-patterns/icons/parallel-execution-icon.png b/static/img/design-patterns/icons/parallel-execution-icon.png new file mode 100644 index 0000000000..7f8ac4c774 Binary files /dev/null and b/static/img/design-patterns/icons/parallel-execution-icon.png differ diff --git a/static/img/design-patterns/icons/pick-first-icon.png b/static/img/design-patterns/icons/pick-first-icon.png new file mode 100644 index 0000000000..971f5ee115 Binary files /dev/null and b/static/img/design-patterns/icons/pick-first-icon.png differ diff --git a/static/img/design-patterns/icons/polling-icon.png b/static/img/design-patterns/icons/polling-icon.png new file mode 100644 index 0000000000..98eb184e44 Binary files /dev/null and b/static/img/design-patterns/icons/polling-icon.png differ diff --git a/static/img/design-patterns/icons/priority-task-queues-icon.svg b/static/img/design-patterns/icons/priority-task-queues-icon.svg new file mode 100644 index 0000000000..3d62decb95 --- /dev/null +++ b/static/img/design-patterns/icons/priority-task-queues-icon.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/static/img/design-patterns/icons/request-response-icon.png b/static/img/design-patterns/icons/request-response-icon.png new file mode 100644 index 0000000000..e42eb21b80 Binary files /dev/null and b/static/img/design-patterns/icons/request-response-icon.png differ diff --git a/static/img/design-patterns/icons/saga-icon.png b/static/img/design-patterns/icons/saga-icon.png new file mode 100644 index 0000000000..689bb78b08 Binary files /dev/null and b/static/img/design-patterns/icons/saga-icon.png differ diff --git a/static/img/design-patterns/icons/signal-with-start-icon.png b/static/img/design-patterns/icons/signal-with-start-icon.png new file mode 100644 index 0000000000..130deb9ca1 Binary files /dev/null and b/static/img/design-patterns/icons/signal-with-start-icon.png differ diff --git a/static/img/design-patterns/icons/updatable-timer-icon.png b/static/img/design-patterns/icons/updatable-timer-icon.png new file mode 100644 index 0000000000..e42b48dc4b Binary files /dev/null and b/static/img/design-patterns/icons/updatable-timer-icon.png differ diff --git a/static/img/design-patterns/icons/webhooks-icon.png b/static/img/design-patterns/icons/webhooks-icon.png new file mode 100644 index 0000000000..554026d114 Binary files /dev/null and b/static/img/design-patterns/icons/webhooks-icon.png differ diff --git a/static/img/design-patterns/icons/worker-specific-taskqueue-icon.png b/static/img/design-patterns/icons/worker-specific-taskqueue-icon.png new file mode 100644 index 0000000000..b7008b1139 Binary files /dev/null and b/static/img/design-patterns/icons/worker-specific-taskqueue-icon.png differ