diff --git a/docs/guides/assets/lock-shared-resources/distributed-lock-1.svg b/docs/guides/assets/lock-shared-resources/distributed-lock-1.svg
new file mode 100644
index 0000000000..b8526307b1
--- /dev/null
+++ b/docs/guides/assets/lock-shared-resources/distributed-lock-1.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/guides/assets/saga-pattern/keep-business-moving-architectural-comparison.png b/docs/guides/assets/saga-pattern/keep-business-moving-architectural-comparison.png
new file mode 100644
index 0000000000..31dbb0f95b
Binary files /dev/null and b/docs/guides/assets/saga-pattern/keep-business-moving-architectural-comparison.png differ
diff --git a/docs/guides/assets/saga-pattern/keep-business-moving-architecture-diagram.svg b/docs/guides/assets/saga-pattern/keep-business-moving-architecture-diagram.svg
new file mode 100644
index 0000000000..c7e9b39906
--- /dev/null
+++ b/docs/guides/assets/saga-pattern/keep-business-moving-architecture-diagram.svg
@@ -0,0 +1,4 @@
+
\ No newline at end of file
diff --git a/docs/guides/assets/saga-pattern/keep-business-moving-cover.png b/docs/guides/assets/saga-pattern/keep-business-moving-cover.png
new file mode 100644
index 0000000000..945e0ab001
Binary files /dev/null and b/docs/guides/assets/saga-pattern/keep-business-moving-cover.png differ
diff --git a/docs/guides/assets/saga-pattern/keep-business-moving-event-history.png b/docs/guides/assets/saga-pattern/keep-business-moving-event-history.png
new file mode 100644
index 0000000000..78532324d5
Binary files /dev/null and b/docs/guides/assets/saga-pattern/keep-business-moving-event-history.png differ
diff --git a/docs/guides/assets/saga-pattern/keep-business-moving-fix-employer.png b/docs/guides/assets/saga-pattern/keep-business-moving-fix-employer.png
new file mode 100644
index 0000000000..e529c55896
Binary files /dev/null and b/docs/guides/assets/saga-pattern/keep-business-moving-fix-employer.png differ
diff --git a/docs/guides/assets/saga-pattern/keep-business-moving-list-view.png b/docs/guides/assets/saga-pattern/keep-business-moving-list-view.png
new file mode 100644
index 0000000000..863eda1547
Binary files /dev/null and b/docs/guides/assets/saga-pattern/keep-business-moving-list-view.png differ
diff --git a/docs/guides/assets/saga-pattern/keep-business-moving-success.png b/docs/guides/assets/saga-pattern/keep-business-moving-success.png
new file mode 100644
index 0000000000..8e4bdb94ef
Binary files /dev/null and b/docs/guides/assets/saga-pattern/keep-business-moving-success.png differ
diff --git a/docs/guides/assets/temporary-rate-limit-increases/orchestrate-temporary-rate-limit-increases.svg b/docs/guides/assets/temporary-rate-limit-increases/orchestrate-temporary-rate-limit-increases.svg
new file mode 100644
index 0000000000..9a4f59fcc9
--- /dev/null
+++ b/docs/guides/assets/temporary-rate-limit-increases/orchestrate-temporary-rate-limit-increases.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/guides/assets/worker-execution-affinity/worker-execution-affinity.svg b/docs/guides/assets/worker-execution-affinity/worker-execution-affinity.svg
new file mode 100644
index 0000000000..38b1130641
--- /dev/null
+++ b/docs/guides/assets/worker-execution-affinity/worker-execution-affinity.svg
@@ -0,0 +1,4 @@
+
\ No newline at end of file
diff --git a/docs/guides/durable-gaming-sessions.mdx b/docs/guides/durable-gaming-sessions.mdx
new file mode 100644
index 0000000000..b1bcf3187a
--- /dev/null
+++ b/docs/guides/durable-gaming-sessions.mdx
@@ -0,0 +1,1300 @@
+---
+id: durable-gaming-sessions
+title: Player Sessions That Survive Anything
+description: Protect player sessions from backend failures by using the Actor pattern.
+sidebar_label: Durable gaming sessions
+toc_max_heading_level: 2
+author: Cecil Phillip
+tags:
+ - Workflows
+ - Activities
+ - Continue-as-new
+ - Child workflows
+ - Heartbeat
+ - Task queues
+ - Workers
+---
+
+Multiplayer game backends run into the same wall at scale: player state lives in memory, pinned to a server, and when that server goes down the session goes with it. Reconnection logic is painful, distributed state is expensive, and cross-player coordination is worse. This guide shows how to model each player as their own durable, independently-running process — one that survives crashes, executes game actions, and stays addressable for as long as the session lasts.
+
+## Problem statement
+
+When a game server goes down, every player on it loses their session. Inventory, position, active buffs, current room — gone. Reconnection has to reconstruct state from whatever made it to the database before the crash, and something always gets missed. Scaling makes this worse: keeping per-player state consistent across a cluster requires distributed caches, locking, and sync logic that grows in complexity with every new game feature. Multi-step operations between players — trading items, triggering combat, changing rooms — need coordination logic that's hard to get right and even harder to debug when it goes wrong.
+
+## Solution
+
+Each player gets their own Workflow — a durable, uniquely-addressable process identified by their player ID. It holds inventory, health, position, and room state in memory. It doesn't just store state: it acts. Joining a room, executing a combat move, updating the leaderboard — each of these happens through an Activity the Workflow drives directly. When the server goes down, the Workflow resumes on another Worker from exactly the point it left off, with no state lost and no reconnection logic required. Sessions that run for weeks don't accumulate unbounded history — Continue-As-New handles that automatically.
+
+## Outcomes
+
+After working through this guide, you'll have a session system where:
+
+- **Server crashes don't end player sessions**: state is durable and resumes on any available Worker without custom recovery logic
+- **Game actions are reliable**: room joins, combat moves, and leaderboard updates execute through Activities with automatic retries — partial failures don't corrupt session state
+- **Sessions run as long as needed**: Continue-As-New manages history growth so a session that lasts weeks costs no more to run than one that lasts minutes
+- **Every player is independently addressable**: any service can send a command, query state, or trigger an update using the player's Workflow Id
+- **Active sessions are queryable by operators**: Search Attributes let you find who's in which room, how long a session has been running, or which players are idle
+
+## Background and best practices
+
+### What is an Actor Workflow?
+
+The Actor Workflow pattern extends the Workflow Entity pattern. An Workflow Entity is a long-running Workflow that represents a *thing*: it holds state, responds to messages, and exposes that state through Queries. An Actor Workflow is an Entity Workflow that *does things*. The distinction is behavioral:
+
+- **An entity is a thing. An actor is a thing that does things.**
+- An Workflow Entity is analogous to a distributed data cache: it stores and retrieves state.
+- An Actor Workflow is analogous to a distributed object with operations: it stores state *and* executes side effects on behalf of the entity it represents.
+
+An Entity Workflow that takes action is an Actor Workflow. Concrete examples of actors include a Workflow that scales up a database when load increases, starts a car engine when a driver authenticates, submits an order when a customer confirms checkout, or manages a player session in a multiplayer game by joining rooms and executing combat moves.
+
+AI agents are often modeled as Actor Workflows. They receive instructions (Temporal Signals or Updates), maintain conversational state, and take action by calling external services (Activities) such as language model APIs, tool integrations, and data retrieval systems.
+
+### The actor model and Temporal
+
+The actor model, originally formalized by Carl Hewitt in 1973, defines computation in terms of *actors*: autonomous units that communicate exclusively through asynchronous messages. Each actor:
+
+1. Has a private, encapsulated state that other actors can not directly access.
+2. Processes messages one at a time from an inbox (mailbox).
+3. Can send messages to other actors.
+4. Can create new actors.
+5. Can change its own internal state in response to a message.
+
+Temporal Workflows map naturally to this model:
+
+| Actor model concept | Temporal equivalent |
+|---|---|
+| Actor identity | Workflow Id |
+| Actor mailbox | Workflow Event History (Signals or Updates) |
+| Processing a message | Signal handler, Update handler |
+| Sending a message to another actor | Signaling an external Workflow |
+| Creating a new actor | Starting a new Workflow Execution |
+| Actor supervision | Retry Policies, parent-child relationships |
+| Actor state persistence | Durable Execution (automatic via Event History) |
+
+Traditional actor frameworks such as Akka, and Orleans require you to configure persistence, message delivery guarantees, and supervision hierarchies. Temporal provides these capabilities as platform primitives. Every Workflow Execution is durably persisted. Every Signal is reliably delivered. Every Activity execution is automatically retried according to its Retry Policy. You do not need to implement a persistence plugin, configure a journal, or write custom supervision strategies.
+
+### Why actors matter for player sessions
+
+A player session maps almost perfectly onto the actor model. Each player has private state that nothing else should write to directly — inventory, health, position, active buffs. They receive messages from clients, servers, and other players. They take real-world actions in response. And their lifecycle is unpredictable: a session might last twenty minutes or three months, with idle stretches in between.
+
+Modeling each player as an Actor Workflow gives you a single authoritative representation of that session. There is no cache to invalidate, no database row to lock, no sticky session to maintain. The Workflow *is* the session, and it's always reachable by player ID.
+
+### Event History and Continue-As-New
+
+Every Workflow Execution in Temporal produces an append-only Event History. This history has a limit of 50,000 Events or 50 MB. For actor Workflows that run indefinitely, Event History growth must be managed proactively.
+
+Continue-As-New atomically completes the current Workflow Execution and starts a new one with the same Workflow Id, carrying forward any state you provide as arguments. From the perspective of external callers, the Workflow Id remains the same. Signals sent during the transition are not lost; the Temporal Service buffers them for the new execution.
+
+Before executing Continue-As-New, you must ensure that all in-progress Signal and Update handlers have finished processing. The `workflow.all_handlers_finished` predicate provides this guarantee.
+
+### Signal volume limits
+
+Two hard limits govern Signal volume per Workflow Execution:
+
+- **10,000 total Signals** per Execution. Continue-As-New resets this counter, so an entity that transitions regularly is not constrained by it in practice.
+- **2,000 pending Signals** (unprocessed Signals buffered by the server) at any one time. If this limit is reached, new Signals are rejected.
+
+Beyond these limits, the practical throughput ceiling is Worker-side: each Signal triggers a Workflow Task, and a Workflow processes one Workflow Task at a time. For typical short tasks, this yields a few Workflow Tasks per second per Workflow Execution.
+
+If your use case requires higher Signal ingestion rates — for example, streaming real-time game telemetry — consider one of these approaches:
+
+- **Batch events into a single Signal payload.** Instead of one Signal per game event, batch several events into a list and send them as a single Signal. The handler appends the entire batch to the queue in one shot.
+- **Use an aggregation layer.** Route high-frequency event streams through a service (such as a message broker or aggregator) that batches events before forwarding them as Signals. This decouples the producer's throughput from the Workflow's processing rate.
+
+### Activity Heartbeating
+
+Long-running Activities, such as maintaining a WebSocket connection to a game room or processing a large batch of leaderboard updates, should emit heartbeats. A heartbeat serves two purposes:
+
+1. It tells the Temporal Service that the Activity is still making progress. If heartbeats stop arriving within the configured Heartbeat Timeout, the Temporal Service considers the Activity failed and schedules a retry.
+2. It carries a custom payload that captures the Activity's progress. When the Activity is retried after a failure, the new attempt can read the last heartbeat payload and resume from where it left off.
+
+Configure a short Heartbeat Timeout (for example, 30 seconds) and emit heartbeats frequently (for example, every 5 to 10 seconds). The SDK throttles heartbeat calls to avoid overwhelming the Temporal Service, so you can call `activity.heartbeat()` as often as needed without performance concern.
+
+### Workflow determinism
+
+In Temporal, your Activities can include non-deterministic code, but the Workflow itself must remain deterministic. This is because Temporal restores Workflow state through *replay*. When a Worker restarts, it re-executes the Workflow code from the beginning, matching Commands against Events stored in the history. If the code produces different Commands than what the history contains, the Worker raises a `NondeterminismError`.
+
+In practice, this means:
+
+- Use `workflow.now()` instead of `datetime.now()`.
+- Use `workflow.uuid4()` instead of `uuid.uuid4()`.
+- Use `workflow.random()` instead of the `random` module.
+- Do not perform I/O, network calls, or file system access inside Workflow code. Delegate all side effects to Activities.
+- Use `workflow.logger` instead of `print()` for replay-safe logging.
+
+The Python SDK's Workflow sandbox provides automatic protection against many of these violations, but understanding the underlying mechanism helps you write correct code.
+
+### Activity idempotency
+
+Activities may be retried by the Temporal Service due to timeouts, Worker crashes, or transient failures. Every Activity that interacts with an external system must be designed so that executing it twice produces the same result as executing it once. Common strategies include:
+
+- Passing a unique identifier (the Workflow Id, an Activity-specific identifier, or a business identifier) as an idempotency key to external APIs.
+- Checking the current state of the external system before making changes.
+- Using conditional writes or upserts instead of blind inserts.
+
+## Target audience
+
+This guide references the following roles:
+
+- **Game backend developers** who design and implement player session management, matchmaking, and real-time game logic. They will write the Workflow, Activity, and data model code.
+- **Platform engineers** who deploy and operate Temporal Workers, and manage Temporal infrastructure.
+- **Technical architects** who evaluate distributed system patterns for multiplayer game backends. They will use this guide to understand how the actor model maps to Temporal and when to apply it.
+
+
+## Prerequisites
+
+### Required software and infrastructure
+
+- **Python 3.11 or later**
+- **Temporal Python SDK (`temporalio`)** version 1.9.0 or later
+- **Pydantic** version 2.0 or later for data validation
+- **Temporal CLI** for running a local development server (`temporal server start-dev`)
+
+### Required concepts
+
+- Familiarity with Python `async`/`await` and the `asyncio` event loop
+- Familiarity with Temporal Workflows, Activities, and Workers
+- Familiarity with Temporal Signals, Queries, and Updates
+- Basic understanding of the actor model (message-passing concurrency)
+- Familiarity with `dataclasses` or Pydantic models for structured data
+
+---
+
+## Architecture diagram
+
+The following sequence diagram illustrates the lifecycle of a player Actor Workflow, including interactions with game systems and Events.
+
+### Narrative
+
+1. The game client starts a player Actor Workflow using the player's unique identifier as the Workflow Id.
+2. The Workflow initializes player state and enters a main event loop, waiting for messages.
+3. When the client sends a `join_room` Signal, the Workflow executes an Activity that communicates with the game room service. This is the key distinction from an Entity Workflow: the actor *does something* by executing an Activity with a real-world side effect.
+4. Combat moves trigger Activities that process game logic and update leaderboards.
+5. Queries provide read-only access to the player's current state for dashboards and client polling.
+6. When the Event History approaches the size limit, the Workflow waits for all handlers to finish and then executes Continue-As-New, carrying forward the player's state to a fresh execution.
+
+
+```mermaid
+sequenceDiagram
+ participant Client as Game Client
+ participant TS as Temporal Service
+ participant PW as Player Actor Workflow (player-session-alice)
+ participant A as Activities
+
+ Client->>TS: Start Workflow (player-session-alice)
+ TS->>PW: Execute @workflow.run
+
+ Note over PW: Actor initializes state, enters main event loop
+
+ Client->>TS: Signal: join_room("dungeon-7")
+ TS->>PW: Deliver Signal
+ PW->>A: execute_activity(join_game_room)
+ A-->>PW: Room joined, room state returned
+ Note over PW: Update internal state with room information
+
+ Client->>TS: Signal: execute_move(attack, target)
+ TS->>PW: Deliver Signal
+ PW->>A: execute_activity(process_combat_move)
+ A-->>PW: Move result (damage dealt, XP earned)
+ PW->>A: execute_activity(update_leaderboard)
+ A-->>PW: Leaderboard updated
+
+ Client->>TS: Query: get_player_state
+ TS->>PW: Deliver Query
+ PW-->>TS: Return current state
+ TS-->>Client: Player state
+
+ Note over PW: Event History approaching limit
+ PW->>PW: await all_handlers_finished
+ PW->>TS: Continue-As-New(carry forward state)
+ TS->>PW: New execution with same Workflow Id
+```
+
+
+## Implementation plan
+
+This section walks you through building a complete player session Actor Workflow system. The implementation is organized into the following phases:
+
+1. Define the data models
+2. Define the Activities
+3. Define the Player Actor Workflow
+4. Configure and start the Worker
+5. Start and interact with player sessions from a client
+
+### Phase 1: Define the data models
+
+Create a file named `models.py` to hold all data structures used by the Workflows and Activities. Using `dataclasses` keeps payloads lightweight and avoids additional dependencies, though Pydantic models work equally well if you need validation.
+
+```python
+# models.py
+from __future__ import annotations
+
+import enum
+from dataclasses import dataclass, field
+
+
+class PlayerStatus(str, enum.Enum):
+ """Represents the current lifecycle state of a player session."""
+
+ ONLINE = "online"
+ IN_ROOM = "in_room"
+ IN_COMBAT = "in_combat"
+ IDLE = "idle"
+ OFFLINE = "offline"
+
+
+class MoveType(str, enum.Enum):
+ """Types of combat moves a player can execute."""
+
+ ATTACK = "attack"
+ DEFEND = "defend"
+ HEAL = "heal"
+ SPECIAL = "special"
+
+
+@dataclass
+class InventoryItem:
+ """A single item in a player's inventory."""
+
+ item_id: str
+ name: str
+ quantity: int = 1
+
+
+@dataclass
+class CombatMoveRequest:
+ """A request from a player to execute a combat move."""
+
+ move_type: MoveType
+ target_player_id: str | None = None
+ event_id: str = "" # Idempotency key; callers should always provide one
+
+
+@dataclass
+class CombatMoveResult:
+ """The outcome of a combat move."""
+
+ damage_dealt: int = 0
+ damage_received: int = 0
+ xp_earned: int = 0
+ move_description: str = ""
+
+
+@dataclass
+class LeaderboardEntry:
+ """A player's leaderboard record."""
+
+ player_id: str
+ score: int
+ rank: int = 0
+
+
+@dataclass
+class RoomInfo:
+ """Information about a game room the player has joined."""
+
+ room_id: str
+ room_name: str
+ player_count: int = 0
+ max_players: int = 20
+
+
+@dataclass
+class PlayerState:
+ """The complete state of a player session.
+
+ This dataclass is the state that the Actor Workflow carries through
+ Continue-As-New cycles. It contains everything needed to reconstruct
+ the player's session context.
+ """
+
+ player_id: str
+ display_name: str
+ status: PlayerStatus = PlayerStatus.ONLINE
+ current_room: RoomInfo | None = None
+ inventory: list[InventoryItem] = field(default_factory=list)
+ health: int = 100
+ max_health: int = 100
+ xp: int = 0
+ level: int = 1
+ score: int = 0
+ session_actions_count: int = 0
+ total_actions_count: int = 0
+ pending_notifications: list[str] = field(default_factory=list)
+```
+
+The `PlayerState` dataclass is the single source of truth for a player's session. It is passed as the argument to the Workflow's `@workflow.run` method and carried forward through each Continue-As-New cycle. The `session_actions_count` field tracks the number of actions in the *current* execution (reset on Continue-As-New), while `total_actions_count` tracks the lifetime total.
+
+The separation of `session_actions_count` from `total_actions_count` gives you two useful signals: the session count helps you decide when to trigger Continue-As-New (since each action generates Events in the history), and the total count provides a lifetime metric for the player.
+
+### Phase 2: Define the Activities
+
+Create a file named `activities.py`. Activities contain all non-deterministic code: network calls, database writes, game server interactions, and notifications. Each Activity is designed to be idempotent so that retries do not produce duplicate effects.
+
+```python
+# activities.py
+from __future__ import annotations
+
+import uuid
+from dataclasses import replace
+
+from temporalio import activity
+
+from models import (
+ CombatMoveRequest,
+ CombatMoveResult,
+ InventoryItem,
+ LeaderboardEntry,
+ MoveType,
+ RoomInfo,
+)
+
+
+@activity.defn
+def join_game_room(player_id: str, room_id: str) -> RoomInfo:
+ """Join a game room and return room information.
+
+ In a production system, this Activity would call the game room
+ microservice API to register the player in the room. The player_id
+ serves as an idempotency key: joining the same room twice is a
+ no-op that returns the current room state.
+ """
+ activity.logger.info(
+ f"Player {player_id} joining room {room_id}"
+ )
+ # Production: call game room service API
+ # room_service.join(player_id=player_id, room_id=room_id)
+ return RoomInfo(
+ room_id=room_id,
+ room_name=f"Room {room_id}",
+ player_count=5,
+ max_players=20,
+ )
+
+
+@activity.defn
+def leave_game_room(player_id: str, room_id: str) -> None:
+ """Remove a player from a game room.
+
+ Idempotent: leaving a room you are not in is a no-op.
+ """
+ activity.logger.info(
+ f"Player {player_id} leaving room {room_id}"
+ )
+ # Production: call game room service API
+ # room_service.leave(player_id=player_id, room_id=room_id)
+
+
+@activity.defn
+def process_combat_move(
+ player_id: str, move: CombatMoveRequest
+) -> CombatMoveResult:
+ """Process a combat move and return the result.
+
+ This Activity calls the game logic service to resolve the combat
+ action. The combination of player_id and a server-assigned move
+ identifier ensures idempotency.
+ """
+ activity.logger.info(
+ f"Player {player_id} executing {move.move_type.value}"
+ f" targeting {move.target_player_id}"
+ )
+ # Production: call game logic service
+ # result = combat_service.resolve_move(player_id, move)
+ damage = 25 if move.move_type == MoveType.ATTACK else 0
+ healing = 15 if move.move_type == MoveType.HEAL else 0
+ xp = 10
+
+ return CombatMoveResult(
+ damage_dealt=damage,
+ damage_received=0,
+ xp_earned=xp,
+ move_description=(
+ f"{move.move_type.value} executed"
+ f"{' against ' + move.target_player_id if move.target_player_id else ''}"
+ ),
+ )
+
+
+@activity.defn
+def update_leaderboard(entry: LeaderboardEntry) -> LeaderboardEntry:
+ """Update the leaderboard with the player's current score.
+
+ Uses the player_id as an upsert key so that repeated calls with the
+ same score do not create duplicate entries.
+ """
+ activity.logger.info(
+ f"Updating leaderboard for {entry.player_id} "
+ f"with score {entry.score}"
+ )
+ # Production: upsert into leaderboard database
+ # db.leaderboard.upsert(player_id=entry.player_id, score=entry.score)
+ return replace(entry, rank=42)
+
+
+@activity.defn
+def send_player_notification(
+ target_player_id: str, message: str
+) -> None:
+ """Send a push notification to a player.
+
+ Notifications are delivered through an external push service.
+ The message includes a client-generated identifier for deduplication.
+ """
+ activity.logger.info(
+ f"Sending notification to {target_player_id}: {message}"
+ )
+ # Production: call push notification service
+ # push_service.send(player_id=target_player_id, message=message)
+
+
+@activity.defn
+def record_session_metrics(
+ player_id: str,
+ total_actions: int,
+ session_duration_minutes: int,
+) -> None:
+ """Record session metrics to the analytics system.
+
+ Called during Continue-As-New to capture session telemetry.
+ Uses an append-only metrics store, so duplicate writes are harmless.
+ """
+ activity.logger.info(
+ f"Recording session metrics for {player_id}: "
+ f"{total_actions} actions, {session_duration_minutes} min"
+ )
+ # Production: write to metrics/analytics pipeline
+ # analytics.record_session(player_id, total_actions, duration)
+```
+
+Each Activity follows a consistent pattern:
+
+1. **Logging with `activity.logger`** provides automatic correlation with the Workflow Id and Activity attempt number.
+2. **Idempotency annotations** in the docstrings explain how the Activity handles retries. In production code, each Activity passes an idempotency key to the external service it calls.
+3. **Stub implementations** return realistic data structures. In a production system, each stub would be replaced by a call to the corresponding game backend service.
+
+Activities are defined as synchronous functions. The Python SDK runs synchronous Activities in a `ThreadPoolExecutor`, which is safer and easier to debug than async Activities. Async Activities are only necessary when the Activity must use `async`-native libraries throughout its implementation.
+
+### Phase 3: Define the Player Actor Workflow
+
+Create a file named `player_workflow.py`. This is the core of the system: a long-running Actor Workflow that represents a single player's game session.
+
+```python
+# player_workflow.py
+from __future__ import annotations
+
+import asyncio
+from datetime import timedelta
+
+from temporalio import workflow
+from temporalio.common import RetryPolicy, SearchAttributes, SearchAttributeKey
+
+with workflow.unsafe.imports_passed_through():
+ from activities import (
+ join_game_room,
+ leave_game_room,
+ process_combat_move,
+ record_session_metrics,
+ send_player_notification,
+ update_leaderboard,
+ )
+ from models import (
+ CombatMoveRequest,
+ CombatMoveResult,
+ InventoryItem,
+ LeaderboardEntry,
+ PlayerState,
+ PlayerStatus,
+ )
+
+
+# Search Attribute keys for operational visibility
+PLAYER_STATUS_KEY = SearchAttributeKey.for_keyword("PlayerStatus")
+CURRENT_ROOM_KEY = SearchAttributeKey.for_keyword("CurrentRoom")
+PLAYER_LEVEL_KEY = SearchAttributeKey.for_int("PlayerLevel")
+
+# Threshold for triggering Continue-As-New.
+# Each Signal/Update handler execution and each Activity execution
+# generates Events in the history. A conservative threshold of
+# 10,000 events provides ample headroom below the 50,000 limit.
+CONTINUE_AS_NEW_THRESHOLD = 10_000
+
+# Default Retry Policy for Activities in this Workflow.
+DEFAULT_RETRY_POLICY = RetryPolicy(
+ initial_interval=timedelta(seconds=1),
+ backoff_coefficient=2.0,
+ maximum_interval=timedelta(seconds=30),
+ maximum_attempts=5,
+)
+
+
+@workflow.defn
+class PlayerSessionWorkflow:
+ """Actor Workflow representing a single player's game session.
+
+ This Workflow is the player's durable, autonomous agent in the game
+ backend. It maintains the player's state and actively performs
+ operations by executing Activities. External systems interact with
+ it through Signals (fire-and-forget commands), Updates (synchronous
+ request-response), and Queries (read-only state inspection).
+
+ Unlike an Entity Workflow that passively holds state, this Actor
+ Workflow takes action: it joins game rooms, executes combat moves,
+ updates leaderboards, and sends notifications.
+ These side effects are all executed through Activities.
+
+ Lifecycle:
+ 1. The Workflow starts when a player logs in.
+ 2. It enters a main loop that waits for Signals and Updates.
+ 3. It carries forward state through Continue-As-New cycles.
+ 4. It completes when the player explicitly logs out.
+ """
+
+ def __init__(self) -> None:
+ # Pending action queues. Signal handlers append to these queues,
+ # and the main loop processes them. This serializes action
+ # processing and avoids concurrent Activity executions that
+ # could conflict with each other.
+ self._pending_room_joins: list[str] = []
+ self._pending_moves: list[CombatMoveRequest] = []
+ self._pending_notifications: list[str] = []
+ self._shutdown_requested: bool = False
+ self._state: PlayerState | None = None
+ # In-memory set for Signal deduplication within this execution.
+ # Temporal delivers Signals at least once, so the same Signal may
+ # arrive more than once. This set catches duplicates that arrive
+ # within the same Execution. It is not persisted across
+ # Continue-As-New transitions, which is acceptable: the window
+ # for a delayed duplicate to span a CAN boundary is very small.
+ self._seen_move_ids: set[str] = set()
+
+ # ──────────────────────────────────────────────
+ # Signals: fire-and-forget commands from clients
+ # ──────────────────────────────────────────────
+
+ @workflow.signal
+ async def join_room(self, room_id: str) -> None:
+ """Request the player to join a game room.
+
+ The Signal handler appends the room identifier to a queue. The
+ main loop picks it up and executes the join_game_room Activity.
+ This pattern ensures that room joins are processed serially,
+ preventing race conditions where a player could be in two rooms.
+
+ If the room is already queued or is the player's
+ current room, the Signal is silently dropped. If the caller
+ needs confirmation that the join succeeded, use an Update instead.
+ """
+ if room_id in self._pending_room_joins:
+ return
+ if (
+ self._state is not None
+ and self._state.current_room is not None
+ and self._state.current_room.room_id == room_id
+ ):
+ return
+ self._pending_room_joins.append(room_id)
+
+ @workflow.signal
+ async def execute_move(self, move: CombatMoveRequest) -> None:
+ """Request the player to execute a combat move.
+
+ Moves are queued and processed in order by the main loop.
+ Callers must supply a unique event_id for deduplication. Without
+ one, duplicate Signals cannot be detected and may result in
+ double-processing. If at-most-once execution or a result is
+ required, use an Update instead of a Signal.
+ """
+ if move.event_id:
+ if move.event_id in self._seen_move_ids:
+ return
+ self._seen_move_ids.add(move.event_id)
+ self._pending_moves.append(move)
+
+ @workflow.signal
+ async def receive_notification(self, message: str) -> None:
+ """Receive a notification from another player or the game system.
+
+ Notifications are stored in the player's state and can be
+ retrieved through the get_player_state Query.
+ """
+ if self._state is not None:
+ self._state.pending_notifications.append(message)
+
+ @workflow.signal
+ async def logout(self) -> None:
+ """Request the player to log out and end the session."""
+ self._shutdown_requested = True
+
+ # ──────────────────────────────────────────────
+ # Queries: read-only state inspection
+ # ──────────────────────────────────────────────
+
+ @workflow.query
+ def get_player_state(self) -> PlayerState | None:
+ """Return the complete player state.
+
+ Queries are read-only and must not modify Workflow state.
+ They execute synchronously on the Worker and return immediately.
+ Game clients can poll this Query to display inventory, health,
+ score, and pending notifications.
+ """
+ return self._state
+
+ @workflow.query
+ def get_status(self) -> str:
+ """Return the player's current status as a string."""
+ if self._state is None:
+ return PlayerStatus.OFFLINE.value
+ return self._state.status.value
+
+ @workflow.query
+ def get_inventory(self) -> list[InventoryItem]:
+ """Return the player's current inventory."""
+ if self._state is None:
+ return []
+ return self._state.inventory
+
+ # ──────────────────────────────────────────────
+ # Main Workflow logic
+ # ──────────────────────────────────────────────
+
+ @workflow.run
+ async def run(self, state: PlayerState) -> str:
+ """Main entry point for the Player Actor Workflow.
+
+ This method initializes state and enters a loop that processes
+ queued actions and checks for Continue-As-New conditions. The
+ loop runs indefinitely until the player logs out or the
+ session is terminated.
+
+ Args:
+ state: The player's state. On the first execution, this is
+ the initial state. On Continue-As-New, this is the state
+ carried forward from the previous execution.
+ """
+ self._state = state
+ self._state.status = PlayerStatus.ONLINE
+
+ # Update Search Attributes so operators can find this player
+ self._upsert_search_attributes()
+
+ workflow.logger.info(
+ f"Player session started for {state.player_id} "
+ f"(total actions: {state.total_actions_count})"
+ )
+
+ # Main event loop: process queued actions until shutdown
+ while not self._shutdown_requested:
+ # Wait for any pending action or a shutdown request.
+ # The timeout ensures we periodically check the Event
+ # History length even during idle periods.
+ try:
+ await workflow.wait_condition(
+ lambda: (
+ bool(self._pending_room_joins)
+ or bool(self._pending_moves)
+ or self._shutdown_requested
+ ),
+ timeout=timedelta(minutes=5),
+ )
+ except asyncio.TimeoutError:
+ # No actions for 5 minutes. Check Continue-As-New
+ # threshold and loop again.
+ pass
+
+ # Process all pending room joins
+ await self._process_room_joins()
+
+ # Process all pending combat moves
+ await self._process_combat_moves()
+
+ # Check whether Event History is approaching the limit
+ if self._should_continue_as_new():
+ await self._perform_continue_as_new()
+ # continue_as_new raises an exception that exits the method
+ return "" # unreachable, but satisfies the type checker
+
+ # Shutdown path: player logged out
+ await self._handle_logout()
+
+ return f"Player {state.player_id} session ended"
+
+ # ──────────────────────────────────────────────
+ # Private helper methods
+ # ──────────────────────────────────────────────
+
+ async def _process_room_joins(self) -> None:
+ """Process all pending room join requests."""
+ while self._pending_room_joins:
+ room_id = self._pending_room_joins.pop(0)
+
+ # Leave the current room first if the player is already in one
+ if self._state.current_room is not None:
+ await workflow.execute_activity(
+ leave_game_room,
+ args=[
+ self._state.player_id,
+ self._state.current_room.room_id,
+ ],
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=DEFAULT_RETRY_POLICY,
+ )
+
+ # Join the new room. This Activity calls the game room
+ # service, which is a real-world side effect. This is
+ # what makes this an Actor Workflow, not an Entity Workflow.
+ room_info = await workflow.execute_activity(
+ join_game_room,
+ args=[self._state.player_id, room_id],
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=DEFAULT_RETRY_POLICY,
+ )
+
+ self._state.current_room = room_info
+ self._state.status = PlayerStatus.IN_ROOM
+ self._increment_action_count()
+ self._upsert_search_attributes()
+
+ workflow.logger.info(
+ f"Player {self._state.player_id} joined room "
+ f"{room_info.room_name}"
+ )
+
+ async def _process_combat_moves(self) -> None:
+ """Process all pending combat moves in order."""
+ while self._pending_moves:
+ move = self._pending_moves.pop(0)
+ self._state.status = PlayerStatus.IN_COMBAT
+
+ # Execute the combat move. This Activity calls the game
+ # logic service to resolve damage, XP, and other effects.
+ result: CombatMoveResult = await workflow.execute_activity(
+ process_combat_move,
+ args=[self._state.player_id, move],
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=DEFAULT_RETRY_POLICY,
+ )
+
+ # Apply the results to the player's state
+ self._state.xp += result.xp_earned
+ self._state.health = max(
+ 0,
+ min(
+ self._state.max_health,
+ self._state.health - result.damage_received,
+ ),
+ )
+ self._state.score += result.damage_dealt + result.xp_earned
+
+ # Check for level up (every 100 XP)
+ new_level = (self._state.xp // 100) + 1
+ if new_level > self._state.level:
+ self._state.level = new_level
+ workflow.logger.info(
+ f"Player {self._state.player_id} reached "
+ f"level {new_level}"
+ )
+
+ # Update the leaderboard. This is another real-world side
+ # effect: the leaderboard is an external system that other
+ # players and the game client query.
+ await workflow.execute_activity(
+ update_leaderboard,
+ LeaderboardEntry(
+ player_id=self._state.player_id,
+ score=self._state.score,
+ ),
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=DEFAULT_RETRY_POLICY,
+ )
+
+ # Notify the target player of the incoming attack.
+ # This demonstrates cross-actor communication: one Actor
+ # Workflow triggers an Activity that sends a message to
+ # another player's session via an external push service.
+ if move.target_player_id:
+ await workflow.execute_activity(
+ send_player_notification,
+ args=[
+ move.target_player_id,
+ f"{self._state.display_name} attacked you "
+ f"for {result.damage_dealt} damage "
+ f"({result.move_description})",
+ ],
+ start_to_close_timeout=timedelta(seconds=10),
+ retry_policy=DEFAULT_RETRY_POLICY,
+ )
+
+ self._increment_action_count()
+ self._state.status = (
+ PlayerStatus.IN_ROOM
+ if self._state.current_room
+ else PlayerStatus.ONLINE
+ )
+
+ def _should_continue_as_new(self) -> bool:
+ """Check whether the Event History is approaching the limit."""
+ current_length = workflow.info().get_current_history_length()
+ return current_length > CONTINUE_AS_NEW_THRESHOLD
+
+ async def _perform_continue_as_new(self) -> None:
+ """Execute Continue-As-New with the current player state.
+
+ Before continuing, this method:
+ 1. Waits for all in-progress Signal and Update handlers to finish.
+ 2. Records session metrics to the analytics system.
+ 3. Resets the session-level action counter.
+ """
+ workflow.logger.info(
+ f"Continue-As-New for player {self._state.player_id} "
+ f"at {workflow.info().get_current_history_length()} events"
+ )
+
+ # Wait for all handlers to complete before continuing.
+ # This prevents data loss from in-flight Signal or Update handlers.
+ await workflow.wait_condition(workflow.all_handlers_finished)
+
+ # Record session metrics before resetting the counter
+ await workflow.execute_activity(
+ record_session_metrics,
+ args=[
+ self._state.player_id,
+ self._state.total_actions_count,
+ 0, # duration computed from workflow start time in production
+ ],
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=DEFAULT_RETRY_POLICY,
+ )
+
+ # Reset session-level counter; keep total lifetime counter
+ self._state.session_actions_count = 0
+
+ # Continue-As-New carries the full player state to the new execution
+ workflow.continue_as_new(args=[self._state])
+
+ async def _handle_logout(self) -> None:
+ """Clean up resources when the player logs out."""
+ assert self._state is not None
+
+ # Wait for all handlers to finish processing
+ await workflow.wait_condition(workflow.all_handlers_finished)
+
+ # Leave the current room
+ if self._state.current_room is not None:
+ await workflow.execute_activity(
+ leave_game_room,
+ args=[
+ self._state.player_id,
+ self._state.current_room.room_id,
+ ],
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=DEFAULT_RETRY_POLICY,
+ )
+ self._state.current_room = None
+
+ self._state.status = PlayerStatus.OFFLINE
+ self._upsert_search_attributes()
+
+ workflow.logger.info(
+ f"Player {self._state.player_id} logged out "
+ f"after {self._state.total_actions_count} total actions"
+ )
+
+ def _increment_action_count(self) -> None:
+ """Increment both session and total action counters."""
+ self._state.session_actions_count += 1
+ self._state.total_actions_count += 1
+
+ def _upsert_search_attributes(self) -> None:
+ """Update Search Attributes for operational visibility.
+
+ Search Attributes allow operators to query running Workflows.
+ For example: find all players in a specific room, find all
+ players above a certain level, or find all players with a
+ particular status.
+ """
+ pairs = [
+ (PLAYER_STATUS_KEY, self._state.status.value),
+ (PLAYER_LEVEL_KEY, self._state.level),
+ ]
+ if self._state.current_room:
+ pairs.append(
+ (CURRENT_ROOM_KEY, self._state.current_room.room_id)
+ )
+ workflow.upsert_search_attributes(pairs)
+```
+
+The `PlayerSessionWorkflow` is the central piece of this system. Several design decisions are worth examining in detail:
+
+**Message queue pattern for Signals.** Signal handlers do not execute Activities directly. Instead, they append to internal queues (`_pending_room_joins`, `_pending_moves`), and the main loop drains these queues. This serializes side effects and prevents concurrent Activity executions that could conflict with each other. For example, if two `join_room` Signals arrive simultaneously, processing them through a queue guarantees that the player leaves the first room before joining the second.
+
+**Signal deduplication.** Temporal delivers Signals at least once: the same Signal may arrive more than once under certain failure conditions. Without deduplication, a duplicate `execute_move` Signal could apply a combat move twice, corrupting XP and health state. The `join_room` handler drops a Signal if the target room is already queued or is the current room. The `execute_move` handler uses the `event_id` field on `CombatMoveRequest` as an idempotency key, tracking seen identifiers in `_seen_move_ids`. Callers must generate and provide a unique `event_id` for every move Signal. If a Signal has no `event_id`, duplicates cannot be detected and the move is appended unconditionally — this is the fallback behavior and should be avoided in production. For operations where the caller needs confirmation or strict at-most-once semantics, use a Workflow Update instead of a Signal: Updates support validators that run before the operation is accepted, and the caller blocks until the handler returns a result.
+
+**Continue-As-New with handler draining.** Before executing `workflow.continue_as_new()`, the Workflow calls `await workflow.wait_condition(workflow.all_handlers_finished)`. This ensures that any Signal or Update handler that is currently executing finishes before the current execution ends. Without this step, in-flight handlers could be interrupted, causing data loss.
+
+**Search Attribute updates.** The Workflow upserts Search Attributes whenever the player's status, room, or level changes. This enables operational queries such as:
+
+```
+PlayerStatus = "in_room" AND CurrentRoom = "dungeon-7"
+```
+
+```
+PlayerLevel >= 10 AND PlayerStatus = "online"
+```
+
+These queries are invaluable for live operations: finding all players in a room that needs to be shut down for maintenance, identifying high-level players for a special event, or monitoring the distribution of player statuses across the system.
+
+### Phase 4: Configure and start the Worker
+
+Create a file named `worker.py`. The Worker registers the `PlayerSessionWorkflow` and all Activities on a single Task Queue.
+
+```python
+# worker.py
+from __future__ import annotations
+
+import asyncio
+import concurrent.futures
+import logging
+
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from activities import (
+ join_game_room,
+ leave_game_room,
+ process_combat_move,
+ record_session_metrics,
+ send_player_notification,
+ update_leaderboard,
+)
+from player_workflow import PlayerSessionWorkflow
+
+# Configure logging for the Worker process
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+
+
+async def main() -> None:
+ """Start a Worker that handles player session Workflows."""
+ client = await Client.connect("localhost:7233", namespace="default")
+
+ # Use a ThreadPoolExecutor for synchronous Activities.
+ # Size the pool based on expected concurrent Activity executions.
+ # Each concurrent Activity occupies one thread.
+ with concurrent.futures.ThreadPoolExecutor(
+ max_workers=100
+ ) as activity_executor:
+ worker = Worker(
+ client,
+ task_queue="player-session-queue",
+ workflows=[PlayerSessionWorkflow],
+ activities=[
+ join_game_room,
+ leave_game_room,
+ process_combat_move,
+ update_leaderboard,
+ send_player_notification,
+ record_session_metrics,
+ ],
+ activity_executor=activity_executor,
+ # Limit concurrent Workflow Tasks. Each Workflow Task is
+ # lightweight (in-memory replay), but limiting concurrency
+ # prevents memory pressure under high load.
+ max_concurrent_workflow_tasks=200,
+ # Limit concurrent Activity executions. Activities perform
+ # I/O and consume threads from the executor pool.
+ max_concurrent_activities=100,
+ )
+
+ logging.info(
+ "Worker started on task queue 'player-session-queue'"
+ )
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+The Worker configuration makes several deliberate choices:
+
+- **ThreadPoolExecutor sizing:** The `max_workers=100` setting supports up to 100 concurrent Activity executions. Size this based on the expected ratio of concurrent players to Activity execution time. If Activities are fast (under 100 milliseconds), fewer threads are needed. If Activities involve slower external service calls, increase the pool size.
+- **Concurrency limits:** `max_concurrent_workflow_tasks=200` and `max_concurrent_activities=100` prevent the Worker from being overwhelmed. These values should be tuned based on the Worker's available memory and CPU.
+
+### Phase 5: Start and interact with player sessions from a client
+
+Create a file named `starter.py`. This module demonstrates how a game backend API would start player sessions and send commands to them.
+
+```python
+# starter.py
+from __future__ import annotations
+
+import asyncio
+
+from temporalio.client import Client
+from temporalio.common import SearchAttributes, SearchAttributeKey
+
+from models import (
+ CombatMoveRequest,
+ InventoryItem,
+ MoveType,
+ PlayerState,
+)
+from player_workflow import PlayerSessionWorkflow
+
+# Search Attribute keys (must match the Workflow definitions)
+PLAYER_STATUS_KEY = SearchAttributeKey.for_keyword("PlayerStatus")
+PLAYER_LEVEL_KEY = SearchAttributeKey.for_int("PlayerLevel")
+
+
+async def start_player_session(
+ client: Client, player_id: str, display_name: str
+) -> None:
+ """Start a new player session Actor Workflow.
+
+ The Workflow Id is derived from the player_id, ensuring that each
+ player has exactly one active session. If a session with this
+ Workflow Id already exists, this call will raise an error. Use
+ the Workflow Id Reuse Policy to control this behavior.
+ """
+ initial_state = PlayerState(
+ player_id=player_id,
+ display_name=display_name,
+ inventory=[
+ InventoryItem(
+ item_id="sword-001",
+ name="Iron Sword",
+ quantity=1,
+ ),
+ InventoryItem(
+ item_id="potion-001",
+ name="Health Potion",
+ quantity=5,
+ ),
+ ],
+ )
+
+ handle = await client.start_workflow(
+ PlayerSessionWorkflow.run,
+ initial_state,
+ id=f"player-session-{player_id}",
+ task_queue="player-session-queue",
+ search_attributes=SearchAttributes.from_pairs([
+ (PLAYER_STATUS_KEY, "online"),
+ (PLAYER_LEVEL_KEY, 1),
+ ]),
+ )
+ print(f"Started session for {display_name} ({player_id})")
+ print(f" Workflow Id: {handle.id}")
+
+
+async def join_room(
+ client: Client, player_id: str, room_id: str
+) -> None:
+ """Send a join_room Signal to a player's Actor Workflow."""
+ handle = client.get_workflow_handle(
+ f"player-session-{player_id}"
+ )
+ await handle.signal(PlayerSessionWorkflow.join_room, room_id)
+ print(f"Player {player_id} sent join_room Signal for {room_id}")
+
+
+async def attack(
+ client: Client,
+ player_id: str,
+ target_player_id: str,
+) -> None:
+ """Send a combat move Signal to a player's Actor Workflow."""
+ handle = client.get_workflow_handle(
+ f"player-session-{player_id}"
+ )
+ move = CombatMoveRequest(
+ move_type=MoveType.ATTACK,
+ target_player_id=target_player_id,
+ )
+ await handle.signal(PlayerSessionWorkflow.execute_move, move)
+ print(
+ f"Player {player_id} sent attack Signal "
+ f"targeting {target_player_id}"
+ )
+
+
+async def check_player_state(
+ client: Client, player_id: str
+) -> None:
+ """Query a player's current state."""
+ handle = client.get_workflow_handle(
+ f"player-session-{player_id}"
+ )
+ state = await handle.query(
+ PlayerSessionWorkflow.get_player_state
+ )
+ if state is not None:
+ print(f"Player: {state.display_name}")
+ print(f" Status: {state.status.value}")
+ print(f" Health: {state.health}/{state.max_health}")
+ print(f" Level: {state.level}")
+ print(f" Score: {state.score}")
+ print(f" XP: {state.xp}")
+ print(
+ f" Room: "
+ f"{state.current_room.room_name if state.current_room else 'None'}"
+ )
+ print(f" Inventory: {len(state.inventory)} item(s)")
+ print(f" Total actions: {state.total_actions_count}")
+
+
+async def logout_player(
+ client: Client, player_id: str
+) -> None:
+ """Send a logout Signal to gracefully end a player session."""
+ handle = client.get_workflow_handle(
+ f"player-session-{player_id}"
+ )
+ await handle.signal(PlayerSessionWorkflow.logout)
+ print(f"Player {player_id} sent logout Signal")
+
+
+async def main() -> None:
+ """Demonstrate a complete player session lifecycle."""
+ client = await Client.connect("localhost:7233", namespace="default")
+
+ # Start two player sessions
+ await start_player_session(client, "alice", "Alice the Brave")
+ await start_player_session(client, "bob", "Bob the Bold")
+
+ # Give the Workers a moment to pick up the Workflow Tasks
+ await asyncio.sleep(2)
+
+ # Alice joins a game room
+ await join_room(client, "alice", "dungeon-7")
+ await asyncio.sleep(1)
+
+ # Bob joins the same room
+ await join_room(client, "bob", "dungeon-7")
+ await asyncio.sleep(1)
+
+ # Alice attacks Bob
+ await attack(client, "alice", "bob")
+ await asyncio.sleep(1)
+
+ # Check Alice's state
+ await check_player_state(client, "alice")
+
+ # Logout both players
+ await logout_player(client, "alice")
+ await logout_player(client, "bob")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+The client code demonstrates the two interaction patterns available to external callers:
+
+- **Signals** for fire-and-forget commands: `join_room`, `execute_move`, `logout`. The caller does not wait for the Workflow to process the Signal.
+- **Queries** for read-only state inspection: `get_player_state`. The caller reads the Workflow's current state without affecting its execution.
+
+### (Optional) Phase 6: Configure Search Attributes
+
+Before starting the Worker, register the custom Search Attributes with the Temporal Service. When using the Temporal CLI development server, run the following commands:
+
+```bash
+temporal operator search-attribute create --name PlayerStatus --type Keyword
+temporal operator search-attribute create --name CurrentRoom --type Keyword
+temporal operator search-attribute create --name PlayerLevel --type Int
+```
+
+Once registered, you can query running player sessions. For example, to find all players currently in room `dungeon-7`:
+
+```bash
+temporal workflow list --query 'PlayerStatus = "in_room" AND CurrentRoom = "dungeon-7"'
+```
+
+To find all players at level 10 or higher:
+
+```bash
+temporal workflow list --query 'PlayerLevel >= 10'
+```
+
+### (Optional) Phase 7: Add heartbeating to long-running Activities
+
+If your game includes Activities that run for extended periods, such as maintaining a persistent connection to a game room server or processing a large batch of analytics events, add heartbeating to track progress and enable resumption after failures.
+
+```python
+# activities.py (additional long-running Activity example)
+
+@activity.defn
+def process_batch_leaderboard_update(
+ entries: list[LeaderboardEntry],
+) -> int:
+ """Process a large batch of leaderboard updates with heartbeating.
+
+ This Activity processes entries one at a time and heartbeats after
+ each entry with the index of the last processed entry. If the Worker
+ fails mid-batch, the next attempt reads the heartbeat payload and
+ resumes from where processing stopped.
+
+ Configure this Activity with a heartbeat_timeout of 30 seconds.
+ The SDK throttles heartbeat calls, so calling activity.heartbeat()
+ on every iteration is safe and does not overload the Temporal Service.
+ """
+ heartbeat_details = activity.info().heartbeat_details
+ start_index = heartbeat_details[0] if heartbeat_details else 0
+
+ processed_count = start_index
+ for i in range(start_index, len(entries)):
+ entry = entries[i]
+ # Production: upsert into leaderboard database
+ activity.logger.info(
+ f"Processing leaderboard entry {i + 1}/{len(entries)} "
+ f"for player {entry.player_id}"
+ )
+ processed_count += 1
+
+ # Heartbeat with progress. The heartbeat payload (the current
+ # index) is available to the next attempt if this attempt fails.
+ activity.heartbeat(processed_count)
+
+ return processed_count
+```
+
+When calling this Activity from a Workflow, configure the `heartbeat_timeout` parameter:
+
+```python
+# In the Workflow
+processed = await workflow.execute_activity(
+ process_batch_leaderboard_update,
+ entries,
+ start_to_close_timeout=timedelta(minutes=30),
+ heartbeat_timeout=timedelta(seconds=30),
+ retry_policy=DEFAULT_RETRY_POLICY,
+)
+```
+
+The `heartbeat_timeout` of 30 seconds means that if the Activity does not emit a heartbeat for 30 seconds, the Temporal Service considers it stalled and schedules a retry. The new attempt reads the last heartbeat payload (the index of the last processed entry) and resumes from that point.
+
+---
+
+## Outcomes
+
+You've built a session system where player state is durable, game actions are reliable, and server failures are non-events. The player Workflow doesn't just hold state — it drives action. That distinction matters: when a player joins a room or executes a combat move, the Workflow owns the operation end-to-end, retrying if needed and preserving consistency if something fails partway through.
+
+The approach isn't limited to games. Any long-lived entity that has to take action on its own behalf — an IoT device scaling infrastructure in response to sensor data, an e-commerce order submitting payment and triggering fulfillment, an AI agent calling tools and external APIs — fits the same model. The player session implementation here gives you the complete pattern.
+
+---
+
+## Related resources
+
+- [Temporal Python SDK documentation](https://docs.temporal.io/develop/python)
+- [Message Passing — Signals, Queries, Updates](https://docs.temporal.io/develop/python/message-passing)
+- [Continue-As-New](https://docs.temporal.io/develop/python/continue-as-new)
+- [Child Workflows](https://docs.temporal.io/develop/python/child-workflows)
+- [Failure Detection — Timeouts, Activity Heartbeating, and Retry Policies](https://docs.temporal.io/develop/python/failure-detection)
+- [Search Attributes](https://docs.temporal.io/visibility/search-attributes)
+- [Temporal Python SDK API Reference](https://python.temporal.io)
+- [Temporal Python SDK samples](https://github.com/temporalio/samples-python)
+- Companion article: [Entity Workflow Pattern](/guides/entity-pattern-loyalty-points)
\ No newline at end of file
diff --git a/docs/guides/entity-pattern-loyalty-points.mdx b/docs/guides/entity-pattern-loyalty-points.mdx
new file mode 100644
index 0000000000..8e874eb540
--- /dev/null
+++ b/docs/guides/entity-pattern-loyalty-points.mdx
@@ -0,0 +1,1128 @@
+---
+id: entity-pattern-loyalty-points
+title: Track customer loyalty points with durable workflows
+description: How to run a customer loyalty program with durable workflows provided by Temporal
+sidebar_label: Customer loyalty program
+toc_max_heading_level: 2
+author: Cecil Phillip
+tags:
+ - Workflows
+ - Workers
+ - Activities
+ - Continue-As-New
+ - Task Queues
+ - Python SDK
+ - Retry Policies
+---
+
+Loyalty programs look simple until you're running them at scale: points have to accrue correctly across every purchase, tiers have to recalculate at the right moment, redemptions have to be consistent, and all of it has to hold up across years of customer interactions. This guide shows how to give each customer their own dedicated, durable account — one that holds state, enforces business rules, responds to purchases and redemptions in real time, and never needs a cron job to stay current.
+
+## Problem statement
+
+Every customer in a loyalty program is their own entity. Their points accumulate across purchases, their tier changes when they hit thresholds, and their redemptions need to be validated and applied without double-spending. Doing this with a shared database means locking, retry logic across services, and cron jobs that periodically recalculate who qualifies for what. When a points accrual succeeds in one service but the tier upgrade notification fails in another, you have an inconsistency that's difficult to detect and expensive to fix. The longer the customer relationship, the more chances there are for these gaps to accumulate.
+
+## Solution
+
+Each customer gets their own Workflow — a persistent, running process identified by their customer ID. It holds the account's points balance, tier status, and activity history in memory. When a purchase happens, a Signal adds the points. When a customer redeems at checkout, an Update validates and applies the redemption synchronously. When a mobile app needs to display the current balance, a Query reads the state without touching the database. Tier upgrades happen as part of normal business logic, not from a nightly job. The workflow runs for as long as the customer relationship does — years if needed — and resets its internal history automatically to stay within platform limits.
+
+## Outcomes
+
+After working through this guide, you'll have a loyalty system where:
+
+- **Each customer account is independent and durable**: state is held per-customer, survives infrastructure failures, and picks up exactly where it left off
+- **Tier upgrades happen in real time**: business rules run in the Workflow itself, triggered by point accruals — no scheduler, no batch job
+- **Redemptions are consistent**: synchronous Updates validate and apply changes atomically, with the response going back to the caller
+- **In-memory Balance reads**: Queries read in-memory state without generating Events or hitting a database
+- **The workflow runs for the life of the customer**: Continue-As-New keeps history size in check so nothing needs to be migrated or restarted
+- **Failures are isolated**: a notification failing doesn't roll back a point accrual; each concern fails independently
+
+## Background and best practices
+
+### What is an Entity Workflow?
+
+Although "Workflow" is a Temporal SDK primitive, when we talk about an "entity workflow," we're describing an architectural pattern. An entity workflow represents something that persists over time — a customer, a device, an order, a bank account. Contrast this with a "process workflow," which has a definite end. The key distinction is that an entity is a thing, whereas a process workflow does a thing. An entity workflow has an indefinite lifetime and reacts to events as they arrive. In the loyalty context, the customer account is the entity: it exists as long as the customer relationship does, and it responds to purchases, redemptions, and status queries throughout that lifetime.
+
+Entity Workflows share three defining characteristics:
+
+- They run for an **indefinite duration**. You do not know at start time when or if they will complete.
+- They **react to external messages** at any point in their lifecycle via Signals, Updates, and Queries.
+- They **maintain mutable state** that evolves over time in response to those messages.
+
+In the loyalty points use case, each customer has their own Entity Workflow. The Workflow holds the points balance and tier status. External services send Signals to add points after a purchase, send Updates to redeem points at checkout, and send Queries to display the current balance in a mobile app. The Workflow runs for as long as the customer account exists.
+
+### How Entity Workflows differ from actors
+
+The Entity Workflow pattern shares characteristics with the Actor Model: both receive messages, maintain state, and can create new instances. However, there is an important distinction. An entity **represents state** and responds to interactions. It is closer to a data cache than a process orchestrator. It holds and serves state, evaluates business rules against that state, and delegates side effects to Activities. An actor, by contrast, **takes actions**. It may write to databases, call external APIs, or send emails as part of its core behavior.
+
+In this pattern, the loyalty Entity Workflow does not directly write to a database or send an email. When it needs a side effect, such as sending a tier upgrade notification, it delegates that work to an Activity. The Workflow itself remains a pure state container with business rules.
+
+### Why Temporal is well suited for this pattern
+
+Temporal provides several capabilities that make Entity Workflows practical:
+
+- **Durable timers**: A Workflow can sleep for hours, days, or months without consuming compute resources. The timer is persisted by the Temporal Server; the Worker is freed to handle other tasks.
+- **Message passing**: Signals, Queries, and Updates provide three distinct interaction patterns — fire-and-forget, read-only, and request-response — covering the full range of entity interactions.
+- **Event History**: Every state transition is recorded in an append-only log, giving a complete audit trail for compliance and debugging.
+- **Continue-As-New**: When the Event History approaches size limits, the Workflow resets itself with a fresh history while preserving current state, allowing Entity Workflows to run for years.
+- **Deterministic replay**: If a Worker crashes, the Workflow resumes from its last known state by replaying the Event History — no state is lost and no external coordination is needed.
+
+### Operational considerations
+
+Before implementing this pattern, keep the following constraints in mind:
+
+**Event History limits.** Temporal enforces a hard limit of 51,200 Events and 50 MB per Workflow Execution, with warnings starting around 10,000 Events. Each Signal generates approximately 4 Events; each Activity execution approximately 11. A loyalty entity processing 100 point accruals per day (each triggering one Activity) generates roughly 1,500 Events per day — plan your Continue-As-New threshold accordingly.
+
+**Payload size limits.** Individual payloads for Workflow and Activity inputs and outputs are limited to 2 MB. If the serialized entity state approaches this limit, store large data externally and reference it by identifier.
+
+**Workflow Determinism.** Activities can include non-deterministic code, but the Workflow itself must remain deterministic. Use `workflow.now()` instead of `datetime.now()`, `workflow.random()` instead of `random`, and `workflow.uuid4()` instead of `uuid.uuid4()`. All I/O must be performed in Activities.
+
+**Signal delivery guarantees.** Temporal delivers Signals at least once. Include a deduplication key in Signal payloads and make your Signal processing logic idempotent.
+
+## Target audience
+
+The following roles are involved in this pattern:
+
+- **Temporal Workflow and Activity developers**: Responsible for implementing the loyalty Entity Workflow, defining Activities, and writing client code to interact with the entity
+- **Platform operators**: Responsible for deploying and managing Temporal Workers, configuring Task Queues, and monitoring Event History growth
+- **Domain architects**: Responsible for deciding which loyalty business rules belong in the Workflow versus in Activities, and defining the message contracts for Signals, Updates, and Queries
+
+This guide requires Python, a Temporal Worker, and either a local development server or Temporal Cloud.
+
+## Prerequisites
+
+### Required software and infrastructure
+
+- Python 3.9 or later
+- Temporal Python SDK (`temporalio`) version 1.7.0 or later (required for `@workflow.init`, `workflow.all_handlers_finished`, and Workflow Updates)
+- A running Temporal Server, either via the [Temporal CLI](https://docs.temporal.io/cli) for local development or [Temporal Cloud](https://temporal.io/cloud) for production
+- `pip` or `uv` for Python dependency management
+
+### Resources and access privileges
+
+- Temporal Cloud account with Namespace Admin role, or local development server access via `temporal server start-dev`
+- Write access to your Worker deployment environment
+
+### Required concepts
+
+You should be familiar with the following before proceeding:
+
+- [Temporal Workflows](https://docs.temporal.io/workflows): The durable, stateful functions that Temporal orchestrates
+- [Temporal Activities](https://docs.temporal.io/activities): The functions that perform side effects such as API calls and database writes
+- [Temporal Workers](https://docs.temporal.io/workers): The processes that host and execute Workflow and Activity code
+- [Temporal Task Queues](https://docs.temporal.io/task-queues): The named queues that route work from the Temporal Server to Workers
+
+## Architecture diagrams
+
+The following diagrams illustrate how external services interact with a loyalty Entity Workflow through Signals, Updates, and Queries.
+
+### Signal and Continue-As-New flow
+
+This diagram shows how a client adds points to a customer's loyalty account via a Signal, and how the Workflow performs Continue-As-New when the Event History grows large.
+
+**Narrative:**
+
+1. An external service starts a loyalty Entity Workflow with a stable Workflow Id tied to the customer identifier, such as `loyalty-cust-123`.
+2. The Temporal Server persists the start request and schedules a Workflow Task on the `loyalty-points-queue` Task Queue.
+3. A Python Worker picks up the task, initializes the loyalty state, and enters a wait loop.
+4. Throughout the customer's lifetime, external services send Signals to add points, Updates to redeem points, and Queries to read the current balance and tier.
+5. When the Event History approaches the suggested threshold, the Workflow serializes its current state and calls `continue_as_new`, starting a fresh Execution with the same Workflow Id.
+6. When the customer account is closed, a shutdown Signal causes the Workflow to drain its queue, await all handlers, and complete.
+
+```mermaid
+sequenceDiagram
+ participant Client as External Service
+ participant TS as Temporal Server
+ participant W as Worker (Python)
+
+ Client->>TS: start_workflow(LoyaltyEntity, id="loyalty-cust-123")
+ TS->>W: Schedule Workflow Task
+ W->>W: Initialize loyalty state (0 points, Bronze tier)
+ W-->>TS: Blocked on wait_condition (waiting for messages)
+
+ Client->>TS: signal("loyalty-cust-123", add_points, {points: 500})
+ TS->>W: Deliver Signal
+ W->>W: Append to internal queue
+ W->>W: Process: update points balance
+ W->>TS: Schedule Activity (record_points_transaction)
+ TS->>W: Execute Activity
+ W-->>TS: Activity complete, resume Workflow
+
+ Client->>TS: query("loyalty-cust-123", get_loyalty_summary)
+ TS->>W: Deliver Query
+ W-->>Client: Return {points: 500, tier: "Bronze"} (no Event recorded)
+
+ Note over W: Event History approaching threshold
+ W->>W: Drain pending signals
+ W->>W: await all_handlers_finished()
+ W->>TS: continue_as_new(current_state)
+ Note over TS: Same Workflow Id, new Run Id, fresh Event History
+```
+
+### Update flow for point redemption
+
+This diagram shows a synchronous Update interaction where a client redeems points and receives confirmation of the new balance.
+
+```mermaid
+sequenceDiagram
+ participant Client as External Service
+ participant TS as Temporal Server
+ participant W as Worker (Python)
+
+ Client->>TS: execute_update("loyalty-cust-123", redeem_points, {points: 200})
+ TS->>W: Deliver Update
+ W->>W: Run validator (check points > 0)
+ W->>W: Check sufficient balance
+ W->>W: Deduct points from state
+ W->>TS: Schedule Activity (record_redemption)
+ TS->>W: Execute Activity
+ W-->>Client: Return {redeemed: 200, remaining: 300}
+```
+
+## Implementation plan
+
+This section walks you through building a complete customer loyalty Entity Workflow in Python. You will define the data model, implement the Workflow with Signal, Query, and Update handlers, write Activities for side effects, configure Continue-As-New, set up the Worker, and write client code to interact with the entity.
+
+### Phase 1: Define the loyalty state and data models
+
+Entity Workflows carry their current state as their primary input, which enables Continue-As-New to resume seamlessly. Use Python `dataclass` objects — they serialize cleanly with Temporal's default JSON converter and allow new fields with defaults without breaking existing Executions.
+
+Create a file named `models.py` with the following content:
+
+```python
+# models.py
+from dataclasses import dataclass, field
+from enum import Enum
+
+
+class LoyaltyTier(str, Enum):
+ """Loyalty tiers based on lifetime points earned."""
+ BRONZE = "Bronze"
+ SILVER = "Silver"
+ GOLD = "Gold"
+ PLATINUM = "Platinum"
+
+
+@dataclass
+class PointsEvent:
+ """A single points transaction to be processed by the entity."""
+ event_id: str # Unique identifier for idempotent processing
+ points: int # Positive for accrual, negative for redemption
+ reason: str # Human-readable description (e.g., "purchase", "signup_bonus")
+ source: str = "" # Originating service or channel
+
+
+@dataclass
+class LoyaltyState:
+ """The complete state of a customer's loyalty account.
+
+ This is the data that survives Continue-As-New transitions.
+ Keep it small and serializable — well under the 2 MB payload limit.
+ """
+ customer_id: str
+ points_balance: int = 0
+ lifetime_points: int = 0
+ tier: str = LoyaltyTier.BRONZE.value
+ is_active: bool = True
+ processed_event_ids: list[str] = field(default_factory=list)
+
+ # Unprocessed events carried across Continue-As-New transitions
+ pending_events: list[dict] = field(default_factory=list)
+
+
+@dataclass
+class LoyaltySummary:
+ """Read-only view of the loyalty account returned by Queries."""
+ customer_id: str
+ points_balance: int
+ lifetime_points: int
+ tier: str
+ is_active: bool
+
+
+@dataclass
+class RedemptionRequest:
+ """Input for the redeem_points Update."""
+ event_id: str # Idempotency key for the redemption
+ points: int # Number of points to redeem (must be positive)
+ reward: str # What the points are being redeemed for
+
+
+@dataclass
+class RedemptionResult:
+ """Output from the redeem_points Update."""
+ redeemed: int
+ remaining_balance: int
+ reward: str
+```
+
+`LoyaltyState` holds only what's needed to reconstruct the entity after a Continue-As-New transition. `processed_event_ids` enables idempotent Signal processing — in production, limit this to a recent window (e.g. last 1,000 IDs) and rely on an external store for long-term deduplication. `pending_events` carries any Signals that arrived but weren't processed before the transition.
+
+### Phase 2: Implement the Activities
+
+Activities hold all non-deterministic code: database writes, API calls, and notifications. The Workflow stays a pure state container.
+
+Create a file named `activities.py`:
+
+```python
+# activities.py
+import logging
+
+from temporalio import activity
+
+logger = logging.getLogger(__name__)
+
+
+@activity.defn
+async def record_points_transaction(transaction: dict) -> bool:
+ """Record a points transaction in the loyalty database.
+
+ Must be idempotent: use event_id as the key to prevent duplicate records
+ on retry. Example: INSERT ... ON CONFLICT (event_id) DO NOTHING.
+ """
+ activity.logger.info(
+ "Recording points transaction for customer %s: %s points (%s)",
+ transaction["customer_id"],
+ transaction["points"],
+ transaction["reason"],
+ )
+ # await db.execute("INSERT INTO points_transactions ... ON CONFLICT (event_id) DO NOTHING", ...)
+ return True
+
+
+@activity.defn
+async def send_tier_change_notification(notification: dict) -> bool:
+ """Send a notification when a customer's loyalty tier changes.
+
+ Non-critical: if this Activity fails after all retries, the Workflow
+ logs the failure and continues rather than terminating the account.
+ """
+ activity.logger.info(
+ "Sending tier change notification to customer %s: %s -> %s",
+ notification["customer_id"],
+ notification["old_tier"],
+ notification["new_tier"],
+ )
+ # await email_service.send(template="tier_change", to=..., context={...})
+ return True
+
+
+@activity.defn
+async def record_redemption(redemption: dict) -> bool:
+ """Record a points redemption in the loyalty database.
+
+ Must be idempotent: use event_id as the key to prevent duplicate records
+ on retry. Example: INSERT ... ON CONFLICT (event_id) DO NOTHING.
+ """
+ activity.logger.info(
+ "Recording redemption for customer %s: %s points for %s",
+ redemption["customer_id"],
+ redemption["points"],
+ redemption["reward"],
+ )
+ # await db.execute("INSERT INTO redemptions ... ON CONFLICT (event_id) DO NOTHING", ...)
+ return True
+```
+
+Each Activity uses `@activity.defn` and `activity.logger` for context-aware logging. All three are idempotent by design: calling them twice with the same `event_id` produces the same result, which is essential because Temporal may retry Activities on failure.
+
+### Phase 3: Implement the Entity Workflow
+
+The Entity Workflow is the core of the pattern: it initializes state, waits for messages, processes them sequentially, and performs Continue-As-New when history grows large.
+
+Create a file named `workflows.py`:
+
+```python
+# workflows.py
+import asyncio
+from collections import deque
+from dataclasses import asdict
+from datetime import timedelta
+
+from temporalio import workflow
+from temporalio.common import RetryPolicy
+from temporalio.exceptions import ApplicationError
+
+with workflow.unsafe.imports_passed_through():
+ from activities import (
+ record_points_transaction,
+ record_redemption,
+ send_tier_change_notification,
+ )
+ from models import (
+ LoyaltyState,
+ LoyaltySummary,
+ LoyaltyTier,
+ PointsEvent,
+ RedemptionRequest,
+ RedemptionResult,
+ )
+
+
+# Retry policy for database writes: retry transient failures with backoff
+DB_RETRY_POLICY = RetryPolicy(
+ initial_interval=timedelta(seconds=1),
+ backoff_coefficient=2.0,
+ maximum_interval=timedelta(seconds=30),
+ maximum_attempts=10,
+)
+
+# Retry policy for notifications: more aggressive retry since these are non-critical
+NOTIFICATION_RETRY_POLICY = RetryPolicy(
+ initial_interval=timedelta(seconds=2),
+ backoff_coefficient=2.0,
+ maximum_interval=timedelta(minutes=1),
+ maximum_attempts=5,
+)
+
+
+@workflow.defn
+class LoyaltyEntityWorkflow:
+ """Manages a single customer's loyalty account as a long-lived Entity Workflow.
+
+ Each customer has their own instance identified by a stable Workflow Id
+ (e.g. "loyalty-cust-123"). All side effects are delegated to Activities.
+ """
+
+ @workflow.init
+ def __init__(self, state: LoyaltyState | None = None) -> None:
+ """Initialize Workflow state.
+
+ The @workflow.init decorator ensures this runs before any Signal
+ or Update handler, preventing race conditions with Signal-with-Start.
+
+ Args:
+ state: Existing state from Continue-As-New, or None for a new account.
+ """
+ if state is not None:
+ self._state = state
+ else:
+ # First start only (not Continue-As-New); customer_id set in run().
+ self._state = LoyaltyState(customer_id="")
+
+ # Signal queue: handlers append, main loop pops one at a time.
+ self._pending_signals: deque[dict] = deque(self._state.pending_events)
+ self._state.pending_events = [] # Clear after restoring
+
+ self._shutdown_requested: bool = False
+
+ @workflow.run
+ async def run(self, state: LoyaltyState | None = None) -> LoyaltyState:
+ """Main entity loop: wait for messages, process them, and Continue-As-New when needed."""
+ if state is None:
+ # First start: derive customer_id from the Workflow Id convention.
+ self._state.customer_id = workflow.info().workflow_id.replace("loyalty-", "", 1)
+
+ workflow.logger.info(
+ "Loyalty entity started for customer %s (tier: %s, balance: %d)",
+ self._state.customer_id,
+ self._state.tier,
+ self._state.points_balance,
+ )
+
+ while not self._shutdown_requested:
+ # Block until a Signal arrives, shutdown is requested,
+ # CAN is suggested, or 24 hours pass.
+ await workflow.wait_condition(
+ lambda: (
+ bool(self._pending_signals)
+ or self._shutdown_requested
+ or workflow.info().is_continue_as_new_suggested()
+ ),
+ timeout=timedelta(hours=24),
+ )
+
+ # Process all queued Signals sequentially
+ while self._pending_signals:
+ event_data = self._pending_signals.popleft()
+ await self._process_points_event(event_data)
+
+ # Check if we should Continue-As-New after each event
+ if workflow.info().is_continue_as_new_suggested():
+ break
+
+ # Exit if shutdown was requested
+ if self._shutdown_requested:
+ break
+
+ # Trigger Continue-As-New when Temporal suggests it
+ if workflow.info().is_continue_as_new_suggested():
+ await self._do_continue_as_new()
+
+ # Wait for all in-flight handlers before completing.
+ await workflow.wait_condition(workflow.all_handlers_finished)
+
+ workflow.logger.info(
+ "Loyalty entity shutting down for customer %s (final balance: %d)",
+ self._state.customer_id,
+ self._state.points_balance,
+ )
+ return self._state
+
+ @workflow.signal
+ def add_points(self, event: dict) -> None:
+ """Enqueue a points accrual event for processing in the main loop.
+
+ Signal handlers must be synchronous — they only append to the queue.
+ All processing happens in the main loop to ensure sequential execution.
+
+ Args:
+ event: dict with keys event_id, points, reason, and optionally source.
+ """
+ self._pending_signals.append(event)
+
+ @workflow.signal
+ def shutdown(self) -> None:
+ """Signal the entity to drain pending events and terminate gracefully."""
+ self._shutdown_requested = True
+
+ # ------------------------------------------------------------------
+ # Query handlers
+ # ------------------------------------------------------------------
+
+ @workflow.query
+ def get_loyalty_summary(self) -> LoyaltySummary:
+ """Return a read-only summary (no Events generated, safe at high frequency)."""
+ return LoyaltySummary(
+ customer_id=self._state.customer_id,
+ points_balance=self._state.points_balance,
+ lifetime_points=self._state.lifetime_points,
+ tier=self._state.tier,
+ is_active=self._state.is_active,
+ )
+
+ @workflow.query
+ def get_points_balance(self) -> int:
+ """Return the current points balance."""
+ return self._state.points_balance
+
+ @workflow.query
+ def get_tier(self) -> str:
+ """Return the current loyalty tier."""
+ return self._state.tier
+
+ @workflow.update
+ async def redeem_points(self, request: RedemptionRequest) -> RedemptionResult:
+ """Redeem loyalty points for a reward.
+
+ Blocks until the Update completes and the caller receives the result,
+ making it ideal for checkout flows that need confirmed point deduction.
+
+ Raises:
+ ApplicationError: If the account is inactive or has insufficient points.
+ """
+ if not self._state.is_active:
+ raise ApplicationError(
+ "Cannot redeem points: account is inactive",
+ type="InactiveAccount",
+ non_retryable=True,
+ )
+
+ if request.points > self._state.points_balance:
+ raise ApplicationError(
+ f"Insufficient points: requested {request.points}, "
+ f"available {self._state.points_balance}",
+ type="InsufficientPoints",
+ non_retryable=True,
+ )
+
+ # Idempotency: return without re-processing if already applied.
+ if request.event_id in self._state.processed_event_ids:
+ return RedemptionResult(
+ redeemed=request.points,
+ remaining_balance=self._state.points_balance,
+ reward=request.reward,
+ )
+
+ self._state.points_balance -= request.points
+
+ # Record the redemption via an Activity (idempotent on event_id).
+ await workflow.execute_activity(
+ record_redemption,
+ {
+ "customer_id": self._state.customer_id,
+ "event_id": request.event_id,
+ "points": request.points,
+ "reward": request.reward,
+ },
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=DB_RETRY_POLICY,
+ )
+
+ # Track this event as processed.
+ self._state.processed_event_ids.append(request.event_id)
+ self._trim_processed_ids()
+
+ workflow.logger.info(
+ "Customer %s redeemed %d points for %s (remaining: %d)",
+ self._state.customer_id,
+ request.points,
+ request.reward,
+ self._state.points_balance,
+ )
+
+ return RedemptionResult(
+ redeemed=request.points,
+ remaining_balance=self._state.points_balance,
+ reward=request.reward,
+ )
+
+ @redeem_points.validator
+ def validate_redemption(self, request: RedemptionRequest) -> None:
+ """Run before the Update is accepted into Event History. Rejection here writes no Event."""
+ if request.points <= 0:
+ raise ApplicationError(
+ "Redemption amount must be a positive number",
+ type="ValidationError",
+ )
+
+ async def _process_points_event(self, event_data: dict) -> None:
+ """Process a single points event: check for duplicates, record the
+ transaction via an Activity, update in-memory state, and evaluate tier.
+ """
+ event_id = event_data.get("event_id", "")
+ points = event_data.get("points", 0)
+ reason = event_data.get("reason", "unknown")
+
+ if event_id and event_id in self._state.processed_event_ids:
+ workflow.logger.info(
+ "Skipping duplicate event %s for customer %s",
+ event_id,
+ self._state.customer_id,
+ )
+ return
+
+ # Record the transaction BEFORE updating in-memory state to avoid
+ # double-counting if the Activity retries.
+ await workflow.execute_activity(
+ record_points_transaction,
+ {
+ "customer_id": self._state.customer_id,
+ "event_id": event_id,
+ "points": points,
+ "reason": reason,
+ },
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=DB_RETRY_POLICY,
+ )
+
+ # Update state after Activity confirms success.
+ self._state.points_balance += points
+ if points > 0:
+ self._state.lifetime_points += points
+
+ if event_id:
+ self._state.processed_event_ids.append(event_id)
+ self._trim_processed_ids()
+
+ workflow.logger.info(
+ "Processed event %s for customer %s: %+d points (balance: %d)",
+ event_id,
+ self._state.customer_id,
+ points,
+ self._state.points_balance,
+ )
+
+ await self._evaluate_tier()
+
+ async def _evaluate_tier(self) -> None:
+ """Recalculate tier; send notification if it changed (non-critical)."""
+ new_tier = self._calculate_tier(self._state.lifetime_points)
+
+ if new_tier != self._state.tier:
+ old_tier = self._state.tier
+ self._state.tier = new_tier
+
+ workflow.logger.info(
+ "Customer %s tier changed: %s -> %s",
+ self._state.customer_id,
+ old_tier,
+ new_tier,
+ )
+
+ # Notification is non-critical — log and continue on failure.
+ try:
+ await workflow.execute_activity(
+ send_tier_change_notification,
+ {
+ "customer_id": self._state.customer_id,
+ "old_tier": old_tier,
+ "new_tier": new_tier,
+ },
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=NOTIFICATION_RETRY_POLICY,
+ )
+ except Exception:
+ workflow.logger.warning(
+ "Failed to send tier change notification for customer %s; "
+ "continuing entity lifecycle",
+ self._state.customer_id,
+ )
+
+ def _calculate_tier(self, lifetime_points: int) -> str:
+ """Determine tier from lifetime points — pure logic, no side effects."""
+ if lifetime_points >= 10_000:
+ return LoyaltyTier.PLATINUM.value
+ if lifetime_points >= 5_000:
+ return LoyaltyTier.GOLD.value
+ if lifetime_points >= 1_000:
+ return LoyaltyTier.SILVER.value
+ return LoyaltyTier.BRONZE.value
+
+ def _trim_processed_ids(self) -> None:
+ """Keep only the most recent 1,000 processed event IDs to prevent unbounded growth."""
+ max_ids = 1_000
+ if len(self._state.processed_event_ids) > max_ids:
+ self._state.processed_event_ids = self._state.processed_event_ids[
+ -max_ids:
+ ]
+
+ async def _do_continue_as_new(self) -> None:
+ """Perform Continue-As-New: drain excess pending signals, serialize
+ remaining unprocessed signals into state, wait for all handlers to
+ finish, then call continue_as_new with the current state.
+ """
+ # Drain excess pending signals before serializing into the CAN input.
+ # Keeps the LoyaltyState payload well under the 2 MB limit.
+ MAX_PENDING_CARRY = 500
+ while len(self._pending_signals) > MAX_PENDING_CARRY:
+ event_data = self._pending_signals.popleft()
+ await self._process_points_event(event_data)
+
+ # Carry remaining unprocessed Signals to the next Execution.
+ self._state.pending_events = list(self._pending_signals)
+
+ # Wait for async handlers to finish before transitioning.
+ await workflow.wait_condition(workflow.all_handlers_finished)
+
+ workflow.logger.info(
+ "Continuing as new for customer %s (balance: %d, pending: %d)",
+ self._state.customer_id,
+ self._state.points_balance,
+ len(self._state.pending_events),
+ )
+
+ workflow.continue_as_new(self._state)
+```
+
+Two design decisions in this Workflow deserve extra attention:
+
+**Activity call inside an Update handler.** The `redeem_points` Update calls `record_redemption` directly via `await` rather than enqueuing it for the main loop. The Python and TypeScript SDKs support `async` Update handlers, making this safe. The trade-off keeps redemption logic self-contained and returns the confirmed result to the caller in a single round trip. It is safe here because `record_redemption` is idempotent — a replay detects the duplicate `event_id` and returns without creating a second record. If you are using the Java SDK, Update handlers cannot be `async`; enqueue the work for the main loop instead.
+
+**Continue-As-New payload size.** Unprocessed Signals are serialized into the CAN input, which must stay under the 2 MB payload limit. The `MAX_PENDING_CARRY` guard drains the queue to a safe size before serializing. Lower the limit if your Signal payloads are large. If the guard fires frequently, Signals are arriving faster than Workers can process them — scale your Worker pool.
+
+### Phase 4: Configure and run the Worker
+
+The Worker hosts your Workflow and Activity code, polls a Task Queue, and executes work.
+
+Create a file named `worker.py`:
+
+```python
+# worker.py
+import asyncio
+import concurrent.futures
+import logging
+
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from activities import (
+ record_points_transaction,
+ record_redemption,
+ send_tier_change_notification,
+)
+from workflows import LoyaltyEntityWorkflow
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+TASK_QUEUE = "loyalty-points-queue"
+TEMPORAL_ADDRESS = "localhost:7233"
+TEMPORAL_NAMESPACE = "default"
+
+
+async def main() -> None:
+ """Start a Worker that hosts the LoyaltyEntityWorkflow and its Activities."""
+ client = await Client.connect(
+ TEMPORAL_ADDRESS,
+ namespace=TEMPORAL_NAMESPACE,
+ )
+
+ logger.info("Starting loyalty Worker on task queue: %s", TASK_QUEUE)
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=50) as activity_executor:
+ worker = Worker(
+ client,
+ task_queue=TASK_QUEUE,
+ workflows=[LoyaltyEntityWorkflow],
+ activities=[
+ record_points_transaction,
+ record_redemption,
+ send_tier_change_notification,
+ ],
+ activity_executor=activity_executor,
+ max_concurrent_workflow_tasks=200,
+ max_concurrent_activities=100,
+ )
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+The Worker registers the `LoyaltyEntityWorkflow` class and all three Activity functions on the `loyalty-points-queue` Task Queue. Setting `max_concurrent_workflow_tasks` higher than the default makes sense here because Entity Workflows spend most of their time blocked on `wait_condition`, so one Worker can host many concurrent instances.
+
+To start the Worker, run:
+
+```
+python worker.py
+```
+
+The Worker connects to the Temporal Server and begins polling. It will continue running until you stop the process.
+
+### Phase 5: Write client code to interact with the entity
+
+The client code starts Entity Workflows, sends Signals, executes Updates, and runs Queries. In production this typically lives in an API server or background service.
+
+Create a file named `starter.py`:
+
+```python
+# starter.py
+import asyncio
+
+from temporalio.client import Client, WorkflowUpdateFailedError
+
+from models import RedemptionRequest
+from workflows import LoyaltyEntityWorkflow
+
+TASK_QUEUE = "loyalty-points-queue"
+TEMPORAL_ADDRESS = "localhost:7233"
+TEMPORAL_NAMESPACE = "default"
+
+
+async def main() -> None:
+ client = await Client.connect(
+ TEMPORAL_ADDRESS,
+ namespace=TEMPORAL_NAMESPACE,
+ )
+
+ customer_id = "cust-42"
+ workflow_id = f"loyalty-{customer_id}"
+
+ # Start the entity — Workflow Id is stable per customer.
+ # If it's already running, start_workflow raises WorkflowAlreadyStartedError.
+ handle = await client.start_workflow(
+ LoyaltyEntityWorkflow.run,
+ id=workflow_id,
+ task_queue=TASK_QUEUE,
+ # No initial state — the Workflow derives customer_id from the Workflow Id.
+ )
+ print(f"Started loyalty entity: {workflow_id}")
+
+ # Signals are fire-and-forget: the call returns once the Server accepts them.
+ await handle.signal(
+ LoyaltyEntityWorkflow.add_points,
+ {
+ "event_id": "evt-001",
+ "points": 500,
+ "reason": "initial_purchase",
+ "source": "ecommerce",
+ },
+ )
+ print("Sent Signal: +500 points (initial_purchase)")
+
+ await handle.signal(
+ LoyaltyEntityWorkflow.add_points,
+ {
+ "event_id": "evt-002",
+ "points": 750,
+ "reason": "referral_bonus",
+ "source": "referral_service",
+ },
+ )
+ print("Sent Signal: +750 points (referral_bonus)")
+
+ # Allow time for the Worker to process the Signals
+ await asyncio.sleep(2)
+
+ # Queries are synchronous and read-only — no Events generated.
+ summary = await handle.query(LoyaltyEntityWorkflow.get_loyalty_summary)
+ print(
+ f"Query result: {summary.points_balance} points, "
+ f"tier={summary.tier}, lifetime={summary.lifetime_points}"
+ )
+
+ # Updates block until the Workflow processes them and returns a result —
+ # ideal for checkout where the caller needs confirmation.
+ try:
+ result = await handle.execute_update(
+ LoyaltyEntityWorkflow.redeem_points,
+ RedemptionRequest(
+ event_id="redeem-001",
+ points=200,
+ reward="free_shipping",
+ ),
+ )
+ print(
+ f"Redemption successful: {result.redeemed} points for "
+ f"{result.reward}, remaining: {result.remaining_balance}"
+ )
+ except WorkflowUpdateFailedError as e:
+ print(f"Redemption failed: {e}")
+
+ # Query again to confirm the updated balance.
+ balance = await handle.query(LoyaltyEntityWorkflow.get_points_balance)
+ tier = await handle.query(LoyaltyEntityWorkflow.get_tier)
+ print(f"After redemption: balance={balance}, tier={tier}")
+
+ print("\nSending batch of points events...")
+ for i in range(10):
+ await handle.signal(
+ LoyaltyEntityWorkflow.add_points,
+ {
+ "event_id": f"batch-{i:04d}",
+ "points": 100,
+ "reason": f"batch_purchase_{i}",
+ },
+ )
+ print("Sent 10 batch events (+1000 points total)")
+
+ await asyncio.sleep(3)
+
+ summary = await handle.query(LoyaltyEntityWorkflow.get_loyalty_summary)
+ print(
+ f"Final state: {summary.points_balance} points, "
+ f"tier={summary.tier}, lifetime={summary.lifetime_points}"
+ )
+
+ # Uncomment to shut down the entity gracefully.
+ # await handle.signal(LoyaltyEntityWorkflow.shutdown)
+ # print("Sent shutdown Signal")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+This client code demonstrates the three interaction patterns:
+
+- **Signals** (`add_points`): Fire-and-forget; returns immediately after the Server accepts the Signal.
+- **Queries** (`get_loyalty_summary`, `get_points_balance`, `get_tier`): Read-only, no Events generated — safe at high frequency.
+- **Updates** (`redeem_points`): Synchronous request-response; the caller blocks until the Workflow returns a confirmed result.
+
+### Phase 6: Handle Continue-As-New for indefinite execution
+
+Continue-As-New allows an Entity Workflow to run for years without unbounded history. When `workflow.info().is_continue_as_new_suggested()` returns `True`, the history is approaching the suggested threshold (~4,096 Events).
+
+The `LoyaltyEntityWorkflow` checks this flag in two places:
+
+1. **In the main wait condition.** The Workflow wakes up even if no Signals are pending, handling the case where history has grown during idle periods.
+2. **After processing each event.** Avoids processing an entire batch before detecting an oversized history.
+
+The `_do_continue_as_new` method handles the transition in three steps:
+
+1. **Unprocessed Signals are preserved.** Any Signals still in the queue are serialized into `pending_events` on the state. The next Execution restores them in its constructor.
+2. **All handlers complete.** `workflow.all_handlers_finished` ensures any in-flight Update handlers finish before the transition — cancelling them mid-execution would lose data.
+3. **The transition occurs.** `workflow.continue_as_new(self._state)` ends the current Execution and starts a new one with the same Workflow Id, a new Run Id, and a fresh Event History.
+
+After the transition, clients continue interacting with the same Workflow Id. Temporal automatically routes Signals, Queries, and Updates to the latest Run. Do not specify a Run Id when obtaining a Workflow handle, or your interactions will target a stale Execution.
+
+### Phase 7: Ensure Workflow determinism
+
+Temporal replays Event History to resume a Workflow after a crash. The code must produce the same sequence of Commands on every replay.
+
+The Python SDK sandbox blocks most non-deterministic operations, but you must still follow these rules:
+
+| Do not use | Use instead | Reason |
+|---|---|---|
+| `datetime.now()` | `workflow.now()` | Returns the same value during replay |
+| `random.random()` | `workflow.random().random()` | Seeded deterministically by the SDK |
+| `uuid.uuid4()` | `workflow.uuid4()` | Produces the same value during replay |
+| `time.sleep()` | `asyncio.sleep()` or `workflow.sleep()` | Converts to a durable timer |
+| `print()` | `workflow.logger.info()` | Replay-safe; only logs on initial execution |
+| Direct I/O (HTTP, file reads) | Activities | All I/O must happen in Activities |
+
+**Testing for determinism.** Use the `Replayer` to verify code changes against a saved Event History:
+
+```python
+# test_determinism.py
+import asyncio
+
+from temporalio.client import WorkflowHistory
+from temporalio.worker import Replayer
+
+from workflows import LoyaltyEntityWorkflow
+
+
+async def test_replay_compatibility() -> None:
+ """Replay a saved Event History against the current code.
+
+ Export history first:
+ temporal workflow show --workflow-id loyalty-cust-42 --output json > history.json
+ """
+ replayer = Replayer(workflows=[LoyaltyEntityWorkflow])
+
+ with open("history.json") as f:
+ history = WorkflowHistory.from_json("loyalty-cust-42", f.read())
+
+ # This raises NondeterminismError if the code is incompatible
+ await replayer.replay_workflow(history)
+ print("Replay succeeded: code is compatible with saved history")
+
+
+if __name__ == "__main__":
+ asyncio.run(test_replay_compatibility())
+```
+
+Run this test as part of your continuous integration pipeline before deploying Workflow code changes. If the replay fails with a `NondeterminismError`, the code change is not backward-compatible. Use Worker Versioning (see Phase 8) to deploy the change safely.
+
+### (Optional) Phase 8: Manage Workflow Evolution with Worker Versioning
+
+Entity Workflows are the hardest case for code evolution: a loyalty account can span years and dozens of deployments. Any change that alters the sequence of Commands the Workflow generates causes a `NondeterminismError` during replay for in-progress Executions.
+
+[Worker Versioning](https://docs.temporal.io/production-deployment/worker-deployments/worker-versioning) is the recommended solution: run multiple builds simultaneously, route each Execution to the build it started on, and drain old builds without any code-level branching.
+
+> Worker Versioning requires Python SDK v1.11.0 or later, Temporal Server v1.29.1 or later, and Temporal CLI v1.4.1 or later.
+
+#### Why PINNED is the right choice for Entity Workflows
+
+`PINNED` is the canonical versioning behavior for Entity Workflows: each execution runs entirely on the Worker Deployment Version where it started. At a Continue-As-New boundary the Workflow can optionally upgrade to the latest build.
+
+| Behavior | Guarantee | When to use |
+|----------|-----------|-------------|
+| `PINNED` | Each execution completes on the version where it started | Long-lived entities; no patching needed within a run |
+| `AUTO_UPGRADE` | Moves to the latest version automatically | Short Workflows that complete before the next deploy |
+
+#### Step 1: Annotate the Workflow
+
+Add `versioning_behavior=VersioningBehavior.PINNED` to `@workflow.defn` in `workflows.py`:
+
+```python
+from temporalio.common import VersioningBehavior
+
+@workflow.defn(versioning_behavior=VersioningBehavior.PINNED)
+class LoyaltyEntityWorkflow:
+ ...
+```
+
+#### Step 2: Configure the Worker
+
+Update `worker.py` to declare the deployment name and build ID:
+
+```python
+import os
+from temporalio.worker import Worker, WorkerDeploymentOptions
+
+# BUILD_ID should be injected by CI/CD (e.g. a Git SHA). All Workers from the same build must match.
+BUILD_ID = os.environ["BUILD_ID"]
+DEPLOYMENT_NAME = os.getenv("TEMPORAL_DEPLOYMENT", "loyalty-points")
+
+async def main() -> None:
+ client = await Client.connect(TEMPORAL_ADDRESS, namespace=TEMPORAL_NAMESPACE)
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=50) as activity_executor:
+ worker = Worker(
+ client,
+ task_queue=TASK_QUEUE,
+ workflows=[LoyaltyEntityWorkflow],
+ activities=[
+ record_points_transaction,
+ record_redemption,
+ send_tier_change_notification,
+ ],
+ activity_executor=activity_executor,
+ worker_deployment_options=WorkerDeploymentOptions(
+ deployment_name=DEPLOYMENT_NAME,
+ build_id=BUILD_ID,
+ use_worker_versioning=True,
+ ),
+ max_concurrent_workflow_tasks=200,
+ max_concurrent_activities=100,
+ )
+ await worker.run()
+```
+
+#### Step 3: Roll out changes with the CLI
+
+**Initial deployment** — start the first versioned build and make it Current:
+
+```bash
+BUILD_ID=1.0.0 python worker.py &
+
+temporal worker deployment set-current-version \
+ --deployment-name "loyalty-points" \
+ --build-id "1.0.0"
+```
+
+**Deploying a change** — run the new build alongside the old one, ramp gradually, then promote:
+
+```bash
+# Start Workers on the new build
+BUILD_ID=1.1.0 python worker.py &
+
+# Canary: send 10% of new Workflows to the new build
+temporal worker deployment set-ramping-version \
+ --deployment-name "loyalty-points" \
+ --build-id "1.1.0" \
+ --percentage 10
+
+# Promote to Current once satisfied
+temporal worker deployment set-current-version \
+ --deployment-name "loyalty-points" \
+ --build-id "1.1.0"
+```
+
+Old `1.0.0` Workers keep running, draining their pinned Executions automatically.
+
+Check drain status before stopping old Workers:
+
+```bash
+temporal worker deployment describe-version \
+ --deployment-name "loyalty-points" \
+ --build-id "1.0.0"
+# DrainageStatus: drained ← safe to shut down
+```
+
+#### Upgrading at the Continue-As-New boundary
+
+Because `LoyaltyEntityWorkflow` already calls `workflow.continue_as_new()` to manage Event History, you can upgrade long-running Executions to a new build at that boundary — without patching.
+
+In `_do_continue_as_new`, check `get_target_worker_deployment_version_changed()` before transitioning:
+
+```python
+from temporalio.workflow import ContinueAsNewVersioningBehavior
+
+async def _do_continue_as_new(self) -> None:
+ MAX_PENDING_CARRY = 500
+ while len(self._pending_signals) > MAX_PENDING_CARRY:
+ event_data = self._pending_signals.popleft()
+ await self._process_points_event(event_data)
+
+ self._state.pending_events = list(self._pending_signals)
+ await workflow.wait_condition(workflow.all_handlers_finished)
+
+ can_options = {}
+ if workflow.info().get_target_worker_deployment_version_changed():
+ # Newer build available — upgrade on this CaN boundary.
+ can_options["versioning_behavior"] = ContinueAsNewVersioningBehaviorAutoUpgrade
+
+ workflow.continue_as_new(self._state, **can_options)
+```
+
+When `get_target_worker_deployment_version_changed()` is `False`, the CaN run stays on the same pinned build. This flag is refreshed after each Workflow Task, so upgrades are picked up on the next natural CaN boundary.
+
+## Outcomes
+
+You've built a loyalty account system where each customer is an independent, self-managing process. Points accrue consistently, tier upgrades happen in real time, redemptions can't be double-spent, and the account keeps running through deployments, restarts, and schema changes. The approach isn't specific to loyalty programs — any domain object with a long lifecycle and ongoing interactions can be modeled the same way: user profiles, IoT devices, subscription accounts, open orders. The loyalty implementation here gives you the full blueprint.
+
+## Related resources
+
+- [Temporal Python SDK documentation](https://docs.temporal.io/develop/python)
+- [Temporal Python SDK API Reference](https://python.temporal.io)
+- [Message Passing — Signals, Queries, Updates](https://docs.temporal.io/develop/python/message-passing)
+- [Continue-As-New](https://docs.temporal.io/develop/python/continue-as-new)
+- [Worker Versioning](https://docs.temporal.io/production-deployment/worker-deployments/worker-versioning)
+- [Failure Detection — Timeouts, Activity Heartbeating, and Retry Policies](https://docs.temporal.io/develop/python/failure-detection)
+- [Temporal Python SDK samples](https://github.com/temporalio/samples-python)
+
+
\ No newline at end of file
diff --git a/docs/guides/index.mdx b/docs/guides/index.mdx
new file mode 100644
index 0000000000..028990282f
--- /dev/null
+++ b/docs/guides/index.mdx
@@ -0,0 +1,28 @@
+---
+id: index
+title: Guides
+slug: /guides
+toc_max_heading_level: 2
+keywords:
+ - guides
+ - how-to
+ - best practices
+ - troubleshooting
+ - performance
+tags:
+ - Integrations
+ - AI Frameworks
+ - Agent Frameworks
+ - Observability
+description: Browse all Guides available for Temporal.
+llm_exclude:
+ This page renders an interactive grid component. For a machine-readable list of guides and their metadata, see
+ [guides-data.ts](https://github.com/temporalio/documentation/blob/main/src/components/GuidesGrid/guides-data.ts).
+---
+
+import GuidesGrid from '@site/src/components/GuidesGrid';
+
+Browse Guides for Temporal SDKs and Temporal Cloud.
+Filter by SDK, tag, or search to find what you need.
+
+
diff --git a/docs/guides/lock-shared-resources.mdx b/docs/guides/lock-shared-resources.mdx
new file mode 100644
index 0000000000..2502a99c00
--- /dev/null
+++ b/docs/guides/lock-shared-resources.mdx
@@ -0,0 +1,442 @@
+---
+id: lock-shared-resources
+title: Coordinate Access to Shared Resources with a Distributed Lock
+description: Use Temporal Workflow IDs to build a durable, safe locking system for your shared resources.
+sidebar_label: Distributed locking
+toc_max_heading_level: 2
+author: Keith Tenzer
+tags:
+ - Workflows
+ - Activities
+ - Child workflows
+ - Signals
+---
+
+This guide details a reusable, durable distributed lock for Temporal Workflows. The design doesn't rely on an external database, a central limiter, or any shared state outside Temporal itself.
+
+This pattern is best for small, fixed pools of scarce resources where each permit is held for minutes, not milliseconds: GPU devices, lab hardware, tenant migrations, or similar operational resources. The latency maybe too high to qualify as a general-purpose high-throughput mutex or rate limiter.
+Common approaches such as a database, cache-layer, or a central rate-limiter services introduce extra infrastructure and new failure modes. Holders of locks die without releasing leak orphan locks. Lock state drifts out of sync with the Workflows that hold them. Capacity changes require restarting limiter services.
+
+This pattern represents each held permit or lock as its own short-lived child Workflow.
+The child Workflow's ID encapsulates the resource and slot, for example `permit:gpu-pool:gpu-2`.
+Temporal does not allow two running Workflows with the same ID, so acquiring a permit is an atomic operation using Temporal's own state.
+The Workflow releases on a Signal pinned to the run. If no release Signal is received, the permit Workflow's lease timeout handles orphan recovery.
+
+By following this implementation plan, you will gain:
+
+- **Atomic Acquire**: Take a permit by starting a child Workflow whose ID is the lock. The operation succeeds or fails atomically against Temporal state.
+- **Lease-Based Recovery**: A per-permit lease timer makes the slot reusable if the holder crashes before releasing. For external resources that can be corrupted by overlap, pass a fencing token to reject stale holders.
+- **Horizontal Scale**: Permit traffic spreads across the slots in a scarce resource pool, which is useful for long-held resource coordination.
+
+## Background and best practices
+
+This section explains why Temporal Workflow IDs make a good distributed lock primitive and how the design recovers from the failure modes that hurt typical lock services.
+
+**Distributed locking is a classical problem**
+Solutions such as a database, cache-layer, and central rate-limiting services share a structural weakness: the lock state lives outside the runtime that holds it. The locking system and the work it gates can fall out of sync. A holder crash leaves an external lock entry in place until a separate cleanup job runs. A limiter outage blocks every gated Workflow, or lets every request through unchecked. Scaling the limiter is a separate operational concern from scaling the workload it gates.
+
+Inside Temporal, by default, every running Workflow execution already has a globally unique, atomically enforced identifier. You cannot start two running Workflows with the same Workflow ID at the same time. That guarantee is the primitive a distributed mutex requires (guarantee only exists if not changing WorkflowId Conflict Policy from its default of FAIL). Treating each permit as its own child Workflow places the lock and the work it gates on the same platform: Temporal's history, durability, and Worker fleet.
+
+**The Workflow ID is the lock**
+Encapsulating the resource and slot into the Workflow ID (`permit:{resource}:{slot}`) makes the act of starting the child Workflow the acquire operation. The start either succeeds and the caller holds the permit, or it fails with `WorkflowAlreadyStartedError` and the caller tries the next slot.
+
+If the parent Workflow closes for any reason without sending the release Signal (operator termination, execution timeout, unhandled exception), the default `ParentClosePolicy.TERMINATE` cancels the permit child immediately and the slot frees right away. The permit Workflow also runs a `wait_condition` with a lease timeout for cases where the holder is alive but went silent, and `start_child_workflow` sets `execution_timeout=lease` as a server-side backstop for permits that no Worker ever picks up. Together these mechanisms handle orphan recovery internally, eliminating the need for a separate cleanup process.
+
+The `Semaphore` pins the release Signal to the specific run started at acquire time by holding the `ChildWorkflowHandle` from `start_child_workflow` and signalling on that handle. Without this pin, a late release Signal from a holder whose lease already expired could revoke a subsequent acquirer's permit. The `Semaphore` shuffles its slot list using `workflow.random()` so Workflows always probe in a random order. The shuffle is deterministic across replay, as Workflow determinism requires, and uniform across callers. This spreads contention across the pool rather than concentrating on slot 0, for example.
+
+Three operational characteristics matter before adopting the pattern:
+
+- Acquire is not first-in-first-out (FIFO). When a slot frees, whomever claims it first, wins.
+- Capacity changes apply only to acquires after the change. Already-held permits run to release or lease expiry on their original capacity.
+- For production, host `PermitSlotWorkflow` on a dedicated Task Queue so application Worker backpressure does not slow permit acquire and release.
+- Lease expiry is recovery, not proof that the old holder stopped using the external resource. For resources that can be corrupted by overlap, pass the permit run ID as a fencing token to the Activity or downstream system and reject stale holders there.
+
+## Target audience
+
+This guide references the following roles:
+
+- **Application developers**: Build the Workflows that acquire permits. Import the `Semaphore` helper and wrap critical sections in `async with sem.acquire(): ...`.
+- **Platform operators**: Run the Worker that hosts `PermitSlotWorkflow`, tune lease and backoff defaults, and decide the Task Queue topology.
+
+The process outlined in this document requires Python code changes and Worker configuration.
+This pattern requires no additional infrastructure.
+
+## Prerequisites
+
+To complete the implementation plan, you will need:
+
+- **Required software and tools**:
+ - Python 3.12 or later ([download](https://www.python.org/downloads/))
+ - Temporal Python SDK 1.7 or later ([github](https://github.com/temporalio/sdk-python))
+ - `uv` 0.9.18 for Python dependency management ([install](https://docs.astral.sh/uv/))
+ - Temporal CLI 1.6.2 for the local development server ([install](https://docs.temporal.io/cli))
+ - Access to a Temporal Cluster 1.3.2 (local development server or Temporal Cloud).
+- **Resources and access privileges**:
+ - A Temporal Namespace where you can register Workers and start Workflows.
+ - For Temporal Cloud: an API key with the Namespace Admin role for the target Namespace.
+- **Required concepts**:
+ - Familiarity with Temporal Workflows and Activities ([docs](https://docs.temporal.io/workflows)).
+ - Child Workflows and parent close policies ([docs](https://docs.temporal.io/encyclopedia/child-workflows)).
+ - Signals and Queries ([docs](https://docs.temporal.io/encyclopedia/workflow-message-passing)).
+ - Workflow determinism ([docs](https://docs.temporal.io/workflows#workflow-determinism)).
+
+## Architecture diagram
+
+The following diagram shows the acquire and release flow when two Workflows compete for slots in a 2-slot pool. The first attempt to get a slot collides with an already held slot; the second succeeds.
+
+
+1. The application Workflow calls `Semaphore.acquire`. The `Semaphore` shuffles slot names using `workflow.random()` and attempts to acquire a slot.
+2. The first attempt to acquire a slot starts a child Workflow with ID `permit:gpu-pool:2`. Temporal already has a running Workflow at that ID and rejects the start with `WorkflowAlreadyStartedError`.
+3. The `Semaphore` falls through to the next slot and starts `permit:gpu-pool:3`. Temporal accepts the start and returns a `ChildWorkflowHandle` to the `Semaphore`.
+4. Temporal dispatches the new execution to a Worker, and `PermitSlotWorkflow` enters its `wait_condition` with the lease timeout running.
+5. The `Semaphore` yields the slot name (`"3"`) to the caller.
+6. The application Workflow runs the gated work using the yielded slot identity.
+7. On context-manager exit, the `Semaphore` sends a `release` Signal on the recorded `ChildWorkflowHandle`, which targets the specific run it started.
+8. The permit Workflow's `wait_condition` returns, the Workflow completes with `released`, and the Workflow ID is reusable.
+
+## Implementation plan
+
+This section outlines the sequential phases to build the distributed lock library and integrate it into a Worker.
+
+### Define shared configuration
+
+The library needs a small config module with lease and backoff defaults, a function that encodes the resource and slot into a Workflow ID, and a function that picks auth mode from environment variables.
+
+Create a file named `distributed_lock/config.py`:
+
+```python
+# config.py
+
+from __future__ import annotations
+
+import os
+from datetime import timedelta
+
+from temporalio.client import Client
+from temporalio.envconfig import ClientConfig
+
+TEMPORAL_TASK_QUEUE: str = os.environ.get("TEMPORAL_TASK_QUEUE", "distributed-lock-tq")
+
+DEFAULT_LEASE: timedelta = timedelta(minutes=10)
+DEFAULT_BACKOFF: timedelta = timedelta(seconds=5)
+DEFAULT_MAX_BACKOFF: timedelta = timedelta(minutes=1)
+DEFAULT_BACKOFF: timedelta = timedelta(seconds=5)
+
+PERMIT_WORKFLOW_ID_PREFIX: str = "permit"
+
+def permit_workflow_id(resource: str, slot: str) -> str:
+ return f"{PERMIT_WORKFLOW_ID_PREFIX}:{resource}:{slot}"
+
+async def connect_temporal_client() -> Client:
+ return await Client.connect(**ClientConfig.load_client_connect_config())
+```
+
+The `permit_workflow_id` function produces IDs like `permit:gpu-pool:2`. That string is the lock: Temporal will reject any second attempt to start a Workflow with the same ID while the first is running.
+The default lease and backoff values are conservative starting points. Adjust these values based on the resource you're gating.
+The `connect_temporal_client` function delegates to `temporalio.envconfig`, which reads connection settings from a TOML profile at `~/.config/temporalio/temporal.toml` (or the path in `TEMPORAL_CONFIG_FILE`) and applies environment variable overrides such as `TEMPORAL_ADDRESS`, `TEMPORAL_NAMESPACE`, `TEMPORAL_API_KEY`, and the `TEMPORAL_TLS_*` family. This is the same configuration source the Temporal CLI uses, so a developer's existing CLI profile drives both the CLI and this Worker. API-key authentication enables Transport Layer Security (TLS) automatically; for self-hosted TLS without an API key, set the relevant `TEMPORAL_TLS_*` variables; for `temporal server start-dev`, leave the variables unset and the connection falls back to plain Transmission Control Protocol (TCP).
+
+### Define the permit primitive
+
+The permit primitive is a child Workflow whose ID is the lock. It accepts a resource name, slot name, and lease duration, then blocks on a `release` Signal with a timeout equal to the lease. On Signal it completes; on timeout it auto-completes for orphan recovery.
+
+Create a file named `distributed_lock/workflow.py`:
+
+```python
+# workflow.py
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import timedelta
+
+from temporalio import workflow
+
+@dataclass
+class PermitSlotInput:
+ resource: str
+ slot: str
+ lease_seconds: float
+
+@workflow.defn(name="PermitSlotWorkflow")
+class PermitSlotWorkflow:
+ def __init__(self) -> None:
+ self._released: bool = False
+
+ @workflow.signal(name="release")
+ def release(self) -> None:
+ self._released = True
+
+ @workflow.run
+ async def run(self, input: PermitSlotInput) -> str:
+ workflow.logger.info(
+ "permit acquired",
+ extra={"resource": input.resource, "slot": input.slot},
+ )
+ try:
+ await workflow.wait_condition(
+ lambda: self._released,
+ timeout=timedelta(seconds=input.lease_seconds),
+ )
+ return "released"
+ except TimeoutError:
+ workflow.logger.warning(
+ "permit lease expired - auto-releasing slot",
+ extra={"resource": input.resource, "slot": input.slot},
+ )
+ return "lease_expired"
+```
+
+The `@workflow.signal(name="release")` decorator exposes the Signal the caller sends on context exit. The `workflow.wait_condition(predicate, timeout=lease)` is the durable wait: Temporal persists the wait, so a Worker restart does not lose the held permit. The Workflow does not set its own ID. The caller sets it via `start_child_workflow(id=...)`. The return value (`"released"` or `"lease_expired"`) lets operators distinguish clean releases from orphan recoveries directly in the Temporal UI.
+
+### Build the caller-side Semaphore helper
+
+The `Semaphore` is a thin async context manager around `workflow.start_child_workflow`. It is not a Workflow itself. Construct it with a fixed `capacity`; the `Semaphore` auto-generates slot names `"0"` through `"N-1"` and yields the name of the slot it acquired. Activities that need to map the slot to an external resource (CUDA device, database shard, etc.) translate the string with `int(slot)`.
+
+Create a file named `distributed_lock/semaphore.py`:
+
+```python
+# semaphore.py
+
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+from datetime import timedelta
+from typing import AsyncIterator
+
+from temporalio import workflow
+from temporalio.exceptions import FailureError, WorkflowAlreadyStartedError
+from temporalio.workflow import ChildWorkflowHandle
+
+from distributed_lock.config import DEFAULT_BACKOFF, DEFAULT_LEASE, permit_workflow_id
+from distributed_lock.workflow import PermitSlotInput, PermitSlotWorkflow
+
+class Semaphore:
+ def __init__(
+ self,
+ resource: str,
+ *,
+ capacity: int,
+ task_queue: str | None = None,
+ ) -> None:
+ if capacity < 1:
+ raise ValueError("capacity must be >= 1")
+ self._resource = resource
+ self._slots: list[str] = [str(i) for i in range(capacity)]
+ self._task_queue = task_queue
+
+ @asynccontextmanager
+ async def acquire(
+ self,
+ *,
+ lease: timedelta = DEFAULT_LEASE,
+ backoff: timedelta = DEFAULT_BACKOFF,
+ ) -> AsyncIterator[str]:
+ slot, handle = await self._acquire_one(lease=lease, backoff=backoff)
+ try:
+ yield slot
+ finally:
+ await self._release(slot, handle)
+
+ async def _acquire_one(
+ self, *, lease: timedelta, backoff: timedelta
+ ) -> tuple[str, ChildWorkflowHandle]:
+ rng = workflow.random()
+ lease_seconds = lease.total_seconds()
+ while True:
+ order = list(self._slots)
+ rng.shuffle(order)
+ for slot in order:
+ wf_id = permit_workflow_id(self._resource, slot)
+ try:
+ handle = await workflow.start_child_workflow(
+ PermitSlotWorkflow.run,
+ PermitSlotInput(
+ resource=self._resource,
+ slot=slot,
+ lease_seconds=lease_seconds,
+ ),
+ id=wf_id,
+ task_queue=self._task_queue,
+ execution_timeout=lease,
+ )
+ return slot, handle
+ except WorkflowAlreadyStartedError:
+ continue
+ await workflow.sleep(backoff)
+
+ async def _release(self, slot: str, handle: ChildWorkflowHandle) -> None:
+ try:
+ await handle.signal("release")
+ except FailureError as e:
+ workflow.logger.info(
+ "release signal target already finished (lease likely expired)",
+ extra={
+ "resource": self._resource,
+ "slot": slot,
+ "error": str(e),
+ },
+ )
+```
+
+The constructor takes a fixed `capacity` and rejects values below 1. The `Semaphore` auto-generates slot names `"0"` through `"N-1"` and reshuffles them on every attempt using `workflow.random()` so long-running contention spreads evenly across slots. The `Semaphore` keeps the `ChildWorkflowHandle` returned by `start_child_workflow` and signals release on that handle, so the Signal cannot land on a subsequent acquirer's execution.
+The `FailureError` caught in `_release` covers the lease-already-expired case: the slot is already free and the release is a no-op, logged at info level for observability.
+Three orphan-recovery mechanisms work together to bound slot occupancy. The default `ParentClosePolicy.TERMINATE` is the fastest, when the parent Workflow closes for any reason without releasing, Temporal terminates the permit child immediately and the slot frees right away. The in-workflow timer `wait_condition(timeout=lease)` covers the case where the parent is still alive but the holder is silent, the permit Workflow exits cleanly as `"lease_expired"`, distinguishable in the Temporal UI. That timer only starts ticking once a Worker runs the permit's first task, however, so the workflow timeout `execution_timeout=lease` is a server-enforced backstop that ticks from Workflow creation regardless of whether a Worker ever picks it up. Together these guarantee the slot is never leaked, even if the permit Workflow never runs.
+
+### Register the permit Workflow on a Worker
+
+The Worker that runs application Workflows must also register `PermitSlotWorkflow`.
+A single-queue topology like the one below is fine for development. For production deployments, use the dedicated Task Queue described in the Background and People and process sections.
+
+Create a file named `worker/main.py`:
+
+```python
+# main.py
+
+from __future__ import annotations
+
+import asyncio
+
+from temporalio.worker import Worker
+
+from distributed_lock.config import TEMPORAL_TASK_QUEUE, connect_temporal_client
+from distributed_lock.workflow import PermitSlotWorkflow
+
+async def _run_worker() -> None:
+ client = await connect_temporal_client()
+ worker = Worker(
+ client,
+ task_queue=TEMPORAL_TASK_QUEUE,
+ workflows=[PermitSlotWorkflow],
+ activities=[],
+ )
+ await worker.run()
+
+if __name__ == "__main__":
+ asyncio.run(_run_worker())
+```
+
+`PermitSlotWorkflow` is in the `workflows=[...]` list and the `activities=[]` list is empty.
+The permit lifecycle is pure Workflow code, so this library needs no Activities. `worker.run()` blocks until the awaited task is cancelled, which is sufficient for `temporal server start-dev` and CI: Ctrl-C raises `KeyboardInterrupt` and the worker drains in-flight work as the cancellation propagates. For containerized production deployments, add a `SIGTERM` handler that cancels the run task so pod restarts and rolling deploys also drain gracefully. The `connect_temporal_client` function picks up environment variables to choose between Temporal Cloud API key auth, self-hosted TLS, and plain TCP for `temporal server start-dev`, so the same Worker source runs against any environment.
+
+### Example application Workflows
+
+The two examples below show how application Workflows apply the pattern in different use cases. The API is the same in both: construct a `Semaphore` with a `capacity`, wrap the gated section in `async with sem.acquire(...)`, and use the yielded slot string however the workload requires. Source files live at `examples/gate_workflow.py` and `examples/gpu_workflow.py` in the reference implementation.
+
+#### Generic Workflow throttling
+
+Cap the number of concurrent holders without caring about slot identity. The yielded slot is a string `"0"` through `"N-1"`; the Activity passes it through as a label.
+
+```python
+# gate_workflow.py
+
+from __future__ import annotations
+
+from datetime import timedelta
+
+from temporalio import workflow
+
+from distributed_lock.semaphore import Semaphore
+
+@workflow.defn(name="ThrottledGateWorkflow")
+class ThrottledGateWorkflow:
+ @workflow.run
+ async def run(self, job_id: str) -> str:
+ sem = Semaphore("app-gate", capacity=4)
+ async with sem.acquire(lease=timedelta(minutes=10)) as slot:
+ return await workflow.execute_activity(
+ "do_gated_work",
+ args=[job_id, slot],
+ start_to_close_timeout=timedelta(minutes=5),
+ heartbeat_timeout=timedelta(seconds=30),
+ )
+```
+
+#### Resource locks (GPU, database shard, tenant)
+
+Hand the yielded slot to an Activity that maps it to a specific resource. The slot is a string `"0"` through `"N-1"`; convert it with `int(slot)` if the resource is indexed numerically (CUDA device, shard ID).
+
+```python
+# gpu_workflow.py
+
+from __future__ import annotations
+
+from datetime import timedelta
+
+from temporalio import workflow
+
+from distributed_lock.semaphore import Semaphore
+
+@workflow.defn(name="GpuTrainingWorkflow")
+class GpuTrainingWorkflow:
+ @workflow.run
+ async def run(self, model_id: str) -> str:
+ sem = Semaphore("gpu-pool", capacity=4)
+ async with sem.acquire(lease=timedelta(minutes=45)) as slot:
+ return await workflow.execute_activity(
+ "run_training",
+ args=[model_id, slot],
+ start_to_close_timeout=timedelta(minutes=30),
+ heartbeat_timeout=timedelta(minutes=3),
+ )
+```
+
+#### Heartbeat the long-running Activity
+
+Both Activities run inside the gated critical section, so a Worker crash mid-Activity must be detected promptly to free the slot. Set `heartbeat_timeout` on the Activity invocation and call `activity.heartbeat(...)` periodically inside the Activity body. Temporal fails the Activity attempt as soon as a heartbeat is missed for longer than the timeout, the holder Workflow then exits its `async with` block, the `Semaphore` sends `release` to the slot Workflow, and the slot returns to the pool. The lease timer remains as a coarse last-resort recovery for cases where heartbeats cannot run.
+
+```python
+# activities.py
+
+from __future__ import annotations
+
+import asyncio
+
+from temporalio import activity
+
+@activity.defn(name="do_gated_work")
+async def do_gated_work(job_id: str, slot: str) -> str:
+ total_seconds = 60
+ interval = 5
+ elapsed = 0
+ while elapsed < total_seconds:
+ # `asyncio.sleep` simulates the real gated work
+ # (API call, batch processing, etc.). Replace with the actual workload.
+ await asyncio.sleep(interval)
+ elapsed += interval
+ activity.heartbeat(
+ {"job_id": job_id, "slot": slot, "elapsed_seconds": elapsed}
+ )
+ return f"{job_id} done on slot {slot}"
+
+@activity.defn(name="run_training")
+async def run_training(model_id: str, slot: str) -> str:
+ gpu_index = int(slot) # map "0".."3" to a CUDA device index
+ epochs = 10
+ for epoch in range(1, epochs + 1):
+ # `asyncio.sleep` simulates one training epoch on cuda:{gpu_index}.
+ # Replace with the actual training step.
+ await asyncio.sleep(60)
+ activity.heartbeat(
+ {"model_id": model_id, "gpu_index": gpu_index, "epoch": epoch}
+ )
+ return f"trained {model_id} on cuda:{gpu_index}"
+```
+
+Set the heartbeat interval shorter than `heartbeat_timeout` (a 2x to 3x ratio is a common starting point) so a single delayed heartbeat does not fail a healthy Activity. Pass progress data into `activity.heartbeat(...)` so retries can resume from the last reported point and operators can observe progress in the Temporal UI.
+
+## Outcomes
+
+By following this guide, you have built a durable distributed lock for Temporal Workflows backed by Temporal's own Workflow ID uniqueness guarantee.
+
+You now have the capability to:
+
+- Cap concurrent access to any shared resource or critical section without a database, cache-layer, or a central limiter service.
+- Recover orphan permits automatically when holders crash, using per-permit lease timers.
+- Scale lock acquisition horizontally. Permit traffic distributes across short-lived child Workflows rather than concentrating on one limiter.
+
+## Related resources
+
+- [Temporal Python SDK Documentation](https://docs.temporal.io/develop/python). SDK reference for Workflows, Signals, child Workflows, and Worker setup.
+- [Child Workflow Executions](https://docs.temporal.io/encyclopedia/child-workflows). How parent and child Workflows interact, including parent close policies.
+- [Workflow Message Passing](https://docs.temporal.io/encyclopedia/workflow-message-passing). Reference for Signals and Queries.
+- [Reference implementation source repository](https://github.com/temporal-sa/temporal-workflow-throttler). The upstream repository bundles `PermitSlotWorkflow`, the `Semaphore` helper, demo Workers, a FastAPI service, and a React UI. The package directory is named `throttler/` in the source.
\ No newline at end of file
diff --git a/docs/guides/rate-limit-downstream-apis.mdx b/docs/guides/rate-limit-downstream-apis.mdx
new file mode 100644
index 0000000000..68124550e8
--- /dev/null
+++ b/docs/guides/rate-limit-downstream-apis.mdx
@@ -0,0 +1,782 @@
+---
+id: rate-limit-downstream-apis
+title: Rate-limit downstream APIs with separate Task Queues
+description: Protect limited resources and avoid Workflow failures by setting limits on downstream APIs with separate Task Queues.
+sidebar_label: Rate-limit downstream APIs
+toc_max_heading_level: 2
+author: Cecil Phillip
+tags:
+ - Workers
+ - Routing
+ - Rate limiting
+---
+
+### Problem statement
+
+Modern applications integrate with numerous external APIs (SendGrid, Stripe, OpenAI, Twilio) that enforce rate limits to protect their infrastructure. These limits vary by provider service and plan, for example:
+
+- **SendGrid:** 100 emails/minute on free tier, 1000/minute on paid
+- **Stripe:** 100 requests/second globally
+- **OpenAI:** 60 requests/minute, 90,000 tokens/minute
+- **Twilio:** 1 request/second per phone number
+
+Without proactive rate limiting when calling downstream services, users may experience the following issues:
+- **HTTP 429 errors:** Activities overwhelm APIs causing "Too Many Requests" errors
+- **Account suspension:** Repeated violations can lead to temporary or permanent bans from the downstream service
+- **Failed workflows:** Without proper retry handling, workflows may not be able to continue to make progress
+- **Wasted execution:** Activities that will fail due to rate limits consume Worker resources
+- **Cascading failures:** One workflow's excessive API calls could affect others
+
+### Solution
+
+Use separate Task Queues with rate limiting configuration to protect downstream APIs. Create one Task Queue per rate-limited API and configure:
+
+- `max_task_queue_activities_per_second`: Global rate limit across all Workers on the queue
+- `max_activities_per_second`: Per-Worker rate limit (optional, for additional control)
+- `max_concurrent_activities`: Limit concurrent executions when API has concurrency limits
+
+This ensures Activities calling external APIs never exceed their rate limits, preventing 429 errors and account issues.
+
+### Outcomes
+
+- **Rate limit compliance:** Activities respect API rate limits, eliminating 429 errors and preventing account suspension
+- **Improved reliability:** Workflows complete successfully without failures caused by rate limits
+- **Resource efficiency:** Workers don't waste resources on doomed-to-fail Activities
+- **Better API relationships:** Consistent rate limit compliance maintains good standing with API providers
+- **Independent scaling:** Add more Workers without exceeding API rate limits (global queue limit applies)
+
+## Background and best practices
+
+### Task Queue fundamentals
+
+Task Queues in Temporal are dynamically created when first referenced. Rate limiting is configured at the Worker level and enforced by the Temporal Server.
+
+**Recommended practice:** Create one Task Queue per rate-limited API to isolate rate limits.
+
+### Rate limiting configuration options
+
+The Python SDK provides three rate limiting controls:
+
+1. **max_task_queue_activities_per_second** (global): Limits Activity dispatch across ALL Workers on this queue. Enforced by Temporal Server. Best for API rate limits. Last value wins.
+
+2. **max_activities_per_second** (per-worker): Limits Activities per Worker. Can be combined with global limit for finer control.
+
+3. **max_concurrent_activities** (concurrency): Limits concurrent executions. Use when API has concurrent connection limits (e.g., database pool size).
+
+4. **disable_eager_activity_execution** (Client configuration): Set to `True` when starting workflows to prevent activities from being eagerly assigned to the workflow worker, ensuring they go through the rate-limited task queue instead.
+
+**Important:** Without `disable_eager_activity_execution=True`, activities may bypass your rate-limited task queues entirely. Eager execution runs activities on the same worker as the workflow, which circumvents the rate limiting controls configured on the activity-specific task queues. Always disable eager execution when using rate-limited task queues for API calls.
+
+### Handling HTTP 429 responses
+
+Even with rate limiting, occasional 429 errors may occur due to:
+- Other systems using the same API key
+- API provider reducing limits temporarily
+- Burst traffic patterns
+
+**Recommended practice:** Raise an exception with a specific next retry delay from the API's `Retry-After` header, and fallback to exponential backoff when the header is not available:
+
+1. When catching 429 errors, check for the `Retry-After` response header
+2. If present, raise an `ApplicationError` with `next_retry_delay` set to the header value
+3. If not present, raise the exception normally to use the Activity's retry policy
+
+```python
+retry_policy=workflow.RetryPolicy(
+ initial_interval=timedelta(seconds=1),
+ maximum_interval=timedelta(minutes=10),
+ backoff_coefficient=2.0,
+ maximum_attempts=5,
+)
+```
+
+**Understanding the retry cadence:**
+
+With this configuration, the retry intervals follow this pattern:
+- Attempt 1: 1 second
+- Attempt 2: 2 seconds (1s × 2.0)
+- Attempt 3: 4 seconds (2s × 2.0)
+- Attempt 4: 8 seconds (4s × 2.0)
+- Attempt 5: 16 seconds (8s × 2.0)
+
+Total time before failure: ~31 seconds across 5 attempts.
+
+**Key differences from default Retry Policy:**
+- **maximum_attempts**: Set to `5` instead of the default `unlimited`. This prevents infinite retries for persistent rate limiting issues and ensures timely failure detection.
+- **maximum_interval**: Set to `10 minutes` instead of the default `100x initial_interval`. This caps retry delays at a reasonable duration.
+
+For rate-limited APIs, capping `maximum_attempts` ensures that Activities don't retry indefinitely if an API is experiencing extended downtime or if your rate limits have been permanently reduced.
+
+This approach respects the API's rate limit guidance while providing a sensible fallback strategy.
+
+### Operational considerations
+
+- **Monitor API usage:** Track actual API calls vs rate limits to adjust configured rate limits in your Temporal Task Queues
+- **API limit changes:** API providers may change rate limits; monitor and update Worker configuration
+- **Burst allowance:** Some APIs allow short bursts above stated limits; test to determine safe Temporal limits
+- **Multiple environments:** Use separate API keys and Task Queues for dev/staging/prod
+
+#### Task Queue backlog and draining strategies
+
+During throttling events, Task Queues can grow significantly. Operators need to switch to **draining mitigation mode** when backlogs occur.
+
+**Critical considerations:**
+- **Determine acceptable SLA/SLO for draining:** How long is acceptable for the queue to drain? Hours? Days?
+- **Cascading failures:** Simply increasing downstream API or Temporal Cloud rate limits may move the bottleneck elsewhere, potentially overwhelming the next component in line
+
+**Mitigation strategies (in order of consideration):**
+
+1. **Request downstream API rate limit increases** - Work with API providers to increase your rate limits
+2. **Request Temporal Cloud rate limit increases** - If using Temporal Cloud, request higher limits
+3. **Scale worker pools** - Add more workers or adjust worker configuration
+4. **Increase internal resources** - Scale your infrastructure
+5. **Identify the next bottleneck** - Determine what will become throttled next to prevent cascading failures
+
+#### CLI commands for operators
+
+**Adjust Task Queue rate limits dynamically:**
+
+```bash
+# Set rate limit for a specific task queue
+temporal task-queue config set --queue-rps-limit 99 --task-queue sendgrid-api
+```
+
+This allows operators to adjust rate limits without redeploying workers.
+
+**Reset stuck activities:**
+
+When activities are stuck at their maximum retry intervals:
+
+```bash
+# Reset activities to retry immediately
+temporal activity reset --workflow-id --activity-id
+```
+
+This is useful when downstream APIs recover from outages and you want to immediately retry activities that are waiting at long backoff intervals (e.g., 10 minutes).
+
+## Target audience
+
+- **Temporal Workflow & Activity developers:** Implementing API integrations with rate limiting
+- **Platform operators:** Configuring and monitoring rate-limited Workers
+- **API integration engineers:** Ensuring compliance with third-party API limits
+- **SRE teams:** Preventing API-related outages
+
+This implementation requires Worker configuration, Activity error handling, and monitoring of API usage against limits.
+
+## Prerequisites
+
+### Required software, infrastructure, and tools
+
+- Temporal Server v1.17+ (for `max_task_queue_activities_per_second` support)
+- Python 3.8 or later
+- Temporal Python SDK v1.0.0 or later (`pip install temporalio`)
+- API keys and documentation for external services
+
+### Resources & Access Privileges
+
+- Temporal namespace with permissions to start Workflows and register Workers
+- API keys with known rate limits for external services
+- Access to API provider dashboards to monitor usage
+
+### Required Concepts
+
+- Temporal Workflows, Activities, and Task Queues
+- HTTP client libraries (httpx, requests)
+- API authentication (API keys, OAuth)
+- Exponential backoff and retry strategies
+
+## Architecture diagram(s)
+
+### Rate-Limited Task Queues Architecture
+
+```mermaid
+flowchart TB
+ subgraph Workflow["Workflow Execution"]
+ WF[Notification Workflow]
+ end
+
+ subgraph Routing["Activity Routing"]
+ Decision{Which API?}
+ end
+
+ subgraph Queues["Rate-Limited Task Queues"]
+ SGTQ[sendgrid-api 100/min = 1.67/sec]
+ TWTQ[twilio-api 60/min = 1/sec]
+ AITQ[openai-api 60/min = 1/sec]
+ end
+
+ subgraph Workers["Worker Pools"]
+ subgraph SGWP["SendGrid Workers"]
+ SGW1[Worker 1]
+ SGW2[Worker 2]
+ SGW3[Global Limit: 1.5/sec]
+ end
+ subgraph TWWP["Twilio Workers"]
+ TWW1[Worker 1]
+ TWW2[Worker 2]
+ TWW3[Global Limit: 0.9/sec]
+ end
+ subgraph AIWP["OpenAI Workers"]
+ AIW1[Worker 1]
+ AIW2[Worker 2]
+ AIW3[Global Limit: 0.9/sec]
+ end
+ end
+
+ subgraph APIs["External APIs"]
+ SGAPI[SendGrid API Rate Limit: 100/min]
+ TWAPI[Twilio API Rate Limit: 60/min]
+ AIAPI[OpenAI API Rate Limit: 60/min]
+ end
+
+ WF --> Decision
+
+ Decision -->|Email| SGTQ
+ Decision -->|SMS| TWTQ
+ Decision -->|AI| AITQ
+
+ SGTQ --> SGWP
+ TWTQ --> TWWP
+ AITQ --> AIWP
+
+ SGWP -.->|Respects 100/min limit| SGAPI
+ TWWP -.->|Respects 60/min limit| TWAPI
+ AIWP -.->|Respects 60/min limit| AIAPI
+
+ style SGTQ fill:#e1f5ff
+ style TWTQ fill:#fff4e1
+ style AITQ fill:#f3e5f5
+```
+
+## Implementation plan
+
+### Step 1: Identify API rate limits
+
+Document rate limits for all external APIs your application uses.
+
+**Example rate limit inventory:**
+
+| API | Rate Limit | Concurrency Limit | Plan/Tier | Notes |
+|-----|-----------|-------------------|-----------|-------|
+| SendGrid | 100 req/min | None | Free | Burst: 300/min for 1 min |
+| Stripe | 100 req/sec | None | Standard | Per API key globally |
+| OpenAI | 60 req/min | None | Tier 1 | Also: 90K tokens/min |
+| Twilio | 60 req/min | None | Free | Per phone number |
+| Database | None | 100 connections | N/A | Connection pool limit |
+
+**Actions:**
+1. Review API documentation for each integrated service
+2. Check your current plan/tier limits
+3. Determine if limits are per API key, per account, or per resource
+4. Set Temporal rate limits to 90% of API limits (leave safety buffer)
+
+### Step 2: Define Task Queue constants
+
+**File: `task_queues.py`**
+
+```python
+"""Task Queue constants for rate-limited APIs."""
+
+# Rate-limited API task queues
+SENDGRID_API_QUEUE = "sendgrid-api"
+STRIPE_API_QUEUE = "stripe-api"
+OPENAI_API_QUEUE = "openai-api"
+TWILIO_API_QUEUE = "twilio-api"
+```
+
+### Step 3: Configure Workers with rate limiting
+
+**File: `worker_sendgrid.py`**
+
+```python
+"""Worker for rate-limited SendGrid email activities."""
+import asyncio
+import logging
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from task_queues import SENDGRID_API_QUEUE
+from activities import send_email, send_batch_email
+
+logging.basicConfig(level=logging.INFO)
+
+
+async def main():
+ client = await Client.connect("localhost:7233")
+
+ worker = Worker(
+ client,
+ task_queue=SENDGRID_API_QUEUE,
+ activities=[send_email, send_batch_email],
+
+ # SendGrid free tier: 100 emails/minute = 1.67/sec
+ # Set to 1.5/sec for safety buffer
+ # This limit applies GLOBALLY across all workers on this queue
+ max_task_queue_activities_per_second=1.5,
+
+ disable_eager_activity_execution=True,
+
+ # Optional: Also limit per-worker to prevent single worker bursts
+ max_activities_per_second=0.5,
+
+ # Limit concurrent connections (SendGrid allows many concurrent)
+ max_concurrent_activities=20,
+ )
+
+ logging.info(
+ f"Starting SendGrid worker on {SENDGRID_API_QUEUE} "
+ f"(max 1.5/sec globally)"
+ )
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**File: `worker_openai.py`**
+
+```python
+"""Worker for rate-limited OpenAI API activities."""
+import asyncio
+import logging
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from task_queues import OPENAI_API_QUEUE
+from activities import generate_text, generate_embeddings
+
+logging.basicConfig(level=logging.INFO)
+
+
+async def main():
+ client = await Client.connect("localhost:7233")
+
+ worker = Worker(
+ client,
+ task_queue=OPENAI_API_QUEUE,
+ activities=[generate_text, generate_embeddings],
+
+ # OpenAI Tier 1: 60 requests/minute = 1/sec
+ # Set to 0.9/sec for safety
+ max_task_queue_activities_per_second=0.9,
+
+ # Limit concurrent requests
+ max_concurrent_activities=5,
+ )
+
+ logging.info(
+ f"Starting OpenAI worker on {OPENAI_API_QUEUE} "
+ f"(max 0.9/sec globally)"
+ )
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**File: `worker_stripe.py`**
+
+```python
+"""Worker for rate-limited Stripe API activities."""
+import asyncio
+import logging
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from task_queues import STRIPE_API_QUEUE
+from activities import create_payment, refund_payment
+
+logging.basicConfig(level=logging.INFO)
+
+
+async def main():
+ client = await Client.connect("localhost:7233")
+
+ worker = Worker(
+ client,
+ task_queue=STRIPE_API_QUEUE,
+ activities=[create_payment, refund_payment],
+
+ # Stripe: 100 requests/second globally
+ # Set to 90/sec for safety
+ max_task_queue_activities_per_second=90,
+
+ # High concurrency allowed
+ max_concurrent_activities=50,
+ )
+
+ logging.info(
+ f"Starting Stripe worker on {STRIPE_API_QUEUE} "
+ f"(max 90/sec globally)"
+ )
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**Deployment guidance:**
+- Deploy 2-5 workers per API-specific Task Queue
+- Rate limit is enforced globally by Temporal Service across all workers
+- More workers = better fault tolerance, but rate limit still applies
+- Monitor worker CPU/memory; rate-limited workers typically have low utilization
+
+### Step 4: Implement Activities with proper error handling
+
+**File: `activities.py`**
+
+```python
+"""Activities that call rate-limited external APIs."""
+import httpx
+from temporalio import activity
+from temporalio.exceptions import ApplicationError
+from datetime import timedelta
+
+# API keys (use environment variables in production)
+SENDGRID_API_KEY = "your-sendgrid-key"
+STRIPE_API_KEY = "your-stripe-key"
+OPENAI_API_KEY = "your-openai-key"
+
+
+@activity.defn
+async def send_email(email_data: dict) -> dict:
+ """
+ Send email via SendGrid.
+
+ Rate limited by worker configuration to 1.5/sec globally.
+ """
+ activity.logger.info(f"Sending email to {email_data['to']}")
+
+ async with httpx.AsyncClient() as client:
+ try:
+ response = await client.post(
+ "https://api.sendgrid.com/v3/mail/send",
+ headers={
+ "Authorization": f"Bearer {SENDGRID_API_KEY}",
+ "Content-Type": "application/json",
+ },
+ json={
+ "personalizations": [{"to": [{"email": email_data["to"]}]}],
+ "from": {"email": "noreply@example.com"},
+ "subject": email_data["subject"],
+ "content": [{"type": "text/html", "value": email_data["body"]}],
+ },
+ timeout=30.0,
+ )
+
+ response.raise_for_status()
+ activity.logger.info(f"Email sent to {email_data['to']}")
+
+ return {"status": "sent", "email": email_data["to"]}
+
+ except httpx.HTTPStatusError as e:
+ if e.response.status_code == 429:
+ # Check for Retry-After header (in seconds)
+ retry_after = e.response.headers.get("Retry-After")
+
+ if retry_after:
+ # Use specific retry delay from API
+ retry_delay = timedelta(seconds=int(retry_after))
+ activity.logger.warning(
+ f"Rate limit hit for {email_data['to']}, "
+ f"retry after {retry_after}s"
+ )
+ raise ApplicationError(
+ "Rate limit exceeded",
+ non_retryable=False,
+ next_retry_delay=retry_delay,
+ )
+ else:
+ # Fallback to exponential backoff via retry policy
+ activity.logger.warning(
+ f"Rate limit hit for {email_data['to']}, "
+ f"using exponential backoff"
+ )
+ raise
+
+ # Other HTTP errors
+ activity.logger.error(f"HTTP error {e.response.status_code}: {e}")
+ raise
+
+ except Exception as e:
+ activity.logger.error(f"Failed to send email: {e}")
+ raise
+
+
+@activity.defn
+async def generate_text(prompt: str) -> dict:
+ """
+ Generate text using OpenAI API.
+
+ Rate limited to 0.9/sec globally.
+ """
+ activity.logger.info(f"Generating text for prompt: {prompt[:50]}...")
+
+ async with httpx.AsyncClient() as client:
+ try:
+ response = await client.post(
+ "https://api.openai.com/v1/chat/completions",
+ headers={
+ "Authorization": f"Bearer {OPENAI_API_KEY}",
+ "Content-Type": "application/json",
+ },
+ json={
+ "model": "gpt-3.5-turbo",
+ "messages": [{"role": "user", "content": prompt}],
+ },
+ timeout=60.0,
+ )
+
+ response.raise_for_status()
+ result = response.json()
+
+ activity.logger.info("Text generated successfully")
+ return {
+ "text": result["choices"][0]["message"]["content"],
+ "usage": result["usage"],
+ }
+
+ except httpx.HTTPStatusError as e:
+ if e.response.status_code == 429:
+ # Check for Retry-After header
+ retry_after = e.response.headers.get("Retry-After")
+ error_data = e.response.json()
+
+ if retry_after:
+ # Use specific retry delay from API
+ retry_delay = timedelta(seconds=int(retry_after))
+ activity.logger.warning(
+ f"OpenAI rate limit, retry after {retry_after}s: "
+ f"{error_data.get('error', {}).get('message')}"
+ )
+ raise ApplicationError(
+ "OpenAI rate limit exceeded",
+ non_retryable=False,
+ next_retry_delay=retry_delay,
+ )
+ else:
+ # Fallback to exponential backoff
+ activity.logger.warning(
+ f"OpenAI rate limit, using exponential backoff: "
+ f"{error_data.get('error', {}).get('message')}"
+ )
+ raise
+
+ activity.logger.error(f"OpenAI error {e.response.status_code}: {e}")
+ raise
+
+
+@activity.defn
+async def create_payment(payment_data: dict) -> dict:
+ """
+ Create payment using Stripe API.
+
+ Rate limited to 90/sec globally.
+ """
+ activity.logger.info(
+ f"Creating payment for ${payment_data['amount']/100:.2f}"
+ )
+
+ async with httpx.AsyncClient() as client:
+ try:
+ response = await client.post(
+ "https://api.stripe.com/v1/payment_intents",
+ headers={
+ "Authorization": f"Bearer {STRIPE_API_KEY}",
+ },
+ data={
+ "amount": payment_data["amount"],
+ "currency": payment_data["currency"],
+ "payment_method": payment_data["payment_method"],
+ "confirm": "true",
+ },
+ timeout=30.0,
+ )
+
+ response.raise_for_status()
+ result = response.json()
+
+ activity.logger.info(f"Payment created: {result['id']}")
+ return {
+ "payment_id": result["id"],
+ "status": result["status"],
+ }
+
+ except httpx.HTTPStatusError as e:
+ if e.response.status_code == 429:
+ # Check for Retry-After header
+ retry_after = e.response.headers.get("Retry-After")
+
+ if retry_after:
+ # Use specific retry delay from API
+ retry_delay = timedelta(seconds=int(retry_after))
+ activity.logger.warning(
+ f"Stripe rate limit hit, retry after {retry_after}s"
+ )
+ raise ApplicationError(
+ "Stripe rate limit exceeded",
+ non_retryable=False,
+ next_retry_delay=retry_delay,
+ )
+ else:
+ # Fallback to exponential backoff
+ activity.logger.warning(
+ "Stripe rate limit hit, using exponential backoff"
+ )
+ raise
+
+ activity.logger.error(f"Stripe error {e.response.status_code}")
+ raise
+```
+
+### Step 5: Route Activities to rate-limited queues in Workflows
+
+**File: `notification_workflow.py`**
+
+```python
+"""Workflow that routes to rate-limited API queues."""
+from datetime import timedelta
+from dataclasses import dataclass
+from temporalio import workflow
+
+with workflow.unsafe.imports_passed_through():
+ from task_queues import SENDGRID_API_QUEUE, OPENAI_API_QUEUE, STRIPE_API_QUEUE
+
+
+@dataclass
+class NotificationRequest:
+ user_email: str
+ notification_type: str
+ data: dict
+
+
+@workflow.defn
+class NotificationWorkflow:
+ """
+ Send notifications via multiple rate-limited APIs.
+
+ Routes to appropriate queues based on notification type.
+ """
+
+ @workflow.run
+ async def run(self, request: NotificationRequest) -> dict:
+ workflow.logger.info(
+ f"Processing {request.notification_type} notification "
+ f"for {request.user_email}"
+ )
+
+ results = {}
+
+ # Send email (rate-limited to 1.5/sec)
+ if "email" in request.notification_type:
+ email_result = await workflow.execute_activity(
+ "send_email",
+ {
+ "to": request.user_email,
+ "subject": f"{request.notification_type} Notification",
+ "body": str(request.data),
+ },
+ task_queue=SENDGRID_API_QUEUE,
+ start_to_close_timeout=timedelta(minutes=2),
+ retry_policy=workflow.RetryPolicy(
+ initial_interval=timedelta(seconds=1),
+ maximum_interval=timedelta(minutes=10),
+ backoff_coefficient=2.0,
+ maximum_attempts=5,
+ ),
+ )
+ results["email"] = email_result
+
+ # Generate AI content (rate-limited to 0.9/sec)
+ if "ai" in request.notification_type:
+ ai_result = await workflow.execute_activity(
+ "generate_text",
+ f"Generate notification: {request.data}",
+ task_queue=OPENAI_API_QUEUE,
+ start_to_close_timeout=timedelta(minutes=5),
+ retry_policy=workflow.RetryPolicy(
+ initial_interval=timedelta(seconds=1),
+ maximum_interval=timedelta(minutes=10),
+ backoff_coefficient=2.0,
+ maximum_attempts=5,
+ ),
+ )
+ results["ai"] = ai_result
+
+ workflow.logger.info(f"Notification sent to {request.user_email}")
+ return results
+```
+
+**Starter example:**
+
+```python
+# start_workflow.py
+"""Start notification workflow."""
+import asyncio
+from temporalio.client import Client
+from notification_workflow import NotificationWorkflow, NotificationRequest
+
+
+async def main():
+ client = await Client.connect("localhost:7233")
+
+ # Send 100 notifications - rate limiting prevents API overload
+ for i in range(100):
+ request = NotificationRequest(
+ user_email=f"user{i}@example.com",
+ notification_type="email",
+ data={"message": f"Notification {i}"},
+ )
+
+ handle = await client.start_workflow(
+ NotificationWorkflow.run,
+ request,
+ id=f"notification-{i}",
+ task_queue="workflows",
+ # Disable eager execution to ensure activities go through rate-limited queues
+ disable_eager_activity_execution=True,
+ )
+
+ print(f"Started workflow {i}: {handle.id}")
+
+ print("All workflows started. Rate limiting ensures API compliance.")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+## Conclusion
+
+By implementing rate-limited separate Task Queues for external APIs, you have achieved:
+
+1. **Rate limit compliance:** Activities respect API rate limits (e.g., SendGrid 1.5/sec, OpenAI 0.9/sec), eliminating 429 errors and preventing account suspension
+
+2. **Improved reliability:** Workflows complete successfully without rate limit failures. Global rate limiting ensures that adding more Workers doesn't violate API limits.
+
+3. **Better API relationships:** Consistent rate limit compliance maintains good standing with API providers, avoiding temporary bans or throttling
+
+4. **Independent scaling:** Add more Workers to increase throughput for other operations without exceeding API rate limits (global queue limit still applies)
+
+Your application now safely integrates with rate-limited external APIs, ensuring compliance and reliability.
+
+## Related Resources
+
+### Official Documentation
+- [Temporal Documentation - Task Routing](https://docs.temporal.io/task-routing)
+- [Temporal Best Practices - Separate Task Queues](https://docs.temporal.io/best-practices/worker#separate-task-queues-logically)
+- [Python SDK Worker Configuration](https://python.temporal.io/temporalio.worker.Worker.html)
+
+### Related Patterns
+- [Separate Task Queues - Worker Affinity](worker-execution-affinity) - For activities on same Worker
+
+### Community Resources
+- [Forum: When to Use Multiple Task Queues](https://community.temporal.io/t/in-what-situation-should-we-use-multiple-separated-task-queues/1254)
+- [Slack: Rate Limiting Activities](https://temporal.io/slack) - Search for "rate limiting" discussions
+
+### API Documentation
+- [SendGrid Rate Limits](https://docs.sendgrid.com/api-reference/rate-limits)
+- [Stripe Rate Limits](https://stripe.com/docs/rate-limits)
+- [OpenAI Rate Limits](https://platform.openai.com/docs/guides/rate-limits)
\ No newline at end of file
diff --git a/docs/guides/reliable-document-approvals.mdx b/docs/guides/reliable-document-approvals.mdx
new file mode 100644
index 0000000000..09f0d66abb
--- /dev/null
+++ b/docs/guides/reliable-document-approvals.mdx
@@ -0,0 +1,1364 @@
+---
+id: reliable-document-approvals
+title: Reliable document approvals with human-in-the-loop workflows
+description: Build durable human-in-the-loop Workflows that can recover even if the humans are delayed.
+sidebar_label: Reliable document approvals
+toc_max_heading_level: 2
+author: Cecil Phillip
+tags:
+ - Workflows
+ - Signals
+ - Activities
+ - Timers
+ - Continue-as-new
+ - Child workflows
+ - Task queues
+ - Workers
+---
+
+Document approval processes fail in predictable ways: requests go unanswered, deadlines pass silently, context gets lost across restarts, and audit trails end up incomplete. The usual fixes — polling loops, cron jobs, reminder emails kicked off by a scheduler — create coordination debt that compounds over time. This guide shows how to build an approval system where none of that is possible: decisions are waited on durably, SLAs are enforced automatically, escalations happen without a scheduler, and every action is recorded whether or not your infrastructure stays up.
+
+### Problem statement
+
+An approval request may not get a response until the following week — or ever. Between submission and decision, a lot can go wrong: the request gets buried in an inbox, the original approver is out, the system restarts and loses state, or nobody notices the deadline passed. Patching this with a database, a cron job, a notification service, and custom reconciliation logic means maintaining four systems to do one thing.
+
+### Solution
+
+The approval process runs as a single Workflow that holds its own state and waits — without consuming compute — for as long as it takes. Deadlines are tracked by durable timers that fire whether or not Workers restart. Reminders go out on schedule. If no one responds in time, escalation happens automatically. Resubmissions loop back cleanly. A complete audit record is written at every step. The infrastructure can fail and recover; the approval process continues from exactly where it left off.
+
+### Outcomes
+
+After working through this guide, you'll have a running approval system where:
+
+- **Requests don't disappear when infrastructure does**: Temporal's Event History replays the Workflow to its exact pre-failure state, so no approval decisions or audit entries are ever lost.
+- **Deadlines enforce themselves**: Durable timers track every SLA and fire precisely when a deadline expires — no external scheduler, no polling loop, no cron job.
+- **Escalations happen automatically when reviewers go quiet**: If an approver doesn't respond within the SLA window, the Workflow escalates to a backup approver without any external trigger.
+- **Resubmissions work without restarting the process from scratch**: Change requests loop back cleanly, preserving full context and the existing audit trail.
+- **Every action is in the audit record**: Decisions, reminders, escalations, and status changes are all recorded — whether from a Signal, an Activity, or a timer firing.
+
+---
+
+## Background and best practices
+
+### Why human-in-the-loop matters
+
+Some decisions can't be automated — a contract requiring legal sign-off, a patient record needing clinical approval, an AI-generated document awaiting human review. A system that drops a request, misses a deadline, or produces an incomplete audit trail creates real business and legal exposure.
+
+### How Temporal enables the "wait" state
+
+When a Workflow calls `workflow.wait_condition()`, **the Worker returns the current task to the Temporal Server and becomes idle** — consuming no compute. The Workflow's state is persisted and suspended on the Server. When a Signal arrives or a timeout fires, the Server schedules a new Workflow Task, the Worker replays the Event History to reconstruct the Workflow's state, and execution resumes from the `wait_condition` call. This mechanism works identically whether the wait is five seconds or five months — durable timers are persisted to the Server's database and survive Worker restarts, deployments, and infrastructure migrations.
+
+### Event History management
+
+Temporal's Event History has a hard limit of 51,200 events or 50 MB ([Workflow Execution limits](https://docs.temporal.io/workflow-execution/limits)). The Server emits warnings at ~10,240 events or 10 MB; hitting either hard limit terminates the Workflow. For this approval pattern:
+
+| Workflow operation | Approximate events generated |
+|--------------------|------------------------------|
+| Workflow start (baseline) | 3 events |
+| Single Activity execution | ~8 events |
+| Timer (`asyncio.sleep` / `workflow.sleep`) | ~4 events |
+| Signal received | ~4 events |
+| Query received | 0 events (not recorded) |
+| Continue-As-New | 1 event on the old run; fresh history on the new run |
+
+A single approval with reminders and escalation generates roughly 40–80 events — well within limits. If your process supports resubmission loops or runs for months, use `workflow.info().is_continue_as_new_suggested()` to detect when the Server recommends a Continue-As-New, and act on it at a safe boundary.
+
+### Workflow determinism
+
+Workflow code must be deterministic because Temporal uses *history replay* to reconstruct state after Worker restarts. If the code generates different commands on replay, the SDK raises a `NondeterminismError`.
+
+Key rules:
+
+- Use `workflow.now()` instead of `datetime.now()` for timestamps
+- Use `workflow.uuid4()` instead of `uuid.uuid4()` for generating unique identifiers
+- Use `workflow.random()` instead of Python's `random` module
+- Never perform I/O (network calls, file reads, database queries) directly in Workflow code — delegate to Activities
+- Use `workflow.logger` instead of `print()` for replay-safe logging
+
+### Activity idempotency
+
+Activities may be retried if a Worker crashes mid-execution. Each Activity must be designed so that executing it twice with the same inputs produces the same outcome:
+
+- **Idempotency keys**: Include the Workflow Id and a step identifier in every external API call.
+- **Check-before-act**: Query the external system's state before performing a mutation.
+- **Upsert semantics**: Use database upsert operations for audit log entries keyed on Workflow Id and approval stage.
+
+### Long running Activities and Heartbeating
+
+For Activities that may take minutes (PDF report generation, rate-limited API calls):
+
+- Set `start_to_close_timeout` to the maximum time for a single attempt.
+- Set `heartbeat_timeout` to the maximum acceptable interval between progress reports. If the Worker crashes, the Server retries on another Worker within this window.
+- Call `activity.heartbeat()` periodically with a progress payload. On retry, read `activity.info().heartbeat_details` to resume from where it left off.
+
+---
+
+## Prerequisites
+
+### Required software and infrastructure
+
+- Python **3.11** or later
+- Temporal Python SDK (`temporalio`) **1.7.0** or later
+- A running Temporal Server — either the [Temporal CLI](https://docs.temporal.io/cli) development server (`temporal server start-dev`) or [Temporal Cloud](https://temporal.io/cloud)
+- `uv` or `pip` for Python dependency management
+
+### Resources and access privileges
+
+- (Optional) SMTP server or third-party notification API credentials for the notification Activity
+
+### Required concepts
+
+Readers should be familiar with the following concepts:
+
+- [Signals](https://docs.temporal.io/signals) — the mechanism for sending data to a running Workflow
+- [Queries](https://docs.temporal.io/queries) — the mechanism for reading Workflow state without affecting execution
+- [Workers](https://docs.temporal.io/workers) — the processes that execute Workflow and Activity code
+- [Task Queues](https://docs.temporal.io/task-queues) — the named queues that route work to Workers
+- Python `asyncio` — coroutines, `await`, and the event loop
+- Python dataclasses or [Pydantic](https://docs.pydantic.dev/) for structured data
+
+---
+
+## Architecture diagram
+
+The following sequence diagram illustrates the complete lifecycle of a document approval Workflow, from submission through approval to final disposition.
+
+```mermaid
+sequenceDiagram
+ participant Submitter as Document Submitter
+ participant API as Web API / Client
+ participant TS as Temporal Server
+ participant W as Worker (Python)
+ participant NS as Notification Service
+ participant AuditDB as Audit Log Store
+
+ Submitter->>API: Submit document for approval
+ API->>TS: start_workflow(DocumentApprovalWorkflow, document)
+ TS->>W: Schedule Workflow Task
+ W->>TS: Schedule Activity: store_document
+ TS->>W: Execute store_document Activity
+ W-->>TS: Document stored
+ W->>TS: Schedule Activity: send_notification (to approver)
+ TS->>W: Execute send_notification Activity
+ W->>NS: Send approval request email
+ NS-->>W: Notification sent
+ W-->>TS: Workflow waiting for Signal (approval decision)
+
+ Note over TS,W: Workflow is suspended. Zero compute consumed.
+
+ alt Approver responds within SLA
+ Submitter->>API: Approve document
+ API->>TS: signal(workflow_id, "submit_decision", APPROVED)
+ TS->>W: Deliver Signal + Schedule Workflow Task
+ W->>TS: Schedule Activity: record_audit_entry
+ TS->>W: Execute record_audit_entry Activity
+ W->>AuditDB: Write audit entry
+ W->>TS: Schedule Activity: generate_approval_report
+ Note over W: Generate final report and complete...
+ else SLA deadline expires
+ TS->>W: Timer fires, Workflow Task scheduled
+ W->>W: Handle TimeoutError — escalate
+ W->>TS: Schedule Activity: send_notification (to escalation contact)
+ else Approver rejects
+ Submitter->>API: Reject document
+ API->>TS: signal(workflow_id, "submit_decision", REJECTED)
+ TS->>W: Deliver Signal
+ W->>TS: Schedule Activity: record_audit_entry
+ W->>TS: Schedule Activity: send_notification (rejection notice)
+ W-->>TS: Workflow completes with REJECTED status
+ end
+```
+
+---
+
+## Implementation plan
+
+The following phases walk through the complete implementation. Each phase builds on the previous one.
+
+### Phase 1: Define the data models
+
+Start by defining the data structures that represent documents, approval decisions, and the state that flows through the Workflow. Using Python dataclasses keeps the models serializable by Temporal's default JSON data converter without additional configuration.
+
+Create a file named `models.py`:
+
+```python
+"""Data models for the document approval system."""
+
+from __future__ import annotations
+
+import enum
+from dataclasses import dataclass, field
+from datetime import datetime
+
+
+class ApprovalStatus(str, enum.Enum):
+ """Possible statuses for an approval decision."""
+
+ PENDING = "PENDING"
+ APPROVED = "APPROVED"
+ REJECTED = "REJECTED"
+ ESCALATED = "ESCALATED"
+ TIMED_OUT = "TIMED_OUT"
+ CHANGES_REQUESTED = "CHANGES_REQUESTED"
+
+
+class DocumentStatus(str, enum.Enum):
+ """Overall status of the document approval process."""
+
+ SUBMITTED = "SUBMITTED"
+ IN_REVIEW = "IN_REVIEW"
+ APPROVED = "APPROVED"
+ REJECTED = "REJECTED"
+ WITHDRAWN = "WITHDRAWN"
+
+
+@dataclass
+class ApproverConfig:
+ """Configuration for the approver."""
+
+ approver_email: str
+ approver_name: str
+ sla_seconds: int # Maximum time to wait for the approver's decision
+ escalation_email: str | None = None # Backup approver if SLA expires
+ reminder_interval_seconds: int = 86400 # Send reminders every 24 hours
+ max_reminders: int = 3 # Maximum number of reminders to send
+ resubmission_timeout_seconds: int = 604800 # Time to allow resubmission after changes requested (default 7 days)
+
+
+@dataclass
+class ApprovalDecision:
+ """A decision submitted by an approver."""
+
+ approver_email: str
+ status: ApprovalStatus
+ comment: str = ""
+ decided_at: str = "" # ISO 8601 timestamp, set by the Workflow
+
+
+@dataclass
+class AuditEntry:
+ """A single entry in the approval audit trail."""
+
+ timestamp: str # ISO 8601
+ workflow_id: str
+ action: str
+ actor: str
+ details: str = ""
+ approval_level: int = 0
+
+
+@dataclass
+class DocumentSubmission:
+ """Input to the document approval Workflow."""
+
+ document_id: str
+ title: str
+ submitter_email: str
+ submitter_name: str
+ content_url: str # Reference to the stored document (Claim Check pattern)
+ approver: ApproverConfig = field(default_factory=lambda: ApproverConfig(
+ approver_email="",
+ approver_name="",
+ sla_seconds=86400,
+ ))
+ metadata: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass
+class ApprovalState:
+ """Mutable state carried through the approval process and passed to Continue-As-New."""
+
+ document: DocumentSubmission
+ status: DocumentStatus = DocumentStatus.SUBMITTED
+ decision: ApprovalDecision | None = None
+ audit_trail: list[AuditEntry] = field(default_factory=list)
+ resubmission_count: int = 0
+ max_resubmissions: int = 3
+ document_stored: bool = False
+```
+
+The `ApprovalState` dataclass carries everything the Workflow needs to resume after a Continue-As-New.
+
+### Phase 2: Define the Activities
+
+Create a file named `activities.py`:
+
+```python
+"""Activities for the document approval system.
+
+Each Activity performs a side-effecting operation and is designed to be idempotent.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import timedelta
+
+from temporalio import activity
+from temporalio.exceptions import ApplicationError
+
+from models import AuditEntry, DocumentSubmission
+
+
+@dataclass
+class NotificationRequest:
+ """Input for the send_notification Activity."""
+
+ recipient_email: str
+ recipient_name: str
+ subject: str
+ body: str
+ idempotency_key: str # Prevents duplicate sends on retry
+
+
+@dataclass
+class StoreDocumentRequest:
+ """Input for the store_document Activity."""
+
+ document_id: str
+ title: str
+ content_url: str
+ submitter_email: str
+
+
+@activity.defn
+def send_notification(request: NotificationRequest) -> bool:
+ """Send a notification; raises ApplicationError for invalid input (non-retryable)."""
+ # Validate input — non-retryable business logic error
+ if not request.recipient_email or "@" not in request.recipient_email:
+ raise ApplicationError(
+ f"Invalid recipient email: {request.recipient_email}"
+ )
+
+ if not request.subject or not request.subject.strip():
+ raise ApplicationError("Notification subject cannot be empty")
+
+ activity.logger.info(
+ "Sending notification",
+ extra={
+ "recipient": request.recipient_email,
+ "subject": request.subject,
+ "idempotency_key": request.idempotency_key,
+ },
+ )
+
+ # --- Add your notification provider ----
+
+ activity.logger.info(
+ "Notification sent successfully",
+ extra={"recipient": request.recipient_email},
+ )
+ return True
+
+
+@activity.defn
+def record_audit_entry(entry: AuditEntry) -> bool:
+ """Write an audit entry; uses upsert semantics to prevent duplicates on retry."""
+ activity.logger.info(
+ "Recording audit entry",
+ extra={
+ "workflow_id": entry.workflow_id,
+ "action": entry.action,
+ "actor": entry.actor,
+ "level": entry.approval_level,
+ },
+ )
+
+ # --- Add your audit log store ---
+
+ return True
+
+
+@activity.defn
+def store_document(request: StoreDocumentRequest) -> str:
+ """Persist document metadata; idempotent — storing the same document_id twice overwrites with identical data."""
+ activity.logger.info(
+ "Storing document",
+ extra={
+ "document_id": request.document_id,
+ "title": request.title,
+ },
+ )
+
+ # --- Add your document store ---
+
+ return f"doc-store://{request.document_id}"
+
+
+@activity.defn
+def generate_approval_report(
+ document_id: str,
+ decisions: list[dict],
+) -> str:
+ """Generate an approval summary report; uses heartbeating to support long runs."""
+ activity.logger.info(
+ "Generating approval report",
+ extra={"document_id": document_id, "decision_count": len(decisions)},
+ )
+
+ total_steps = len(decisions)
+ for i, decision in enumerate(decisions):
+ # Simulate report generation work for each decision
+ activity.logger.info(
+ f"Processing decision {i + 1}/{total_steps}",
+ extra={"approver": decision.get("approver_email", "unknown")},
+ )
+
+ # Heartbeat with progress so the server can detect Worker crashes.
+ activity.heartbeat(i + 1)
+
+ report_url = f"reports://{document_id}/approval-summary"
+ activity.logger.info(
+ "Approval report generated",
+ extra={"document_id": document_id, "report_url": report_url},
+ )
+ return report_url
+```
+
+### Phase 3: Define the approval Workflow
+
+Create a file named `workflows.py`:
+
+```python
+"""Document approval Workflow with SLA enforcement, escalation, resubmission handling, and audit logging."""
+
+from __future__ import annotations
+
+import asyncio
+from dataclasses import asdict
+from datetime import timedelta
+
+from temporalio import workflow
+from temporalio.common import RetryPolicy
+
+with workflow.unsafe.imports_passed_through():
+ from activities import (
+ NotificationRequest,
+ StoreDocumentRequest,
+ generate_approval_report,
+ record_audit_entry,
+ send_notification,
+ store_document,
+ )
+ from models import (
+ ApprovalDecision,
+ ApprovalState,
+ ApprovalStatus,
+ ApproverConfig,
+ AuditEntry,
+ DocumentStatus,
+ DocumentSubmission,
+ )
+
+
+# Retry policy for notification Activities.
+NOTIFICATION_RETRY_POLICY = RetryPolicy(
+ initial_interval=timedelta(seconds=5),
+ backoff_coefficient=2.0,
+ maximum_interval=timedelta(minutes=2),
+ maximum_attempts=5,
+)
+
+# Retry policy for audit logging.
+AUDIT_RETRY_POLICY = RetryPolicy(
+ initial_interval=timedelta(seconds=2),
+ backoff_coefficient=2.0,
+ maximum_interval=timedelta(minutes=5),
+ maximum_attempts=10,
+)
+
+# Retry policy for document storage.
+STORAGE_RETRY_POLICY = RetryPolicy(
+ initial_interval=timedelta(seconds=1),
+ backoff_coefficient=2.0,
+ maximum_interval=timedelta(minutes=1),
+ maximum_attempts=5,
+)
+
+# Retry policy for long-running report generation.
+REPORT_RETRY_POLICY = RetryPolicy(
+ initial_interval=timedelta(seconds=10),
+ backoff_coefficient=2.0,
+ maximum_interval=timedelta(minutes=5),
+ maximum_attempts=3,
+)
+
+
+@workflow.defn
+class DocumentApprovalWorkflow:
+
+ @workflow.init
+ def __init__(
+ self, input_data: DocumentSubmission | ApprovalState
+ ) -> None:
+ if isinstance(input_data, ApprovalState):
+ self._state = input_data
+ else:
+ self._state = ApprovalState(document=input_data)
+
+ self._pending_decision: ApprovalDecision | None = None
+ self._processed_update_ids: set[str] = set()
+
+ @workflow.signal
+ async def submit_decision(self, decision: ApprovalDecision) -> None:
+ workflow.logger.info(
+ "Decision received via Signal",
+ extra={
+ "approver": decision.approver_email,
+ "status": decision.status,
+ },
+ )
+ self._pending_decision = decision
+
+ @workflow.signal
+ async def withdraw(self) -> None:
+ workflow.logger.info("Document withdrawal requested")
+ self._state.status = DocumentStatus.WITHDRAWN
+
+ @workflow.query
+ def get_status(self) -> dict:
+ return {
+ "document_id": self._state.document.document_id,
+ "title": self._state.document.title,
+ "status": self._state.status.value,
+ "decision": asdict(self._state.decision) if self._state.decision else None,
+ "resubmission_count": self._state.resubmission_count,
+ }
+
+ @workflow.query
+ def get_audit_trail(self) -> list[dict]:
+ """Return the in-memory audit trail for this Workflow."""
+ return [asdict(e) for e in self._state.audit_trail]
+
+ @workflow.update
+ async def resubmit_document(self, update_id: str, new_content_url: str) -> dict:
+ # Prevent duplicate processing of the same update
+ if update_id in self._processed_update_ids:
+ return {
+ "accepted": True,
+ "reason": "Resubmission already processed",
+ "duplicate": True,
+ }
+
+ if self._state.status != DocumentStatus.REJECTED:
+ return {
+ "accepted": False,
+ "reason": (
+ f"Document is in {self._state.status.value} state. "
+ "Resubmission is only allowed after rejection."
+ ),
+ }
+
+ if self._state.resubmission_count >= self._state.max_resubmissions:
+ return {
+ "accepted": False,
+ "reason": (
+ f"Maximum resubmissions ({self._state.max_resubmissions}) "
+ "reached."
+ ),
+ }
+
+ self._state.document.content_url = new_content_url
+ self._state.resubmission_count += 1
+ self._state.status = DocumentStatus.SUBMITTED
+ self._pending_decision = None
+ self._processed_update_ids.add(update_id)
+
+ workflow.logger.info(
+ "Document resubmitted",
+ extra={
+ "resubmission_count": self._state.resubmission_count,
+ "new_content_url": new_content_url,
+ },
+ )
+
+ return {
+ "accepted": True,
+ "resubmission_count": self._state.resubmission_count,
+ }
+
+ @resubmit_document.validator
+ def validate_resubmit(self, update_id: str, new_content_url: str) -> None:
+ """Validate the resubmission URL before accepting the Update."""
+ if not update_id or not update_id.strip():
+ raise ValueError("update_id must not be empty")
+ if not new_content_url or not new_content_url.strip():
+ raise ValueError("new_content_url must not be empty")
+
+ @workflow.run
+ async def run(
+ self, input_data: DocumentSubmission | ApprovalState
+ ) -> dict:
+ workflow_id = workflow.info().workflow_id
+
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="WORKFLOW_STARTED",
+ actor=self._state.document.submitter_email,
+ details=(
+ f"Document '{self._state.document.title}' submitted "
+ f"for approval (resubmission #{self._state.resubmission_count})"
+ ),
+ )
+
+ if not self._state.document_stored:
+ await self._store_document(workflow_id)
+ self._state.document_stored = True
+
+ if workflow.info().is_continue_as_new_suggested():
+ workflow.logger.info("Continue-As-New suggested, resetting Event History")
+ await workflow.wait_condition(workflow.all_handlers_finished)
+ workflow.continue_as_new(args=[self._state])
+
+ self._state.status = DocumentStatus.IN_REVIEW
+ approver_config = self._state.document.approver
+
+ workflow.logger.info(
+ "Approval process started",
+ extra={
+ "approver": approver_config.approver_email,
+ "sla_seconds": approver_config.sla_seconds,
+ },
+ )
+
+ await self._send_approval_request(workflow_id, approver_config)
+
+ result: dict = {}
+ while True:
+ decision = await self._wait_for_decision(workflow_id, approver_config)
+
+ if self._state.status == DocumentStatus.WITHDRAWN:
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="DOCUMENT_WITHDRAWN",
+ actor=self._state.document.submitter_email,
+ details="Document withdrawn by submitter",
+ )
+ result = self._build_result("Document withdrawn by submitter")
+ break
+
+ result = await self._handle_decision(workflow_id, approver_config, decision)
+
+ if self._state.status != DocumentStatus.SUBMITTED:
+ break
+
+ if workflow.info().is_continue_as_new_suggested():
+ workflow.logger.info(
+ "Continue-As-New suggested before resubmission cycle"
+ )
+ await workflow.wait_condition(workflow.all_handlers_finished)
+ workflow.continue_as_new(args=[self._state])
+
+ self._state.status = DocumentStatus.IN_REVIEW
+ await self._send_approval_request(workflow_id, approver_config)
+
+ # Wait for all in-flight Signal and Update handlers to complete
+ # before returning
+ await workflow.wait_condition(workflow.all_handlers_finished)
+
+ return result
+
+ async def _wait_for_decision(
+ self, workflow_id: str, approver_config: ApproverConfig
+ ) -> ApprovalDecision:
+ sla_timeout = timedelta(seconds=approver_config.sla_seconds)
+ reminder_interval = timedelta(
+ seconds=approver_config.reminder_interval_seconds
+ )
+
+ self._pending_decision = None
+
+ reminder_task = asyncio.create_task(
+ self._send_reminders(workflow_id, approver_config, reminder_interval)
+ )
+
+ try:
+ await workflow.wait_condition(
+ lambda: (
+ self._pending_decision is not None
+ or self._state.status == DocumentStatus.WITHDRAWN
+ ),
+ timeout=sla_timeout,
+ )
+
+ reminder_task.cancel()
+ try:
+ await reminder_task
+ except asyncio.CancelledError:
+ pass
+
+ if self._state.status == DocumentStatus.WITHDRAWN:
+ return ApprovalDecision(
+ approver_email=self._state.document.submitter_email,
+ status=ApprovalStatus.REJECTED,
+ comment="Document withdrawn",
+ decided_at=workflow.now().isoformat(),
+ )
+
+ decision = self._pending_decision
+ assert decision is not None
+ self._pending_decision = None
+ return decision
+
+ except asyncio.TimeoutError:
+ reminder_task.cancel()
+ try:
+ await reminder_task
+ except asyncio.CancelledError:
+ pass
+
+ return await self._handle_escalation(workflow_id, approver_config)
+
+ async def _send_reminders(
+ self, workflow_id: str, approver_config: ApproverConfig, interval: timedelta
+ ) -> None:
+ reminder_count = 0
+ while True:
+ await asyncio.sleep(interval.total_seconds())
+ reminder_count += 1
+
+ if reminder_count > approver_config.max_reminders:
+ workflow.logger.info(
+ "Max reminders reached, stopping reminder loop"
+ )
+ break
+
+ workflow.logger.info(
+ "Sending reminder",
+ extra={
+ "reminder_count": reminder_count,
+ "approver": approver_config.approver_email,
+ },
+ )
+
+ await workflow.execute_activity(
+ send_notification,
+ NotificationRequest(
+ recipient_email=approver_config.approver_email,
+ recipient_name=approver_config.approver_name,
+ subject=(
+ f"Reminder: Approval needed for "
+ f"'{self._state.document.title}'"
+ ),
+ body=f"Reminder #{reminder_count}: please review '{self._state.document.title}'.",
+ idempotency_key=(
+ f"{workflow_id}-reminder-R{reminder_count}"
+ ),
+ ),
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=NOTIFICATION_RETRY_POLICY,
+ )
+
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="REMINDER_SENT",
+ actor="system",
+ details=(
+ f"Reminder #{reminder_count} sent to "
+ f"{approver_config.approver_email}"
+ ),
+ )
+
+ async def _handle_escalation(
+ self, workflow_id: str, approver_config: ApproverConfig
+ ) -> ApprovalDecision:
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="SLA_EXPIRED",
+ actor="system",
+ details=(
+ f"SLA expired for {approver_config.approver_email} "
+ f"after {approver_config.sla_seconds} seconds"
+ ),
+ )
+
+ if approver_config.escalation_email:
+ workflow.logger.info(
+ "Escalating to backup approver",
+ extra={
+ "original_approver": approver_config.approver_email,
+ "escalation_contact": approver_config.escalation_email,
+ },
+ )
+
+ await workflow.execute_activity(
+ send_notification,
+ NotificationRequest(
+ recipient_email=approver_config.escalation_email,
+ recipient_name="Escalation Contact",
+ subject=(
+ f"ESCALATION: Approval needed for "
+ f"'{self._state.document.title}'"
+ ),
+ body=f"Original approver ({approver_config.approver_email}) did not respond within the SLA. Please review and approve or reject.",
+ idempotency_key=f"{workflow_id}-escalation",
+ ),
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=NOTIFICATION_RETRY_POLICY,
+ )
+
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="ESCALATION_SENT",
+ actor="system",
+ details=f"Escalated to {approver_config.escalation_email}",
+ )
+
+ self._pending_decision = None
+ try:
+ await workflow.wait_condition(
+ lambda: (
+ self._pending_decision is not None
+ or self._state.status == DocumentStatus.WITHDRAWN
+ ),
+ timeout=timedelta(seconds=approver_config.sla_seconds),
+ )
+
+ if self._state.status == DocumentStatus.WITHDRAWN:
+ return ApprovalDecision(
+ approver_email=self._state.document.submitter_email,
+ status=ApprovalStatus.REJECTED,
+ comment="Document withdrawn during escalation",
+ decided_at=workflow.now().isoformat(),
+ )
+
+ decision = self._pending_decision
+ assert decision is not None
+ self._pending_decision = None
+ return decision
+
+ except asyncio.TimeoutError:
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="ESCALATION_TIMEOUT",
+ actor="system",
+ details=(
+ f"Escalation contact {approver_config.escalation_email} "
+ f"also timed out"
+ ),
+ )
+
+ return ApprovalDecision(
+ approver_email="system",
+ status=ApprovalStatus.TIMED_OUT,
+ comment=(
+ "Auto-rejected: both the original approver and "
+ "escalation contact failed to respond within the SLA"
+ ),
+ decided_at=workflow.now().isoformat(),
+ )
+ else:
+ workflow.logger.info("No escalation contact configured, auto-rejecting")
+
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="AUTO_REJECTED",
+ actor="system",
+ details=(
+ f"No escalation contact configured for {approver_config.approver_email}"
+ ),
+ )
+
+ return ApprovalDecision(
+ approver_email="system",
+ status=ApprovalStatus.TIMED_OUT,
+ comment=(
+ f"Auto-rejected: approver {approver_config.approver_email} "
+ f"did not respond within {approver_config.sla_seconds} seconds"
+ ),
+ decided_at=workflow.now().isoformat(),
+ )
+
+ async def _handle_decision(
+ self,
+ workflow_id: str,
+ approver_config: ApproverConfig,
+ decision: ApprovalDecision,
+ ) -> dict:
+ # Stamp the decision with processing time using the deterministic Workflow clock
+ decision.decided_at = workflow.now().isoformat()
+ self._state.decision = decision
+
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action=f"DECISION_{decision.status.value}",
+ actor=decision.approver_email,
+ details=decision.comment,
+ )
+
+ if decision.status == ApprovalStatus.APPROVED:
+ workflow.logger.info(
+ "Document approved",
+ extra={"approver": decision.approver_email},
+ )
+
+ report_url = await self._finalize_approval(workflow_id)
+
+ await workflow.execute_activity(
+ send_notification,
+ NotificationRequest(
+ recipient_email=self._state.document.submitter_email,
+ recipient_name=self._state.document.submitter_name,
+ subject=(
+ f"Document approved: '{self._state.document.title}'"
+ ),
+ body=f"Your document has been approved by {decision.approver_email}. Approval report: {report_url}",
+ idempotency_key=f"{workflow_id}-final-approval",
+ ),
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=NOTIFICATION_RETRY_POLICY,
+ )
+
+ return self._build_result(
+ message="Document approved",
+ report_url=report_url,
+ )
+
+ elif decision.status == ApprovalStatus.CHANGES_REQUESTED:
+ workflow.logger.info(
+ "Changes requested",
+ extra={"approver": decision.approver_email},
+ )
+
+ self._state.status = DocumentStatus.REJECTED
+
+ await workflow.execute_activity(
+ send_notification,
+ NotificationRequest(
+ recipient_email=self._state.document.submitter_email,
+ recipient_name=self._state.document.submitter_name,
+ subject=(
+ f"Changes requested: "
+ f"'{self._state.document.title}'"
+ ),
+ body=f"Approver {decision.approver_email} has requested changes: {decision.comment}. You may resubmit.",
+ idempotency_key=(
+ f"{workflow_id}-changes-requested-"
+ f"S{self._state.resubmission_count}"
+ ),
+ ),
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=NOTIFICATION_RETRY_POLICY,
+ )
+
+ resubmission_timeout = timedelta(
+ seconds=approver_config.resubmission_timeout_seconds
+ )
+ try:
+ await workflow.wait_condition(
+ lambda: (
+ self._state.status == DocumentStatus.SUBMITTED
+ or self._state.status == DocumentStatus.WITHDRAWN
+ ),
+ timeout=resubmission_timeout,
+ )
+
+ if self._state.status == DocumentStatus.WITHDRAWN:
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="DOCUMENT_WITHDRAWN",
+ actor=self._state.document.submitter_email,
+ details="Document withdrawn after changes requested",
+ )
+ return self._build_result(
+ message="Document withdrawn by submitter"
+ )
+
+ workflow.logger.info(
+ "Resubmission accepted",
+ extra={"resubmission_count": self._state.resubmission_count},
+ )
+
+ if self._state.resubmission_count >= self._state.max_resubmissions:
+ self._state.status = DocumentStatus.REJECTED
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="MAX_RESUBMISSIONS_EXCEEDED",
+ actor="system",
+ details=f"Maximum resubmissions ({self._state.max_resubmissions}) exceeded",
+ )
+ return self._build_result(
+ message="Maximum resubmission attempts exceeded"
+ )
+
+ return self._build_result(message="Resubmission accepted")
+
+ except asyncio.TimeoutError:
+ workflow.logger.info(
+ "Resubmission window expired"
+ )
+ self._state.status = DocumentStatus.REJECTED
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="RESUBMISSION_TIMEOUT",
+ actor="system",
+ details=(
+ "Resubmission window expired after 7 days"
+ ),
+ )
+ return self._build_result(
+ message="Resubmission deadline expired, document rejected"
+ )
+
+ else:
+ workflow.logger.info(
+ "Document rejected",
+ extra={"status": decision.status.value},
+ )
+
+ self._state.status = DocumentStatus.REJECTED
+
+ await workflow.execute_activity(
+ send_notification,
+ NotificationRequest(
+ recipient_email=self._state.document.submitter_email,
+ recipient_name=self._state.document.submitter_name,
+ subject=(
+ f"Document rejected: "
+ f"'{self._state.document.title}'"
+ ),
+ body=f"Your document was {decision.status.value.lower()}. Reason: {decision.comment}",
+ idempotency_key=(
+ f"{workflow_id}-rejected-"
+ f"S{self._state.resubmission_count}"
+ ),
+ ),
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=NOTIFICATION_RETRY_POLICY,
+ )
+ return self._build_result(
+ message="Document rejected by approver"
+ )
+
+ async def _finalize_approval(self, workflow_id: str) -> str:
+ """Generate the approval report and return the report URL."""
+ self._state.status = DocumentStatus.APPROVED
+
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="DOCUMENT_APPROVED",
+ actor="system",
+ details="Document approved by approver",
+ )
+
+ decision_data = asdict(self._state.decision)
+ report_url = await workflow.execute_activity(
+ generate_approval_report,
+ self._state.document.document_id,
+ [decision_data],
+ start_to_close_timeout=timedelta(minutes=5),
+ heartbeat_timeout=timedelta(minutes=2),
+ retry_policy=REPORT_RETRY_POLICY,
+ )
+
+ return report_url
+
+ async def _store_document(self, workflow_id: str) -> None:
+ """Store the document and record the audit entry."""
+ storage_ref = await workflow.execute_activity(
+ store_document,
+ StoreDocumentRequest(
+ document_id=self._state.document.document_id,
+ title=self._state.document.title,
+ content_url=self._state.document.content_url,
+ submitter_email=self._state.document.submitter_email,
+ ),
+ start_to_close_timeout=timedelta(seconds=60),
+ retry_policy=STORAGE_RETRY_POLICY,
+ )
+
+ workflow.logger.info(
+ "Document stored",
+ extra={"storage_ref": storage_ref},
+ )
+
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="DOCUMENT_STORED",
+ actor=self._state.document.submitter_email,
+ details=f"Document stored at {storage_ref}",
+ )
+
+ async def _send_approval_request(
+ self, workflow_id: str, approver_config: ApproverConfig
+ ) -> None:
+ """Send the initial approval request to the approver."""
+ await workflow.execute_activity(
+ send_notification,
+ NotificationRequest(
+ recipient_email=approver_config.approver_email,
+ recipient_name=approver_config.approver_name,
+ subject=(
+ f"Approval requested: "
+ f"'{self._state.document.title}'"
+ ),
+ body=f"'{self._state.document.title}' by {self._state.document.submitter_name} requires your approval (SLA: {approver_config.sla_seconds}s).",
+ idempotency_key=(
+ f"{workflow_id}-approval-request-"
+ f"S{self._state.resubmission_count}"
+ ),
+ ),
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=NOTIFICATION_RETRY_POLICY,
+ )
+
+ await self._record_audit(
+ workflow_id=workflow_id,
+ action="APPROVAL_REQUESTED",
+ actor="system",
+ details=(
+ f"Approval request sent to "
+ f"{approver_config.approver_email}"
+ ),
+ )
+
+ async def _record_audit(
+ self,
+ workflow_id: str,
+ action: str,
+ actor: str,
+ details: str = "",
+ approval_level: int = 0,
+ ) -> None:
+ """Record an audit entry both in-memory and via an Activity."""
+ entry = AuditEntry(
+ timestamp=workflow.now().isoformat(),
+ workflow_id=workflow_id,
+ action=action,
+ actor=actor,
+ details=details,
+ approval_level=approval_level,
+ )
+
+ self._state.audit_trail.append(entry)
+
+ await workflow.execute_activity(
+ record_audit_entry,
+ entry,
+ start_to_close_timeout=timedelta(seconds=30),
+ retry_policy=AUDIT_RETRY_POLICY,
+ )
+
+ def _build_result(
+ self, message: str, report_url: str = ""
+ ) -> dict:
+ """Build the final result returned by the Workflow."""
+ return {
+ "document_id": self._state.document.document_id,
+ "status": self._state.status.value,
+ "message": message,
+ "decision": (
+ asdict(self._state.decision)
+ if self._state.decision
+ else None
+ ),
+ "resubmission_count": self._state.resubmission_count,
+ "report_url": report_url,
+ }
+```
+
+### Phase 4: Configure and run the Worker
+
+The Worker is the process that executes Workflow and Activity code. Create a file named `worker.py`:
+
+```python
+"""Worker process for the document approval system."""
+
+from __future__ import annotations
+
+import asyncio
+import concurrent.futures
+import logging
+import os
+
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from activities import (
+ generate_approval_report,
+ record_audit_entry,
+ send_notification,
+ store_document,
+)
+from workflows import DocumentApprovalWorkflow
+
+TASK_QUEUE = os.getenv("TEMPORAL_TASK_QUEUE", "document-approval")
+NAMESPACE = os.getenv("TEMPORAL_NAMESPACE", "document-approval")
+
+
+async def main() -> None:
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+ )
+
+ logging.info(f"Connecting to Temporal Server with namespace: {NAMESPACE}")
+ client = await Client.connect("localhost:7233", namespace=NAMESPACE)
+
+ with concurrent.futures.ThreadPoolExecutor(
+ max_workers=50
+ ) as activity_executor:
+ worker = Worker(
+ client,
+ task_queue=TASK_QUEUE,
+ workflows=[DocumentApprovalWorkflow],
+ activities=[
+ send_notification,
+ record_audit_entry,
+ store_document,
+ generate_approval_report,
+ ],
+ activity_executor=activity_executor,
+ max_concurrent_workflow_tasks=100,
+ max_concurrent_activities=50,
+ )
+
+ logging.info("Starting Worker on Task Queue: %s", TASK_QUEUE)
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+To run the Worker, start the Temporal development server in one terminal and the Worker in another:
+
+```
+temporal server start-dev
+```
+
+```
+python worker.py
+```
+
+### Phase 5: Start a Workflow and send Signals
+
+Create a file named `starter.py` that demonstrates how to start the approval Workflow and interact with it:
+
+```python
+"""Client code to start a document approval Workflow and interact with it."""
+
+from __future__ import annotations
+
+import asyncio
+import os
+
+from temporalio.api.common.v1 import SearchAttributes
+from temporalio.client import Client
+
+from models import (
+ ApprovalDecision,
+ ApprovalStatus,
+ ApproverConfig,
+ DocumentSubmission,
+)
+from workflows import DocumentApprovalWorkflow
+
+TASK_QUEUE = os.getenv("TEMPORAL_TASK_QUEUE", "document-approval")
+NAMESPACE = os.getenv("TEMPORAL_NAMESPACE", "document-approval")
+
+
+async def main() -> None:
+ """Start an approval Workflow and demonstrate Signal and Query interactions."""
+ client = await Client.connect("localhost:7233", namespace=NAMESPACE)
+
+ document = DocumentSubmission(
+ document_id="doc-2026-001",
+ title="Q1 Budget Proposal",
+ submitter_email="alice@example.com",
+ submitter_name="Alice Chen",
+ content_url="https://docs.example.com/q1-budget-v1",
+ approver=ApproverConfig(
+ approver_email="bob@example.com",
+ approver_name="Bob Martinez",
+ sla_seconds=172800, # 48 hours
+ escalation_email="carol@example.com",
+ reminder_interval_seconds=86400, # Remind every 24 hours
+ max_reminders=3, # Send max 3 reminders
+ ),
+ )
+
+ workflow_id = f"approval-{document.document_id}"
+
+ search_attributes = SearchAttributes.from_pairs([
+ ("DocumentId", document.document_id),
+ ("SubmitterEmail", document.submitter_email),
+ ("ApprovalStatus", "SUBMITTED"),
+ ])
+
+ handle = await client.start_workflow(
+ DocumentApprovalWorkflow.run,
+ document,
+ id=workflow_id,
+ task_queue=TASK_QUEUE,
+ search_attributes=search_attributes,
+ )
+
+ print(f"Workflow started: {workflow_id}")
+ print(f"Run Id: {handle.result_run_id}")
+
+ status = await handle.query(DocumentApprovalWorkflow.get_status)
+ print(f"Current status: {status}")
+
+ await handle.signal(
+ DocumentApprovalWorkflow.submit_decision,
+ ApprovalDecision(
+ approver_email="bob@example.com",
+ status=ApprovalStatus.APPROVED,
+ comment="Budget looks good. Approved.",
+ ),
+ )
+ print("Approval Signal sent")
+
+ result = await handle.result()
+ print(f"Workflow result: {result}")
+
+ search_attributes = SearchAttributes.from_pairs([
+ ("ApprovalStatus", result["status"].upper()),
+ ])
+ await client.update_workflow_search_attributes(workflow_id, search_attributes)
+
+ # Query the audit trail
+ audit_trail = await handle.query(
+ DocumentApprovalWorkflow.get_audit_trail
+ )
+ print(f"Audit trail ({len(audit_trail)} entries):")
+ for entry in audit_trail:
+ print(f" [{entry['timestamp']}] {entry['action']}: {entry['details']}")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+### Phase 6: Configure timeouts and retry policies
+
+Choosing appropriate timeouts and retry policies is critical for the reliability of the system. The following table summarizes the timeout and retry configuration used in this pattern:
+
+| Activity | `start_to_close_timeout` | `heartbeat_timeout` | Retry policy | Rationale |
+|----------|--------------------------|---------------------|--------------|-----------|
+| `send_notification` | 30 seconds | Not set | 5 attempts, 5s initial, 2x backoff, 2m max | Notifications are important but not blocking. Five retries with exponential backoff handle transient network failures. |
+| `record_audit_entry` | 30 seconds | Not set | 10 attempts, 2s initial, 2x backoff, 5m max | Audit entries are critical for compliance. More aggressive retry to ensure entries are persisted. |
+| `store_document` | 60 seconds | Not set | 5 attempts, 1s initial, 2x backoff, 1m max | Document storage should complete quickly. Longer `start_to_close_timeout` accommodates large documents. |
+| `generate_approval_report` | 5 minutes | 30 seconds | 3 attempts, 10s initial, 2x backoff, 5m max | Report generation may take time. Heartbeating detects Worker failures quickly. Fewer retries because the operation is expensive. |
+
+Do not set `schedule_to_close_timeout` unless you need to bound the total time across all retry attempts.
+
+## Outcomes
+
+You've built a document approval system that holds up under real conditions — slow reviewers, missed deadlines, infrastructure restarts, and repeated resubmissions. The techniques here aren't specific to documents. Any process that has to wait for a human — loan applications, employee onboarding, content moderation, procurement sign-off, insurance claims — can be built the same way. The Workflow waits, the timers fire, the audit trail fills in, and the process completes regardless of what happens underneath it.
+
+---
+
+## Related resources
+
+- [Temporal Python SDK documentation](https://docs.temporal.io/develop/python)
+- [Message Passing — Signals, Queries, Updates](https://docs.temporal.io/develop/python/message-passing)
+- [Continue-As-New](https://docs.temporal.io/develop/python/continue-as-new)
+- [Failure Detection — Timeouts, Activity Heartbeating, and Retry Policies](https://docs.temporal.io/develop/python/failure-detection)
+- [Child Workflows](https://docs.temporal.io/develop/python/child-workflows)
+- [Worker Versioning](https://docs.temporal.io/production-deployment/worker-deployments/worker-versioning)
+- [Temporal Python SDK API Reference](https://python.temporal.io)
+- [Temporal Python SDK GitHub repository](https://github.com/temporalio/sdk-python)
+- [Temporal Python SDK samples](https://github.com/temporalio/samples-python)
+- [Entity Workflow Pattern](entity-pattern-loyalty-points) — covers the related Entity Workflow pattern for modeling long-lived domain objects
\ No newline at end of file
diff --git a/docs/guides/route-specialized-workloads.mdx b/docs/guides/route-specialized-workloads.mdx
new file mode 100644
index 0000000000..8af5ac0280
--- /dev/null
+++ b/docs/guides/route-specialized-workloads.mdx
@@ -0,0 +1,680 @@
+---
+id: route-specialized-workloads
+title: Route specialized workloads
+description: Direct resource-intensive workloads to appropriate Task Queues to optimize resources
+sidebar_label: Route specialized workloads
+toc_max_heading_level: 2
+author: Cecil Phillip
+tags:
+ - Task Queues
+ - Routing
+ - Workers
+ - Resource requirements
+ - GPU computing
+---
+
+Modern applications have workloads with diverse resource requirements. ML/AI workloads require expensive GPU-equipped Workers with specific CUDA libraries, video processing needs specialized encoding hardware, and data analytics may require high-memory instances. Running all Activities on the same Worker type is cost-prohibitive and inefficient.
+
+## Problem statement
+
+Without separate Task Queues for different resource types, these scenarios create problems:
+- **Resource waste:** GPU Workers sit idle waiting for ML work, or expensive GPUs run standard CPU tasks that don't need them
+- **Cost inefficiency:** Running all Workers on GPU instances when only 10% of workloads need GPUs increases costs by 20-30x
+- **Environment conflicts:** ML libraries (TensorFlow, PyTorch) have complex dependencies that conflict with other Activity requirements
+- **Scaling complexity:** Cannot independently scale GPU Workers based on ML demand vs CPU Workers based on standard workload demand
+
+## Solution
+
+Use separate Task Queues to route Activities based on their resource requirements. Create dedicated Worker pools for:
+- **GPU-intensive ML workloads** (NVIDIA GPUs, CUDA libraries, limited concurrency due to GPU memory)
+- **Standard CPU workloads** (high concurrency on cost-effective instances)
+- **High-memory analytics** (memory-optimized instances for large dataset processing)
+- **Specialized hardware** (video encoding hardware, TPUs, ARM processors)
+
+## Outcomes
+
+- **Cost optimization:** GPU instances ($3/hr) only handle ML inference while standard Activities run on cost-effective CPU instances ($0.10/hr), reducing infrastructure costs by 60-80%
+- **Resource efficiency:** GPU Workers run at 2-4 concurrent activities (GPU memory constraints) while CPU Workers run at 100+ concurrent activities, maximizing hardware utilization
+- **Independent scaling:** Scale GPU Workers based on ML workload demand, high-memory Workers based on analytics load, and CPU Workers based on standard workload, without coordination
+- **Environment isolation:** ML Workers have TensorFlow/PyTorch dependencies isolated from standard Workers, preventing library conflicts and simplifying deployments
+
+## Background and best practices
+
+### Task Queue fundamentals
+Task Queues in Temporal are dynamically created when first referenced. If your Workflow references task queue `gpu-processing` but your Worker polls `gpu-procesing` (typo), two separate queues are created and the Worker never receives Tasks.
+
+**Best practice:** Always define Task Queue names as constants in a shared module that both Workflows and Workers import.
+
+### Resource-based routing approaches
+
+Route Activities to appropriate Worker pools based on their resource requirements:
+- **GPU-intensive Activities:** Route to GPU-equipped Workers with ML libraries (TensorFlow, PyTorch, CUDA)
+- **CPU-intensive Activities:** Route to high-CPU instances for compute-heavy tasks
+- **Memory-intensive Activities:** Route to memory-optimized instances for large dataset processing
+- **Standard Activities:** Route to cost-effective general-purpose instances
+
+### Worker configuration for specialized hardware
+
+Key considerations for resource-specific Workers:
+- **GPU concurrency:** Limit GPU Workers to 2-4 concurrent activities due to GPU memory constraints
+- **CPU concurrency:** Standard Workers can handle 100+ concurrent activities
+- **Memory allocation:** High-memory Workers need careful concurrency tuning to avoid OOM errors
+- **Hardware dependencies:** GPU Workers require NVIDIA drivers, CUDA toolkit, and specific ML library versions
+- **Container images:** Use specialized images with pre-installed dependencies (nvidia/cuda, tensorflow/tensorflow:latest-gpu)
+
+### Operational considerations
+
+- **Resource utilization:** Monitor GPU memory, compute utilization, and CPU/memory usage per Worker pool
+- **Queue latency:** Track `schedule_to_start_latency` per Task Queue to detect under-provisioning of specialized hardware
+- **Cost tracking:** Tag resources by hardware type (GPU, CPU, high-memory) for cost analysis and optimization
+- **Hardware health:** Monitor GPU temperature, driver errors, and CUDA out-of-memory errors
+- **Dependency management:** Use container images with pinned library versions to ensure reproducible environments
+
+## Target audience
+
+- **Temporal Workflow & Activity developers:** Implementing resource-aware routing logic
+- **Platform operators:** Configuring Worker deployments for specialized hardware
+- **ML/AI Engineers:** Deploying GPU-based inference and training Workers
+- **DevOps/SRE teams:** Managing heterogeneous Worker pools and infrastructure
+- **FinOps teams:** Optimizing cloud infrastructure costs through efficient resource allocation
+
+This implementation requires code changes, Worker configuration, and deployment of specialized infrastructure (GPU instances, high-memory instances, specialized hardware).
+
+## Prerequisites
+
+### Required software, infrastructure, and tools
+
+- Temporal Service (Self-hosted or Temporal Cloud)
+- Python 3.8 or later
+- Temporal Python SDK v1.0.0 or later (`pip install temporalio`)
+- GPU infrastructure (AWS EC2 with NVIDIA GPUs, GCP with GPUs) for ML workloads
+- Process manager or container orchestration for multi-worker deployments
+
+### Resources & Access Privileges
+
+- Temporal namespace with permissions to start Workflows and register Workers
+- Infrastructure provisioning access for GPU instances and multiple Worker pools
+- Ability to configure autoscaling policies
+
+### Required Concepts
+
+- Temporal Workflows, Activities, and Task Queues
+- Python async/await patterns
+- GPU computing basics (for ML scenarios)
+- Basic deployment and process management
+
+## Architecture diagram(s)
+
+### Resource-Based Routing Architecture
+
+```mermaid
+flowchart TB
+ subgraph Workflow["Workflow Execution"]
+ WF[ML Pipeline Workflow]
+ end
+
+ subgraph Routing["Resource-Based Routing Logic"]
+ Decision{Activity Resource Requirements}
+ end
+
+ subgraph Queues["Task Queues by Resource Type"]
+ CPUTQ[cpu-standard Task Queue]
+ GPUTQ[gpu-ml-inference Task Queue]
+ MEMTQ[high-memory-analytics Task Queue]
+ end
+
+ subgraph Workers["Specialized Worker Pools"]
+ subgraph CPUPool["Standard CPU Pool"]
+ CPU1[Instance: c5.xlarge]
+ CPU2[4 vCPU, 8GB RAM]
+ CPU3[Replicas: 10]
+ CPU4[Concurrency: 100]
+ CPU5[Cost: $0.17/hr]
+ end
+ subgraph GPUPool["GPU ML Pool"]
+ GPU1[Instance: g4dn.xlarge]
+ GPU2[NVIDIA T4 16GB]
+ GPU3[CUDA 12.0, PyTorch]
+ GPU4[Replicas: 2]
+ GPU5[Concurrency: 2]
+ GPU6[Cost: $0.526/hr]
+ end
+ subgraph MEMPool["High-Memory Pool"]
+ MEM1[Instance: r5.2xlarge]
+ MEM2[8 vCPU, 64GB RAM]
+ MEM3[Replicas: 3]
+ MEM4[Concurrency: 10]
+ MEM5[Cost: $0.504/hr]
+ end
+ end
+
+ WF --> Decision
+
+ Decision -->|CPU Activities validation, API calls| CPUTQ
+ Decision -->|GPU Activities inference, embeddings| GPUTQ
+ Decision -->|Memory Activities large dataset processing| MEMTQ
+
+ CPUTQ --> CPUPool
+ GPUTQ --> GPUPool
+ MEMTQ --> MEMPool
+
+ style CPUTQ fill:#e1f5ff
+ style GPUTQ fill:#fff4e1
+ style MEMTQ fill:#f3e5f5
+ style GPUPool fill:#fff4e1
+ style MEMPool fill:#f3e5f5
+```
+
+## Implementation plan
+
+### Step 1: Define Task Queue constants
+
+Create a shared Python module with Task Queue name constants based on resource requirements.
+
+**File: `task_queues.py`**
+
+```python
+"""Task Queue constants for resource-based routing."""
+
+# Resource-specific task queues
+STANDARD_CPU_QUEUE = "cpu-standard"
+GPU_ML_QUEUE = "gpu-ml-inference"
+HIGH_MEMORY_QUEUE = "high-memory-analytics"
+VIDEO_ENCODING_QUEUE = "video-encoding-hardware"
+```
+
+### Step 2: Configure Workers for standard CPU processing
+
+Deploy Workers on cost-effective CPU instances for standard Activities that don't require specialized hardware.
+
+**File: `worker_cpu.py`**
+
+```python
+"""Worker for standard CPU-based Activities."""
+import asyncio
+import logging
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from task_queues import STANDARD_CPU_QUEUE
+from workflows import MLPipelineWorkflow
+from activities import validate_data, preprocess_data, store_results
+
+logging.basicConfig(level=logging.INFO)
+
+
+async def main():
+ client = await Client.connect("localhost:7233")
+
+ worker = Worker(
+ client,
+ task_queue=STANDARD_CPU_QUEUE,
+ workflows=[MLPipelineWorkflow],
+ activities=[validate_data, preprocess_data, store_results],
+ max_concurrent_activities=100, # High concurrency on CPU
+ )
+
+ logging.info(f"Starting CPU worker on {STANDARD_CPU_QUEUE}")
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**Deployment guidance:**
+- Deploy 10 instances of `worker_cpu.py` on standard compute instances (c5.xlarge: 4 vCPU, 8GB RAM)
+- Use autoscaling based on queue depth and CPU utilization
+- Consider using spot instances for cost savings if workload tolerates interruptions
+
+### Step 3: Configure Workers for GPU processing
+
+Deploy Workers on GPU-equipped instances for ML/AI workloads. GPU Workers should have limited concurrency due to GPU memory constraints.
+
+**File: `worker_gpu.py`**
+
+```python
+"""Worker for GPU-intensive ML Activities."""
+import asyncio
+import logging
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from task_queues import GPU_ML_QUEUE
+from activities import generate_embeddings, run_inference, train_model
+
+logging.basicConfig(level=logging.INFO)
+
+
+async def main():
+ client = await Client.connect("localhost:7233")
+
+ worker = Worker(
+ client,
+ task_queue=GPU_ML_QUEUE,
+ activities=[generate_embeddings, run_inference, train_model],
+ # Limited concurrency - GPU memory constraint
+ # T4 16GB can typically handle 2-4 concurrent inference tasks
+ max_concurrent_activities=2,
+ )
+
+ logging.info(f"Starting GPU worker on {GPU_ML_QUEUE}")
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**Deployment guidance:**
+- Deploy 2-3 instances of `worker_gpu.py` on GPU instances (g4dn.xlarge with NVIDIA T4, or p3.2xlarge with V100)
+- GPU Workers require:
+ - NVIDIA drivers (version 525.60 or later)
+ - CUDA toolkit (12.0 or later)
+ - ML frameworks (PyTorch, TensorFlow) with GPU support
+- Use container images: `nvidia/cuda:12.0-cudnn8-runtime-ubuntu22.04` or `pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime`
+- Monitor GPU memory usage - adjust `max_concurrent_activities` if OOM errors occur
+- Consider using GPU-optimized instance types based on model requirements (T4 for inference, V100/A100 for training)
+
+### Step 4: Configure Workers for high-memory analytics
+
+Deploy Workers on memory-optimized instances for Activities processing large datasets.
+
+**File: `worker_high_memory.py`**
+
+```python
+"""Worker for high-memory data processing Activities."""
+import asyncio
+import logging
+from temporalio.client import Client
+from temporalio.worker import Worker
+
+from task_queues import HIGH_MEMORY_QUEUE
+from activities import process_large_dataset, aggregate_analytics, build_large_index
+
+logging.basicConfig(level=logging.INFO)
+
+
+async def main():
+ client = await Client.connect("localhost:7233")
+
+ worker = Worker(
+ client,
+ task_queue=HIGH_MEMORY_QUEUE,
+ activities=[process_large_dataset, aggregate_analytics, build_large_index],
+ # Limited concurrency due to high memory usage per activity
+ # r5.2xlarge (64GB RAM) can handle ~10 concurrent activities using 5-6GB each
+ max_concurrent_activities=10,
+ )
+
+ logging.info(f"Starting high-memory worker on {HIGH_MEMORY_QUEUE}")
+ await worker.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**Deployment guidance:**
+- Deploy 3-5 instances of `worker_high_memory.py` on memory-optimized instances (r5.2xlarge: 8 vCPU, 64GB RAM)
+- Adjust `max_concurrent_activities` based on per-activity memory usage to prevent OOM errors
+- Monitor memory utilization and swap usage
+- Consider using instances with local NVMe storage for faster I/O on large datasets
+
+### Step 5: Implement resource-aware routing in Workflows
+
+**File: `ml_pipeline_workflow.py`**
+
+```python
+"""Workflow with resource-aware routing."""
+from datetime import timedelta
+from dataclasses import dataclass
+from temporalio import workflow
+
+with workflow.unsafe.imports_passed_through():
+ from task_queues import STANDARD_CPU_QUEUE, GPU_ML_QUEUE, HIGH_MEMORY_QUEUE
+
+
+@dataclass
+class MLPipelineRequest:
+ pipeline_id: str
+ dataset_url: str
+ model_type: str
+ customer_id: str
+
+
+@workflow.defn
+class MLPipelineWorkflow:
+ """
+ ML pipeline workflow with resource-aware routing.
+
+ Routes Activities to appropriate Worker pools based on resource requirements:
+ - CPU queue: Data validation, preprocessing, result storage
+ - GPU queue: Model training, inference, embeddings generation
+ - High-memory queue: Large dataset processing, analytics aggregation
+ """
+
+ @workflow.run
+ async def run(self, request: MLPipelineRequest) -> dict:
+ workflow.logger.info(f"Starting ML pipeline {request.pipeline_id}")
+
+ # Step 1: Validate data (CPU queue - lightweight validation)
+ validation_result = await workflow.execute_activity(
+ "validate_data",
+ {"dataset_url": request.dataset_url},
+ task_queue=STANDARD_CPU_QUEUE,
+ start_to_close_timeout=timedelta(minutes=2),
+ )
+
+ if not validation_result["valid"]:
+ return {"status": "validation_failed", "error": validation_result["error"]}
+
+ # Step 2: Process large dataset (High-memory queue - loads full dataset into memory)
+ processed_data = await workflow.execute_activity(
+ "process_large_dataset",
+ {"dataset_url": request.dataset_url, "pipeline_id": request.pipeline_id},
+ task_queue=HIGH_MEMORY_QUEUE,
+ start_to_close_timeout=timedelta(hours=1),
+ )
+
+ # Step 3: Train model (GPU queue - requires GPU for training)
+ model_result = await workflow.execute_activity(
+ "train_model",
+ {
+ "model_type": request.model_type,
+ "data_path": processed_data["output_path"],
+ "pipeline_id": request.pipeline_id,
+ },
+ task_queue=GPU_ML_QUEUE,
+ start_to_close_timeout=timedelta(hours=4),
+ )
+
+ # Step 4: Generate embeddings (GPU queue - GPU-accelerated inference)
+ embeddings = await workflow.execute_activity(
+ "generate_embeddings",
+ {
+ "model_path": model_result["model_path"],
+ "customer_id": request.customer_id,
+ },
+ task_queue=GPU_ML_QUEUE,
+ start_to_close_timeout=timedelta(minutes=30),
+ )
+
+ # Step 5: Store results (CPU queue - simple I/O operation)
+ await workflow.execute_activity(
+ "store_results",
+ {
+ "pipeline_id": request.pipeline_id,
+ "embeddings": embeddings,
+ "model_metrics": model_result["metrics"],
+ },
+ task_queue=STANDARD_CPU_QUEUE,
+ start_to_close_timeout=timedelta(minutes=5),
+ )
+
+ return {
+ "status": "completed",
+ "pipeline_id": request.pipeline_id,
+ "model_path": model_result["model_path"],
+ "embeddings_count": len(embeddings["vectors"]),
+ }
+```
+
+**File: `activities.py`**
+
+```python
+"""Activity implementations for different resource requirements."""
+from temporalio import activity
+import asyncio
+
+
+# ===== CPU-based activities (STANDARD_CPU_QUEUE) =====
+
+@activity.defn
+async def validate_data(data: dict) -> dict:
+ """
+ Validate input data (runs on standard CPU workers).
+
+ Lightweight validation that doesn't require specialized hardware.
+ """
+ activity.logger.info(f"Validating dataset at {data['dataset_url']}")
+
+ # Validation logic
+ is_valid = True # Simplified validation
+
+ return {
+ "valid": is_valid,
+ "dataset_url": data["dataset_url"],
+ "error": None if is_valid else "Invalid dataset format",
+ }
+
+
+@activity.defn
+async def preprocess_data(data: dict) -> dict:
+ """Basic data preprocessing (runs on CPU workers)."""
+ activity.logger.info("Preprocessing data")
+
+ # Lightweight preprocessing
+ return {"preprocessed": True, "records": 1000}
+
+
+@activity.defn
+async def store_results(data: dict) -> dict:
+ """Store pipeline results (runs on CPU workers)."""
+ activity.logger.info(f"Storing results for pipeline {data['pipeline_id']}")
+
+ # Store to database or object storage
+ return {"stored": True, "pipeline_id": data["pipeline_id"]}
+
+
+# ===== GPU-based activities (GPU_ML_QUEUE) =====
+
+@activity.defn
+async def train_model(data: dict) -> dict:
+ """
+ Train ML model (runs on GPU workers).
+
+ Requires GPU for acceptable training performance.
+ """
+ activity.logger.info(f"Training {data['model_type']} model on GPU")
+
+ import torch
+
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ activity.logger.info(f"Using device: {device}")
+
+ # Simulate training
+ # In production: load data, initialize model, train
+ await asyncio.sleep(5) # Simulate training time
+
+ model_path = f"/models/{data['pipeline_id']}/model.pth"
+
+ activity.logger.info(f"Training complete, model saved to {model_path}")
+
+ return {
+ "model_path": model_path,
+ "metrics": {"accuracy": 0.95, "loss": 0.05},
+ "device": str(device),
+ }
+
+
+@activity.defn
+async def generate_embeddings(data: dict) -> dict:
+ """
+ Generate embeddings using ML model (runs on GPU workers).
+
+ Requires GPU hardware for acceptable performance.
+ """
+ activity.logger.info("Generating embeddings on GPU")
+
+ import torch
+ from sentence_transformers import SentenceTransformer
+
+ # Load model (cached after first load)
+ model = SentenceTransformer("all-MiniLM-L6-v2")
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ model = model.to(device)
+
+ # Generate embeddings (simplified for example)
+ texts = [f"Sample text {i}" for i in range(100)]
+ embeddings = model.encode(texts, convert_to_tensor=True)
+
+ activity.logger.info(f"Generated {len(embeddings)} embeddings on {device}")
+
+ return {
+ "vectors": embeddings.cpu().tolist(),
+ "customer_id": data["customer_id"],
+ "dimension": embeddings.shape[1],
+ }
+
+
+@activity.defn
+async def run_inference(data: dict) -> dict:
+ """Run ML inference (runs on GPU workers)."""
+ activity.logger.info("Running inference on GPU")
+
+ import torch
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ # Model inference logic
+ activity.logger.info(f"Inference completed on {device}")
+
+ return {"predictions": [], "confidence": 0.95, "device": str(device)}
+
+
+# ===== High-memory activities (HIGH_MEMORY_QUEUE) =====
+
+@activity.defn
+async def process_large_dataset(data: dict) -> dict:
+ """
+ Process large dataset (runs on high-memory workers).
+
+ Loads entire dataset into memory for processing.
+ Requires 32-64GB RAM for typical datasets.
+ """
+ activity.logger.info(f"Processing large dataset from {data['dataset_url']}")
+
+ # In production: load dataset (pandas, dask, spark)
+ # Process in-memory transformations
+ await asyncio.sleep(10) # Simulate processing time
+
+ output_path = f"/data/processed/{data['pipeline_id']}/dataset.parquet"
+
+ activity.logger.info(f"Dataset processed, saved to {output_path}")
+
+ return {
+ "output_path": output_path,
+ "records_processed": 10_000_000,
+ "memory_used_gb": 45,
+ }
+
+
+@activity.defn
+async def aggregate_analytics(data: dict) -> dict:
+ """
+ Aggregate analytics from large datasets (runs on high-memory workers).
+
+ Performs in-memory aggregations on large datasets.
+ """
+ activity.logger.info("Running analytics aggregation")
+
+ # In-memory aggregation logic
+ await asyncio.sleep(5)
+
+ return {
+ "aggregations": {"total": 1000000, "average": 42.5},
+ "memory_used_gb": 38,
+ }
+
+
+@activity.defn
+async def build_large_index(data: dict) -> dict:
+ """
+ Build search index from large dataset (runs on high-memory workers).
+
+ Requires significant memory to build index structures.
+ """
+ activity.logger.info("Building large search index")
+
+ # Build index in memory
+ await asyncio.sleep(8)
+
+ return {
+ "index_path": "/indexes/search_index.bin",
+ "index_size_gb": 25,
+ "documents_indexed": 50_000_000,
+ }
+```
+
+
+**Starter example:**
+
+```python
+# start_ml_pipeline.py
+"""Start ML pipeline workflow with resource-aware routing."""
+import asyncio
+from temporalio.client import Client
+from ml_pipeline_workflow import MLPipelineWorkflow, MLPipelineRequest
+
+
+async def main():
+ client = await Client.connect("localhost:7233")
+
+ pipeline_request = MLPipelineRequest(
+ pipeline_id="ml-pipeline-2024-001",
+ dataset_url="s3://datasets/customer-behavior-10m.parquet",
+ model_type="transformer",
+ customer_id="customer-456",
+ )
+
+ handle = await client.start_workflow(
+ MLPipelineWorkflow.run,
+ pipeline_request,
+ id=f"ml-pipeline-{pipeline_request.pipeline_id}",
+ task_queue="workflows",
+ )
+
+ print(f"Started ML pipeline workflow: {handle.id}")
+
+ # Wait for result (or use handle.describe() to check status)
+ result = await handle.result()
+ print(f"Pipeline completed: {result}")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+## Conclusion
+
+By implementing separate Task Queues for resource-based routing, you have achieved:
+
+1. **Cost optimization:** GPU instances (g4dn.xlarge at $0.526/hr) only handle ML training and inference, while standard CPU Activities run on cost-effective instances (c5.xlarge at $0.17/hr), and high-memory analytics run on memory-optimized instances (r5.2xlarge at $0.504/hr). This targeted resource allocation reduces infrastructure costs by 60-80% compared to running everything on GPU instances.
+
+2. **Resource efficiency:** GPU Workers run at 2-4 concurrent activities (constrained by GPU memory), CPU Workers run at 100+ concurrent activities, and high-memory Workers run at 10 concurrent activities (constrained by RAM). Each Worker pool is optimized for its specific resource constraints.
+
+3. **Independent scaling:** Scale GPU Workers based on ML workload demand, high-memory Workers based on analytics load, and CPU Workers based on standard workload volume, without coordination between pools. Each pool can scale independently based on its queue depth and resource utilization.
+
+4. **Environment isolation:** ML Workers with TensorFlow/PyTorch/CUDA dependencies are isolated from standard Workers, preventing library conflicts and simplifying deployment. Each Worker type can use specialized container images with only the dependencies it needs.
+
+Your application now intelligently routes work based on resource requirements, ensuring efficient hardware utilization and cost-effective infrastructure allocation.
+
+## Related Resources
+
+### Official Documentation
+- [Temporal Documentation - Task Routing](https://docs.temporal.io/task-routing)
+- [Temporal Best Practices - Separate Task Queues](https://docs.temporal.io/best-practices/worker#separate-task-queues-logically)
+- [Python SDK Worker Configuration](https://python.temporal.io/temporalio.worker.Worker.html)
+
+### Related Patterns
+- [Separate Task Queues - Worker Affinity](worker-execution-affinity) - For Activities that must run on the same Worker instance
+- [Separate Task Queues - Rate Limiting](rate-limit-downstream-apis) - For protecting downstream APIs with rate limits
+
+### Code Samples
+- [GPU-Accelerated Image Processing (Python)](https://github.com/temporalio/samples-python) - Example of GPU worker configuration
+- [Worker Versioning with Task Queues](https://github.com/temporalio/samples-python/tree/main/worker_versioning) - Managing different Worker versions
+
+### Community Resources
+- [Blog: AI, ML, and Data Engineering Workflows with Temporal](https://temporal.io/blog/ai-ml-and-data-engineering-workflows-with-temporal)
+- [Forum: Task Queue Best Practices](https://community.temporal.io/t/in-what-situation-should-we-use-multiple-separated-task-queues/1254)
+- [Temporal ML/AI Use Cases](https://temporal.io/use-cases/machine-learning)
\ No newline at end of file
diff --git a/docs/guides/saga-pattern.mdx b/docs/guides/saga-pattern.mdx
new file mode 100644
index 0000000000..c5a1f1eba8
--- /dev/null
+++ b/docs/guides/saga-pattern.mdx
@@ -0,0 +1,1061 @@
+---
+id: saga-pattern
+title: Recover business processes without restarting
+description: Build a multi-step business procss that pauses when it hits a permanent failure, then resumes from where it stopped without restarting.
+sidebar_label: Recovery without restart
+toc_max_heading_level: 2
+author: Tao Guo
+tags:
+ - Architecture
+ - Saga Pattern
+ - Retries
+ - Durable Execution
+ - TypeScript SDK
+---
+
+This pattern shows how to build a multi-step business process that pauses when it hits a permanent failure — a malformed identifier, an exceeded policy limit, a failed compliance check — allows a human or automated agent to correct the underlying issue, and resumes from where it stopped without restarting from the beginning.
+Queryable metadata routes each blocked process to the right resolution resource, and operators get full visibility into the state of every individual execution and across the entire pipeline.
+
+
+
+## Problem statement
+
+Business processes such as loan origination, insurance claims, and onboarding involve sequential steps where certain failures — invalid input, regulatory violations, business policy breaches — cannot be resolved by automated retries.
+An invalid Social Security number will never pass a credit check regardless of how often you retry it.
+A title search against a missing property identifier will return the same error no matter how many times you repeat it.
+These failures require someone to inspect the error, correct the data, and instruct the process to continue.
+
+Without a durable orchestration layer, teams typically implement this by writing failed records to a dead-letter queue, building a separate reconciliation service, and stitching together polling loops to resume processing.
+This approach is fragile: the process state lives across multiple systems, corrections are ad-hoc, and operators have no single view of which applications need attention and why.
+When something goes wrong, the common workaround is to fix the data and restart the entire process from the beginning, losing all progress from steps that already completed.
+
+## Solution
+
+You will use a Temporal Workflow to orchestrate a six-step home loan processing pipeline.
+When a step encounters a problem that retries alone cannot solve — a missing document, a policy limit breach, or a regulatory hold — it throws a non-retryable `ApplicationFailure` and the Workflow pauses, preserving all progress.
+Custom Search Attributes advertise the blocked state so that operators or AI agents can find and prioritize the right cases.
+An operator sends a Signal containing corrected data, which patches the application in-place and wakes the Workflow to retry the failed step.
+The process picks up where it left off: completed steps do not re-execute, and the full correction history is preserved.
+
+## Architectural benefit
+
+While you can use traditional architectures to implement a similar solution, they often involve significantly more complexity and demand higher implementation and operational costs. A Temporal-based solution is typically much leaner and more robust.
+
+
+
+## What you will achieve
+
+By completing this pattern, you will:
+
+- Pause a business process on a non-retryable failure and resume after correction without restarting from the beginning.
+- Unwind side-effecting steps through a LIFO **saga** compensation stack when forward progress is not possible — for example, a compliance block or an explicit cancellation from the applicant.
+- Recover from compensation failures using the same pause-and-fix loop so a stuck rollback does not leave the process in a partially unwound state.
+- Route blocked processes to the right human or automated agent using queryable Search Attributes.
+- Inspect the current state of any individual process or filter across all processes through a single API.
+- Maintain a complete audit trail of every data correction and every compensation applied during the lifecycle of each process.
+
+## Background and best practices
+
+Temporal Workflows persist their execution state, including local variables, call stacks, and pending Timers, to the Temporal Cluster.
+When a Workflow suspends on `condition()`, the Worker completes the Workflow Task and the execution can be evicted from the Worker's in-memory cache to make room for other Workflows, freeing the associated resources.
+The Workflow resumes where it left off when the condition becomes true, regardless of how much wall-clock time passes.
+This means thousands of loan applications can sit in a `PENDING_FIX` state simultaneously without consuming Worker capacity.
+
+The primary architectural challenge is distinguishing between transient failures that automated retries can resolve and permanent failures that require human intervention.
+Temporal addresses this with [`ApplicationFailure.nonRetryable()`](https://docs.temporal.io/references/failures#application-failure), which instructs the SDK to skip the retry policy and propagate the error to the Workflow code without delay.
+The Workflow then has full control over how to handle the failure: it can log the error, update Search Attributes for visibility, and suspend until a [Signal](https://docs.temporal.io/develop/typescript/message-passing#signals) arrives.
+
+Custom [Search Attributes](https://docs.temporal.io/visibility#search-attribute) provide the mechanism for routing blocked Workflows to the correct resolution resource.
+By upserting attributes like `LoanStatus` and `FailedActivity` at each state transition, you create a queryable index across all active Workflows.
+An operations dashboard can filter for `LoanStatus = 'PENDING_FIX' AND FailedActivity = 'runCreditCheck'` to show which applications need a credit-related data correction.
+This same query interface supports automated agents that poll for specific failure categories and apply corrections programmatically.
+
+Temporal [Queries](https://docs.temporal.io/develop/typescript/message-passing#queries) provide synchronous read access to the current state of any running Workflow.
+Unlike Search Attributes, which expose denormalized metadata for cross-Workflow queries, Queries return the full internal state of a single execution.
+Together, Search Attributes and Queries give you both the aggregate view across your entire pipeline and the detailed view into any individual Workflow.
+
+## Target audience
+
+Engineers familiar with Temporal foundations who need to handle long-running, complex business processes.
+
+## Prerequisites
+
+To execute the steps in this pattern, you must have:
+
+- **Required software, infrastructure, and tools:** Temporal TypeScript SDK v1.13.0 or later, Node.js v18 or later, Temporal CLI v1.6.1 or later.
+- **Resources and access privileges:** A running Temporal Cluster (local dev server or Temporal Cloud) with permissions to create custom Search Attributes and start Workflows.
+- **Required concepts:** Familiarity with Temporal Workflows, Activities, Signals, Queries, Search Attributes, and Workers.
+
+Install the Temporal CLI and start a local dev server:
+
+```bash
+# macOS
+brew install temporal
+
+# Or download directly from https://docs.temporal.io/cli#install
+
+# Start the local development server
+temporal server start-dev
+```
+
+This starts a Temporal Server on `localhost:7233` with the Web UI at `http://localhost:8233`.
+
+**Note:** This guide uses the TypeScript SDK but the pattern apply to any Temporal SDK.
+
+## Architecture diagram
+
+The following diagram illustrates the flow of a loan application through the six-step pipeline, including the pause-and-resume cycle when an Activity encounters a permanent failure.
+
+```mermaid
+sequenceDiagram
+ participant Client
+ participant WF as Workflow
+ participant Act as Activities
+ participant Op as Operator / Agent
+
+ Client->>WF: Start homeLoanWorkflow
+ WF->>+Act: Execute Activity
+ Act-->>-WF: ApplicationFailure.nonRetryable()
+ WF->>WF: Set PENDING_FIX, await condition()
+
+ Note over Op, Act: Correction cycle
+ Op->>WF: Query blocked Workflows
+ WF-->>Op: PENDING_FIX list
+ Op->>WF: Signal: retry({key, value})
+ WF->>WF: Patch data, wake condition
+ WF->>+Act: Retry Activity
+ Act-->>-WF: Success
+
+ WF->>WF: Advance to next Activity
+```
+
+1. A client starts `homeLoanWorkflow` with the loan application data.
+2. The Workflow executes Activities sequentially. On a non-retryable failure, the Activity throws `ApplicationFailure.nonRetryable()`.
+3. The Workflow catches the error, sets Search Attributes to `PENDING_FIX`, and suspends via `await condition(() => retryRequested)`.
+4. An operator or agent queries for blocked Workflows, sends a corrective Signal with the field name and new value.
+5. The Signal handler patches the data, records the fix, and wakes the Workflow.
+6. The Workflow retries the failed Activity. On success it advances; on failure the cycle repeats from step 3.
+
+## Implementation plan
+
+Prior to executing this plan, ensure you have your Temporal Cluster running and the custom Search Attributes created.
+
+### Create custom Search Attributes
+
+Before writing any code, register the custom Search Attributes that your Workflow will use to advertise its state.
+These attributes enable the visibility queries that route blocked Workflows to the appropriate resolution resource.
+
+```bash
+temporal operator search-attribute create --name LoanStatus --type Keyword
+temporal operator search-attribute create --name FailedActivity --type Keyword
+```
+
+`LoanStatus` tracks the current pipeline stage: `STARTED`, `INCOME_VERIFIED`, `CREDIT_CHECKED`, `APPRAISAL_ORDERED`, `TITLE_SEARCHED`, `UNDERWRITTEN`, `CLOSED`, or `PENDING_FIX`.
+`FailedActivity` records the name of the Activity that caused the failure.
+Together, these two attributes allow you to write queries like `LoanStatus = 'PENDING_FIX' AND FailedActivity = 'runCreditCheck'` to find every application blocked on a credit check.
+
+### Define data models
+
+Create the interfaces that define the data flowing through Workflows and Activities.
+It is a best practice to use a single serializable input to Workflows and Activities.
+
+```typescript
+// src/models.ts
+
+// Input data for the loan processing pipeline
+export interface LoanApplication {
+ applicationId: string;
+ applicantName: string;
+ ssn: string;
+ employerName: string;
+ annualIncome: number;
+ propertyAddress: string;
+ propertyId: string;
+ loanAmount: number;
+ downPayment: number;
+}
+
+// Tracks the current pipeline stage; PENDING_FIX means the Workflow is waiting for a correction
+export type LoanStatus =
+ | 'STARTED'
+ | 'INCOME_VERIFIED'
+ | 'CREDIT_CHECKED'
+ | 'APPRAISAL_ORDERED'
+ | 'TITLE_SEARCHED'
+ | 'UNDERWRITTEN'
+ | 'CLOSED'
+ | 'PENDING_FIX'
+ | 'FAILED';
+
+// Records a single data correction applied during execution
+export interface FixEntry {
+ activity: string; // which Activity was blocked
+ field: string; // which field was corrected
+ oldValue: string;
+ newValue: string;
+ error: string; // the failure message that triggered the fix
+ id?: string; // client-supplied idempotency key for duplicate Signal detection
+}
+
+// Complete queryable state of a pipeline execution
+export interface LoanState {
+ status: LoanStatus;
+ failedActivity: string;
+ failureMessage: string;
+ completedActivities: string[]; // Activities that have already succeeded
+ fixHistory: FixEntry[]; // audit trail of all corrections
+ application: LoanApplication; // current data including patches
+}
+
+// Signal payload: the field to correct and its new value
+export interface RetryUpdate {
+ key?: keyof LoanApplication | '';
+ value?: string;
+ id?: string; // optional idempotency key — Signals are at-least-once
+}
+```
+
+### Define the Activities
+
+Create functions that validate loan application data at each pipeline stage.
+Temporal Activities automatically recover from transient failures — network timeouts, temporary service unavailability, rate limits — through their built-in [Retry Policy](https://docs.temporal.io/encyclopedia/retry-policies) and [timeout management](https://docs.temporal.io/develop/typescript/failure-detection#activity-timeouts).
+When an Activity encounters a permanent failure that retries cannot fix — invalid input data, a policy violation, or a missing record — it throws `ApplicationFailure.nonRetryable()` to bypass the Retry Policy and propagate the error directly to the Workflow.
+
+```typescript
+// src/activities.ts
+
+import { ApplicationFailure } from '@temporalio/activity';
+
+// Step 1: Validate the applicant's employment and income
+export async function verifyIncome(
+ applicantName: string,
+ employerName: string,
+ annualIncome: number
+): Promise {
+ // Unrecognized employer — cannot verify
+ if (employerName === 'UNKNOWN_EMPLOYER') {
+ throw ApplicationFailure.nonRetryable(
+ `Employer "${employerName}" not found in verification database for ${applicantName}`
+ );
+ }
+ if (annualIncome <= 0) {
+ throw ApplicationFailure.nonRetryable(
+ `Invalid annual income: $${annualIncome} for ${applicantName}`
+ );
+ }
+ return `Income verified: ${applicantName} earns $${annualIncome}/yr at ${employerName}`;
+}
+
+// Step 2: Pull credit report using the applicant's SSN
+export async function runCreditCheck(
+ applicantName: string,
+ ssn: string
+): Promise {
+ // Malformed SSN — no retry will fix this
+ if (ssn === '000-00-0000' || ssn.length < 11) {
+ throw ApplicationFailure.nonRetryable(
+ `Invalid SSN "${ssn}" for ${applicantName} — cannot pull credit report`
+ );
+ }
+ return `Credit check passed for ${applicantName}: score 750`;
+}
+
+// Step 3: Order a property appraisal
+export async function orderAppraisal(
+ propertyAddress: string,
+ loanAmount: number
+): Promise {
+ // Invalid property address
+ if (propertyAddress === '' || propertyAddress === 'INVALID_ADDRESS') {
+ throw ApplicationFailure.nonRetryable(
+ `Cannot order appraisal — invalid property address: "${propertyAddress}"`
+ );
+ }
+ return `Appraisal completed for ${propertyAddress}: valued at $${loanAmount * 1.1}`;
+}
+
+// Step 4: Verify the property title is clear
+export async function performTitleSearch(
+ propertyId: string,
+ propertyAddress: string
+): Promise {
+ // Missing or invalid property ID
+ if (propertyId === '' || propertyId === 'MISSING') {
+ throw ApplicationFailure.nonRetryable(
+ `Title search failed — missing or invalid property ID: "${propertyId}" for ${propertyAddress}`
+ );
+ }
+ return `Title is clear for property ${propertyId} at ${propertyAddress}`;
+}
+
+// Step 5: Check debt-to-income ratio against lending policy
+export async function underwrite(
+ applicantName: string,
+ annualIncome: number,
+ loanAmount: number,
+ downPayment: number
+): Promise {
+ const dti = ((loanAmount - downPayment) / annualIncome) * 100;
+ // DTI above 400% exceeds policy limit — needs a larger down payment or lower loan amount
+ if (dti > 400) {
+ throw ApplicationFailure.nonRetryable(
+ `Underwriting denied for ${applicantName} — debt-to-income ratio ${dti.toFixed(0)}% exceeds 400% limit`
+ );
+ }
+ return `Underwriting approved for ${applicantName}: DTI ${dti.toFixed(0)}%`;
+}
+
+// Step 6: Finalize and fund the loan
+export async function closeLoan(
+ applicationId: string,
+ applicantName: string,
+ loanAmount: number
+): Promise {
+ return `Loan ${applicationId} closed for ${applicantName}: $${loanAmount} funded`;
+}
+```
+
+Each Activity validates its inputs against business rules and throws `ApplicationFailure.nonRetryable()` when the data is fundamentally invalid.
+The `nonRetryable` designation is critical: it instructs the Temporal SDK to skip the retry policy entirely and propagate the error directly to the Workflow.
+This distinguishes failures that require human judgment — corrupt input, policy breaches, compliance blocks — from transient infrastructure failures that retries can resolve.
+
+### Implement the Workflow with the recoverableStep pattern
+
+You will now create the Workflow that orchestrates the six-step pipeline.
+The central mechanism is the `recoverableStep` helper function that wraps each Activity call in a pause-and-resume loop.
+
+```typescript
+// src/workflows.ts
+
+import {
+ proxyActivities,
+ defineSignal,
+ defineQuery,
+ setHandler,
+ condition,
+ upsertSearchAttributes,
+ log,
+ ActivityFailure,
+ isCancellation,
+} from '@temporalio/workflow';
+import { defineSearchAttributeKey } from '@temporalio/common';
+import type * as activities from './activities';
+import type { FixEntry, LoanApplication, LoanState, LoanStatus, RetryUpdate } from './models';
+
+// Typed Search Attribute keys — used in both upsertSearchAttributes and client start options
+const LoanStatusKey = defineSearchAttributeKey('LoanStatus', 'KEYWORD');
+const FailedActivityKey = defineSearchAttributeKey('FailedActivity', 'KEYWORD');
+
+const {
+ verifyIncome,
+ runCreditCheck,
+ orderAppraisal,
+ performTitleSearch,
+ underwrite,
+ closeLoan,
+} = proxyActivities({
+ // In production, split into multiple proxies and tune per Activity; long-running steps
+ // should also call heartbeat() with a heartbeatTimeout.
+ startToCloseTimeout: '10 seconds',
+ // Use the default retry policy for transient failures.
+ // ApplicationFailure.nonRetryable() bypasses retries for permanent failures.
+});
+
+// Signal to deliver corrected data; Query to read current pipeline state
+export const retrySignal = defineSignal<[RetryUpdate]>('retry');
+export const getStateQuery = defineQuery('getState');
+
+export async function homeLoanWorkflow(application: LoanApplication): Promise {
+ // Mutable copy — patched in-place by the Signal handler when corrections arrive
+ const app = { ...application };
+ let status: LoanStatus = 'STARTED';
+ let failedActivity = '';
+ let failureMessage = '';
+ let retryRequested = false;
+ const completedActivities: string[] = [];
+ const fixHistory: FixEntry[] = [];
+
+ // Publish pipeline state as Search Attributes so operators can query across all Workflows
+ const updateStatus = (newStatus: LoanStatus, activity = '', message = '') => {
+ status = newStatus;
+ failedActivity = activity;
+ failureMessage = message;
+ upsertSearchAttributes([
+ { key: LoanStatusKey, value: newStatus },
+ { key: FailedActivityKey, value: activity },
+ ]);
+ };
+
+ // Query handler — returns a snapshot of the full pipeline state without side effects
+ setHandler(getStateQuery, () => ({
+ status,
+ failedActivity,
+ failureMessage,
+ completedActivities: [...completedActivities],
+ fixHistory: [...fixHistory],
+ application: { ...app },
+ }));
+
+ // Signal handler — patches the application data and wakes the suspended Workflow.
+ // Signals are at-least-once: clients can retry on transport hiccups, so dedupe
+ // via the caller-supplied id stored on FixEntry.
+ setHandler(retrySignal, (update: RetryUpdate) => {
+ if (update.id && fixHistory.some((f) => f.id === update.id)) {
+ log.warn(`Duplicate retry signal ignored: ${update.id}`);
+ return;
+ }
+ if (update.key) {
+ const key = update.key as keyof LoanApplication;
+ const oldValue = String((app as any)[key]);
+ if (key === 'annualIncome' || key === 'loanAmount' || key === 'downPayment') {
+ (app as any)[key] = parseFloat(update.value ?? '0');
+ } else {
+ (app as any)[key] = update.value ?? '';
+ }
+ fixHistory.push({
+ activity: failedActivity,
+ field: key,
+ oldValue,
+ newValue: update.value ?? '',
+ error: failureMessage,
+ id: update.id,
+ });
+ log.info(`Fix received ${key}: ${oldValue} -> ${update.value}`);
+ } else {
+ log.info('Retry requested without patch');
+ }
+ retryRequested = true; // unblocks the condition() below
+ });
+
+ // Core pattern: wrap each Activity in a pause-and-resume loop.
+ // On failure, advertise PENDING_FIX via Search Attributes and suspend
+ // until a Signal delivers corrected data, then retry the same Activity.
+ const recoverableStep = async (
+ activityName: string,
+ fn: () => Promise
+ ): Promise => {
+ while (true) {
+ try {
+ const result = await fn();
+ return result;
+ } catch (e) {
+ // Anything that isn't an ActivityFailure (workflow-side bug, non-determinism)
+ // is treated as a workflow task failure by Temporal and retried — let it surface.
+ if (!(e instanceof ActivityFailure)) throw e;
+ // Cancellation arrives wrapped as ActivityFailure; propagate so the workflow unwinds.
+ if (isCancellation(e)) throw e;
+ // Activity errors are wrapped in ActivityFailure; the original message is in .cause
+ const message = e.cause?.message || e.message || String(e);
+ log.warn(`Activity ${activityName} failed: ${message}`);
+ updateStatus('PENDING_FIX', activityName, message);
+ retryRequested = false;
+ // Suspend the Workflow — no resources consumed while waiting
+ await condition(() => retryRequested);
+ updateStatus('STARTED', '', '');
+ log.info(`Retrying activity ${activityName} after fix`);
+ }
+ }
+ };
+
+ await recoverableStep('verifyIncome', () =>
+ verifyIncome(app.applicantName, app.employerName, app.annualIncome)
+ );
+ completedActivities.push('verifyIncome');
+ updateStatus('INCOME_VERIFIED');
+
+ await recoverableStep('runCreditCheck', () =>
+ runCreditCheck(app.applicantName, app.ssn)
+ );
+ completedActivities.push('runCreditCheck');
+ updateStatus('CREDIT_CHECKED');
+
+ await recoverableStep('orderAppraisal', () =>
+ orderAppraisal(app.propertyAddress, app.loanAmount)
+ );
+ completedActivities.push('orderAppraisal');
+ updateStatus('APPRAISAL_ORDERED');
+
+ await recoverableStep('performTitleSearch', () =>
+ performTitleSearch(app.propertyId, app.propertyAddress)
+ );
+ completedActivities.push('performTitleSearch');
+ updateStatus('TITLE_SEARCHED');
+
+ await recoverableStep('underwrite', () =>
+ underwrite(app.applicantName, app.annualIncome, app.loanAmount, app.downPayment)
+ );
+ completedActivities.push('underwrite');
+ updateStatus('UNDERWRITTEN');
+
+ await recoverableStep('closeLoan', () =>
+ closeLoan(app.applicationId, app.applicantName, app.loanAmount)
+ );
+ completedActivities.push('closeLoan');
+ updateStatus('CLOSED');
+
+ return {
+ status,
+ failedActivity,
+ failureMessage,
+ completedActivities: [...completedActivities],
+ fixHistory: [...fixHistory],
+ application: { ...app },
+ };
+}
+```
+
+The `homeLoanWorkflow` orchestrates the entire loan processing pipeline.
+The `recoverableStep` helper function wraps each Activity call in a `while (true)` loop.
+When an Activity throws, the helper updates the `LoanStatus` Search Attribute to `PENDING_FIX` and the `FailedActivity` Search Attribute to the name of the failed Activity.
+It then calls `await condition(() => retryRequested)`, which suspends the Workflow until a Signal delivers corrected data.
+
+The catch block narrows the error before suspending. It first re-throws anything that is not an `ActivityFailure` — a Workflow-side bug or a non-determinism error should fail the Workflow Task and let Temporal retry it, not be silently parked in `PENDING_FIX`. It then re-throws on `isCancellation(e)`, since cancellation arrives wrapped as an `ActivityFailure` and must propagate so the framework can unwind the Workflow cleanly.
+
+The Signal handler receives a `RetryUpdate` containing the field name and corrected value.
+It patches the `app` object in-place, records the correction in `fixHistory`, and sets `retryRequested = true` to wake the suspended `condition()`.
+The loop then retries the Activity with the patched data.
+If the Activity succeeds, the loop exits and the pipeline advances.
+If it fails again, the cycle repeats.
+
+Temporal delivers Signals with at-least-once semantics: a client retry, a flaky network, or an over-eager UI can submit the same correction twice and the Workflow will see both copies.
+Without protection, the second delivery would overwrite the field again — usually a no-op, but harmful when the operator has since sent a different value, and always noisy in `fixHistory`.
+The handler defends against this by checking the caller-supplied `id` against the audit trail and dropping anything it has already applied.
+The client (the dashboard, CLI script, or upstream agent) is responsible for generating a stable `id` per logical correction — typically a UUID minted when the operator clicks **Patch and Retry**, reused across any retries of that same submission.
+[Workflow Updates](https://docs.temporal.io/encyclopedia/workflow-message-passing#updates) would give you this deduplication for free via the SDK's built-in `updateId` handling, along with a synchronous result to the caller; this pattern sticks with Signals because the dashboard is fire-and-forget and the audit trail already provides the dedup index.
+
+The Query handler returns the complete `LoanState` at any point during execution.
+This includes the current status, the list of completed Activities, the full fix history, and the current application data.
+External systems can poll this Query to display real-time pipeline progress.
+
+By calling `upsertSearchAttributes` at every state transition, the Workflow maintains a denormalized index that the Temporal Visibility API can query across all active Workflows.
+This enables an operations dashboard to display aggregate statistics and filter by any combination of status and failed Activity.
+
+### Add saga compensation for side-effecting steps
+
+Not every failure can be resolved by correcting input data.
+An OFAC match, a withdrawn offer, or a regulatory hold demands that the pipeline roll back rather than pause for a fix.
+This is where the [saga pattern](https://taonic.github.io/temporal-design-patterns/saga-pattern.html) comes in: each forward Activity that produces an external side effect — a credit bureau inquiry, an appraiser booking, a title company fee, a reserved lending slot, a funded loan — registers a compensating Activity **before** it executes.
+If the forward pipeline aborts, the Workflow unwinds the registered compensations in reverse order.
+
+The pattern has three key discipline points that the implementation must honor:
+
+1. **Register before execution.** Compensations are registered before the forward call to handle partial failures. Consider `orderAppraisal`: the Worker POSTs a booking to the appraisal vendor, the vendor records the booking and reserves the fee, then the response is lost to a network blip on the way back. The Activity throws because it never received a response — yet the booking exists on the vendor side. If the Workflow only registered the compensation after a successful forward call, that booking would be orphaned by a saga rollback. Pre-registering guarantees `cancelAppraisal` runs during rollback either way; idempotency makes it a safe no-op when the side effect never actually landed.
+2. **LIFO unwinding.** The last step to touch external state is the first to undo. In TypeScript this is expressed by pushing onto the front of the array with `unshift()` and iterating forward, or by iterating a `push()`-built array in reverse.
+3. **Recoverable compensation.** Compensations can fail — vendor APIs go down, external systems reject requests. Running each compensation through the same recoverable wrapper as forward steps means a stuck rollback pauses with `ROLLBACK_PENDING_FIX` and can be patched or retried by an operator.
+
+Extend the Workflow to register compensations alongside forward Activities:
+
+```typescript
+interface Compensation {
+ forwardActivity: string;
+ compensationActivity: string;
+ run: () => Promise;
+}
+const compensations: Compensation[] = [];
+
+const runForward = async (
+ activityName: string,
+ forward: () => Promise,
+ compensation?: { name: string; fn: () => Promise }
+): Promise => {
+ if (cancelRequested) {
+ throw new Error(`Cancelled before ${activityName}: ${cancelReason}`);
+ }
+ // Register BEFORE execution — handles partial side effects if the Activity aborts mid-flight
+ if (compensation) {
+ compensations.unshift({
+ forwardActivity: activityName,
+ compensationActivity: compensation.name,
+ run: compensation.fn,
+ });
+ }
+ return recoverableStep(activityName, forward, 'forward');
+};
+```
+
+Two paths now trigger a rollback:
+
+**Explicit cancellation.** A `cancelApplication` Signal sets `cancelRequested = true` and wakes any paused `condition()`. The `recoverableStep` loop checks this flag on wake-up and throws out of the pause to the outer `try/catch`.
+
+**RollbackRequired failure type.** An Activity can throw `ApplicationFailure.nonRetryable(message, 'RollbackRequired')` to signal that no data correction will help — the application must be withdrawn. The Workflow catches this specific failure type and routes it to the compensation phase instead of the pause loop:
+
+```typescript
+export async function underwrite(
+ applicantName: string, ssn: string, ...
+): Promise {
+ if (ssn.startsWith('999')) {
+ throw ApplicationFailure.nonRetryable(
+ `Compliance block: OFAC/sanctions match — application must be withdrawn`,
+ 'RollbackRequired'
+ );
+ }
+ // ... normal DTI check
+}
+```
+
+The compensation loop runs in LIFO order through the same recoverable wrapper:
+
+```typescript
+try {
+ // ... forward pipeline: verifyIncome, runCreditCheck, orderAppraisal, ...
+} catch (err: any) {
+ const trigger = cancelReason || err.message || String(err);
+ updateStatus('COMPENSATING', '', trigger);
+
+ for (const comp of compensations) {
+ // Safe even without this check thanks to idempotency, but skipping keeps the audit clean
+ if (!completedActivities.includes(comp.forwardActivity)) continue;
+
+ const result = await recoverableStep(comp.forwardActivity, comp.run, 'compensation');
+ compensationHistory.push({
+ forwardActivity: comp.forwardActivity,
+ compensationActivity: comp.compensationActivity,
+ result,
+ });
+ compensatedActivities.push(comp.forwardActivity);
+ updateStatus('COMPENSATING');
+ }
+
+ updateStatus('ROLLED_BACK', '', trigger);
+}
+```
+
+Not every step needs a compensation.
+`verifyIncome` is a read-only lookup against the employer verification database — there is no external state to undo.
+`runCreditCheck` records a hard inquiry that lowers the applicant's score, so its compensation submits a withdrawal request.
+`orderAppraisal` books an appraiser and charges a fee — the compensation cancels the booking and issues a partial refund.
+`performTitleSearch` pays the title company and places a placeholder hold — the compensation releases the hold.
+`underwrite` reserves lending capacity against portfolio limits — the compensation returns the capacity to the pool.
+`closeLoan` is the most consequential: funds are disbursed and a lien is recorded at the county — the compensation initiates a clawback and files the lien release.
+
+Each compensation Activity is intentionally written to be idempotent.
+`withdrawCreditInquiry` simply resubmits the withdrawal request; the bureau accepts duplicates.
+`cancelAppraisal` checks for an existing booking before cancelling.
+`releaseTitleHold` is safe to call multiple times on an already-released hold.
+This means the "register before execution" discipline cannot corrupt state even if the forward Activity never produced the side effect.
+
+After the compensations finish, a `notifyApplicantCancelled` Activity runs to inform the applicant that the application was withdrawn and to surface the trigger reason.
+This step runs through the same recoverable wrapper as compensations: if the email provider is down, the Workflow pauses with `ROLLBACK_PENDING_FIX` so an operator can retry once the provider recovers rather than leaving the applicant uninformed.
+
+### Implement the web service
+
+Connect the Workflow to your application's API layer.
+You will write the code that receives incoming requests, starts Workflows, queries state, and sends corrective Signals.
+
+```typescript
+// src/web-service.ts
+
+import express from 'express';
+import { Connection, Client } from '@temporalio/client';
+import { defineSearchAttributeKey } from '@temporalio/common';
+import { homeLoanWorkflow, retrySignal, getStateQuery } from './workflows';
+import type { LoanApplication, RetryUpdate, LoanState } from './models';
+
+const LoanStatusKey = defineSearchAttributeKey('LoanStatus', 'KEYWORD');
+const FailedActivityKey = defineSearchAttributeKey('FailedActivity', 'KEYWORD');
+
+async function run() {
+ const connection = await Connection.connect({ address: 'localhost:7233' });
+ const client = new Client({ connection });
+
+ const app = express();
+ app.use(express.json());
+
+ app.get('/api/workflows', async (_req, res) => {
+ try {
+ const workflows: any[] = [];
+ const iterator = client.workflow.list({
+ query: `TaskQueue = 'recoverable-activity' AND ExecutionStatus != 'Terminated'`,
+ });
+ for await (const wf of iterator) {
+ const entry: any = {
+ workflowId: wf.workflowId,
+ wfStatus: wf.status.name,
+ // Read Search Attributes from the Visibility store (eventually consistent)
+ loanStatus: wf.searchAttributes?.LoanStatus?.[0] ?? '',
+ failedActivity: wf.searchAttributes?.FailedActivity?.[0] ?? '',
+ };
+ if (wf.status.name === 'RUNNING') {
+ const handle = client.workflow.getHandle(wf.workflowId);
+ // Query returns the strongly consistent internal state from the Workflow
+ entry.state = await handle.query(getStateQuery);
+ }
+ workflows.push(entry);
+ }
+ res.json({ workflows });
+ } catch (error) {
+ res.status(500).json({ error: (error as Error).message });
+ }
+ });
+
+ app.get('/api/workflows/search', async (req, res) => {
+ try {
+ const { failedActivity, status } = req.query;
+ const clauses = [`TaskQueue = 'recoverable-activity'`];
+ if (failedActivity) {
+ clauses.push(`FailedActivity = '${failedActivity}'`);
+ }
+ if (status) {
+ clauses.push(`LoanStatus = '${status}'`);
+ }
+ const workflows: any[] = [];
+ const iterator = client.workflow.list({ query: clauses.join(' AND ') });
+ for await (const wf of iterator) {
+ workflows.push({
+ workflowId: wf.workflowId,
+ loanStatus: wf.searchAttributes?.LoanStatus?.[0] ?? '',
+ failedActivity: wf.searchAttributes?.FailedActivity?.[0] ?? '',
+ });
+ }
+ res.json({ workflows });
+ } catch (error) {
+ res.status(500).json({ error: (error as Error).message });
+ }
+ });
+
+ app.post('/api/workflows', async (req, res) => {
+ try {
+ const application = req.body as LoanApplication;
+ const handle = await client.workflow.start(homeLoanWorkflow, {
+ taskQueue: 'recoverable-activity',
+ workflowId: application.applicationId,
+ args: [application],
+ // Set initial Search Attributes so the Workflow is queryable from the start
+ typedSearchAttributes: [
+ { key: LoanStatusKey, value: 'STARTED' },
+ { key: FailedActivityKey, value: '' },
+ ],
+ });
+ res.json({ success: true, workflowId: handle.workflowId });
+ } catch (error) {
+ res.status(500).json({ error: (error as Error).message });
+ }
+ });
+
+ app.post('/api/workflows/:workflowId/fix', async (req, res) => {
+ try {
+ const { key, value } = req.body as RetryUpdate;
+ // getHandle creates a reference to a running Workflow — no server call yet
+ const handle = client.workflow.getHandle(req.params.workflowId);
+ // signal() delivers the correction asynchronously; the Workflow wakes and retries
+ await handle.signal(retrySignal, { key, value });
+ res.json({ success: true, message: `Fix sent: ${key} = ${value}` });
+ } catch (error) {
+ res.status(500).json({ error: (error as Error).message });
+ }
+ });
+
+ app.listen(3000, () => {
+ console.log('Recoverable Activity UI running on http://localhost:3000');
+ });
+}
+
+run().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});
+```
+
+The search endpoint demonstrates how custom Search Attributes enable routing to human operators or automated agents for specific failure categories.
+
+### Create the Temporal Worker
+
+Now that you have built the Activities and Workflow, here is the complete Worker setup required to register and run the system.
+
+```typescript
+// src/worker.ts
+
+import { NativeConnection, Worker } from '@temporalio/worker';
+import * as activities from './activities';
+
+async function run() {
+ const connection = await NativeConnection.connect({ address: 'localhost:7233' });
+
+ const worker = await Worker.create({
+ connection,
+ namespace: 'default',
+ taskQueue: 'recoverable-activity',
+ workflowsPath: require.resolve('./workflows'),
+ activities,
+ });
+
+ console.log('Worker started, ctrl+c to exit');
+ await worker.run();
+}
+
+run().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});
+```
+
+The `taskQueue` value must match the Task Queue used when starting Workflows from the client.
+
+Run the following commands sequentially, each in their own terminal:
+
+```bash
+temporal server start-dev
+```
+
+```bash
+npm start
+```
+
+### Launch the demo dashboard
+
+The project includes a single-page operations dashboard at [`public/index.html`](https://github.com/temporal-sa/validated-pattern-blob/main/public/index.html) that exercises the web service endpoints you built above. Start it alongside the Worker:
+
+```bash
+npm run web
+```
+
+Open `http://localhost:3000` in your browser. The dashboard provides:
+
+- **Stats bar** — live counts of Total, Pending Fix, Running, and Completed Workflows.
+- **Filters** — narrow by failed Activity or loan status, powered by the Search Attribute query endpoint.
+- **Pipeline visualization** — per-Workflow step indicators showing which Activities have completed, which failed, and which are pending.
+- **Detail panel** — click any Workflow to inspect its full state (via the Query API), view the error message, and send a corrective Signal through a form that pre-suggests which field to fix.
+- **Fix history** — an audit trail of every correction applied during the lifecycle of the Workflow.
+
+The dashboard polls the web service every 3 seconds and refreshes the detail panel in real time after you send a fix, so you can watch the pipeline advance step by step.
+
+### Test the Workflow execution
+
+To verify the pipeline and the pause-and-resume cycle, you must submit loan applications with known data errors and business rule violations.
+You will write a client script that starts Workflows covering single-issue and multi-issue failure scenarios.
+
+```typescript
+// src/client.ts
+
+import { Connection, Client } from '@temporalio/client';
+import { defineSearchAttributeKey } from '@temporalio/common';
+import { homeLoanWorkflow } from './workflows';
+import type { LoanApplication } from './models';
+
+const LoanStatusKey = defineSearchAttributeKey('LoanStatus', 'KEYWORD');
+const FailedActivityKey = defineSearchAttributeKey('FailedActivity', 'KEYWORD');
+
+const scenarios: { name: string; application: LoanApplication }[] = [
+ {
+ name: 'Clean — all activities pass',
+ application: {
+ applicationId: 'LOAN-001',
+ applicantName: 'Alice Johnson',
+ ssn: '123-45-6789',
+ employerName: 'Acme Corp',
+ annualIncome: 120000,
+ propertyAddress: '123 Oak St, Springfield',
+ propertyId: 'PROP-001',
+ loanAmount: 350000,
+ downPayment: 70000,
+ },
+ },
+ {
+ name: 'Bad SSN — credit check fails',
+ application: {
+ applicationId: 'LOAN-002',
+ applicantName: 'Bob Smith',
+ ssn: '000-00-0000',
+ employerName: 'TechCo',
+ annualIncome: 95000,
+ propertyAddress: '456 Elm Ave, Shelbyville',
+ propertyId: 'PROP-002',
+ loanAmount: 280000,
+ downPayment: 56000,
+ },
+ },
+ {
+ name: 'Invalid address — appraisal fails',
+ application: {
+ applicationId: 'LOAN-003',
+ applicantName: 'Carol Davis',
+ ssn: '987-65-4321',
+ employerName: 'HealthPlus',
+ annualIncome: 105000,
+ propertyAddress: 'INVALID_ADDRESS',
+ propertyId: 'PROP-003',
+ loanAmount: 320000,
+ downPayment: 64000,
+ },
+ },
+ {
+ name: 'Multi-issue — bad employer + bad SSN + invalid address + high DTI',
+ application: {
+ applicationId: 'LOAN-009',
+ applicantName: 'Irene Tanaka',
+ ssn: '000-00-0000',
+ employerName: 'UNKNOWN_EMPLOYER',
+ annualIncome: 40000,
+ propertyAddress: 'INVALID_ADDRESS',
+ propertyId: 'PROP-009',
+ loanAmount: 600000,
+ downPayment: 5000,
+ },
+ },
+];
+
+async function run() {
+ const connection = await Connection.connect({ address: 'localhost:7233' });
+ const client = new Client({ connection });
+
+ for (const scenario of scenarios) {
+ const handle = await client.workflow.start(homeLoanWorkflow, {
+ taskQueue: 'recoverable-activity',
+ workflowId: scenario.application.applicationId,
+ args: [scenario.application],
+ typedSearchAttributes: [
+ { key: LoanStatusKey, value: 'STARTED' },
+ { key: FailedActivityKey, value: '' },
+ ],
+ });
+ console.log(`Started: ${handle.workflowId} — ${scenario.name}`);
+ }
+}
+
+run().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});
+```
+
+Execute the client from your command line:
+
+```bash
+npm run workflow
+```
+
+The script will complete and print the confirmation for each Workflow:
+
+```
+Started: LOAN-001 — Clean — all activities pass
+Started: LOAN-002 — Bad SSN — credit check fails
+Started: LOAN-003 — Invalid address — appraisal fails
+Started: LOAN-009 — Multi-issue — bad employer + bad SSN + invalid address + high DTI
+```
+
+After this output appears, `LOAN-001` completes without pausing because all data is valid.
+`LOAN-002` pauses at `runCreditCheck` because of the invalid Social Security number (SSN).
+`LOAN-003` pauses at `orderAppraisal` because of the invalid address.
+`LOAN-009` pauses at `verifyIncome` because of the unknown employer, and will require four separate corrections across four Activities to complete.
+
+### Send a corrective Signal
+
+To resume a paused Workflow, send a Signal containing the field name and corrected value.
+You can do this through the demo UI, the web service API, or the Temporal CLI.
+
+#### Using the demo UI
+
+The dashboard shows all Workflows and their pipeline status. Click a blocked Workflow to inspect the failure details and send a correction.
+
+
+
+Click a `PENDING_FIX` Workflow to open its detail panel. The UI shows the failure message and suggests a fix. Select the field to patch, enter the corrected value, and click **Patch and Retry** to send the Signal.
+
+
+
+After the Signal is delivered, the Workflow wakes up, retries the failed Activity with the corrected data, and advances through the remaining pipeline steps. The detail panel shows the updated state including the fix history.
+
+
+
+You can also verify the full event history — including the Signal and subsequent Activity completions — in the Temporal Web UI.
+
+
+
+#### Using the Temporal CLI
+
+```bash
+temporal workflow signal \
+ --workflow-id LOAN-002 \
+ --name retry \
+ --input '{"key":"ssn","value":"222-33-4444"}'
+```
+
+#### Using the web service API
+
+```bash
+curl -X POST http://localhost:3000/api/workflows/LOAN-002/fix \
+ -H 'Content-Type: application/json' \
+ -d '{"key":"ssn","value":"222-33-4444"}'
+```
+
+### Query pipeline status using Search Attributes
+
+Use the Temporal CLI or the web service to query blocked Workflows across the entire pipeline.
+
+Find all Workflows waiting for a fix:
+
+```bash
+temporal workflow list --query "LoanStatus = 'PENDING_FIX'"
+```
+
+Find all Workflows blocked on credit checks:
+
+```bash
+temporal workflow list \
+ --query "LoanStatus = 'PENDING_FIX' AND FailedActivity = 'runCreditCheck'"
+```
+
+Using the web service search endpoint:
+
+```bash
+curl "http://localhost:3000/api/workflows/search?failedActivity=runCreditCheck&status=PENDING_FIX"
+```
+
+These queries return results in real time as Workflows update their Search Attributes.
+An operations dashboard can poll these endpoints to display aggregate statistics: the number of applications awaiting correction, which Activities cause the most failures, and how long applications sit in `PENDING_FIX` status.
+
+This query capability supports automated resolution agents.
+An AI agent can periodically query for Workflows blocked on a specific Activity, fetch the error message and application data through the Query API, apply a correction, and send the Signal.
+The same Search Attribute interface that routes work to human operators also routes work to automated agents.
+
+### Query individual Workflow state
+
+Use the Query API to inspect the full internal state of any running Workflow without affecting its execution.
+
+```bash
+temporal workflow query \
+ --workflow-id LOAN-009 \
+ --name getState
+```
+
+The Query returns the complete `LoanState` object:
+
+```json
+{
+ "status": "PENDING_FIX",
+ "failedActivity": "verifyIncome",
+ "failureMessage": "Employer \"UNKNOWN_EMPLOYER\" not found in verification database for Irene Tanaka",
+ "completedActivities": [],
+ "fixHistory": [],
+ "application": {
+ "applicationId": "LOAN-009",
+ "applicantName": "Irene Tanaka",
+ "ssn": "000-00-0000",
+ "employerName": "UNKNOWN_EMPLOYER",
+ "annualIncome": 40000,
+ "propertyAddress": "INVALID_ADDRESS",
+ "propertyId": "PROP-009",
+ "loanAmount": 600000,
+ "downPayment": 5000
+ }
+}
+```
+
+## Outcomes
+
+By following this guide, you have implemented a recoverable and compensatable pipeline including:
+
+- **Durable pause on permanent failure.** Activities throw `ApplicationFailure.nonRetryable()` to signal non-retryable failures — whether invalid data, a breached lending limit, or a blocked compliance check. The Workflow catches these errors and suspends using `await condition()`, consuming no Worker resources while waiting for correction. The Workflow resumes where it left off regardless of how much time passes.
+- **Signal-driven recovery.** External operators or automated agents send corrective Signals containing the field name and new value. The Signal handler patches the application data in-place, records the correction in an auditable fix history, and wakes the Workflow to retry the failed Activity.
+- **Saga compensation on abort.** When a `RollbackRequired` failure or a `cancelApplication` Signal aborts the forward pipeline, the Workflow unwinds registered compensations in LIFO order. Compensations are registered before execution and written to be idempotent so partial side effects are always cleaned up. Each compensation runs through the same recoverable wrapper, so a stuck rollback pauses with `ROLLBACK_PENDING_FIX` for operator intervention rather than leaving the process half-unwound.
+- **Search Attribute routing.** Custom Search Attributes `LoanStatus` and `FailedActivity` are updated at every state transition, creating a real-time queryable index across all active Workflows. Operations teams use visibility queries to filter blocked Workflows by failure type and route them to the appropriate resolution resource, be it human or agents.
+- **Full pipeline visibility.** Queries return the complete internal state of any running Workflow, including completed Activities, compensated Activities, fix history, compensation history, and the triggering cancellation reason. Search Attributes provide the denormalized aggregate view across all Workflows. Together, these mechanisms give you both the individual and cross-pipeline views needed to operate the system at scale.
+
+## Related resources
+- [Source code](https://github.com/temporal-sa/validated-pattern-keep-business-moving)
+- [Temporal TypeScript SDK Documentation](https://docs.temporal.io/develop/typescript) — Complete reference for building Workflows, Activities, and Workers with the TypeScript SDK.
+- [Temporal Signals](https://docs.temporal.io/develop/typescript/message-passing#signals) — Guide to defining and sending Signals to running Workflow Executions.
+- [Temporal Queries](https://docs.temporal.io/develop/typescript/message-passing#queries) — Guide to defining and handling synchronous read-only Queries on Workflows.
+- [Temporal Search Attributes](https://docs.temporal.io/visibility#search-attribute) — Reference for creating and using custom Search Attributes for Workflow visibility.
+- [Temporal Visibility](https://docs.temporal.io/visibility) — Overview of the Visibility subsystem and List Filter query syntax.
+- [ApplicationFailure Reference](https://docs.temporal.io/references/failures#application-failure) — API reference for `ApplicationFailure` including retryable and non-retryable error handling.
\ No newline at end of file
diff --git a/docs/guides/temporary-rate-limit-increases.mdx b/docs/guides/temporary-rate-limit-increases.mdx
new file mode 100644
index 0000000000..6c0b091894
--- /dev/null
+++ b/docs/guides/temporary-rate-limit-increases.mdx
@@ -0,0 +1,727 @@
+---
+id: temporary-rate-limit-increases
+title: Orchestrate temporary rate limit increases on your Namespaces
+description: Handle temporary spikes in usage by dynamically provisioning extra capacity when you need it, and deprovision it when you don't.
+sidebar_label: Temporary rate limit increases
+toc_max_heading_level: 2
+author: Taylor Khan
+tags:
+ - Workflows
+ - Activities
+ - Timers
+---
+
+This pattern provides the steps to dynamically provision and automatically deprovision capacity on Temporal Cloud.
+
+Applications often experience predictable or temporary spikes in throughput that require additional Temporal Cloud capacity. This pattern demonstrates how to grant time-limited increases to your capacity limits and guarantee release of those resources after a specific duration.
+
+Permanently allocating peak capacity requirements for a namespace results in unnecessary costs. However, manually adjusting capacity quotas before and after load spikes is prone to human error, risking either workflow throttling if limits are not raised in time, or runaway costs if operators forget to reduce limits after the workload subsides.
+
+You will use Temporal Workflows to orchestrate the provisioning process. A parent Workflow executes an Activity to raise the capacity limit, then starts an asynchronous Child Workflow configured with an abandon policy. The parent Workflow completes to unblock the client, while the Child Workflow waits for a designated duration before executing an Activity to revert the capacity to its original limit.
+
+By completing this pattern, you will:
+
+- Automate capacity management to reduce Temporal Cloud costs.
+- Guarantee the execution of cleanup operations using durable Timers.
+- Unblock client requests while long-running deprovisioning tasks continue in the background.
+
+## Background and best practices
+
+Temporal Cloud offers different capacity modes to accommodate varying workloads. When using provisioned capacity, you define the maximum throughput your Namespace can consume. Modifying this limit via an API call allows you to scale resources up for intensive tasks.
+
+The primary architectural challenge is ensuring that temporary capacity increases revert reliably. Temporal solves this by persisting the state of your Workflow, including durably storing and starting Timers.
+
+By separating the provisioning and deprovisioning steps into distinct Workflows, you adhere to the best practice of returning control to the caller as soon as the provisioning command succeeds. The `ParentClosePolicyAbandon` setting instructs the Temporal Service to allow the Child Workflow to continue running even after the parent completes. This decouples the client's synchronous request from the long-running *Time-To-Live (TTL)* wait state. When the Child Workflow enters a sleep state, it consumes no Worker memory, allowing you to scale this pattern across thousands of concurrent requests efficiently.
+
+**Note:** Latency to the caller can be further reduced via [the Early Return pattern paired with an SDK call to update-with-start](https://docs.temporal.io/sending-messages#update-with-start).
+
+## Target audience
+
+This guide references the following roles:
+
+- **Temporal Workflow and Activity developers**: Implement the Go code for the Workflows and Activities and deploy the Worker processes.
+- **Platform operators**: Maintain the Temporal Namespace configuration and monitor usage against provisioned capacity limits.
+
+## Prerequisites
+
+To execute the steps in this pattern, you must have:
+
+- **Required software, infrastructure, and tools:** Temporal Go SDK v1.40.0 or later, Go v1.23 or later, Temporal CLI v.1.6.1 or later.
+- **Resources & Access Privileges:** Temporal Cloud Account with Admin role for the target Namespace to access the Cloud Operations API.
+ - You will need to create an API key using your Temporal Cloud account to access the Cloud Operations API
+- **Required Concepts:**
+ - Familiarity with
+ - Temporal Workflows
+ - Temporal Activities
+ - Temporal Child Workflows
+ - Temporal Timers
+ - Temporal Workers
+ - Temporal Cloud Namespaces
+
+**Note:** This pattern uses the Temporal Go SDK and it is necessary for direct replication. However, any Temporal SDK supported by Temporal Cloud is sufficient to achieve the same outcomes outlined in this document.
+
+## People & process considerations
+
+### Platform operators
+
+The **Platform operators** own the Temporal Cloud Namespace and API keys. This team is responsible for the following:
+
+1. Generate and manage Temporal Cloud API keys required for the provisioning Activities.
+2. Monitor overall throughput and ensure the base provisioned capacity meets everyday operational needs.
+
+### Application developers
+
+The **Application developers** are the primary authors of the Temporal Workflows and Activities. This team will have the following responsibilities:
+
+1. Implement and test the Temporal code outlined in this pattern.
+2. Configure Worker scaling policies to handle the expected frequency of capacity adjustment requests.
+
+## Architecture diagram
+
+
+
+The following diagram illustrates the flow of the requests to provision capacity and handle automatic cleanup after a 60 minute time-to-live.
+
+1. A client application sends a gRPC request to the `my-service` application with the requested limit and namespace.
+2. The `my-service` application uses the Temporal SDK client to start the `provisionTRU` Workflow.
+3. The `provisionTRU` Workflow schedules the `addTRUs` Activity and waits for it to complete.
+4. The `provisionTRU` Workflow starts the `deprovisionTRU` asynchronous Child Workflow using a parent close policy of abandon.
+5. The `provisionTRU` Workflow completes, and `my-service` returns a success response to the client.
+6. The `deprovisionTRU` Child Workflow sets a Timer for 60 minutes.
+7. The Timer fires after the TTL expires.
+8. The `deprovisionTRU` Child Workflow schedules the `removeTRUs` Activity to revert the capacity limits.
+
+## Implementation plan
+
+In this implementation plan, you will build and verify the capacity management solution. You will begin by defining the required Activities to interact with the Temporal Cloud. Next, you will create the asynchronous Child Workflow responsible for the delayed cleanup. You will then orchestrate the process with the parent Workflow and expose it through a service handler. Finally, you will manually trigger the Workflow to simulate a client request and write tests using the Temporal Go SDK test suite to validate your orchestration logic locally.
+
+Prior to executing this plan, ensure you have your Temporal Cloud API key and your target Namespace name documented
+
+### Define messages for Activities and Workflows
+
+Create the messages for passing to Temporal Workflows and Activities. It is a best practice to use a single serializable input to Workflows and Activities. You will create one for each Activity and Workflow, and helper functions for generating unique identifiers for each capacity request.
+
+```go
+// capacity/messages.go
+package capacity
+
+import "fmt"
+
+type ProvisionTRUInput struct {
+ Namespace string
+ APSLimit int32
+ MinutesToProvision int32
+}
+
+type DeprovisionTRUInput struct {
+ Namespace string
+ MinutesToProvision int32
+}
+
+type AddTRUInput struct {
+ Namespace string
+ APSLimit int32
+}
+
+type RemoveTRUInput struct {
+ Namespace string
+}
+
+func generateProvisioningId(namespace string) string {
+ return fmt.Sprintf("provision-%s", namespace)
+}
+
+func generateDeprovisioningId(namespace string) string {
+ return fmt.Sprintf("deprovision-%s", namespace)
+}
+
+```
+
+### Define the Activities
+
+You must create functions that interact with the Temporal Cloud Operations API to adjust capacity. Temporal Activities encapsulates these functions to guarantee retries in case of intermittent failures.
+
+```go
+// capacity/activities.go
+package capacity
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "go.temporal.io/sdk/temporal"
+ "io"
+ "net/http"
+ "strconv"
+)
+
+const (
+ defaultBaseURL = "https://saas-api.tmprl.cloud"
+)
+
+// Activities holds the dependencies required by the provisioning activities.
+type Activities struct {
+ HTTPClient *http.Client
+ BaseURL string
+ APIKey string
+}
+
+type getNamespaceResponse struct {
+ Namespace struct {
+ Spec json.RawMessage `json:"spec"`
+ ResourceVersion string `json:"resourceVersion"`
+ } `json:"namespace"`
+}
+
+type updateNamespaceRequest struct {
+ Spec json.RawMessage `json:"spec"`
+ ResourceVersion string `json:"resourceVersion"`
+}
+
+func (a *Activities) baseURL() string {
+ if a.BaseURL != "" {
+ return a.BaseURL
+ }
+ return defaultBaseURL
+}
+
+func (a *Activities) httpClient() *http.Client {
+ if a.HTTPClient != nil {
+ return a.HTTPClient
+ }
+ return http.DefaultClient
+}
+
+func (a *Activities) getNamespace(ctx context.Context, namespace string) (json.RawMessage, string, error) {
+ url := fmt.Sprintf("%s/cloud/namespaces/%s", a.baseURL(), namespace)
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+ if err != nil {
+ return nil, "", err
+ }
+ req.Header.Set("Authorization", "Bearer "+a.APIKey)
+
+ resp, err := a.httpClient().Do(req)
+ if err != nil {
+ return nil, "", err
+ }
+ defer resp.Body.Close()
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, "", err
+ }
+ if resp.StatusCode != http.StatusOK {
+ msg := fmt.Sprintf("%s", body)
+ return nil, "", temporal.NewApplicationError(msg, strconv.Itoa(resp.StatusCode))
+ }
+ var result getNamespaceResponse
+ if err := json.Unmarshal(body, &result); err != nil {
+ return nil, "", err
+ }
+ return result.Namespace.Spec, result.Namespace.ResourceVersion, nil
+}
+
+func (a *Activities) updateNamespace(ctx context.Context, namespace string, spec json.RawMessage, resourceVersion string) error {
+ url := fmt.Sprintf("%s/cloud/namespaces/%s", a.baseURL(), namespace)
+
+ payload, err := json.Marshal(updateNamespaceRequest{
+ Spec: spec,
+ ResourceVersion: resourceVersion,
+ })
+ if err != nil {
+ return err
+ }
+
+ req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload))
+ if err != nil {
+ return err
+ }
+ req.Header.Set("Authorization", "Bearer "+a.APIKey)
+ req.Header.Set("Content-Type", "application/json")
+
+ resp, err := a.httpClient().Do(req)
+ if err != nil {
+ return err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ body, _ := io.ReadAll(resp.Body)
+ msg := fmt.Sprintf("%s", body)
+ return temporal.NewApplicationError(msg, strconv.Itoa(resp.StatusCode))
+ }
+ return nil
+}
+
+// AddTRUs increases the provisioned capacity of the target namespace to the requested APS limit.
+func (a *Activities) AddTRUs(ctx context.Context, input AddTRUInput) error {
+ spec, resourceVersion, err := a.getNamespace(ctx, input.Namespace)
+ if err != nil {
+ return fmt.Errorf("get namespace %q: %w", input.Namespace, err)
+ }
+
+ var specMap map[string]interface{}
+ if err := json.Unmarshal(spec, &specMap); err != nil {
+ return fmt.Errorf("unmarshal namespace spec: %w", err)
+ }
+ specMap["capacitySpec"] = map[string]interface{}{
+ "provisioned": map[string]interface{}{
+ // 1 TRU = 500 APS; convert the requested APS limit to TRUs.
+ "value": float64(input.APSLimit) / 500.0,
+ },
+ }
+
+ updatedSpec, err := json.Marshal(specMap)
+ if err != nil {
+ return fmt.Errorf("marshal updated spec: %w", err)
+ }
+
+ if err := a.updateNamespace(ctx, input.Namespace, updatedSpec, resourceVersion); err != nil {
+ return err
+ }
+ return nil
+}
+
+// RemoveTRUs reverts the target namespace to on-demand capacity mode.
+func (a *Activities) RemoveTRUs(ctx context.Context, input RemoveTRUInput) error {
+ spec, resourceVersion, err := a.getNamespace(ctx, input.Namespace)
+ if err != nil {
+ return err
+ }
+
+ var specMap map[string]interface{}
+ if err := json.Unmarshal(spec, &specMap); err != nil {
+ return fmt.Errorf("unmarshal namespace spec: %w", err)
+ }
+ specMap["capacitySpec"] = map[string]interface{}{
+ "onDemand": map[string]interface{}{},
+ }
+
+ updatedSpec, err := json.Marshal(specMap)
+ if err != nil {
+ return fmt.Errorf("marshal updated spec: %w", err)
+ }
+
+ if err := a.updateNamespace(ctx, input.Namespace, updatedSpec, resourceVersion); err != nil {
+ return err
+ }
+ return nil
+}
+```
+
+The `AddTRUs` and `RemoveTRUs` methods define the actions taken to alter the capacity. By grouping these as methods on a `Activities` struct, you can inject dependencies like API clients or logger instances. Both methods accept a struct, which contains the target namespace and the desired limit.
+
+## Implement the deprovisioning Child Workflow
+
+To ensure capacity reverts after the given duration without consuming active computing resources, you will define a Workflow that sleeps for the specified duration and then executes the Activity that deprovisions the capacity.
+
+```go
+// capacity/deprovision_workflow.go
+package capacity
+
+import (
+ "go.temporal.io/sdk/temporal"
+ "time"
+
+ "go.temporal.io/sdk/workflow"
+)
+
+// DeprovisionTRUWorkflow sleeps for the TTL duration then reverts the namespace
+// to on-demand capacity. It is started as an asynchronous Child Workflow by
+// ProvisionTRUWorkflow and runs independently after the parent completes.
+func DeprovisionTRUWorkflow(ctx workflow.Context, input DeprovisionTRUInput) error {
+ err := workflow.Sleep(ctx, time.Duration(input.MinutesToProvision)*time.Minute)
+ if err != nil {
+ return err
+ }
+
+ ao := workflow.ActivityOptions{
+ StartToCloseTimeout: 2 * time.Minute,
+ RetryPolicy: &temporal.RetryPolicy{
+ NonRetryableErrorTypes: []string{unauthorized, forbidden, badRequest},
+ },
+ }
+ ctx = workflow.WithActivityOptions(ctx, ao)
+
+ var a *Activities
+ return workflow.ExecuteActivity(ctx, a.RemoveTRUs, input).Get(ctx, nil)
+}
+```
+
+## Implement the provisioning Workflow
+
+You will now create the main orchestration Workflow. This Workflow applies the capacity increase, and sets the deprovisioning process to run with an "abandon policy," so it will run regardless of what happens to the parent Workflow.
+
+```go
+// capacity/provision_workflow.go
+package capacity
+
+import (
+ "go.temporal.io/sdk/temporal"
+ "net/http"
+ "strconv"
+ "time"
+
+ "go.temporal.io/api/enums/v1"
+ "go.temporal.io/sdk/workflow"
+)
+
+var (
+ unauthorized = strconv.Itoa(http.StatusUnauthorized)
+ forbidden = strconv.Itoa(http.StatusForbidden)
+ badRequest = strconv.Itoa(http.StatusBadRequest)
+)
+
+// ProvisionTRUWorkflow raises the namespace capacity limit, then starts the
+// DeprovisionTRUWorkflow as an asynchronous Child Workflow with an abandon
+// policy so the parent can return immediately while cleanup runs independently.
+func ProvisionTRUWorkflow(ctx workflow.Context, input ProvisionTRUInput) error {
+ ao := workflow.ActivityOptions{
+ StartToCloseTimeout: 2 * time.Minute,
+ RetryPolicy: &temporal.RetryPolicy{
+ NonRetryableErrorTypes: []string{unauthorized, forbidden, badRequest},
+ },
+ }
+ actCtx := workflow.WithActivityOptions(ctx, ao)
+
+ var a *Activities
+ err := workflow.ExecuteActivity(actCtx, a.AddTRUs, AddTRUInput{
+ Namespace: input.Namespace,
+ APSLimit: input.APSLimit,
+ }).Get(ctx, nil)
+ if err != nil {
+ return err
+ }
+
+ cwo := workflow.ChildWorkflowOptions{
+ WorkflowID: generateDeprovisioningId(input.Namespace),
+ ParentClosePolicy: enums.PARENT_CLOSE_POLICY_ABANDON,
+ }
+ childCtx := workflow.WithChildOptions(ctx, cwo)
+
+ childFuture := workflow.ExecuteChildWorkflow(childCtx, DeprovisionTRUWorkflow, DeprovisionTRUInput{
+ Namespace: input.Namespace,
+ MinutesToProvision: input.MinutesToProvision,
+ })
+
+ var childExec workflow.Execution
+ err = childFuture.GetChildWorkflowExecution().Get(ctx, &childExec)
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+```
+
+The `ProvisionTRUWorkflow` orchestrates the entire request. First, it synchronously waits for `AddTRUs` to complete to ensure the capacity is available before returning. Second, it configures `workflow.ChildWorkflowOptions` with `enums.PARENT_CLOSE_POLICY_ABANDON`. This prevents the Temporal Service from cancelling the Child Workflow when this parent completes. Finally, the code uses `GetChildWorkflowExecution().Get()` to block execution just until the Temporal Service confirms the Child Workflow has started. Once started, the parent completes, leaving the child to run independently.
+
+## Implement the service handler
+
+Connect the Workflow to your application's API layer. You will write the code that receives the incoming request and triggers the Workflow.
+
+```go
+// capacity/handler.go
+package capacity
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "go.temporal.io/sdk/client"
+)
+
+const TaskQueue = "capacity-management"
+
+// HandleProvisionRequest starts a ProvisionTRUWorkflow execution and returns
+// after the Temporal Cluster accepts the request. It does not wait
+// for the Workflow to complete.
+func HandleProvisionRequest(c client.Client, namespace string, apsLimit int32, minutesToProvision int32) error {
+ id := generateProvisioningId(namespace)
+ preExistingRun := c.GetWorkflow(context.Background(), id, "").GetRunID()
+ if preExistingRun != "" {
+ return errors.New("provisioning request already in-progress")
+ }
+ deprovisionId := fmt.Sprintf("deprovision-%s", namespace)
+ preExistingDeprovisionRun := c.GetWorkflow(context.Background(), deprovisionId, "").GetRunID()
+ if preExistingDeprovisionRun != "" {
+ err := c.CancelWorkflow(context.Background(), deprovisionId, preExistingDeprovisionRun)
+ if err != nil {
+ return errors.Join(errors.New("unable to cancel deprovisioning workflow"), err)
+ }
+ }
+ options := client.StartWorkflowOptions{
+ ID: id,
+ TaskQueue: TaskQueue,
+ }
+
+ input := ProvisionTRUInput{
+ Namespace: namespace,
+ APSLimit: apsLimit,
+ MinutesToProvision: minutesToProvision,
+ }
+
+ we, err := c.ExecuteWorkflow(context.Background(), options, ProvisionTRUWorkflow, input)
+ if err != nil {
+ return fmt.Errorf("failed to start workflow: %w", err)
+ }
+
+ fmt.Printf("Successfully started provision workflow. ID: %s, RunID: %s\n",
+ we.GetID(), we.GetRunID())
+ return nil
+}
+```
+
+The `HandleProvisionRequest` function represents your gRPC or HTTP endpoint. It prepares the `client.StartWorkflowOptions` and calls `ExecuteWorkflow`. Because it uses `ExecuteWorkflow` rather than waiting on the result with `.Get()`, the function returns an acknowledgment to the client after the Temporal Service accepts the Workflow initiation. `MinutesToProvision` determines how long the `DeprovisionTRUWorkflow` waits before provisioned capacity reverts.
+
+## Create the Worker Program
+
+Now that you have built the Activities and Workflows here is the complete set of imports and definitions required to register and run the system on a Worker.
+
+```go
+// cmd/worker.go
+package main
+
+import (
+ "github.com/temporal-sa/temporary-rate-limit-increases/capacity"
+ "go.temporal.io/sdk/client"
+ "go.temporal.io/sdk/worker"
+ "log"
+ "os"
+)
+
+func main() {
+ apiKey := os.Getenv("TEMPORAL_CLOUD_API_KEY")
+ if apiKey == "" {
+ log.Fatalln("TEMPORAL_CLOUD_API_KEY missing and required")
+ }
+
+ c, err := client.Dial(client.Options{})
+ if err != nil {
+ log.Fatalln("Unable to create Temporal client", err)
+ }
+ defer c.Close()
+
+ w := worker.New(c, capacity.TaskQueue, worker.Options{})
+
+ w.RegisterWorkflow(capacity.ProvisionTRUWorkflow)
+ w.RegisterWorkflow(capacity.DeprovisionTRUWorkflow)
+
+ activities := &capacity.Activities{
+ APIKey: apiKey,
+ }
+ w.RegisterActivity(activities)
+
+ err = w.Run(worker.InterruptCh())
+ if err != nil {
+ log.Fatalln("Unable to start worker", err)
+ }
+}
+```
+
+Run the following commands sequentially, each in their own terminal:
+
+```bash
+temporal server start-dev
+```
+
+```bash
+export TEMPORAL_CLOUD_API_KEY={YOUR_TEMPORAL_CLOUD_API_KEY}
+go run cmd/worker.go
+```
+
+## Test the Workflow execution
+
+To verify the capacity provisioning process, you must simulate the client request. You will write a short script that initializes a Temporal client, defines the requested limits, and calls your service handler to trigger the parent Workflow. Replace `{TEST_NAMESPACE}` with the target namespace you identified in the prerequisites section.
+
+```go
+// cmd/trigger.go
+package main
+
+import (
+ "github.com/temporal-sa/temporary-rate-limit-increases/capacity"
+ "go.temporal.io/sdk/client"
+ "log"
+ "os"
+ "strconv"
+)
+
+func main() {
+ c, err := client.Dial(client.Options{})
+ if err != nil {
+ log.Fatalln("Unable to create Temporal client", err)
+ }
+ defer c.Close()
+
+ targetNamespace := os.Getenv("TEMPORAL_CLOUD_NAMESPACE")
+ if targetNamespace == "" {
+ log.Fatalln("TEMPORAL_CLOUD_NAMESPACE missing and required")
+ }
+ minutesToProvisionRaw := os.Getenv("MINUTES_TO_PROVISION")
+ if minutesToProvisionRaw == "" {
+ log.Fatalln("MINUTES_TO_PROVISION missing and required")
+ }
+ minutesToProvision, err := strconv.Atoi(minutesToProvisionRaw)
+ if err != nil {
+ log.Fatalln("Unable to parse MINUTES_TO_PROVISION: " + err.Error())
+ }
+ var newLimit int32 = 1000
+
+ err = capacity.HandleProvisionRequest(c, targetNamespace, newLimit, int32(minutesToProvision))
+ if err != nil {
+ log.Fatalln("Unable to execute provision request", err)
+ }
+}
+```
+
+The `main` function connects to the Temporal Service using `client.Dial`. It defines the `targetNamespace` and `newLimit` variables to represent the data payload that a gRPC request would contain. It then passes the client and these variables to the `HandleProvisionRequest` function. By executing this file while your Worker is running, the Temporal Service will start the `ProvisionTRUWorkflow**.**`
+
+Execute the script from your command line:
+
+```bash
+export MINUTES_TO_PROVISION=5
+export TEMPORAL_CLOUD_NAMESPACE={YOUR_TEMPORAL_NAMESPACE}
+go run trigger.go
+```
+
+The script will complete and print the confirmation from the handler:
+
+```bash
+Started provision workflow. ID: provision-production-workload, RunID: {RUN_ID}
+```
+
+After this output appears, the Worker continues to process the provisioning Activity and the 5-minute sleep Timer in the background.
+
+**Note:** a 5-minute sleep time is used for testing the Workflow end-to-end. Set `MINUTES_TO_PROVISION` to your needs in production environments. We recommend at least 60 minutes.
+
+## Unit test the Workflow
+
+Verify that the Workflow orchestrates the Activities and Child Workflow without interacting with the live Temporal Service. The Temporal Go SDK provides a test environment that simulates time and cluster behavior, allowing you to validate your code logic locally.
+
+```go
+// capacity/provision_workflow_test.go
+package capacity
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/mock"
+ "github.com/stretchr/testify/require"
+ "go.temporal.io/sdk/testsuite"
+)
+
+func Test_ProvisionTRUWorkflow(t *testing.T) {
+ testSuite := &testsuite.WorkflowTestSuite{}
+ env := testSuite.NewTestWorkflowEnvironment()
+
+ env.RegisterWorkflow(DeprovisionTRUWorkflow)
+
+ var a *Activities
+ env.OnActivity(a.AddTRUs, mock.Anything, mock.Anything).Return(nil)
+
+ input := ProvisionTRUInput{
+ Namespace: "test-namespace",
+ APSLimit: 2000,
+ MinutesToProvision: 5,
+ }
+
+ env.ExecuteWorkflow(ProvisionTRUWorkflow, input)
+
+ require.True(t, env.IsWorkflowCompleted())
+ require.NoError(t, env.GetWorkflowError())
+ env.AssertExpectations(t)
+}
+```
+
+The `Test_ProvisionTRUWorkflow` function initializes a `testsuite.WorkflowTestSuite` and creates a `TestWorkflowEnvironment`. This environment simulates the Temporal Service in memory. Because the parent Workflow starts a Child Workflow asynchronously, you must register `DeprovisionTRUWorkflow` with the test environment so it does not fail when invoked.
+
+You then use `env.OnActivity` to mock the `AddTRUs` Activity, instructing it to return `nil` without making real network calls. After calling `env.ExecuteWorkflow`, the test uses `require.True` and `require.NoError` to guarantee the parent Workflow completes. Finally, `env.AssertExpectations` confirms that the mocked Activity executed as defined.
+
+Execute the test from your command line:
+
+```bash
+go test -v provision_workflow_test.go provision_workflow.go deprovision_workflow.go activities.go
+```
+
+The terminal will output the test results:
+
+```bash
+=== RUN Test_ProvisionTRUWorkflow
+--- PASS: Test_ProvisionTRUWorkflow (0.01s)
+PASS
+```
+
+Additional log lines from the test results are a non-issue.
+
+## Unit test the deprovisioning Workflow
+
+When testing long-running Workflows, waiting for real time to pass is inefficient. You must verify that the `DeprovisionTRUWorkflow` schedules the `RemoveTRUs` Activity after the 60-minute Timer without actually waiting an hour during your test execution.
+
+```go
+// capacity/deprovision_workflow_test.go
+package capacity
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/mock"
+ "github.com/stretchr/testify/require"
+ "go.temporal.io/sdk/testsuite"
+)
+
+func Test_DeprovisionTRUWorkflow(t *testing.T) {
+ testSuite := &testsuite.WorkflowTestSuite{}
+ env := testSuite.NewTestWorkflowEnvironment()
+
+ var a *Activities
+ env.OnActivity(a.RemoveTRUs, mock.Anything, mock.Anything).Return(nil)
+
+ input := DeprovisionTRUInput{
+ Namespace: "test-namespace",
+ MinutesToProvision: 5,
+ }
+
+ env.ExecuteWorkflow(DeprovisionTRUWorkflow, input)
+
+ require.True(t, env.IsWorkflowCompleted())
+ require.NoError(t, env.GetWorkflowError())
+ env.AssertExpectations(t)
+}
+```
+
+The `Test_DeprovisionTRUWorkflow` function uses the `testsuite.WorkflowTestSuite` to run the Child Workflow. When a Timer blocks a Workflow execution the Temporal test environment automatically skips time forward, such as the `workflow.Sleep` command used in your code. This mechanism causes the 60-minute sleep to complete instantly within the test framework.
+
+You configure `env.OnActivity` to mock the `RemoveTRUs` Activity, ensuring the test does not make live API calls to Temporal Cloud. After execution, `require.True` confirms the Workflow finished, and `env.AssertExpectations` guarantees that the deprovisioning Activity executed as expected after the simulated time elapsed.
+
+Execute the test from your command line:
+
+```bash
+go test -v deprovision_workflow_test.go deprovision_workflow.go activities.go
+```
+
+The terminal will output the test results:
+
+```bash
+=== RUN Test_DeprovisionTRUWorkflow
+--- PASS: Test_DeprovisionTRUWorkflow (0.01s)
+PASS
+ok command-line-arguments 0.015s
+```
+
+Additional log lines from the test results are a non-issue.
+
+## Outcomes
+
+By following this guide, you have implemented a time-bound capacity provisioning system including:
+
+- Requesting increased limits through a retryable Temporal Activity.
+- Using an asynchronous Child Workflow to decouple application responses from long-term cleanup tasks.
+- Using durable Timers to guarantee that capacity limits are strictly reverted after 60 minutes, ensuring you do not overpay for unused compute.
+
+## Related Resources
+
+- [Temporal Capacity Modes](https://docs.temporal.io/cloud/capacity-modes)
+- [Temporal Best Practices](https://docs.temporal.io/best-practices)
+- [Python Capacity Mode Sample](https://github.com/lainecsmith/temporal-cloud-ops-capacity-modes)
+- [Go Capacity Mode Sample](https://github.com/temporal-sa/temporary-rate-limit-increases)
\ No newline at end of file
diff --git a/docs/guides/worker-execution-affinity.mdx b/docs/guides/worker-execution-affinity.mdx
new file mode 100644
index 0000000000..83fb814f54
--- /dev/null
+++ b/docs/guides/worker-execution-affinity.mdx
@@ -0,0 +1,500 @@
+---
+id: worker-execution-affinity
+title: Ensure Activity execution on the same Worker
+description: Execute multiple Activities on thee same Worker to guarantee data locality.
+sidebar_label: Worker execution affinity
+toc_max_heading_level: 2
+author: Cecil Phillip
+tags:
+ - Task Queues
+ - Routing
+ - Workers
+ - Resource requirements
+ - GPU computing
+---
+
+### Problem statement
+
+Many workflows require multiple Activities to execute on the same Worker to maintain data locality. Common scenarios include:
+
+- **File processing:** Download a file (Activity 1), process it (Activity 2), and upload it (Activity 3). The file exists on the Worker's local disk, and requiring another Worker to re-download multi-GB files from the original storage (or via an object store) is slow and expensive.
+- **ML model caching:** Load a large ML model into memory (Activity 1), then run multiple inference calls (Activities 2-N) using the cached model. Loading the model on each Worker wastes time and memory.
+- **Database connection pooling:** Establish expensive database connections that should be reused across multiple Activities in the same Workflow Execution.
+
+Without Worker affinity, Temporal distributes Activities across available Workers. This means:
+- Files downloaded in Activity 1 aren't available for Activity 2 (different Worker)
+- Network transfer overhead: 10GB video file must be re-fetched from the original storage or an object store (incurs large network transfer and egress costs)
+- Duplicate resource initialization: ML models loaded multiple times across Workers
+- Increased latency: Each Activity pays setup costs instead of reusing resources
+
+### Solution
+
+Use Worker-specific Task Queues to ensure all Activities in a workflow execute on the same Worker. To achieve this:
+
+1. Each Worker polls two Activity Task Queues: a shared queue and a unique queue (generated per Worker instance)
+2. The Workflow calls an Activity on the shared queue to discover an available Worker's unique queue name
+3. The Workflow routes all subsequent Activities to that Worker's unique queue
+4. All Activities execute on the same Worker, maintaining data locality
+
+### Outcomes
+
+- **Data locality:** Files downloaded in one Activity are immediately available to subsequent Activities on the same Worker
+- **Performance:** Eliminate repeated re-downloads from remote storage (multi-GB transfers)
+- **Resource efficiency:** Load expensive resources (ML models, DB connections) once per Workflow instead of per Activity
+- **Cost reduction:** Reduce network egress costs from repeated downloads from remote/object storage
+
+## Background and best practices
+
+### Task Queue fundamentals
+
+Task Queues in Temporal are dynamically created when first referenced. With this pattern a unique Task Queue is created per Worker instance (e.g., `file-processing-abc123`) that only that Worker polls.
+
+**Recommended practice:** Generate unique queue names using UUIDs (or the hostname if running in a containerized environment) to avoid collisions across Worker instances.
+
+### Worker-specific vs Worker sessions
+
+- **Go SDK** has a built-in [Worker Sessions API](https://docs.temporal.io/develop/go/sessions) that handles Worker-specific routing automatically
+- **Other SDKs** (Python, TypeScript, etc.) must implement the pattern manually using unique Task Queue names
+
+This pattern provides the same guarantees as Go's Sessions API for non-Go SDKs.
+
+### Worker failure handling
+
+If a Worker crashes while processing Activities on its unique queue:
+- Running activities will timeout after `heartbeat_timeout` (if configured) or `start_to_close_timeout`
+- Retries and pending activities will wait in the unique queue until `schedule_to_start_timeout` expires
+- To recover, the Workflow catches the timeout error and can route to a different Worker (on a new unique task queue)
+
+**Recommended practice:** Set a short `heartbeat_timeout` (e.g., 30s) to detect crashes quickly, and a short `schedule_to_start_timeout` (e.g., 1m) to stop waiting on dead queues.
+
+**Recommended practice:** Set appropriate `schedule_to_start_timeout` values to detect Worker failures quickly (e.g., 5 minutes for file processing).
+
+> Activity Executions in most Workflows are constrained by the Start-to-Close Timeout, which limits the maximum duration of a single attempt. Its value is set to slightly longer than the Activity should take to complete. The pattern described here also relies on the Schedule-to-Start Timeout, which limits the maximum amount of time that a Task may remain enqueued. Although otherwise seldom used, this Timeout is valuable here because it enables the system to detect a Worker crash. That is, when a Worker crashes, it will no longer dequeue Tasks and the Schedule-to-Start Timeout will be reached.
+
+### Operational considerations
+
+- **Queue proliferation:** Each Worker creates a unique queue. Monitor total queue count.
+- **Queue lifecycle:** Consider setting a reasonable `schedule_to_close_timeout` on Activities to bound how long work tied to a unique queue can remain active, or implement explicit cleanup job to remove stale unique queues when Workers terminate.
+- **Worker scaling:** When scaling down, ensure Workers complete in-progress Workflows before [termination](https://docs.temporal.io/encyclopedia/workers/worker-shutdown)
+- **Monitoring:** Track Activities stuck in unique queues (indicates Worker crash/unavailability)
+
+## Target audience
+
+- **Temporal Workflow & Activity developers:** Implementing file processing and data-local workflows
+- **Platform operators:** Deploying and monitoring Worker-specific queue patterns
+- **Data engineers:** Building ETL pipelines with large file processing
+- **ML Engineers:** Deploying inference workflows with model caching
+
+This implementation requires code changes to Workers and Workflows, and consideration for Worker lifecycle management.
+
+## Prerequisites
+
+### Required software, infrastructure, and tools
+
+- Temporal Service (Self-hosted or Temporal Cloud)
+- Python 3.8 or later
+- Temporal Python SDK v1.0.0 or later (`pip install temporalio`)
+- File storage accessible to Workers (local disk, shared filesystem, or cloud storage)
+
+### Resources & access privileges
+
+- Temporal Namespace with permissions to start Workflows and register Workers
+- File storage with appropriate read/write permissions for Workers
+- Sufficient disk space on Worker hosts for file processing
+
+### Required concepts
+
+- Temporal Workflows, Activities, and Task Queues
+- Python async/await patterns
+- File I/O operations
+- Basic understanding of Worker lifecycle
+
+## Architecture diagram(s)
+
+### Worker-specific Task Queue pattern
+
+```mermaid
+sequenceDiagram
+ participant WF as Workflow
+ participant Shared as file-processing-shared Task Queue
+ participant W1 as Worker 1 (UUID: abc123)
+ participant Unique as file-processing-abc123 Task Queue
+ participant Disk as Worker 1 Local Disk
+
+ Note over WF: Need to process file on same worker
+
+ WF->>Shared: get_available_task_queue()
+ Shared->>W1: Activity dispatched
+ W1-->>WF: "file-processing-abc123"
+
+ Note over WF: Route all activities to Worker 1's unique queue
+
+ WF->>Unique: download_file(url)
+ Unique->>W1: Routed to Worker 1
+ W1->>Disk: Save file to /tmp/video.mp4
+ Note over Disk: 10GB video file
+ W1-->>WF: "/tmp/video.mp4"
+
+ WF->>Unique: process_file(path)
+ Unique->>W1: Same worker
+ W1->>Disk: Read /tmp/video.mp4
+ Note over W1: Process locally No network transfer!
+ W1->>Disk: Write /tmp/video-processed.mp4
+ W1-->>WF: "/tmp/video-processed.mp4"
+
+ WF->>Unique: upload_file(path)
+ Unique->>W1: Same worker
+ W1->>Disk: Read /tmp/video-processed.mp4
+ Note over W1: Upload to S3
+ W1->>Disk: Clean up files
+ W1-->>WF: "https://s3.../video.mp4"
+```
+
+## Implementation
+
+### Step 1: Define Task Queue constants
+
+**File: `task_queues.py`**
+
+```python
+"""Task Queue constants for worker-specific routing."""
+
+# Shared queue for discovering available workers
+FILE_PROCESSING_SHARED_QUEUE = "file-processing-shared"
+
+# Note: Unique per-worker queues are generated at runtime
+# Pattern: f"file-processing-{uuid.uuid4()}"
+```
+
+### Step 2: Configure Worker with shared and unique queues
+
+Each Worker polls two queues:
+1. Shared queue: Returns this Worker's unique queue name
+2. Unique queue: Handles file processing Activities
+
+**File: `worker_file_processing.py`**
+
+```python
+"""Worker with unique task queue for file processing affinity."""
+import asyncio
+import hashlib
+import httpx
+import uuid
+import logging
+import os
+from pathlib import Path
+from temporalio.client import Client
+from temporalio.exceptions import ApplicationError
+from temporalio.worker import Worker
+from temporalio import activity
+
+from task_queues import FILE_PROCESSING_SHARED_QUEUE
+
+logging.basicConfig(level=logging.INFO)
+
+# Generate a unique Task Queue name for this worker instance
+UNIQUE_WORKER_TASK_QUEUE = f"file-processing-{uuid.uuid4()}"
+
+
+@activity.defn
+async def get_available_task_queue() -> str:
+ """
+ Return this worker's unique task queue name.
+
+ The Workflow calls this on the shared queue to discover a Worker's
+ unique queue, then routes all subsequent file operations to that queue.
+ """
+ activity.logger.info(f"Returning unique queue: {UNIQUE_WORKER_TASK_QUEUE}")
+ return UNIQUE_WORKER_TASK_QUEUE
+
+
+@activity.defn
+async def download_file(url: str) -> str:
+ """
+ Download file from URL and save to local disk.
+
+ Returns the local file path. Subsequent activities on the same Worker
+ can access this file without network transfer.
+ """
+
+ local_path = f"/tmp/temporal_file_{uuid.uuid4()}"
+
+ activity.logger.info(f"Downloading {url} to {local_path}")
+
+ async with httpx.AsyncClient() as client:
+ response = await client.get(url, timeout=300.0)
+ activity.heartbeat(f"Downloading {url}")
+
+ # Raise exception for 4xx/5xx errors
+ # By default, Temporal retries all application failures.
+ # Treat 4xx errors as non retryable
+ try:
+ response.raise_for_status()
+ except httpx.HTTPStatusError as e:
+ if 400 <= e.response.status_code < 500:
+ raise ApplicationError(f"Client error downloading file: {e}", non_retryable=True) from e
+ raise e
+
+ # Use asyncio.to_thread to avoid blocking the event loop
+ await asyncio.to_thread(Path(local_path).write_bytes, response.content)
+
+ activity.logger.info(
+ f"Downloaded {len(response.content)} bytes to {local_path}"
+ )
+
+ return local_path
+
+
+@activity.defn
+async def process_file(local_path: str) -> str:
+ """
+ Process the file that was downloaded by previous activity.
+
+ Because this runs on the same worker, the file is already present
+ on the local disk - no network transfer needed.
+ """
+ activity.logger.info(f"Processing file at {local_path}")
+
+ # Read the local file (async to avoid blocking the event loop)
+ content = await asyncio.to_thread(Path(local_path).read_bytes)
+
+ # Simulate processing (e.g., video transcoding, image transformation)
+ # Send periodic heartbeats to detect worker crashes quickly
+ processing_duration = 3 # seconds (simulated; real work could be minutes/hours)
+ heartbeat_interval = 1 # seconds
+
+ elapsed = 0
+ while elapsed < processing_duration:
+ await asyncio.sleep(heartbeat_interval)
+ elapsed += heartbeat_interval
+ # Send heartbeat periodically to prove the worker is alive
+ activity.heartbeat(f"Processing file at {local_path} - {elapsed}s elapsed")
+
+ # Compute checksum as proof of processing
+ checksum = hashlib.sha256(content).hexdigest()
+
+ result_path = f"{local_path}.processed"
+ activity.heartbeat(f"Writing results to {result_path}")
+ await asyncio.to_thread(Path(result_path).write_text, f"Processed. Checksum: {checksum}")
+
+ activity.logger.info(f"Processed file, checksum: {checksum[:16]}...")
+
+ return result_path
+
+
+@activity.defn
+async def upload_file(local_path: str) -> str:
+ """
+ Upload the processed file to destination.
+
+ Reads from local disk (no network transfer from previous Activities).
+ """
+ activity.logger.info(f"Uploading file from {local_path}")
+
+ # Use asyncio.to_thread for blocking file read
+ content = await asyncio.to_thread(Path(local_path).read_text)
+ activity.heartbeat(f"File read complete: {len(content)} bytes")
+
+ # Simulate upload to S3, GCS, etc. with periodic heartbeats
+ # In real scenarios: stream file, track upload progress, handle retries, etc.
+ upload_duration = 3 # seconds (simulated; real uploads could take minutes)
+ heartbeat_interval = 1 # seconds
+
+ elapsed = 0
+ while elapsed < upload_duration:
+ await asyncio.sleep(heartbeat_interval)
+ elapsed += heartbeat_interval
+ progress = int((elapsed / upload_duration) * 100)
+ activity.heartbeat(f"Uploading file - {progress}% complete")
+
+ upload_url = f"https://storage.example.com/results/{uuid.uuid4()}"
+ activity.logger.info(f"Uploaded to {upload_url}")
+
+ # Clean up local files with heartbeats
+ activity.heartbeat(f"Starting cleanup of {local_path}")
+ original_path = local_path.replace(".processed", "")
+ for path in [local_path, original_path]:
+ try:
+ os.remove(path)
+ activity.heartbeat(f"Cleaned up {path}")
+ activity.logger.info(f"Cleaned up {path}")
+ except FileNotFoundError:
+ pass
+
+ activity.heartbeat("Upload and cleanup complete")
+ return upload_url
+
+
+async def main():
+ config = ClientConfig.load_client_connect_config()
+ client = await Client.connect(**config)
+
+ # Worker for the shared queue
+ # Handles "get_available_task_queue" requests from workflows
+ shared_worker = Worker(
+ client,
+ task_queue=FILE_PROCESSING_SHARED_QUEUE,
+ activities=[get_available_task_queue],
+ )
+
+ # Worker for this process's unique queue
+ # Handles the actual file operations
+ unique_worker = Worker(
+ client,
+ task_queue=UNIQUE_WORKER_TASK_QUEUE,
+ activities=[download_file, process_file, upload_file],
+ max_concurrent_activities=5, # Limit based on disk I/O
+ )
+
+ logging.info(
+ f"Starting file processing Worker\n"
+ f" Shared queue: {FILE_PROCESSING_SHARED_QUEUE}\n"
+ f" Unique queue: {UNIQUE_WORKER_TASK_QUEUE}"
+ )
+
+ # Run both Workers concurrently
+ await asyncio.gather(
+ shared_worker.run(),
+ unique_worker.run(),
+ )
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**Deployment guidance:**
+- Deploy multiple instances of this Worker (e.g., 5-10 instances)
+- Each instance generates its own unique queue UUID
+- Workers should have adequate disk space for file processing
+- Consider using local SSD storage for better I/O performance
+
+### Step 3: Implement Workflow with Worker affinity and retries
+
+When a Worker crashes, Activities scheduled to its unique queue will wait. Implement failure detection and recovery using a multi-layered resilience strategy:
+
+**Failure Detection Mechanisms:**
+
+1. **Heartbeat Timeout** (fastest): Activities periodically send heartbeats to signal they're alive. If a Worker crashes mid-execution, no heartbeats are sent, and the activity fails within seconds (e.g., 30s) rather than waiting for the full task duration.
+
+2. **Schedule-to-Start Timeout** (medium): If a Worker crashes before picking up a task from its queue, this timeout detects it within minutes (e.g., 5 min) instead of waiting for the full start-to-close timeout. This is critical for identifying dead Workers early.
+
+3. **Workflow-Level Retries** (recovery): The `for` loop catches all exceptions and retries the entire workflow sequence on a different Worker. This provides recovery after detecting a Worker failure—the workflow doesn't retry on the failed Worker's queue, but instead requests a new unique queue from a healthy Worker.
+
+**Failure Scenarios:**
+
+- **Worker crashes mid-activity** → Detected by `heartbeat_timeout` (30s) → Exception caught by try/except → Workflow retries on new Worker
+- **Worker crashes before picking up task** → Detected by `schedule_to_start_timeout` (5 min) → Exception caught → Workflow retries on new Worker
+- **All Workers fail** → Loop exhausts max attempts → Workflow fails after 3 attempts (configurable)
+
+**Enhanced Workflow with fallback:**
+
+```python
+@workflow.defn
+class FileProcessingWorkflowWithFallback:
+ """File processing with Worker failure handling.
+
+ Uses a combination of:
+ - Heartbeats for fast crash detection (30s)
+ - Schedule-to-start timeout for dead Worker detection (5 min)
+ - Workflow-level retries to attempt on a different Worker
+ """
+
+ @workflow.run
+ async def run(self, file_url: str) -> str:
+ max_worker_attempts = 3
+
+ for attempt in range(max_worker_attempts):
+ try:
+ workflow.logger.info(
+ f"Attempt {attempt + 1}/{max_worker_attempts}"
+ )
+
+ # Get unique queue
+ unique_queue = await workflow.execute_activity(
+ get_available_task_queue,
+ task_queue=FILE_PROCESSING_SHARED_QUEUE,
+ start_to_close_timeout=timedelta(minutes=1),
+ )
+
+ # Process file on that Worker
+ local_path = await workflow.execute_activity(
+ download_file,
+ file_url,
+ task_queue=unique_queue,
+ start_to_close_timeout=timedelta(minutes=10),
+ # Key: schedule_to_start timeout detects dead workers before they pick up
+ schedule_to_start_timeout=timedelta(minutes=5),
+ # Key: heartbeat timeout detects worker crashes during execution (faster!)
+ heartbeat_timeout=timedelta(seconds=30),
+ )
+
+ processed_path = await workflow.execute_activity(
+ process_file,
+ local_path,
+ task_queue=unique_queue,
+ start_to_close_timeout=timedelta(minutes=30),
+ schedule_to_start_timeout=timedelta(minutes=5),
+ # Heartbeat detection for mid-execution worker crashes
+ heartbeat_timeout=timedelta(seconds=30),
+ )
+
+ upload_url = await workflow.execute_activity(
+ upload_file,
+ processed_path,
+ task_queue=unique_queue,
+ start_to_close_timeout=timedelta(minutes=10),
+ schedule_to_start_timeout=timedelta(minutes=5),
+ # Heartbeat detection for mid-execution worker crashes
+ heartbeat_timeout=timedelta(seconds=30),
+ )
+
+ return upload_url
+
+ except Exception as e:
+ workflow.logger.warning(
+ f"Worker attempt {attempt + 1} failed: {e}"
+ )
+ if attempt == max_worker_attempts - 1:
+ raise
+ # Try a different Worker
+ await workflow.sleep(timedelta(seconds=10))
+
+ raise RuntimeError("Failed to process file after all attempts")
+```
+
+
+## Conclusion
+
+By implementing worker-specific Task Queues for file processing, you have achieved:
+
+1. **Data locality:** Files downloaded in one Activity are immediately available to subsequent Activities on the same Worker, eliminating multi-GB network transfers
+
+2. **Performance improvement:** Reduced execution time by 80%+ for workflows processing large files (e.g., 10GB video file no longer transferred between Workers)
+
+3. **Resource efficiency:** Load expensive resources (ML models, database connections) once per workflow instead of per Activity, reducing memory usage and initialization overhead
+
+4. **Cost reduction:** Eliminated network egress costs from transferring large files between Workers, potentially saving thousands of dollars per month
+
+Your file processing workflows now guarantee that all Activities execute on the same Worker, maintaining data locality and maximizing performance.
+
+## Related resources
+
+### Official documentation
+- [Temporal Documentation - Task Routing](https://docs.temporal.io/task-routing)
+- [Temporal Documentation - Worker Sessions (Go SDK)](https://docs.temporal.io/develop/go/sessions)
+
+### Related patterns
+- [Separate Task Queues - Priorities](/guides/route-specialized-workloads) - For priority-based routing
+- [Separate Task Queues - Rate Limiting](/guides/rate-limit-downstream-apis) - For protecting downstream APIs
+
+### Code samples
+- [Worker-Specific Task Queues Sample (Python)](https://github.com/temporalio/samples-python/tree/main/worker_specific_task_queues)
+- [Worker-Specific Task Queues Sample (TypeScript)](https://github.com/temporalio/samples-typescript/tree/main/worker-specific-task-queues)
+- [Worker-Specific Task Queues Sample (Go)](https://github.com/temporalio/samples-go/tree/main/worker-specific-task-queues)
+- [Worker-Specific Task Queues Sample (.NET)](https://github.com/temporalio/samples-dotnet/tree/main/src/WorkerSpecificTaskQueues)
+- [Worker-Specific Task Queues Sample (Ruby)](https://github.com/temporalio/samples-ruby/tree/main/worker_specific_task_queues)
+
+### Community resources
+- [Forum: When to Use Multiple Task Queues](https://community.temporal.io/t/in-what-situation-should-we-use-multiple-separated-task-queues/1254)
+- [Slack: Using Dynamic Task Queues for Traffic Routing](https://community.temporal.io/t/using-dynamic-task-queues-for-traffic-routing/3045)
\ No newline at end of file
diff --git a/sidebars.js b/sidebars.js
index 83d1946a37..a55c850ede 100644
--- a/sidebars.js
+++ b/sidebars.js
@@ -1685,10 +1685,22 @@ module.exports = {
},
{
type: 'category',
- label: 'Interactive Demos',
+ label: 'Guides',
collapsed: true,
+ link: {
+ type: 'doc',
+ id: 'guides/index',
+ },
items: [
- 'develop/standalone-activities-interactive-demo',
+ 'guides/entity-pattern-loyalty-points',
+ 'guides/saga-pattern',
+ 'guides/route-specialized-workloads',
+ 'guides/worker-execution-affinity',
+ 'guides/temporary-rate-limit-increases',
+ 'guides/reliable-document-approvals',
+ 'guides/rate-limit-downstream-apis',
+ 'guides/durable-gaming-sessions',
+ 'guides/lock-shared-resources',
],
},
'integrations',
diff --git a/src/components/GuidesGrid/GuidesGrid.module.css b/src/components/GuidesGrid/GuidesGrid.module.css
new file mode 100644
index 0000000000..9c36cabb6c
--- /dev/null
+++ b/src/components/GuidesGrid/GuidesGrid.module.css
@@ -0,0 +1,234 @@
+.container {
+ --ig-pill-gradient: linear-gradient(255deg, #444ce7 0%, #8b46d4 100%);
+ --ig-pill-text-active: #fff;
+
+ display: flex;
+ flex-direction: column;
+ gap: 1.5rem;
+}
+
+:global([data-theme="light"]) .container {
+ --ig-search-bg: #fff;
+ --ig-search-border: rgba(15, 23, 42, 0.25);
+ --ig-search-text: #1e293b;
+ --ig-search-placeholder: #94a3b8;
+ --ig-focus-color: #444ce7;
+ --ig-focus-shadow: 0 0 0 3px rgba(68, 76, 231, 0.15);
+ --ig-pill-text: #475569;
+ --ig-pill-border: rgba(15, 23, 42, 0.25);
+ --ig-card-bg: #fff;
+ --ig-card-border: rgba(15, 23, 42, 0.15);
+ --ig-card-hover: 0 4px 24px rgba(68, 76, 231, 0.18), 0 0 0 1px rgba(68, 76, 231, 0.12);
+ --ig-badge-bg: #f1f5f9;
+ --ig-badge-text: #475569;
+ --ig-muted: #64748b;
+}
+
+:global([data-theme="dark"]) .container {
+ --ig-search-bg: #1a1a1a;
+ --ig-search-border: rgba(148, 163, 184, 0.35);
+ --ig-search-text: #f1f5f9;
+ --ig-search-placeholder: #64748b;
+ --ig-focus-color: #7c6aef;
+ --ig-focus-shadow: 0 0 0 3px rgba(124, 106, 239, 0.25);
+ --ig-pill-text: #cbd5e1;
+ --ig-pill-border: rgba(148, 163, 184, 0.35);
+ --ig-card-bg: #1a1a1a;
+ --ig-card-border: rgba(148, 163, 184, 0.4);
+ --ig-card-hover: 0 0 24px rgba(130, 90, 255, 0.25), 0 0 0 1px rgba(130, 90, 255, 0.2);
+ --ig-badge-bg: rgba(148, 163, 184, 0.15);
+ --ig-badge-text: #cbd5e1;
+ --ig-muted: #64748b;
+}
+
+/* Search */
+
+.searchWrapper {
+ position: relative;
+}
+
+.searchIcon {
+ position: absolute;
+ left: 1rem;
+ top: 50%;
+ transform: translateY(-50%);
+ color: var(--ig-search-placeholder);
+ pointer-events: none;
+ display: flex;
+ align-items: center;
+}
+
+.searchInput {
+ width: 100%;
+ padding: 0.75rem 1rem 0.75rem 2.75rem;
+ font-size: 1rem;
+ border: none;
+ border-radius: 0;
+ box-shadow: inset 0 0 0 1px var(--ig-search-border);
+ background: var(--ig-search-bg);
+ color: var(--ig-search-text);
+ outline: none;
+ transition: box-shadow 0.2s ease;
+}
+
+.searchInput::placeholder {
+ color: var(--ig-search-placeholder);
+}
+
+.searchInput:focus {
+ box-shadow: inset 0 0 0 1px var(--ig-focus-color), var(--ig-focus-shadow);
+}
+
+/* Filters */
+
+.filters {
+ display: flex;
+ flex-direction: column;
+ gap: 0.75rem;
+}
+
+.filterGroup {
+ display: flex;
+ flex-wrap: wrap;
+ align-items: center;
+ gap: 0.5rem;
+}
+
+.filterLabel {
+ font-size: 0.8rem;
+ font-weight: 600;
+ text-transform: uppercase;
+ letter-spacing: 0.05em;
+ color: var(--ig-badge-text);
+ margin-right: 0.25rem;
+}
+
+.pill {
+ display: inline-flex;
+ align-items: center;
+ padding: 0.375rem 0.875rem;
+ font-size: 0.85rem;
+ font-weight: 500;
+ border: none;
+ border-radius: 0;
+ box-shadow: inset 0 0 0 1px var(--ig-pill-border);
+ background: transparent;
+ color: var(--ig-pill-text);
+ cursor: pointer;
+ transition: background 0.15s ease, color 0.15s ease, box-shadow 0.15s ease;
+ user-select: none;
+}
+
+.pill:hover {
+ box-shadow: inset 0 0 0 1px var(--ig-focus-color);
+}
+
+.pillActive {
+ background: var(--ig-pill-gradient);
+ color: var(--ig-pill-text-active);
+ box-shadow: none;
+}
+
+/* Grid */
+
+.grid {
+ display: grid;
+ grid-template-columns: repeat(auto-fill, minmax(260px, 1fr));
+ gap: 1rem;
+}
+
+/* Card */
+
+.card {
+ display: flex;
+ flex-direction: column;
+ padding: 1.25rem;
+ background: var(--ig-card-bg);
+ border: 1px solid var(--ig-card-border);
+ border-radius: 0;
+ text-decoration: none;
+ color: inherit;
+ transition: box-shadow 0.2s ease;
+}
+
+.card:hover {
+ border-image: linear-gradient(255deg, #444ce7 0%, #b664ff 100%) 1;
+ box-shadow: var(--ig-card-hover);
+ text-decoration: none;
+}
+
+.card:focus-visible {
+ outline: 2px solid var(--ig-focus-color);
+ outline-offset: 2px;
+}
+
+.cardHeader {
+ display: flex;
+ align-items: center;
+ gap: 0.5rem;
+ margin-bottom: 0.5rem;
+}
+
+.cardName {
+ margin: 0;
+ font-size: 1rem;
+ font-weight: 600;
+ color: var(--ifm-color-emphasis-900);
+ display: inline-flex;
+ align-items: center;
+ gap: 0.375rem;
+}
+
+.sdkIcons {
+ display: flex;
+ align-items: center;
+ gap: 0.375rem;
+ margin-left: auto;
+}
+
+.sdkIcons svg {
+ width: 22px;
+ height: 22px;
+ pointer-events: none;
+}
+
+.externalIcon {
+ opacity: 0.6;
+ flex-shrink: 0;
+}
+
+.cardDescription {
+ font-size: 0.875rem;
+ line-height: 1.5;
+ color: var(--ifm-color-emphasis-700);
+ margin: 0 0 0.75rem 0;
+ flex: 1;
+}
+
+.cardMeta {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 0.375rem;
+ margin-top: auto;
+}
+
+.badge {
+ display: inline-block;
+ padding: 0.2rem 0.5rem;
+ font-size: 0.7rem;
+ font-weight: 600;
+ text-transform: uppercase;
+ letter-spacing: 0.03em;
+ border-radius: 0;
+ background: var(--ig-badge-bg);
+ color: var(--ig-badge-text);
+}
+
+/* Empty state */
+
+.empty {
+ text-align: center;
+ padding: 3rem 1rem;
+ color: var(--ig-muted);
+ font-size: 1rem;
+}
diff --git a/src/components/GuidesGrid/guides-data.ts b/src/components/GuidesGrid/guides-data.ts
new file mode 100644
index 0000000000..c6e4326a5a
--- /dev/null
+++ b/src/components/GuidesGrid/guides-data.ts
@@ -0,0 +1,96 @@
+export type SDK = "Java" | "Python" | "TypeScript" | "Ruby" | "Go";
+
+export type Guide = {
+ name: string;
+ description: string;
+ tags: string[];
+ sdk?: SDK;
+ href: string;
+};
+
+const guides: Guide[] = [
+ {
+ name: "Customer loyalty program",
+ description:
+ "How to run a customer loyalty program with the entity pattern and durable workflows.",
+ tags: ["Entity Pattern"],
+ sdk: "Python",
+ href: "/guides/entity-pattern-loyalty-points",
+ },
+
+ {
+ name: "Recover without restart",
+ description:
+ "Build business processes that pause on errors and recover without restarting.",
+ tags: ["Saga Pattern"],
+ sdk: "TypeScript",
+ href: "/guides/saga-pattern",
+ },
+
+ {
+ name: "Route specialized workloads",
+ description:
+ "Direct resource-intensive workloads to appropriate Task Queues to optimize resources.",
+ tags: ["Task Queues"],
+ sdk: "Python",
+ href: "/guides/route-specialized-workloads",
+ },
+
+ {
+ name: "Worker execution affinity",
+ description:
+ "Direct multiple Activities to execute on the same Worker to maintain data locality.",
+ tags: ["Worker management"],
+ sdk: "Python",
+ href: "/guides/worker-execution-affinity",
+ },
+
+ {
+ name: "Temporary rate limit increases",
+ description:
+ "Handle temporary spikes in usage by dynamically provisioning extra capacity.",
+ tags: ["Namespace management"],
+ sdk: "Go",
+ href: "/guides/temporary-rate-limit-increases",
+ },
+
+ {
+ name: "Reliable document approvals",
+ description:
+ "Build durable human-in-the-loop Workflows.",
+ tags: ["Human-in-the-loop"],
+ sdk: "Python",
+ href: "/guides/reliable-document-approvals",
+ },
+
+ {
+ name: "Rate-limit downstream APIs",
+ description:
+ "Protect limited resources and avoid Workflow failures with separate Task Queues.",
+ tags: ["Workflow Management"],
+ sdk: "Python",
+ href: "/guides/rate-limit-downstream-apis",
+ },
+
+ {
+ name: "Durable gaming sessions",
+ description:
+ "Protect player sessions from backend failures by using the Actor pattern.",
+ tags: ["Actor Pattern"],
+ sdk: "Python",
+ href: "/guides/durable-gaming-sessions",
+ },
+
+ {
+ name: "Distributed locking",
+ description:
+ "Coordinate access to shared resources with a distributed lock.",
+ tags: ["Workflow Management"],
+ sdk: "Python",
+ href: "/guides/lock-shared-resources",
+ },
+
+
+];
+
+export default guides;
diff --git a/src/components/GuidesGrid/index.tsx b/src/components/GuidesGrid/index.tsx
new file mode 100644
index 0000000000..3967f005a9
--- /dev/null
+++ b/src/components/GuidesGrid/index.tsx
@@ -0,0 +1,195 @@
+import { useState, useMemo } from "react";
+import Link from "@docusaurus/Link";
+import clsx from "clsx";
+import guides, { type SDK, type Guide } from "./guides-data";
+import SdkSvg from "../elements/SdkSvgs/SdkSvg";
+import styles from "./GuidesGrid.module.css";
+
+const ALL_SDKS: SDK[] = ["Java", "Python", "Ruby", "TypeScript", "Go"];
+const LANGUAGE_AGNOSTIC = "Language-agnostic";
+type SdkFilter = SDK | typeof LANGUAGE_AGNOSTIC;
+const ALL_SDK_FILTERS: SdkFilter[] = [...ALL_SDKS, LANGUAGE_AGNOSTIC];
+
+const SDK_BLOCK_NAMES: Record = {
+ Java: "javaBlock",
+ Python: "pythonBlock",
+ Ruby: "rubyBlock",
+ TypeScript: "typeScriptBlock",
+ Go: "goBlock",
+};
+
+const ALL_TAGS = Array.from(
+ new Set(guides.flatMap((i) => i.tags)),
+).sort();
+
+const FILTER_GROUPS = [
+ { label: "SDK", key: "sdks" as const, options: ALL_SDK_FILTERS as string[] },
+ { label: "Tag", key: "tags" as const, options: ALL_TAGS },
+];
+
+type FilterState = {
+ sdks: SdkFilter[];
+ tags: string[];
+};
+
+function isExternal(href: string): boolean {
+ return href.startsWith("http://") || href.startsWith("https://");
+}
+
+function SearchIcon() {
+ return (
+
+ );
+}
+
+function ExternalLinkIcon() {
+ return (
+
+ );
+}
+
+function GuideCard({ item }: { item: Guide }) {
+ const external = isExternal(item.href);
+ return (
+
+