From 05ed306d07cd0b385fdf0633da9e7157fbfe7c17 Mon Sep 17 00:00:00 2001
From: Wenting Wu <wentingwu@microsoft.com>
Date: Mon, 20 Apr 2026 14:17:13 -0400
Subject: [PATCH 1/2] docs: add long haul test design document

Add design document for the canary-based long haul test infrastructure
per issue #220. The design covers:

- 4-component harness (data plane workload, control plane operations,
  health monitor, event journal)
- Run-until-failure canary model on persistent AKS cluster
- Data integrity oracle with per-writer sequence tracking
- Per-operation outage policies and failure tiers
- Phased implementation plan

Based on research of Strimzi, CloudNative-PG, CockroachDB, and Vitess
long haul/soak test patterns.

Closes #220

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Signed-off-by: Wenting Wu <wentingwu@microsoft.com>
---
 docs/designs/long-haul-test-design.md | 415 ++++++++++++++++++++++++++
 1 file changed, 415 insertions(+)
 create mode 100644 docs/designs/long-haul-test-design.md

diff --git a/docs/designs/long-haul-test-design.md b/docs/designs/long-haul-test-design.md
new file mode 100644
index 00000000..e1ec4a01
--- /dev/null
+++ b/docs/designs/long-haul-test-design.md
@@ -0,0 +1,415 @@
+# Long Haul Test Design — DocumentDB Kubernetes Operator
+
+**Issue:** [#220](https://github.com/documentdb/documentdb-kubernetes-operator/issues/220)  
+**Status:** Design phase
+
+## Problem Statement
+
+The operator lacks continuous, long-running test coverage. Issue #220 requires:
+1. Constant writes/reads — ensure no data is lost
+2. Constant management operations (add/remove region, HA toggle, scale, backup/restore)
+3. Operator and cluster updates under load
+
+## Why Long Haul Testing?
+
+Problems that only surface over extended continuous operation:
+- **Memory/resource leaks** — need hours of reconciliation loops to see growth trends
+- **WAL accumulation / disk fill** — cleanup bugs take time to manifest
+- **Connection pool exhaustion** — gradual leak over many connect/disconnect cycles
+- **Reconciliation drift** — operator state slowly diverges after many operations
+- **Certificate rotation** — certs don't expire during 60-min CI runs
+- **Backup retention cleanup** — need to exceed retention period to verify pruning
+- **Pod restart cascades** — subtle race conditions under repeated scale/failover cycles
+- **Upgrade correctness under load** — data corruption from rolling restarts
+
+Existing 60-min E2E tests verify correctness of individual operations. Long haul tests verify **sustained reliability** — that the operator doesn't degrade over time.
+
+## Design Overview
+
+The design is based on research of Strimzi, CloudNative-PG, CockroachDB (roachtest), and Vitess soak test patterns. The common architecture across all projects: **separate workload generation from disruption injection, run them concurrently, verify correctness post-hoc**.
+
+We adopt the **run-until-failure (canary)** model inspired by Strimzi: the cluster runs indefinitely with continuous workload and operations. When something breaks — data loss, unrecoverable state, resource exhaustion — the test captures the failure, collects artifacts, and alerts the team. This answers the real question: **"what breaks first, and after how long?"**
+
+---
+
+## Architecture: 4 Components
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                    Long Haul Test (Go/Ginkgo)             │
+│                                                          │
+│  ┌──────────────┐  ┌──────────────┐  ┌───────────────┐  │
+│  │ Data Plane   │  │ Control Plane│  │ Health Monitor │  │
+│  │ Workload     │  │ Operations   │  │ & Metrics      │  │
+│  │              │  │              │  │                │  │
+│  │ • Writers    │  │ • Scale      │  │ • Pod status   │  │
+│  │ • Readers    │  │ • Replication│  │ • CR conditions│  │
+│  │ • Verifiers  │  │ • Backup     │  │ • OTel metrics │  │
+│  │              │  │ • Upgrade    │  │ • Leak detect  │  │
+│  └──────┬───────┘  └──────┬───────┘  └───────┬───────┘  │
+│         │                 │                   │          │
+│         └─────────┬───────┴───────────────────┘          │
+│                   ▼                                      │
+│          ┌────────────────┐                              │
+│          │ Event Journal  │                              │
+│          │                │                              │
+│          │ • Op start/end │                              │
+│          │ • State changes│                              │
+│          │ • Error windows│                              │
+│          │ • Disruption   │                              │
+│          │   budgets      │                              │
+│          └────────────────┘                              │
+└─────────────────────────────────────────────────────────┘
+```
+
+### Component 1: Data Plane Workload
+
+**Purpose:** Continuous read/write traffic to detect data loss, corruption, and availability gaps.
+
+**Implementation:** Go with the official MongoDB driver (`go.mongodb.org/mongo-driver`), NOT shelling out to mongosh. This gives better cancellation/retry/context control over 24h+ runs.
+
+**Writer Model (Durability Oracle):**
+- Multiple writer goroutines, each with a unique `writer_id`
+- Each write: `{writer_id, seq, payload, checksum(payload), timestamp}`
+- Unique index on `(writer_id, seq)` to detect duplicates
+- Track three states per write: **attempted**, **acknowledged**, **verified**
+- Use `writeConcern: majority` for durability claims
+- Small percentage of **upserts/updates** (not just inserts) for broader coverage
+
+**Reader/Verifier Model:**
+- Periodic full-scan verification: no gaps in acknowledged sequences per writer
+- Checksum validation on read-back
+- Separate counters for: missing acknowledged writes, duplicates, stale reads, checksum mismatches
+- Use `readConcern: majority` to avoid false negatives from replica lag
+- Lag-aware: don't flag replication delay as data loss
+
+**Metrics Emitted:**
+- `longhaul_writes_attempted`, `longhaul_writes_acknowledged`, `longhaul_writes_failed`
+- `longhaul_reads_total`, `longhaul_reads_stale`, `longhaul_verification_failures`
+- `longhaul_write_latency_ms`, `longhaul_read_latency_ms`
+
+### Component 2: Control Plane Operations
+
+**Purpose:** Exercise management operations under continuous load.
+
+**Operation Categories:**
+
+| Operation | Type | Expected Disruption | Validation |
+|-----------|------|-------------------|------------|
+| Scale up (nodeCount++) | Topology | None | New pods ready, data accessible |
+| Scale down (nodeCount--) | Topology | Brief write pause | Remaining pods healthy, no data loss |
+| Enable replication | Replication | None | Replicas created, WAL streaming |
+| Disable replication | Replication | Brief | Standalone healthy |
+| Add region | Multi-region | None | New region catches up, data synced |
+| Remove region | Multi-region | Brief | Remaining regions healthy |
+| Toggle HA (localHA) | HA | Brief failover | Primary switches, writes resume |
+| On-demand backup | Backup | None | Backup CR reaches Completed |
+| Restore to new cluster | Backup | N/A (new cluster) | Restored data matches backup watermark |
+| Scheduled backup verify | Backup | None | Backups created on schedule |
+| Operator upgrade | Update | None (DB pods should NOT restart) | Operator pod rolls, cluster unaffected |
+| Cluster binary upgrade | Update | Rolling restart | Pods restart one-by-one, workload continues |
+| Schema upgrade | Update | Varies | Pre-backup, post-upgrade reads/writes OK |
+| Operator restart/leader failover | Chaos | Brief reconcile gap | Reconciliation resumes |
+| Pod eviction (simulating node drain) | Chaos | Brief | Pod rescheduled, workload resumes |
+
+**Sequencing Rules:**
+- Operations are NOT fully random — use **preconditions and cooldowns**
+- Cannot remove region if only 1 region exists
+- Cannot scale below minimum node count
+- Cooldown between disruptive ops (configurable, default 5 min)
+- Must reach steady state before next operation
+- Backup/restore is a **separate flow** (restore creates a NEW cluster, verifies, then cleans up)
+
+**Per-Operation Outage Policy:**
+```go
+type OutagePolicy struct {
+    AllowedDowntime     time.Duration  // e.g., 60s for failover
+    AllowedWriteFailures int           // tolerated write errors during window
+    MustRecoverWithin   time.Duration  // e.g., 5min to return to steady state
+}
+```
+
+### Component 3: Health Monitor & Metrics
+
+**Purpose:** Continuous cluster health observation + resource leak detection.
+
+**What to Monitor:**
+- **Kubernetes layer:** Pod readiness, restart counts, OOMKills, events
+- **CR layer:** DocumentDB status conditions, backup phase transitions
+- **Operator layer:** Operator logs/errors, reconciliation count, reconcile duration
+- **Database layer:** Connection count, WAL lag, replication status
+- **Resource layer:** Memory/CPU usage trends (via OTel/cAdvisor), PVC usage
+
+**Leak Detection:**
+- Sample memory/CPU at fixed intervals
+- Linear regression over last N samples
+- Alert if slope exceeds threshold (configurable)
+- 48-72h runs recommended for reliable leak detection
+
+**Steady State Definition:**
+```
+- All pods in Ready state
+- DocumentDB CR conditions: all True
+- Replication lag < threshold (if replicated)
+- No new pod restarts in last 5 min
+- Workload success rate > 99.9%
+- No unresolved backup failures
+```
+
+### Component 4: Event Journal
+
+**Purpose:** Central log correlating operations, disruptions, and errors for post-mortem analysis.
+
+**Every entry records:**
+- Timestamp
+- Event type (op_start, op_end, disruption_window_open, disruption_window_close, health_change, workload_error, verification_failure)
+- Operation ID
+- Cluster state snapshot (topology, pod count, primary node)
+- Associated errors (if any)
+
+**Key use case:** When a write failure occurs, the journal shows whether it happened during an expected disruption window (tolerable) or during steady state (bug).
+
+---
+
+## Canary Model
+
+The long-haul test is a **single persistent canary** running on a dedicated AKS cluster. Existing Kind-based integration tests (45-60 min, PR-gated) already cover short-lived validation — there is no need for a separate smoke mode.
+
+**Canary Cluster:**
+- 5 writers, 2 verifiers
+- Full operation cycle (scale, HA, replication, backup/restore, upgrades, chaos)
+- Runs indefinitely until a fatal failure occurs
+- On failure: collect artifacts, preserve cluster state for investigation
+- Key output: **MTTF** (mean time to failure) and failure classification
+- During development: test locally with `--max-duration=30m` against Kind
+
+### Failure Tiers
+
+| Tier | Example | Action |
+|------|---------|--------|
+| **Fatal** (stop test) | Acknowledged write lost, checksum mismatch, cluster unrecoverable >10min | Artifact dump + preserve cluster + exit non-zero |
+| **Degraded** (log + continue) | Operator pod restarted, brief write timeout during expected disruption | Log to journal, continue if recovery within budget |
+| **Warning** (monitor) | Memory trending up, reconcile latency increasing | Log warning, no stop |
+
+### Auto-Recovery Before Fatal Declaration
+- Operator crash → wait for K8s restart → continue if healthy within 5 min
+- Pod eviction → wait for reschedule → continue
+- Data loss or corruption → **immediate stop**, preserve cluster state for investigation
+
+### Future: Multi-Region Canary
+- Add/remove region operations, cross-region replication verification
+- AKS Fleet integration
+- Separate canary cluster or extension of single-cluster canary
+
+---
+
+## Directory Structure
+
+```
+operator/src/test/longhaul/
+├── main_test.go           # Ginkgo suite entry, profile selection
+├── config.go              # Configuration (duration, intervals, cluster, profile)
+├── workload/
+│   ├── writer.go          # Multi-writer with durability tracking
+│   ├── reader.go          # Reader + verifier
+│   └── oracle.go          # Data integrity oracle (acknowledged write tracking)
+├── operations/
+│   ├── scheduler.go       # Operation sequencer with preconditions/cooldowns
+│   ├── scale.go           # Scale up/down operations
+│   ├── replication.go     # Replication enable/disable, add/remove region
+│   ├── backup.go          # Backup create + restore-to-new-cluster verification
+│   ├── upgrade.go         # Operator, cluster binary, schema upgrades
+│   └── chaos.go           # Pod eviction, operator restart
+├── monitor/
+│   ├── health.go          # Cluster health checks
+│   ├── metrics.go         # OTel/Prometheus metric collection
+│   └── leakdetect.go      # Resource trend analysis
+├── journal/
+│   ├── journal.go         # Event journal with disruption window tracking
+│   └── policy.go          # Per-operation outage policies
+└── report/
+    ├── report.go          # Summary report generation
+    └── templates/         # Report templates (markdown/HTML)
+```
+
+---
+
+## Configuration
+
+```go
+type Config struct {
+    // Canary runs until failure; MaxDuration=0 means infinite.
+    // Use --max-duration=30m for local dev testing against Kind.
+    MaxDuration time.Duration
+
+    // Workload tuning
+    NumWriters   int           // default: 5
+    NumVerifiers int           // default: 2
+
+    // Operation scheduling
+    OpCooldown  time.Duration // min interval between disruptive ops
+    OpEnabled   []string      // which operations to enable
+
+    // Failure handling
+    RecoveryTimeout time.Duration // max time to wait for auto-recovery before fatal
+}
+```
+
+---
+
+## Deployment & Visibility
+
+### Approach
+
+The long haul test code is fully open source in the repository — anyone can run it. There is no requirement for a public-facing dashboard or scheduled CI workflow for the canary. This matches the pattern of most early-stage OSS projects; public dashboards (like Strimzi's Jenkins or CockroachDB's TeamCity) can be added later as the project matures.
+
+### Running the Canary
+
+**Local development (anyone):**
+```bash
+cd operator/src
+go test ./test/longhaul/ -v --max-duration=30m
+```
+Runs against whatever cluster your kubeconfig points to (Kind, Minikube, etc.).
+
+**Persistent canary (internal):**
+- Dedicated AKS cluster provisioned once (manually or via IaC)
+- Long haul test deployed as a Kubernetes Job on the same cluster (separate `longhaul` namespace)
+- On new operator release: re-deploy operator via Helm + restart longhaul Job
+- Internal Grafana/OTel dashboard for monitoring (optional)
+- Cluster preserved on failure for investigation
+
+### When Bugs Are Found
+
+Bugs discovered by the canary are filed as regular GitHub issues — no special process needed. The long haul test collects enough context (event journal, cluster state snapshot, failure details) to make issues actionable.
+
+### Auto-Upgrade
+
+A GitHub Actions workflow handles upgrading the canary cluster automatically. It triggers on new releases and can also be triggered manually.
+
+```yaml
+on:
+  workflow_dispatch:        # manual trigger
+  release:
+    types: [published]      # auto-trigger on new operator release
+
+jobs:
+  upgrade-canary:
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write       # for Azure federated identity (OIDC)
+    steps:
+      - uses: actions/checkout@v4
+      - uses: azure/login@v2
+        with:
+          client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+      - run: az aks get-credentials --resource-group $RG --name $CLUSTER
+      - run: helm upgrade documentdb-operator ./operator/documentdb-helm-chart
+      - run: |
+          kubectl delete job longhaul -n longhaul --ignore-not-found
+          kubectl apply -f test/longhaul/deploy/job.yaml
+          kubectl wait --for=condition=ready pod -l job-name=longhaul -n longhaul --timeout=120s
+```
+
+**Key points:**
+- **AKS auth**: Azure federated identity (OIDC) — no stored secrets, just a trust relationship between GitHub and Azure
+- **Operator release** → workflow auto-triggers → Helm upgrade → restart longhaul Job
+- **Test code change** → rebuild longhaul image, trigger workflow manually via `workflow_dispatch`
+- **Audit trail**: Every upgrade is visible in GitHub Actions history
+
+---
+
+## Learnings from Other Projects
+
+| Project | Key Pattern We Adopt | Key Pattern We Skip |
+|---------|---------------------|-------------------|
+| **Strimzi** | Run-until-failure loops; metrics collection; CI profiles | JUnit (we use Ginkgo) |
+| **CloudNative-PG** | Ginkgo framework; failover via pod delete + SIGSTOP; LSN verification | Single-sequence failover (we need continuous concurrent workload) |
+| **CockroachDB** | Chaos runner (periodic kill/restart); separate workload from disruption; roachstress repeated runs | Custom roachtest framework (too heavy for our needs) |
+| **Vitess** | Background stress goroutine; per-query tracking; Go native driver | No fault injection (we need disruptive ops) |
+
+**Universal pattern adopted:** Separate workload generators from disruption injectors, run concurrently, verify correctness against an acknowledged-write oracle, use per-operation disruption budgets. Run-until-failure (Strimzi model) rather than time-bounded.
+
+---
+
+## Implementation Phases
+
+Each phase is a self-contained, demoable increment (~1-2 PRs each).
+
+### Phase 1a: Project Skeleton + Config
+- `test/longhaul/` directory structure, Ginkgo suite entry point
+- Config loading (`--max-duration`, writer count, cooldowns, operation list)
+- Can run against a cluster (does nothing yet)
+
+### Phase 1b: Data Plane Workload
+- Multi-writer goroutines with durability oracle
+- Reader/verifier with gap, duplicate, and checksum detection
+- Metrics counters (writes attempted/acknowledged/failed, reads, verification failures)
+
+### Phase 1c: Event Journal
+- Central event log (op_start, op_end, health_change, workload_error, etc.)
+- Disruption window tracking (expected vs unexpected errors)
+- In-memory + file-backed for post-mortem
+
+### Phase 1d: Health Monitor
+- Pod readiness, restart counts, OOMKills
+- DocumentDB CR status conditions
+- Steady-state detection (all healthy, no recent restarts, workload success rate OK)
+
+### Phase 1e: Scale Operations
+- Scale up/down with precondition checks
+- Per-operation outage policy enforcement
+- First control plane operation — validates the operation scheduler pattern
+
+### Phase 1f: Summary Report
+- Markdown report on exit (pass/fail, duration, stats, operation timeline)
+- Event journal dump
+- Testable locally: `go test ./test/longhaul/ -v --max-duration=30m` against Kind
+
+### Phase 2a: Backup & Restore Operations
+- On-demand backup creation + wait for completion
+- Restore to new cluster + data verification against backup watermark
+- Cleanup of restored cluster
+
+### Phase 2b: HA & Replication Operations
+- Toggle HA (localHA)
+- Enable/disable replication
+- Precondition checks (e.g., cannot disable if already standalone)
+
+### Phase 2c: Upgrade Operations
+- Operator upgrade (Helm)
+- Cluster binary upgrade (documentDBVersion)
+- Schema upgrade (schemaVersion)
+- Each tested separately with outage policy
+
+### Phase 2d: Chaos Operations
+- Pod eviction (simulating node drain)
+- Operator restart / leader failover
+
+### Phase 2e: Failure Tiers + Auto-Recovery
+- Fatal / degraded / warning classification
+- Auto-recovery logic (wait for K8s restart before declaring fatal)
+- Cluster state preservation on fatal failure
+
+### Phase 2f: AKS Deployment
+- Dockerfile for longhaul test image
+- Kubernetes Job manifest, RBAC (ServiceAccount, ClusterRole, Binding)
+- ConfigMap for tuning parameters
+- Deploy script / instructions
+
+### Phase 2g: Auto-Upgrade Workflow
+- GitHub Actions workflow (triggered on release + manual dispatch)
+- Azure OIDC auth, Helm upgrade, Job restart
+
+### Phase 3: Multi-Region Canary
+- Add/remove region operations
+- Cross-region replication verification
+- AKS Fleet integration
+
+---
+
+## Open Questions
+1. What AKS cluster/subscription should be used for the dedicated canary cluster?
+2. Desired SLO targets (e.g., 99.9% write success during steady state)?

From 158a393563aeba01b2d335ccad4bbc3ab766554e Mon Sep 17 00:00:00 2001
From: Wenting Wu <wentingwu@microsoft.com>
Date: Wed, 22 Apr 2026 11:05:54 -0400
Subject: [PATCH 2/2] feat: add long-haul test skeleton and configuration
 (Phase 1a)

Add the project skeleton for long-haul (canary) tests:

- config/config.go: Config struct with env var loading, defaults, and validation
- config/config_test.go: Comprehensive tests for all config options (15 specs)
- config/suite_test.go: Ginkgo suite entry for config unit tests
- suite_test.go: Ginkgo suite entry point for the canary
- longhaul_test.go: BeforeSuite with LONGHAUL_ENABLED skip gate + placeholder
- README.md: Usage guide for running locally and in CI

Config is in a sub-package so config unit tests run independently of the
long-running canary. Tests are gated behind LONGHAUL_ENABLED=true env var,
so go test ./... safely skips them.

Part of #220

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Signed-off-by: Wenting Wu <wentingwu@microsoft.com>
---
 docs/designs/long-haul-test-design.md | 362 +++++++++++++++++++++-----
 test/longhaul/README.md               | 105 ++++++++
 test/longhaul/config/config.go        |  87 +++++++
 test/longhaul/config/config_test.go   | 157 +++++++++++
 test/longhaul/config/suite_test.go    |  16 ++
 test/longhaul/go.mod                  |  23 ++
 test/longhaul/go.sum                  |  69 +++++
 test/longhaul/longhaul_test.go        |  40 +++
 test/longhaul/suite_test.go           |  16 ++
 9 files changed, 803 insertions(+), 72 deletions(-)
 create mode 100644 test/longhaul/README.md
 create mode 100644 test/longhaul/config/config.go
 create mode 100644 test/longhaul/config/config_test.go
 create mode 100644 test/longhaul/config/suite_test.go
 create mode 100644 test/longhaul/go.mod
 create mode 100644 test/longhaul/go.sum
 create mode 100644 test/longhaul/longhaul_test.go
 create mode 100644 test/longhaul/suite_test.go

diff --git a/docs/designs/long-haul-test-design.md b/docs/designs/long-haul-test-design.md
index e1ec4a01..7e916549 100644
--- a/docs/designs/long-haul-test-design.md
+++ b/docs/designs/long-haul-test-design.md
@@ -1,14 +1,23 @@
 # Long Haul Test Design — DocumentDB Kubernetes Operator
 
 **Issue:** [#220](https://github.com/documentdb/documentdb-kubernetes-operator/issues/220)  
-**Status:** Design phase
+**Status:** In progress (Phase 1a complete)
+
+## Terminology
+
+This document refers to two kinds of cluster:
+
+- **DocumentDB cluster** — the database cluster managed by the operator (the `DocumentDB` Custom Resource and its pods).
+- **Kubernetes cluster** (or **AKS cluster**, **Kind cluster**) — the infrastructure cluster where the operator and DocumentDB run.
+
+When unqualified, "cluster" in the context of operations, health, and state refers to the **DocumentDB cluster**. Infrastructure clusters are always qualified (AKS, Kind, etc.).
 
 ## Problem Statement
 
 The operator lacks continuous, long-running test coverage. Issue #220 requires:
 1. Constant writes/reads — ensure no data is lost
 2. Constant management operations (add/remove region, HA toggle, scale, backup/restore)
-3. Operator and cluster updates under load
+3. Operator and DocumentDB cluster updates under load
 
 ## Why Long Haul Testing?
 
@@ -28,7 +37,7 @@ Existing 60-min E2E tests verify correctness of individual operations. Long haul
 
 The design is based on research of Strimzi, CloudNative-PG, CockroachDB (roachtest), and Vitess soak test patterns. The common architecture across all projects: **separate workload generation from disruption injection, run them concurrently, verify correctness post-hoc**.
 
-We adopt the **run-until-failure (canary)** model inspired by Strimzi: the cluster runs indefinitely with continuous workload and operations. When something breaks — data loss, unrecoverable state, resource exhaustion — the test captures the failure, collects artifacts, and alerts the team. This answers the real question: **"what breaks first, and after how long?"**
+We adopt the **run-until-failure (canary)** model inspired by Strimzi: the DocumentDB cluster runs indefinitely with continuous workload and operations. When something breaks — data loss, unrecoverable state, resource exhaustion — the test captures the failure, collects artifacts, and alerts the team. This answers the real question: **"what breaks first, and after how long?"**
 
 ---
 
@@ -104,10 +113,10 @@ We adopt the **run-until-failure (canary)** model inspired by Strimzi: the clust
 | Remove region | Multi-region | Brief | Remaining regions healthy |
 | Toggle HA (localHA) | HA | Brief failover | Primary switches, writes resume |
 | On-demand backup | Backup | None | Backup CR reaches Completed |
-| Restore to new cluster | Backup | N/A (new cluster) | Restored data matches backup watermark |
+| Restore to new DocumentDB cluster | Backup | N/A (new cluster) | Restored data matches backup watermark |
 | Scheduled backup verify | Backup | None | Backups created on schedule |
-| Operator upgrade | Update | None (DB pods should NOT restart) | Operator pod rolls, cluster unaffected |
-| Cluster binary upgrade | Update | Rolling restart | Pods restart one-by-one, workload continues |
+| Operator upgrade | Update | None (DB pods should NOT restart) | Operator pod rolls, DocumentDB cluster unaffected |
+| DocumentDB binary upgrade | Update | Rolling restart | Pods restart one-by-one, workload continues |
 | Schema upgrade | Update | Varies | Pre-backup, post-upgrade reads/writes OK |
 | Operator restart/leader failover | Chaos | Brief reconcile gap | Reconciliation resumes |
 | Pod eviction (simulating node drain) | Chaos | Brief | Pod rescheduled, workload resumes |
@@ -118,7 +127,7 @@ We adopt the **run-until-failure (canary)** model inspired by Strimzi: the clust
 - Cannot scale below minimum node count
 - Cooldown between disruptive ops (configurable, default 5 min)
 - Must reach steady state before next operation
-- Backup/restore is a **separate flow** (restore creates a NEW cluster, verifies, then cleans up)
+- Backup/restore is a **separate flow** (restore creates a NEW DocumentDB cluster, verifies, then cleans up)
 
 **Per-Operation Outage Policy:**
 ```go
@@ -131,7 +140,7 @@ type OutagePolicy struct {
 
 ### Component 3: Health Monitor & Metrics
 
-**Purpose:** Continuous cluster health observation + resource leak detection.
+**Purpose:** Continuous DocumentDB cluster health observation + resource leak detection.
 
 **What to Monitor:**
 - **Kubernetes layer:** Pod readiness, restart counts, OOMKills, events
@@ -164,7 +173,7 @@ type OutagePolicy struct {
 - Timestamp
 - Event type (op_start, op_end, disruption_window_open, disruption_window_close, health_change, workload_error, verification_failure)
 - Operation ID
-- Cluster state snapshot (topology, pod count, primary node)
+- Cluster state snapshot (DocumentDB topology, pod count, primary node)
 - Associated errors (if any)
 
 **Key use case:** When a write failure occurs, the journal shows whether it happened during an expected disruption window (tolerable) or during steady state (bug).
@@ -175,11 +184,11 @@ type OutagePolicy struct {
 
 The long-haul test is a **single persistent canary** running on a dedicated AKS cluster. Existing Kind-based integration tests (45-60 min, PR-gated) already cover short-lived validation — there is no need for a separate smoke mode.
 
-**Canary Cluster:**
+**Canary Configuration:**
 - 5 writers, 2 verifiers
 - Full operation cycle (scale, HA, replication, backup/restore, upgrades, chaos)
 - Runs indefinitely until a fatal failure occurs
-- On failure: collect artifacts, preserve cluster state for investigation
+- On failure: collect artifacts, preserve DocumentDB cluster state for investigation
 - Key output: **MTTF** (mean time to failure) and failure classification
 - During development: test locally with `--max-duration=30m` against Kind
 
@@ -187,73 +196,160 @@ The long-haul test is a **single persistent canary** running on a dedicated AKS
 
 | Tier | Example | Action |
 |------|---------|--------|
-| **Fatal** (stop test) | Acknowledged write lost, checksum mismatch, cluster unrecoverable >10min | Artifact dump + preserve cluster + exit non-zero |
+| **Fatal** (stop test) | Acknowledged write lost, checksum mismatch, DocumentDB cluster unrecoverable >10min | Artifact dump + preserve cluster + exit non-zero |
 | **Degraded** (log + continue) | Operator pod restarted, brief write timeout during expected disruption | Log to journal, continue if recovery within budget |
 | **Warning** (monitor) | Memory trending up, reconcile latency increasing | Log warning, no stop |
 
 ### Auto-Recovery Before Fatal Declaration
 - Operator crash → wait for K8s restart → continue if healthy within 5 min
 - Pod eviction → wait for reschedule → continue
-- Data loss or corruption → **immediate stop**, preserve cluster state for investigation
+- Data loss or corruption → **immediate stop**, preserve DocumentDB cluster state for investigation
 
 ### Future: Multi-Region Canary
 - Add/remove region operations, cross-region replication verification
 - AKS Fleet integration
-- Separate canary cluster or extension of single-cluster canary
+- Separate canary AKS cluster or extension of single-cluster canary
 
 ---
 
 ## Directory Structure
 
+The test infrastructure follows a **three-directory layout** at the repo root:
+
 ```
-operator/src/test/longhaul/
-├── main_test.go           # Ginkgo suite entry, profile selection
-├── config.go              # Configuration (duration, intervals, cluster, profile)
-├── workload/
-│   ├── writer.go          # Multi-writer with durability tracking
-│   ├── reader.go          # Reader + verifier
-│   └── oracle.go          # Data integrity oracle (acknowledged write tracking)
-├── operations/
-│   ├── scheduler.go       # Operation sequencer with preconditions/cooldowns
-│   ├── scale.go           # Scale up/down operations
-│   ├── replication.go     # Replication enable/disable, add/remove region
-│   ├── backup.go          # Backup create + restore-to-new-cluster verification
-│   ├── upgrade.go         # Operator, cluster binary, schema upgrades
-│   └── chaos.go           # Pod eviction, operator restart
-├── monitor/
-│   ├── health.go          # Cluster health checks
-│   ├── metrics.go         # OTel/Prometheus metric collection
-│   └── leakdetect.go      # Resource trend analysis
-├── journal/
-│   ├── journal.go         # Event journal with disruption window tracking
-│   └── policy.go          # Per-operation outage policies
-└── report/
-    ├── report.go          # Summary report generation
-    └── templates/         # Report templates (markdown/HTML)
+test/
+├── utils/                     # Shared test utilities (used by BOTH e2e and longhaul)
+│   ├── go.mod                 # Separate module: github.com/.../test/utils
+│   ├── mongo/                 # Mongo client, Seed, Count, Ping, Handle
+│   ├── assertions/            # Gomega-compatible checkers (DocumentDBReady, InstanceCount, …)
+│   ├── documentdb/            # DocumentDB CR CRUD (Create, WaitHealthy, Delete, PatchSpec, …)
+│   ├── operatorhealth/        # Operator-churn gate (pod UID/restart tracking)
+│   ├── portforward/           # Gateway port-forward (wraps CNPG forwardconnection)
+│   ├── fixtures/              # Namespace/secret/label helpers, teardown-by-label
+│   ├── timeouts/              # Centralised Eventually durations (reuses CNPG timeouts)
+│   ├── clusterprobe/          # Runtime capability checks (VolumeSnapshot CRD, StorageClass)
+│   ├── seed/                  # Deterministic datasets (SmallDataset, MediumDataset, …)
+│   └── testenv/               # Shared environment config (kubeconfig, client setup)
+│
+├── e2e/                       # E2E test suite (PR #346)
+│   ├── go.mod                 # Imports test/utils + operator API types
+│   ├── tests/
+│   │   ├── lifecycle/         # Deploy, delete, image update, log level
+│   │   ├── scale/             # Instance scaling
+│   │   ├── data/              # CRUD, aggregation, sort/limit
+│   │   ├── backup/            # Backup & restore
+│   │   ├── tls/               # TLS certificate modes
+│   │   ├── upgrade/           # Operator & binary upgrades
+│   │   └── ...
+│   └── README.md
+│
+└── longhaul/                  # Long-haul canary test suite
+    ├── go.mod                 # Imports test/utils + operator API types
+    ├── README.md              # Usage guide (running locally, CI safety, configuration)
+    ├── suite_test.go          # Ginkgo suite entry point for the canary
+    ├── longhaul_test.go       # BeforeSuite (skip gate + config) + long-running test specs
+    ├── config/
+    │   ├── config.go          # Config struct, env var loading, validation, IsEnabled gate
+    │   ├── suite_test.go      # Ginkgo suite entry for config unit tests
+    │   └── config_test.go     # Config unit tests (23 specs, fast, no Kubernetes cluster needed)
+    ├── workload/              # (Phase 1b)
+    │   ├── writer.go          # Multi-writer with durability tracking
+    │   ├── reader.go          # Reader + verifier (reuses test/utils/mongo)
+    │   └── oracle.go          # Data integrity oracle (acknowledged write tracking)
+    ├── operations/            # (Phase 1d-2d)
+    │   ├── scheduler.go       # Operation sequencer with preconditions/cooldowns
+    │   ├── scale.go           # Scale (reuses test/utils/documentdb.PatchInstances)
+    │   ├── replication.go     # Replication enable/disable, add/remove region
+    │   ├── backup.go          # Backup create + restore (reuses test/utils/clusterprobe)
+    │   ├── upgrade.go         # Operator, DocumentDB binary, schema upgrades
+    │   └── chaos.go           # Pod eviction, operator restart
+    ├── monitor/               # (Phase 1d)
+    │   ├── health.go          # Reuses test/utils/assertions + test/utils/operatorhealth
+    │   ├── metrics.go         # OTel/Prometheus metric collection
+    │   └── leakdetect.go      # Resource trend analysis
+    ├── journal/               # (Phase 1c)
+    │   ├── journal.go         # Event journal with disruption window tracking
+    │   └── policy.go          # Per-operation outage policies
+    └── report/                # (Phase 1f)
+        ├── report.go          # Summary report generation
+        └── templates/         # Report templates (markdown/HTML)
 ```
 
+### Shared Utilities: `test/utils/`
+
+The `test/utils/` module provides reusable test infrastructure for **both** E2E and long-haul tests. This avoids duplicating ~2000 lines of proven utilities. The packages originate from PR #346's `test/e2e/pkg/e2eutils/` and are promoted to the shared location.
+
+**Key packages and how long-haul uses them:**
+
+| Package | What it provides | Long-haul use |
+|---------|-----------------|---------------|
+| `mongo/` | Client, Seed, Count, Ping, Handle, port-forward connect | Writers + Verifiers connect to DocumentDB gateway |
+| `assertions/` | AssertDocumentDBReady, AssertInstanceCount, AssertPrimaryUnchanged | Health monitor polls cluster health continuously |
+| `documentdb/` | Create, WaitHealthy, Delete, PatchInstances, PatchSpec | Operation executor (scale, upgrade, backup/restore) |
+| `operatorhealth/` | Gate (pod UID/restart tracking), Check, MarkChurned | Health monitor detects operator churn under load |
+| `portforward/` | OpenWithErr for gateway service | Writers open port-forward to DocumentDB gateway |
+| `timeouts/` | For(op), PollInterval(op) — standardised wait durations | All waiters use consistent, CNPG-aligned timeouts |
+| `fixtures/` | ensureNamespace, ensureCredentialSecret, ownershipLabels, teardownByLabels | Canary setup creates namespace + credentials; teardown by label on abort |
+| `clusterprobe/` | HasVolumeSnapshotCRD, StorageClassAllowsExpansion | Backup operations skip when CSI snapshots unavailable |
+| `seed/` | SmallDataset, MediumDataset (deterministic bson.M generators) | Writer seed data for baseline verification |
+
+**Module structure:**
+
+```
+test/utils/go.mod    → github.com/documentdb/documentdb-operator/test/utils
+test/e2e/go.mod      → github.com/documentdb/documentdb-operator/test/e2e
+test/longhaul/go.mod → github.com/documentdb/documentdb-operator/test/longhaul
+operator/src/go.mod  → github.com/documentdb/documentdb-operator (unchanged)
+```
+
+Each test module uses a `replace` directive to point at the local operator source and `test/utils`:
+
+```go
+// test/longhaul/go.mod
+module github.com/documentdb/documentdb-operator/test/longhaul
+
+require (
+    github.com/documentdb/documentdb-operator/test/utils v0.0.0
+    github.com/documentdb/documentdb-operator              v0.0.0
+)
+
+replace (
+    github.com/documentdb/documentdb-operator/test/utils => ../utils
+    github.com/documentdb/documentdb-operator              => ../../operator/src
+)
+```
+
+> **Migration note:** PR #346 currently has utilities under `test/e2e/pkg/e2eutils/`. Extracting them to
+> `test/utils/` is a follow-up task that should be coordinated with xgerman. Until extraction happens,
+> long-haul tests can vendor the needed types locally and swap to imports once `test/utils/` exists.
+
 ---
 
 ## Configuration
 
-```go
-type Config struct {
-    // Canary runs until failure; MaxDuration=0 means infinite.
-    // Use --max-duration=30m for local dev testing against Kind.
-    MaxDuration time.Duration
+All configuration is via environment variables. Tests are **gated** behind `LONGHAUL_ENABLED` — they are safely skipped in regular CI runs (`go test ./...`).
 
-    // Workload tuning
-    NumWriters   int           // default: 5
-    NumVerifiers int           // default: 2
+**Current (Phase 1a):**
 
-    // Operation scheduling
-    OpCooldown  time.Duration // min interval between disruptive ops
-    OpEnabled   []string      // which operations to enable
+| Variable | Required | Default | Description |
+|----------|----------|---------|-------------|
+| `LONGHAUL_ENABLED` | Yes | — | Must be `true`, `1`, or `yes` to run. Otherwise all tests skip. |
+| `LONGHAUL_CLUSTER_NAME` | Yes | — | Name of the target DocumentDB cluster CR. |
+| `LONGHAUL_NAMESPACE` | No | `default` | Kubernetes namespace of the target DocumentDB cluster. |
+| `LONGHAUL_MAX_DURATION` | No | `30m` | Max test duration (`0s` = run until failure). |
 
-    // Failure handling
-    RecoveryTimeout time.Duration // max time to wait for auto-recovery before fatal
-}
-```
+> **Note:** The default 30m timeout is a safety net for local development. The persistent canary
+> Job manifest explicitly sets `LONGHAUL_MAX_DURATION=0s` to enable run-until-failure mode.
+
+**Planned (future phases):**
+
+| Variable | Default | Phase | Description |
+|----------|---------|-------|-------------|
+| `LONGHAUL_NUM_WRITERS` | `5` | 1b | Number of concurrent writer goroutines |
+| `LONGHAUL_NUM_VERIFIERS` | `2` | 1b | Number of concurrent verifier goroutines |
+| `LONGHAUL_OP_COOLDOWN` | `5m` | 1e | Min interval between disruptive operations |
+| `LONGHAUL_OP_ENABLED` | all | 1e | Comma-separated list of enabled operations |
+| `LONGHAUL_RECOVERY_TIMEOUT` | `5m` | 2e | Max time to wait for auto-recovery before fatal |
 
 ---
 
@@ -267,25 +363,118 @@ The long haul test code is fully open source in the repository — anyone can ru
 
 **Local development (anyone):**
 ```bash
-cd operator/src
-go test ./test/longhaul/ -v --max-duration=30m
+cd test/longhaul
+
+# Run config unit tests (fast, no Kubernetes cluster needed)
+go test ./config/ -v
+
+# Run the canary against a local Kind cluster
+LONGHAUL_ENABLED=true \
+LONGHAUL_CLUSTER_NAME=documentdb-sample \
+LONGHAUL_NAMESPACE=default \
+LONGHAUL_MAX_DURATION=10m \
+go test ./... -v -timeout 0
+
+# Or build a standalone binary
+go test -c -o longhaul.test ./
+LONGHAUL_ENABLED=true \
+LONGHAUL_CLUSTER_NAME=documentdb-sample \
+./longhaul.test -test.v -test.timeout 0
 ```
-Runs against whatever cluster your kubeconfig points to (Kind, Minikube, etc.).
+Runs against whatever Kubernetes cluster your kubeconfig points to (Kind, Minikube, etc.).
 
 **Persistent canary (internal):**
 - Dedicated AKS cluster provisioned once (manually or via IaC)
-- Long haul test deployed as a Kubernetes Job on the same cluster (separate `longhaul` namespace)
+- Long haul test deployed as a Kubernetes Job on the same AKS cluster (separate `longhaul` namespace)
 - On new operator release: re-deploy operator via Helm + restart longhaul Job
 - Internal Grafana/OTel dashboard for monitoring (optional)
-- Cluster preserved on failure for investigation
+- DocumentDB cluster preserved on failure for investigation
+
+> **Note:** The canary runs on a team-managed AKS cluster. Contributors do not need cluster access —
+> test results are made public via GitHub Issues (on failure) and an optional status badge in README.
+> This is standard practice for open-source projects (CockroachDB, Strimzi, Kubernetes itself all
+> run long-running tests on private infrastructure with public results).
+
+### Alerting
+
+The alerting system uses a **two-layer architecture** to avoid managing long-lived tokens on the AKS cluster:
+
+**Layer 1: AKS cluster (always running)**
+- Long-haul canary runs as a Kubernetes Job — continuous workload
+- Writes status to a well-known ConfigMap (`longhaul-status` in `longhaul` namespace)
+- Updates include: current state (running/failed/passed), last heartbeat, failure details, journal excerpt
+- No GitHub token needed on the AKS cluster
+
+**Layer 2: GitHub Actions (periodic health check)**
+- Scheduled workflow runs every hour (`cron: '0 * * * *'`)
+- Connects to AKS cluster via Azure federated identity (OIDC, same as auto-upgrade workflow)
+- Checks canary health: pod status, status ConfigMap, recent pod logs
+- If failure detected → creates a GitHub Issue with:
+  - Title: `[Long Haul Failure] {failure type} — {timestamp}`
+  - Body: DocumentDB cluster name, uptime, error details, journal excerpt, pod logs
+  - Labels: `long-haul-failure`
+- Uses `GITHUB_TOKEN` (auto-managed by GitHub Actions, no expiry, no rotation)
+- Maintainers receive email automatically via GitHub's issue notification system
+- Deduplication: skips issue creation if an open `long-haul-failure` issue already exists
 
-### When Bugs Are Found
+```yaml
+on:
+  schedule:
+    - cron: '0 * * * *'       # every hour
+  workflow_dispatch:            # manual trigger
 
-Bugs discovered by the canary are filed as regular GitHub issues — no special process needed. The long haul test collects enough context (event journal, cluster state snapshot, failure details) to make issues actionable.
+jobs:
+  check-canary:
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write           # Azure OIDC
+      issues: write             # create GitHub Issues
+    steps:
+      - uses: actions/checkout@v4
+      - uses: azure/login@v2
+        with:
+          client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+      - run: az aks get-credentials --resource-group $RG --name $CLUSTER
+      - name: Check canary status
+        id: status
+        run: |
+          # Check pod health
+          POD_STATUS=$(kubectl get pods -l job-name=longhaul -n longhaul -o jsonpath='{.items[0].status.phase}')
+          # Read status ConfigMap
+          CANARY_STATUS=$(kubectl get configmap longhaul-status -n longhaul -o jsonpath='{.data.status}')
+          echo "pod_status=$POD_STATUS" >> $GITHUB_OUTPUT
+          echo "canary_status=$CANARY_STATUS" >> $GITHUB_OUTPUT
+      - name: Create issue on failure
+        if: steps.status.outputs.canary_status == 'failed' || steps.status.outputs.pod_status != 'Running'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            // Deduplicate: skip if open issue exists
+            const { data: issues } = await github.rest.issues.listForRepo({
+              owner: context.repo.owner, repo: context.repo.repo,
+              labels: 'long-haul-failure', state: 'open'
+            });
+            if (issues.length > 0) return;
+            await github.rest.issues.create({
+              owner: context.repo.owner, repo: context.repo.repo,
+              title: `[Long Haul Failure] ${new Date().toISOString()}`,
+              body: `Canary status: ${{ steps.status.outputs.canary_status }}\nPod: ${{ steps.status.outputs.pod_status }}`,
+              labels: ['long-haul-failure']
+            });
+```
+
+**Benefits:**
+- No long-lived GitHub tokens on the AKS cluster
+- `GITHUB_TOKEN` in Actions is auto-managed — no expiry, no rotation
+- Maintainers get email through GitHub's built-in notification system
+- All failures are publicly visible as GitHub Issues — contributors can see and comment
+- Easy to extend: add Slack webhook, Teams notification, or status badge in future
 
 ### Auto-Upgrade
 
-A GitHub Actions workflow handles upgrading the canary cluster automatically. It triggers on new releases and can also be triggered manually.
+A GitHub Actions workflow handles upgrading the canary AKS cluster automatically. It triggers on new releases and can also be triggered manually.
 
 ```yaml
 on:
@@ -309,7 +498,7 @@ jobs:
       - run: helm upgrade documentdb-operator ./operator/documentdb-helm-chart
       - run: |
           kubectl delete job longhaul -n longhaul --ignore-not-found
-          kubectl apply -f test/longhaul/deploy/job.yaml
+          kubectl apply -f operator/src/test/longhaul/deploy/job.yaml
           kubectl wait --for=condition=ready pod -l job-name=longhaul -n longhaul --timeout=120s
 ```
 
@@ -338,14 +527,17 @@ jobs:
 
 Each phase is a self-contained, demoable increment (~1-2 PRs each).
 
-### Phase 1a: Project Skeleton + Config
-- `test/longhaul/` directory structure, Ginkgo suite entry point
-- Config loading (`--max-duration`, writer count, cooldowns, operation list)
-- Can run against a cluster (does nothing yet)
+### Phase 1a: Project Skeleton + Config ✅
+- `test/longhaul/` directory with Ginkgo suite, BeforeSuite skip gate, placeholder test
+- `test/longhaul/config/` sub-package with Config struct, env var loading, validation, IsEnabled
+- Config unit tests (23 specs) in separate suite — fast, no Kubernetes cluster needed
+- README with usage guide, config reference, CI safety explanation
+- CI-safe: `LONGHAUL_ENABLED` gate skips tests in `go test ./...`
 
 ### Phase 1b: Data Plane Workload
 - Multi-writer goroutines with durability oracle
 - Reader/verifier with gap, duplicate, and checksum detection
+- Reuses `test/utils/mongo` for gateway connections and `test/utils/seed` patterns for data generation
 - Metrics counters (writes attempted/acknowledged/failed, reads, verification failures)
 
 ### Phase 1c: Event Journal
@@ -356,22 +548,23 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each).
 ### Phase 1d: Health Monitor
 - Pod readiness, restart counts, OOMKills
 - DocumentDB CR status conditions
+- Reuses `test/utils/assertions` (AssertDocumentDBReady) and `test/utils/operatorhealth` (Gate)
 - Steady-state detection (all healthy, no recent restarts, workload success rate OK)
 
 ### Phase 1e: Scale Operations
-- Scale up/down with precondition checks
+- Scale up/down with precondition checks (reuses `test/utils/documentdb.PatchInstances`)
 - Per-operation outage policy enforcement
 - First control plane operation — validates the operation scheduler pattern
 
 ### Phase 1f: Summary Report
 - Markdown report on exit (pass/fail, duration, stats, operation timeline)
 - Event journal dump
-- Testable locally: `go test ./test/longhaul/ -v --max-duration=30m` against Kind
+- Testable locally: `cd test/longhaul && LONGHAUL_MAX_DURATION=30m go test ./... -v -timeout 0` against Kind
 
 ### Phase 2a: Backup & Restore Operations
 - On-demand backup creation + wait for completion
-- Restore to new cluster + data verification against backup watermark
-- Cleanup of restored cluster
+- Restore to new DocumentDB cluster + data verification against backup watermark
+- Cleanup of restored DocumentDB cluster
 
 ### Phase 2b: HA & Replication Operations
 - Toggle HA (localHA)
@@ -380,7 +573,7 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each).
 
 ### Phase 2c: Upgrade Operations
 - Operator upgrade (Helm)
-- Cluster binary upgrade (documentDBVersion)
+- DocumentDB binary upgrade (documentDBVersion)
 - Schema upgrade (schemaVersion)
 - Each tested separately with outage policy
 
@@ -391,7 +584,7 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each).
 ### Phase 2e: Failure Tiers + Auto-Recovery
 - Fatal / degraded / warning classification
 - Auto-recovery logic (wait for K8s restart before declaring fatal)
-- Cluster state preservation on fatal failure
+- DocumentDB cluster state preservation on fatal failure
 
 ### Phase 2f: AKS Deployment
 - Dockerfile for longhaul test image
@@ -403,6 +596,13 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each).
 - GitHub Actions workflow (triggered on release + manual dispatch)
 - Azure OIDC auth, Helm upgrade, Job restart
 
+### Phase 2h: Alerting Workflow
+- GitHub Actions scheduled workflow (hourly cron)
+- Checks canary pod status + status ConfigMap
+- Creates GitHub Issue on failure (with deduplication)
+- Labels: `long-haul-failure`
+- Maintainers receive email via GitHub notification system
+
 ### Phase 3: Multi-Region Canary
 - Add/remove region operations
 - Cross-region replication verification
@@ -413,3 +613,21 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each).
 ## Open Questions
 1. What AKS cluster/subscription should be used for the dedicated canary cluster?
 2. Desired SLO targets (e.g., 99.9% write success during steady state)?
+3. **Module placement:** Long-haul tests live in `test/longhaul/` as a separate Go module (`test/longhaul/go.mod`). Shared test infrastructure lives in `test/utils/` and is imported by both `test/e2e/` and `test/longhaul/` via `replace` directives. This keeps test dependencies (Ginkgo, mongo-driver, CNPG test utils) out of the operator's runtime `go.mod`.
+4. **Shared utility extraction:** PR #346 currently places reusable utilities under `test/e2e/pkg/e2eutils/`. A follow-up task will extract them to `test/utils/` so long-haul tests can import without depending on the E2E module. Until extraction, long-haul can vendor needed helpers locally.
+
+## Design Decisions (Provisional)
+
+The following decisions shape future Phase interfaces. They are provisional — details will be refined when each Phase begins, but the approach is locked.
+
+### Journal Durability (Phase 1c)
+The event journal will use a PVC-backed file for persistence across pod restarts. The journal appends structured JSON lines (`{timestamp, event_type, op_id, cluster_state, error}`). On startup, the journal reader scans the existing file to reconstruct in-memory state. The PVC is mounted at `/data/journal/` in the canary Job manifest.
+
+### Writer Sequence Resumption (Phase 1b)
+On restart, each writer bootstraps its sequence number from `max(seq)` for its `writer_id` in the database. The oracle tolerates gaps between a crash and resume — gaps are logged as expected (crash-recovery gap) rather than flagged as data loss. The `(writer_id, seq)` unique index guarantees no duplicate sequence numbers.
+
+### Teardown on Abort (Phase 1b)
+The harness registers a signal handler for SIGTERM and SIGINT. On signal: (1) cancel all writer/reader contexts, (2) flush journal to disk, (3) write final status to ConfigMap, (4) exit with appropriate code. On startup, the harness checks for a leftover run (stale ConfigMap with state=running but no matching pod) and logs a warning before proceeding.
+
+### Latency-Regression Baseline (Phase 1d)
+During the first 30 minutes of a canary run, the monitor establishes P50/P99 write and read latency baselines. After warmup, sustained P99 regression >2× baseline for >5 minutes triggers a warning-level alert. The exact thresholds are configurable via environment variables (`LONGHAUL_LATENCY_P99_MULTIPLIER`, `LONGHAUL_LATENCY_WINDOW`).
diff --git a/test/longhaul/README.md b/test/longhaul/README.md
new file mode 100644
index 00000000..6ca2ab7d
--- /dev/null
+++ b/test/longhaul/README.md
@@ -0,0 +1,105 @@
+# Long Haul Tests
+
+Long haul tests validate that DocumentDB Kubernetes Operator clusters remain healthy under
+continuous load over extended periods. They run a canary workload that writes and reads data,
+performs management operations, and checks for data integrity.
+
+> **Status:** Phase 1a (skeleton). The canary workload and management operations will be added
+> in subsequent phases. See [design document](../../docs/designs/long-haul-test-design.md)
+> for the full plan.
+
+## Project Structure
+
+```
+test/longhaul/
+├── go.mod                # Separate Go module (imports test/utils when available)
+├── README.md             # This file
+├── suite_test.go         # Ginkgo suite entry point (the canary)
+├── longhaul_test.go      # BeforeSuite + long-running test specs
+└── config/
+    ├── config.go          # Config struct, env var loading, validation
+    ├── suite_test.go      # Config unit test suite entry
+    └── config_test.go     # Config unit tests
+```
+
+- **`test/longhaul/`** — The actual long-running canary. Designed to run for hours/days.
+- **`test/longhaul/config/`** — Config parsing and validation. Fast unit tests, safe for CI.
+
+## Quick Start
+
+### Prerequisites
+
+- A running Kubernetes cluster with DocumentDB deployed
+- `kubectl` configured to access the cluster
+- Go 1.25+
+
+### Run the Config Unit Tests
+
+These are fast and require no cluster:
+
+```bash
+cd test/longhaul
+go test ./config/ -v
+```
+
+### Run the Long Haul Canary Locally
+
+Against a local Kind cluster (see [development environment guide](../../docs/developer-guides/development-environment.md)):
+
+```bash
+cd test/longhaul
+
+LONGHAUL_ENABLED=true \
+LONGHAUL_CLUSTER_NAME=documentdb-sample \
+LONGHAUL_NAMESPACE=default \
+LONGHAUL_MAX_DURATION=10m \
+go test ./... -v -timeout 0
+```
+
+> **Note:** Use `-timeout 0` to disable Go's default 10-minute test timeout for long runs.
+
+### Build a Standalone Binary
+
+For containerized deployment (Phase 2+):
+
+```bash
+cd test/longhaul
+go test -c -o longhaul.test ./
+
+# Run the compiled binary
+LONGHAUL_ENABLED=true \
+LONGHAUL_CLUSTER_NAME=documentdb-sample \
+LONGHAUL_NAMESPACE=default \
+./longhaul.test -test.v -test.timeout 0
+```
+
+## Configuration
+
+All configuration is via environment variables. Tests are **gated** behind `LONGHAUL_ENABLED` —
+they are safely skipped in regular CI runs (`go test ./...`).
+
+| Variable | Required | Default | Description |
+|----------|----------|---------|-------------|
+| `LONGHAUL_ENABLED` | Yes | — | Must be `true`, `1`, or `yes` to run. Otherwise all tests skip. |
+| `LONGHAUL_CLUSTER_NAME` | Yes | — | Name of the target DocumentDB cluster CR. |
+| `LONGHAUL_NAMESPACE` | No | `default` | Kubernetes namespace of the target cluster. |
+| `LONGHAUL_MAX_DURATION` | No | `30m` | Max test duration. Use `0s` for run-until-failure. |
+
+> Additional configuration (writer count, operation cooldown, etc.) will be added in later phases
+> as the corresponding features are implemented.
+
+## CI Safety
+
+The long haul tests are gated behind `LONGHAUL_ENABLED`. No CI workflow currently sets this
+variable — do not add it to any PR-gated workflow.
+
+1. `LONGHAUL_ENABLED` is not set in any CI workflow
+2. The `BeforeSuite` calls `Skip()` when disabled
+3. CI output shows `Suite skipped in BeforeSuite -- 0 Passed | 0 Failed | 1 Skipped`
+
+> **Note:** For persistent canary deployment, the Job manifest explicitly sets
+> `LONGHAUL_MAX_DURATION=0s` to enable run-until-failure mode. The default 30m timeout
+> is only a safety net for local development.
+
+The config unit tests (`test/longhaul/config/`) run unconditionally and are included in normal
+CI test runs — they are fast (~0.002s) and require no cluster.
diff --git a/test/longhaul/config/config.go b/test/longhaul/config/config.go
new file mode 100644
index 00000000..70672548
--- /dev/null
+++ b/test/longhaul/config/config.go
@@ -0,0 +1,87 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package config
+
+import (
+	"fmt"
+	"os"
+	"strings"
+	"time"
+)
+
+const (
+	// Environment variable names for long haul test configuration.
+	EnvEnabled     = "LONGHAUL_ENABLED"
+	EnvMaxDuration = "LONGHAUL_MAX_DURATION"
+	EnvNamespace   = "LONGHAUL_NAMESPACE"
+	EnvClusterName = "LONGHAUL_CLUSTER_NAME"
+)
+
+// Config holds all configuration for a long haul test run.
+type Config struct {
+	// MaxDuration is the maximum test duration. Zero means run until failure.
+	// Requires explicit LONGHAUL_MAX_DURATION=0s to enable infinite runs.
+	// Default: 30m (safe for local development).
+	MaxDuration time.Duration
+
+	// Namespace is the Kubernetes namespace of the target DocumentDB cluster.
+	Namespace string
+
+	// ClusterName is the name of the target DocumentDB cluster CR.
+	ClusterName string
+}
+
+// DefaultConfig returns a Config with safe defaults for local development.
+func DefaultConfig() Config {
+	return Config{
+		MaxDuration: 30 * time.Minute,
+		Namespace:   "default",
+		ClusterName: "",
+	}
+}
+
+// LoadFromEnv loads configuration from environment variables,
+// falling back to defaults for any unset variable.
+func LoadFromEnv() (Config, error) {
+	cfg := DefaultConfig()
+
+	if v := os.Getenv(EnvMaxDuration); v != "" {
+		d, err := time.ParseDuration(v)
+		if err != nil {
+			return cfg, fmt.Errorf("invalid %s=%q: %w", EnvMaxDuration, v, err)
+		}
+		cfg.MaxDuration = d
+	}
+
+	if v := os.Getenv(EnvNamespace); v != "" {
+		cfg.Namespace = v
+	}
+
+	if v := os.Getenv(EnvClusterName); v != "" {
+		cfg.ClusterName = v
+	}
+
+	return cfg, nil
+}
+
+// Validate checks that the configuration is valid.
+func (c *Config) Validate() error {
+	if c.MaxDuration < 0 {
+		return fmt.Errorf("max duration must not be negative, got %s", c.MaxDuration)
+	}
+	if c.Namespace == "" {
+		return fmt.Errorf("namespace must not be empty")
+	}
+	if c.ClusterName == "" {
+		return fmt.Errorf("cluster name must not be empty")
+	}
+	return nil
+}
+
+// IsEnabled returns true if the long haul test is explicitly enabled
+// via the LONGHAUL_ENABLED environment variable.
+func IsEnabled() bool {
+	v := strings.TrimSpace(strings.ToLower(os.Getenv(EnvEnabled)))
+	return v == "true" || v == "1" || v == "yes"
+}
diff --git a/test/longhaul/config/config_test.go b/test/longhaul/config/config_test.go
new file mode 100644
index 00000000..af07d63c
--- /dev/null
+++ b/test/longhaul/config/config_test.go
@@ -0,0 +1,157 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package config
+
+import (
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Config", func() {
+	Describe("DefaultConfig", func() {
+		It("returns safe defaults", func() {
+			cfg := DefaultConfig()
+			Expect(cfg.MaxDuration).To(Equal(30 * time.Minute))
+			Expect(cfg.Namespace).To(Equal("default"))
+			Expect(cfg.ClusterName).To(BeEmpty())
+		})
+	})
+
+	Describe("LoadFromEnv", func() {
+		It("uses defaults when no env vars set", func() {
+			GinkgoT().Setenv(EnvMaxDuration, "")
+			GinkgoT().Setenv(EnvNamespace, "")
+			GinkgoT().Setenv(EnvClusterName, "")
+			cfg, err := LoadFromEnv()
+			Expect(err).NotTo(HaveOccurred())
+			Expect(cfg.MaxDuration).To(Equal(30 * time.Minute))
+		})
+
+		It("parses MaxDuration from env", func() {
+			GinkgoT().Setenv(EnvMaxDuration, "1h")
+			cfg, err := LoadFromEnv()
+			Expect(err).NotTo(HaveOccurred())
+			Expect(cfg.MaxDuration).To(Equal(1 * time.Hour))
+		})
+
+		It("parses zero MaxDuration for infinite runs", func() {
+			GinkgoT().Setenv(EnvMaxDuration, "0s")
+			cfg, err := LoadFromEnv()
+			Expect(err).NotTo(HaveOccurred())
+			Expect(cfg.MaxDuration).To(Equal(time.Duration(0)))
+		})
+
+		It("parses Namespace and ClusterName from env", func() {
+			GinkgoT().Setenv(EnvNamespace, "test-ns")
+			GinkgoT().Setenv(EnvClusterName, "my-cluster")
+			cfg, err := LoadFromEnv()
+			Expect(err).NotTo(HaveOccurred())
+			Expect(cfg.Namespace).To(Equal("test-ns"))
+			Expect(cfg.ClusterName).To(Equal("my-cluster"))
+		})
+
+		It("returns error for invalid MaxDuration", func() {
+			GinkgoT().Setenv(EnvMaxDuration, "not-a-duration")
+			_, err := LoadFromEnv()
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring(EnvMaxDuration))
+		})
+	})
+
+	Describe("Validate", func() {
+		It("passes for valid config", func() {
+			cfg := DefaultConfig()
+			cfg.ClusterName = "test-cluster"
+			Expect(cfg.Validate()).To(Succeed())
+		})
+
+		It("fails when Namespace is empty", func() {
+			cfg := DefaultConfig()
+			cfg.ClusterName = "test"
+			cfg.Namespace = ""
+			Expect(cfg.Validate()).To(MatchError(ContainSubstring("namespace")))
+		})
+
+		It("fails when ClusterName is empty", func() {
+			cfg := DefaultConfig()
+			Expect(cfg.Validate()).To(MatchError(ContainSubstring("cluster name")))
+		})
+
+		It("fails when MaxDuration is negative", func() {
+			cfg := DefaultConfig()
+			cfg.ClusterName = "test"
+			cfg.MaxDuration = -1 * time.Second
+			Expect(cfg.Validate()).To(MatchError(ContainSubstring("max duration must not be negative")))
+		})
+	})
+
+	Describe("IsEnabled", func() {
+		It("returns false when env not set", func() {
+			GinkgoT().Setenv(EnvEnabled, "")
+			Expect(IsEnabled()).To(BeFalse())
+		})
+
+		It("returns true for 'true'", func() {
+			GinkgoT().Setenv(EnvEnabled, "true")
+			Expect(IsEnabled()).To(BeTrue())
+		})
+
+		It("returns true for '1'", func() {
+			GinkgoT().Setenv(EnvEnabled, "1")
+			Expect(IsEnabled()).To(BeTrue())
+		})
+
+		It("returns true for 'yes'", func() {
+			GinkgoT().Setenv(EnvEnabled, "yes")
+			Expect(IsEnabled()).To(BeTrue())
+		})
+
+		It("returns true case-insensitively", func() {
+			GinkgoT().Setenv(EnvEnabled, "TRUE")
+			Expect(IsEnabled()).To(BeTrue())
+		})
+
+		It("returns true for mixed case 'True'", func() {
+			GinkgoT().Setenv(EnvEnabled, "True")
+			Expect(IsEnabled()).To(BeTrue())
+		})
+
+		It("returns true for mixed case 'YES'", func() {
+			GinkgoT().Setenv(EnvEnabled, "YES")
+			Expect(IsEnabled()).To(BeTrue())
+		})
+
+		It("returns true with surrounding whitespace", func() {
+			GinkgoT().Setenv(EnvEnabled, " true ")
+			Expect(IsEnabled()).To(BeTrue())
+		})
+
+		It("returns true for ' yes ' with whitespace", func() {
+			GinkgoT().Setenv(EnvEnabled, " yes ")
+			Expect(IsEnabled()).To(BeTrue())
+		})
+
+		It("returns false for whitespace-only", func() {
+			GinkgoT().Setenv(EnvEnabled, "   ")
+			Expect(IsEnabled()).To(BeFalse())
+		})
+
+		It("returns false for 'false'", func() {
+			GinkgoT().Setenv(EnvEnabled, "false")
+			Expect(IsEnabled()).To(BeFalse())
+		})
+
+		It("returns false for '0'", func() {
+			GinkgoT().Setenv(EnvEnabled, "0")
+			Expect(IsEnabled()).To(BeFalse())
+		})
+
+		It("returns false for 'no'", func() {
+			GinkgoT().Setenv(EnvEnabled, "no")
+			Expect(IsEnabled()).To(BeFalse())
+		})
+	})
+})
diff --git a/test/longhaul/config/suite_test.go b/test/longhaul/config/suite_test.go
new file mode 100644
index 00000000..c12c6a89
--- /dev/null
+++ b/test/longhaul/config/suite_test.go
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package config
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestConfig(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Long Haul Config Suite")
+}
diff --git a/test/longhaul/go.mod b/test/longhaul/go.mod
new file mode 100644
index 00000000..54b92254
--- /dev/null
+++ b/test/longhaul/go.mod
@@ -0,0 +1,23 @@
+module github.com/documentdb/documentdb-operator/test/longhaul
+
+go 1.25.9
+
+require (
+	github.com/onsi/ginkgo/v2 v2.28.1
+	github.com/onsi/gomega v1.39.1
+)
+
+require (
+	github.com/Masterminds/semver/v3 v3.4.0 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
+	github.com/google/go-cmp v0.7.0 // indirect
+	github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 // indirect
+	go.yaml.in/yaml/v3 v3.0.4 // indirect
+	golang.org/x/mod v0.32.0 // indirect
+	golang.org/x/net v0.49.0 // indirect
+	golang.org/x/sync v0.19.0 // indirect
+	golang.org/x/sys v0.40.0 // indirect
+	golang.org/x/text v0.33.0 // indirect
+	golang.org/x/tools v0.41.0 // indirect
+)
diff --git a/test/longhaul/go.sum b/test/longhaul/go.sum
new file mode 100644
index 00000000..b1521ced
--- /dev/null
+++ b/test/longhaul/go.sum
@@ -0,0 +1,69 @@
+github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
+github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs=
+github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo=
+github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M=
+github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk=
+github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE=
+github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc=
+github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
+github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
+github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
+github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw=
+github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/vP9vJGqPwcdqsWjOt+V8J7+bTc=
+github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI=
+github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE=
+github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo=
+github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg=
+github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE=
+github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A=
+github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI=
+github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE=
+github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28=
+github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
+github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
+github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
+github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
+github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
+github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
+go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
+go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
+golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c=
+golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU=
+golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
+golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
+golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
+golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
+golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
+golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
+google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A=
+google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/test/longhaul/longhaul_test.go b/test/longhaul/longhaul_test.go
new file mode 100644
index 00000000..80553609
--- /dev/null
+++ b/test/longhaul/longhaul_test.go
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package longhaul
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/documentdb/documentdb-operator/test/longhaul/config"
+)
+
+var testConfig config.Config
+
+var _ = BeforeSuite(func() {
+	if !config.IsEnabled() {
+		Skip("Long haul tests are disabled. Set LONGHAUL_ENABLED=true to run.")
+	}
+
+	var err error
+	testConfig, err = config.LoadFromEnv()
+	Expect(err).NotTo(HaveOccurred(), "Failed to load long haul config from environment")
+
+	err = testConfig.Validate()
+	Expect(err).NotTo(HaveOccurred(), "Invalid long haul config")
+
+	GinkgoWriter.Printf("Long haul test config:\n")
+	GinkgoWriter.Printf("  MaxDuration:  %s\n", testConfig.MaxDuration)
+	GinkgoWriter.Printf("  Namespace:    %s\n", testConfig.Namespace)
+	GinkgoWriter.Printf("  ClusterName:  %s\n", testConfig.ClusterName)
+})
+
+var _ = Describe("Long Haul Test", func() {
+	It("should run the long haul canary", func() {
+		// Phase 1b+ will implement the actual workload, operations, and monitoring.
+		// For now, verify the skeleton is wired up correctly.
+		GinkgoWriter.Println("Long haul test skeleton is running")
+		Expect(testConfig.ClusterName).NotTo(BeEmpty())
+	})
+})
diff --git a/test/longhaul/suite_test.go b/test/longhaul/suite_test.go
new file mode 100644
index 00000000..ca024859
--- /dev/null
+++ b/test/longhaul/suite_test.go
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+package longhaul
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestLongHaul(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Long Haul Suite")
+}