From 05ed306d07cd0b385fdf0633da9e7157fbfe7c17 Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Mon, 20 Apr 2026 14:17:13 -0400 Subject: [PATCH 1/2] docs: add long haul test design document Add design document for the canary-based long haul test infrastructure per issue #220. The design covers: - 4-component harness (data plane workload, control plane operations, health monitor, event journal) - Run-until-failure canary model on persistent AKS cluster - Data integrity oracle with per-writer sequence tracking - Per-operation outage policies and failure tiers - Phased implementation plan Based on research of Strimzi, CloudNative-PG, CockroachDB, and Vitess long haul/soak test patterns. Closes #220 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Wenting Wu --- docs/designs/long-haul-test-design.md | 415 ++++++++++++++++++++++++++ 1 file changed, 415 insertions(+) create mode 100644 docs/designs/long-haul-test-design.md diff --git a/docs/designs/long-haul-test-design.md b/docs/designs/long-haul-test-design.md new file mode 100644 index 00000000..e1ec4a01 --- /dev/null +++ b/docs/designs/long-haul-test-design.md @@ -0,0 +1,415 @@ +# Long Haul Test Design — DocumentDB Kubernetes Operator + +**Issue:** [#220](https://github.com/documentdb/documentdb-kubernetes-operator/issues/220) +**Status:** Design phase + +## Problem Statement + +The operator lacks continuous, long-running test coverage. Issue #220 requires: +1. Constant writes/reads — ensure no data is lost +2. Constant management operations (add/remove region, HA toggle, scale, backup/restore) +3. Operator and cluster updates under load + +## Why Long Haul Testing? + +Problems that only surface over extended continuous operation: +- **Memory/resource leaks** — need hours of reconciliation loops to see growth trends +- **WAL accumulation / disk fill** — cleanup bugs take time to manifest +- **Connection pool exhaustion** — gradual leak over many connect/disconnect cycles +- **Reconciliation drift** — operator state slowly diverges after many operations +- **Certificate rotation** — certs don't expire during 60-min CI runs +- **Backup retention cleanup** — need to exceed retention period to verify pruning +- **Pod restart cascades** — subtle race conditions under repeated scale/failover cycles +- **Upgrade correctness under load** — data corruption from rolling restarts + +Existing 60-min E2E tests verify correctness of individual operations. Long haul tests verify **sustained reliability** — that the operator doesn't degrade over time. + +## Design Overview + +The design is based on research of Strimzi, CloudNative-PG, CockroachDB (roachtest), and Vitess soak test patterns. The common architecture across all projects: **separate workload generation from disruption injection, run them concurrently, verify correctness post-hoc**. + +We adopt the **run-until-failure (canary)** model inspired by Strimzi: the cluster runs indefinitely with continuous workload and operations. When something breaks — data loss, unrecoverable state, resource exhaustion — the test captures the failure, collects artifacts, and alerts the team. This answers the real question: **"what breaks first, and after how long?"** + +--- + +## Architecture: 4 Components + +``` +┌─────────────────────────────────────────────────────────┐ +│ Long Haul Test (Go/Ginkgo) │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌───────────────┐ │ +│ │ Data Plane │ │ Control Plane│ │ Health Monitor │ │ +│ │ Workload │ │ Operations │ │ & Metrics │ │ +│ │ │ │ │ │ │ │ +│ │ • Writers │ │ • Scale │ │ • Pod status │ │ +│ │ • Readers │ │ • Replication│ │ • CR conditions│ │ +│ │ • Verifiers │ │ • Backup │ │ • OTel metrics │ │ +│ │ │ │ • Upgrade │ │ • Leak detect │ │ +│ └──────┬───────┘ └──────┬───────┘ └───────┬───────┘ │ +│ │ │ │ │ +│ └─────────┬───────┴───────────────────┘ │ +│ ▼ │ +│ ┌────────────────┐ │ +│ │ Event Journal │ │ +│ │ │ │ +│ │ • Op start/end │ │ +│ │ • State changes│ │ +│ │ • Error windows│ │ +│ │ • Disruption │ │ +│ │ budgets │ │ +│ └────────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +### Component 1: Data Plane Workload + +**Purpose:** Continuous read/write traffic to detect data loss, corruption, and availability gaps. + +**Implementation:** Go with the official MongoDB driver (`go.mongodb.org/mongo-driver`), NOT shelling out to mongosh. This gives better cancellation/retry/context control over 24h+ runs. + +**Writer Model (Durability Oracle):** +- Multiple writer goroutines, each with a unique `writer_id` +- Each write: `{writer_id, seq, payload, checksum(payload), timestamp}` +- Unique index on `(writer_id, seq)` to detect duplicates +- Track three states per write: **attempted**, **acknowledged**, **verified** +- Use `writeConcern: majority` for durability claims +- Small percentage of **upserts/updates** (not just inserts) for broader coverage + +**Reader/Verifier Model:** +- Periodic full-scan verification: no gaps in acknowledged sequences per writer +- Checksum validation on read-back +- Separate counters for: missing acknowledged writes, duplicates, stale reads, checksum mismatches +- Use `readConcern: majority` to avoid false negatives from replica lag +- Lag-aware: don't flag replication delay as data loss + +**Metrics Emitted:** +- `longhaul_writes_attempted`, `longhaul_writes_acknowledged`, `longhaul_writes_failed` +- `longhaul_reads_total`, `longhaul_reads_stale`, `longhaul_verification_failures` +- `longhaul_write_latency_ms`, `longhaul_read_latency_ms` + +### Component 2: Control Plane Operations + +**Purpose:** Exercise management operations under continuous load. + +**Operation Categories:** + +| Operation | Type | Expected Disruption | Validation | +|-----------|------|-------------------|------------| +| Scale up (nodeCount++) | Topology | None | New pods ready, data accessible | +| Scale down (nodeCount--) | Topology | Brief write pause | Remaining pods healthy, no data loss | +| Enable replication | Replication | None | Replicas created, WAL streaming | +| Disable replication | Replication | Brief | Standalone healthy | +| Add region | Multi-region | None | New region catches up, data synced | +| Remove region | Multi-region | Brief | Remaining regions healthy | +| Toggle HA (localHA) | HA | Brief failover | Primary switches, writes resume | +| On-demand backup | Backup | None | Backup CR reaches Completed | +| Restore to new cluster | Backup | N/A (new cluster) | Restored data matches backup watermark | +| Scheduled backup verify | Backup | None | Backups created on schedule | +| Operator upgrade | Update | None (DB pods should NOT restart) | Operator pod rolls, cluster unaffected | +| Cluster binary upgrade | Update | Rolling restart | Pods restart one-by-one, workload continues | +| Schema upgrade | Update | Varies | Pre-backup, post-upgrade reads/writes OK | +| Operator restart/leader failover | Chaos | Brief reconcile gap | Reconciliation resumes | +| Pod eviction (simulating node drain) | Chaos | Brief | Pod rescheduled, workload resumes | + +**Sequencing Rules:** +- Operations are NOT fully random — use **preconditions and cooldowns** +- Cannot remove region if only 1 region exists +- Cannot scale below minimum node count +- Cooldown between disruptive ops (configurable, default 5 min) +- Must reach steady state before next operation +- Backup/restore is a **separate flow** (restore creates a NEW cluster, verifies, then cleans up) + +**Per-Operation Outage Policy:** +```go +type OutagePolicy struct { + AllowedDowntime time.Duration // e.g., 60s for failover + AllowedWriteFailures int // tolerated write errors during window + MustRecoverWithin time.Duration // e.g., 5min to return to steady state +} +``` + +### Component 3: Health Monitor & Metrics + +**Purpose:** Continuous cluster health observation + resource leak detection. + +**What to Monitor:** +- **Kubernetes layer:** Pod readiness, restart counts, OOMKills, events +- **CR layer:** DocumentDB status conditions, backup phase transitions +- **Operator layer:** Operator logs/errors, reconciliation count, reconcile duration +- **Database layer:** Connection count, WAL lag, replication status +- **Resource layer:** Memory/CPU usage trends (via OTel/cAdvisor), PVC usage + +**Leak Detection:** +- Sample memory/CPU at fixed intervals +- Linear regression over last N samples +- Alert if slope exceeds threshold (configurable) +- 48-72h runs recommended for reliable leak detection + +**Steady State Definition:** +``` +- All pods in Ready state +- DocumentDB CR conditions: all True +- Replication lag < threshold (if replicated) +- No new pod restarts in last 5 min +- Workload success rate > 99.9% +- No unresolved backup failures +``` + +### Component 4: Event Journal + +**Purpose:** Central log correlating operations, disruptions, and errors for post-mortem analysis. + +**Every entry records:** +- Timestamp +- Event type (op_start, op_end, disruption_window_open, disruption_window_close, health_change, workload_error, verification_failure) +- Operation ID +- Cluster state snapshot (topology, pod count, primary node) +- Associated errors (if any) + +**Key use case:** When a write failure occurs, the journal shows whether it happened during an expected disruption window (tolerable) or during steady state (bug). + +--- + +## Canary Model + +The long-haul test is a **single persistent canary** running on a dedicated AKS cluster. Existing Kind-based integration tests (45-60 min, PR-gated) already cover short-lived validation — there is no need for a separate smoke mode. + +**Canary Cluster:** +- 5 writers, 2 verifiers +- Full operation cycle (scale, HA, replication, backup/restore, upgrades, chaos) +- Runs indefinitely until a fatal failure occurs +- On failure: collect artifacts, preserve cluster state for investigation +- Key output: **MTTF** (mean time to failure) and failure classification +- During development: test locally with `--max-duration=30m` against Kind + +### Failure Tiers + +| Tier | Example | Action | +|------|---------|--------| +| **Fatal** (stop test) | Acknowledged write lost, checksum mismatch, cluster unrecoverable >10min | Artifact dump + preserve cluster + exit non-zero | +| **Degraded** (log + continue) | Operator pod restarted, brief write timeout during expected disruption | Log to journal, continue if recovery within budget | +| **Warning** (monitor) | Memory trending up, reconcile latency increasing | Log warning, no stop | + +### Auto-Recovery Before Fatal Declaration +- Operator crash → wait for K8s restart → continue if healthy within 5 min +- Pod eviction → wait for reschedule → continue +- Data loss or corruption → **immediate stop**, preserve cluster state for investigation + +### Future: Multi-Region Canary +- Add/remove region operations, cross-region replication verification +- AKS Fleet integration +- Separate canary cluster or extension of single-cluster canary + +--- + +## Directory Structure + +``` +operator/src/test/longhaul/ +├── main_test.go # Ginkgo suite entry, profile selection +├── config.go # Configuration (duration, intervals, cluster, profile) +├── workload/ +│ ├── writer.go # Multi-writer with durability tracking +│ ├── reader.go # Reader + verifier +│ └── oracle.go # Data integrity oracle (acknowledged write tracking) +├── operations/ +│ ├── scheduler.go # Operation sequencer with preconditions/cooldowns +│ ├── scale.go # Scale up/down operations +│ ├── replication.go # Replication enable/disable, add/remove region +│ ├── backup.go # Backup create + restore-to-new-cluster verification +│ ├── upgrade.go # Operator, cluster binary, schema upgrades +│ └── chaos.go # Pod eviction, operator restart +├── monitor/ +│ ├── health.go # Cluster health checks +│ ├── metrics.go # OTel/Prometheus metric collection +│ └── leakdetect.go # Resource trend analysis +├── journal/ +│ ├── journal.go # Event journal with disruption window tracking +│ └── policy.go # Per-operation outage policies +└── report/ + ├── report.go # Summary report generation + └── templates/ # Report templates (markdown/HTML) +``` + +--- + +## Configuration + +```go +type Config struct { + // Canary runs until failure; MaxDuration=0 means infinite. + // Use --max-duration=30m for local dev testing against Kind. + MaxDuration time.Duration + + // Workload tuning + NumWriters int // default: 5 + NumVerifiers int // default: 2 + + // Operation scheduling + OpCooldown time.Duration // min interval between disruptive ops + OpEnabled []string // which operations to enable + + // Failure handling + RecoveryTimeout time.Duration // max time to wait for auto-recovery before fatal +} +``` + +--- + +## Deployment & Visibility + +### Approach + +The long haul test code is fully open source in the repository — anyone can run it. There is no requirement for a public-facing dashboard or scheduled CI workflow for the canary. This matches the pattern of most early-stage OSS projects; public dashboards (like Strimzi's Jenkins or CockroachDB's TeamCity) can be added later as the project matures. + +### Running the Canary + +**Local development (anyone):** +```bash +cd operator/src +go test ./test/longhaul/ -v --max-duration=30m +``` +Runs against whatever cluster your kubeconfig points to (Kind, Minikube, etc.). + +**Persistent canary (internal):** +- Dedicated AKS cluster provisioned once (manually or via IaC) +- Long haul test deployed as a Kubernetes Job on the same cluster (separate `longhaul` namespace) +- On new operator release: re-deploy operator via Helm + restart longhaul Job +- Internal Grafana/OTel dashboard for monitoring (optional) +- Cluster preserved on failure for investigation + +### When Bugs Are Found + +Bugs discovered by the canary are filed as regular GitHub issues — no special process needed. The long haul test collects enough context (event journal, cluster state snapshot, failure details) to make issues actionable. + +### Auto-Upgrade + +A GitHub Actions workflow handles upgrading the canary cluster automatically. It triggers on new releases and can also be triggered manually. + +```yaml +on: + workflow_dispatch: # manual trigger + release: + types: [published] # auto-trigger on new operator release + +jobs: + upgrade-canary: + runs-on: ubuntu-latest + permissions: + id-token: write # for Azure federated identity (OIDC) + steps: + - uses: actions/checkout@v4 + - uses: azure/login@v2 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - run: az aks get-credentials --resource-group $RG --name $CLUSTER + - run: helm upgrade documentdb-operator ./operator/documentdb-helm-chart + - run: | + kubectl delete job longhaul -n longhaul --ignore-not-found + kubectl apply -f test/longhaul/deploy/job.yaml + kubectl wait --for=condition=ready pod -l job-name=longhaul -n longhaul --timeout=120s +``` + +**Key points:** +- **AKS auth**: Azure federated identity (OIDC) — no stored secrets, just a trust relationship between GitHub and Azure +- **Operator release** → workflow auto-triggers → Helm upgrade → restart longhaul Job +- **Test code change** → rebuild longhaul image, trigger workflow manually via `workflow_dispatch` +- **Audit trail**: Every upgrade is visible in GitHub Actions history + +--- + +## Learnings from Other Projects + +| Project | Key Pattern We Adopt | Key Pattern We Skip | +|---------|---------------------|-------------------| +| **Strimzi** | Run-until-failure loops; metrics collection; CI profiles | JUnit (we use Ginkgo) | +| **CloudNative-PG** | Ginkgo framework; failover via pod delete + SIGSTOP; LSN verification | Single-sequence failover (we need continuous concurrent workload) | +| **CockroachDB** | Chaos runner (periodic kill/restart); separate workload from disruption; roachstress repeated runs | Custom roachtest framework (too heavy for our needs) | +| **Vitess** | Background stress goroutine; per-query tracking; Go native driver | No fault injection (we need disruptive ops) | + +**Universal pattern adopted:** Separate workload generators from disruption injectors, run concurrently, verify correctness against an acknowledged-write oracle, use per-operation disruption budgets. Run-until-failure (Strimzi model) rather than time-bounded. + +--- + +## Implementation Phases + +Each phase is a self-contained, demoable increment (~1-2 PRs each). + +### Phase 1a: Project Skeleton + Config +- `test/longhaul/` directory structure, Ginkgo suite entry point +- Config loading (`--max-duration`, writer count, cooldowns, operation list) +- Can run against a cluster (does nothing yet) + +### Phase 1b: Data Plane Workload +- Multi-writer goroutines with durability oracle +- Reader/verifier with gap, duplicate, and checksum detection +- Metrics counters (writes attempted/acknowledged/failed, reads, verification failures) + +### Phase 1c: Event Journal +- Central event log (op_start, op_end, health_change, workload_error, etc.) +- Disruption window tracking (expected vs unexpected errors) +- In-memory + file-backed for post-mortem + +### Phase 1d: Health Monitor +- Pod readiness, restart counts, OOMKills +- DocumentDB CR status conditions +- Steady-state detection (all healthy, no recent restarts, workload success rate OK) + +### Phase 1e: Scale Operations +- Scale up/down with precondition checks +- Per-operation outage policy enforcement +- First control plane operation — validates the operation scheduler pattern + +### Phase 1f: Summary Report +- Markdown report on exit (pass/fail, duration, stats, operation timeline) +- Event journal dump +- Testable locally: `go test ./test/longhaul/ -v --max-duration=30m` against Kind + +### Phase 2a: Backup & Restore Operations +- On-demand backup creation + wait for completion +- Restore to new cluster + data verification against backup watermark +- Cleanup of restored cluster + +### Phase 2b: HA & Replication Operations +- Toggle HA (localHA) +- Enable/disable replication +- Precondition checks (e.g., cannot disable if already standalone) + +### Phase 2c: Upgrade Operations +- Operator upgrade (Helm) +- Cluster binary upgrade (documentDBVersion) +- Schema upgrade (schemaVersion) +- Each tested separately with outage policy + +### Phase 2d: Chaos Operations +- Pod eviction (simulating node drain) +- Operator restart / leader failover + +### Phase 2e: Failure Tiers + Auto-Recovery +- Fatal / degraded / warning classification +- Auto-recovery logic (wait for K8s restart before declaring fatal) +- Cluster state preservation on fatal failure + +### Phase 2f: AKS Deployment +- Dockerfile for longhaul test image +- Kubernetes Job manifest, RBAC (ServiceAccount, ClusterRole, Binding) +- ConfigMap for tuning parameters +- Deploy script / instructions + +### Phase 2g: Auto-Upgrade Workflow +- GitHub Actions workflow (triggered on release + manual dispatch) +- Azure OIDC auth, Helm upgrade, Job restart + +### Phase 3: Multi-Region Canary +- Add/remove region operations +- Cross-region replication verification +- AKS Fleet integration + +--- + +## Open Questions +1. What AKS cluster/subscription should be used for the dedicated canary cluster? +2. Desired SLO targets (e.g., 99.9% write success during steady state)? From 158a393563aeba01b2d335ccad4bbc3ab766554e Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Wed, 22 Apr 2026 11:05:54 -0400 Subject: [PATCH 2/2] feat: add long-haul test skeleton and configuration (Phase 1a) Add the project skeleton for long-haul (canary) tests: - config/config.go: Config struct with env var loading, defaults, and validation - config/config_test.go: Comprehensive tests for all config options (15 specs) - config/suite_test.go: Ginkgo suite entry for config unit tests - suite_test.go: Ginkgo suite entry point for the canary - longhaul_test.go: BeforeSuite with LONGHAUL_ENABLED skip gate + placeholder - README.md: Usage guide for running locally and in CI Config is in a sub-package so config unit tests run independently of the long-running canary. Tests are gated behind LONGHAUL_ENABLED=true env var, so go test ./... safely skips them. Part of #220 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Wenting Wu --- docs/designs/long-haul-test-design.md | 362 +++++++++++++++++++++----- test/longhaul/README.md | 105 ++++++++ test/longhaul/config/config.go | 87 +++++++ test/longhaul/config/config_test.go | 157 +++++++++++ test/longhaul/config/suite_test.go | 16 ++ test/longhaul/go.mod | 23 ++ test/longhaul/go.sum | 69 +++++ test/longhaul/longhaul_test.go | 40 +++ test/longhaul/suite_test.go | 16 ++ 9 files changed, 803 insertions(+), 72 deletions(-) create mode 100644 test/longhaul/README.md create mode 100644 test/longhaul/config/config.go create mode 100644 test/longhaul/config/config_test.go create mode 100644 test/longhaul/config/suite_test.go create mode 100644 test/longhaul/go.mod create mode 100644 test/longhaul/go.sum create mode 100644 test/longhaul/longhaul_test.go create mode 100644 test/longhaul/suite_test.go diff --git a/docs/designs/long-haul-test-design.md b/docs/designs/long-haul-test-design.md index e1ec4a01..7e916549 100644 --- a/docs/designs/long-haul-test-design.md +++ b/docs/designs/long-haul-test-design.md @@ -1,14 +1,23 @@ # Long Haul Test Design — DocumentDB Kubernetes Operator **Issue:** [#220](https://github.com/documentdb/documentdb-kubernetes-operator/issues/220) -**Status:** Design phase +**Status:** In progress (Phase 1a complete) + +## Terminology + +This document refers to two kinds of cluster: + +- **DocumentDB cluster** — the database cluster managed by the operator (the `DocumentDB` Custom Resource and its pods). +- **Kubernetes cluster** (or **AKS cluster**, **Kind cluster**) — the infrastructure cluster where the operator and DocumentDB run. + +When unqualified, "cluster" in the context of operations, health, and state refers to the **DocumentDB cluster**. Infrastructure clusters are always qualified (AKS, Kind, etc.). ## Problem Statement The operator lacks continuous, long-running test coverage. Issue #220 requires: 1. Constant writes/reads — ensure no data is lost 2. Constant management operations (add/remove region, HA toggle, scale, backup/restore) -3. Operator and cluster updates under load +3. Operator and DocumentDB cluster updates under load ## Why Long Haul Testing? @@ -28,7 +37,7 @@ Existing 60-min E2E tests verify correctness of individual operations. Long haul The design is based on research of Strimzi, CloudNative-PG, CockroachDB (roachtest), and Vitess soak test patterns. The common architecture across all projects: **separate workload generation from disruption injection, run them concurrently, verify correctness post-hoc**. -We adopt the **run-until-failure (canary)** model inspired by Strimzi: the cluster runs indefinitely with continuous workload and operations. When something breaks — data loss, unrecoverable state, resource exhaustion — the test captures the failure, collects artifacts, and alerts the team. This answers the real question: **"what breaks first, and after how long?"** +We adopt the **run-until-failure (canary)** model inspired by Strimzi: the DocumentDB cluster runs indefinitely with continuous workload and operations. When something breaks — data loss, unrecoverable state, resource exhaustion — the test captures the failure, collects artifacts, and alerts the team. This answers the real question: **"what breaks first, and after how long?"** --- @@ -104,10 +113,10 @@ We adopt the **run-until-failure (canary)** model inspired by Strimzi: the clust | Remove region | Multi-region | Brief | Remaining regions healthy | | Toggle HA (localHA) | HA | Brief failover | Primary switches, writes resume | | On-demand backup | Backup | None | Backup CR reaches Completed | -| Restore to new cluster | Backup | N/A (new cluster) | Restored data matches backup watermark | +| Restore to new DocumentDB cluster | Backup | N/A (new cluster) | Restored data matches backup watermark | | Scheduled backup verify | Backup | None | Backups created on schedule | -| Operator upgrade | Update | None (DB pods should NOT restart) | Operator pod rolls, cluster unaffected | -| Cluster binary upgrade | Update | Rolling restart | Pods restart one-by-one, workload continues | +| Operator upgrade | Update | None (DB pods should NOT restart) | Operator pod rolls, DocumentDB cluster unaffected | +| DocumentDB binary upgrade | Update | Rolling restart | Pods restart one-by-one, workload continues | | Schema upgrade | Update | Varies | Pre-backup, post-upgrade reads/writes OK | | Operator restart/leader failover | Chaos | Brief reconcile gap | Reconciliation resumes | | Pod eviction (simulating node drain) | Chaos | Brief | Pod rescheduled, workload resumes | @@ -118,7 +127,7 @@ We adopt the **run-until-failure (canary)** model inspired by Strimzi: the clust - Cannot scale below minimum node count - Cooldown between disruptive ops (configurable, default 5 min) - Must reach steady state before next operation -- Backup/restore is a **separate flow** (restore creates a NEW cluster, verifies, then cleans up) +- Backup/restore is a **separate flow** (restore creates a NEW DocumentDB cluster, verifies, then cleans up) **Per-Operation Outage Policy:** ```go @@ -131,7 +140,7 @@ type OutagePolicy struct { ### Component 3: Health Monitor & Metrics -**Purpose:** Continuous cluster health observation + resource leak detection. +**Purpose:** Continuous DocumentDB cluster health observation + resource leak detection. **What to Monitor:** - **Kubernetes layer:** Pod readiness, restart counts, OOMKills, events @@ -164,7 +173,7 @@ type OutagePolicy struct { - Timestamp - Event type (op_start, op_end, disruption_window_open, disruption_window_close, health_change, workload_error, verification_failure) - Operation ID -- Cluster state snapshot (topology, pod count, primary node) +- Cluster state snapshot (DocumentDB topology, pod count, primary node) - Associated errors (if any) **Key use case:** When a write failure occurs, the journal shows whether it happened during an expected disruption window (tolerable) or during steady state (bug). @@ -175,11 +184,11 @@ type OutagePolicy struct { The long-haul test is a **single persistent canary** running on a dedicated AKS cluster. Existing Kind-based integration tests (45-60 min, PR-gated) already cover short-lived validation — there is no need for a separate smoke mode. -**Canary Cluster:** +**Canary Configuration:** - 5 writers, 2 verifiers - Full operation cycle (scale, HA, replication, backup/restore, upgrades, chaos) - Runs indefinitely until a fatal failure occurs -- On failure: collect artifacts, preserve cluster state for investigation +- On failure: collect artifacts, preserve DocumentDB cluster state for investigation - Key output: **MTTF** (mean time to failure) and failure classification - During development: test locally with `--max-duration=30m` against Kind @@ -187,73 +196,160 @@ The long-haul test is a **single persistent canary** running on a dedicated AKS | Tier | Example | Action | |------|---------|--------| -| **Fatal** (stop test) | Acknowledged write lost, checksum mismatch, cluster unrecoverable >10min | Artifact dump + preserve cluster + exit non-zero | +| **Fatal** (stop test) | Acknowledged write lost, checksum mismatch, DocumentDB cluster unrecoverable >10min | Artifact dump + preserve cluster + exit non-zero | | **Degraded** (log + continue) | Operator pod restarted, brief write timeout during expected disruption | Log to journal, continue if recovery within budget | | **Warning** (monitor) | Memory trending up, reconcile latency increasing | Log warning, no stop | ### Auto-Recovery Before Fatal Declaration - Operator crash → wait for K8s restart → continue if healthy within 5 min - Pod eviction → wait for reschedule → continue -- Data loss or corruption → **immediate stop**, preserve cluster state for investigation +- Data loss or corruption → **immediate stop**, preserve DocumentDB cluster state for investigation ### Future: Multi-Region Canary - Add/remove region operations, cross-region replication verification - AKS Fleet integration -- Separate canary cluster or extension of single-cluster canary +- Separate canary AKS cluster or extension of single-cluster canary --- ## Directory Structure +The test infrastructure follows a **three-directory layout** at the repo root: + ``` -operator/src/test/longhaul/ -├── main_test.go # Ginkgo suite entry, profile selection -├── config.go # Configuration (duration, intervals, cluster, profile) -├── workload/ -│ ├── writer.go # Multi-writer with durability tracking -│ ├── reader.go # Reader + verifier -│ └── oracle.go # Data integrity oracle (acknowledged write tracking) -├── operations/ -│ ├── scheduler.go # Operation sequencer with preconditions/cooldowns -│ ├── scale.go # Scale up/down operations -│ ├── replication.go # Replication enable/disable, add/remove region -│ ├── backup.go # Backup create + restore-to-new-cluster verification -│ ├── upgrade.go # Operator, cluster binary, schema upgrades -│ └── chaos.go # Pod eviction, operator restart -├── monitor/ -│ ├── health.go # Cluster health checks -│ ├── metrics.go # OTel/Prometheus metric collection -│ └── leakdetect.go # Resource trend analysis -├── journal/ -│ ├── journal.go # Event journal with disruption window tracking -│ └── policy.go # Per-operation outage policies -└── report/ - ├── report.go # Summary report generation - └── templates/ # Report templates (markdown/HTML) +test/ +├── utils/ # Shared test utilities (used by BOTH e2e and longhaul) +│ ├── go.mod # Separate module: github.com/.../test/utils +│ ├── mongo/ # Mongo client, Seed, Count, Ping, Handle +│ ├── assertions/ # Gomega-compatible checkers (DocumentDBReady, InstanceCount, …) +│ ├── documentdb/ # DocumentDB CR CRUD (Create, WaitHealthy, Delete, PatchSpec, …) +│ ├── operatorhealth/ # Operator-churn gate (pod UID/restart tracking) +│ ├── portforward/ # Gateway port-forward (wraps CNPG forwardconnection) +│ ├── fixtures/ # Namespace/secret/label helpers, teardown-by-label +│ ├── timeouts/ # Centralised Eventually durations (reuses CNPG timeouts) +│ ├── clusterprobe/ # Runtime capability checks (VolumeSnapshot CRD, StorageClass) +│ ├── seed/ # Deterministic datasets (SmallDataset, MediumDataset, …) +│ └── testenv/ # Shared environment config (kubeconfig, client setup) +│ +├── e2e/ # E2E test suite (PR #346) +│ ├── go.mod # Imports test/utils + operator API types +│ ├── tests/ +│ │ ├── lifecycle/ # Deploy, delete, image update, log level +│ │ ├── scale/ # Instance scaling +│ │ ├── data/ # CRUD, aggregation, sort/limit +│ │ ├── backup/ # Backup & restore +│ │ ├── tls/ # TLS certificate modes +│ │ ├── upgrade/ # Operator & binary upgrades +│ │ └── ... +│ └── README.md +│ +└── longhaul/ # Long-haul canary test suite + ├── go.mod # Imports test/utils + operator API types + ├── README.md # Usage guide (running locally, CI safety, configuration) + ├── suite_test.go # Ginkgo suite entry point for the canary + ├── longhaul_test.go # BeforeSuite (skip gate + config) + long-running test specs + ├── config/ + │ ├── config.go # Config struct, env var loading, validation, IsEnabled gate + │ ├── suite_test.go # Ginkgo suite entry for config unit tests + │ └── config_test.go # Config unit tests (23 specs, fast, no Kubernetes cluster needed) + ├── workload/ # (Phase 1b) + │ ├── writer.go # Multi-writer with durability tracking + │ ├── reader.go # Reader + verifier (reuses test/utils/mongo) + │ └── oracle.go # Data integrity oracle (acknowledged write tracking) + ├── operations/ # (Phase 1d-2d) + │ ├── scheduler.go # Operation sequencer with preconditions/cooldowns + │ ├── scale.go # Scale (reuses test/utils/documentdb.PatchInstances) + │ ├── replication.go # Replication enable/disable, add/remove region + │ ├── backup.go # Backup create + restore (reuses test/utils/clusterprobe) + │ ├── upgrade.go # Operator, DocumentDB binary, schema upgrades + │ └── chaos.go # Pod eviction, operator restart + ├── monitor/ # (Phase 1d) + │ ├── health.go # Reuses test/utils/assertions + test/utils/operatorhealth + │ ├── metrics.go # OTel/Prometheus metric collection + │ └── leakdetect.go # Resource trend analysis + ├── journal/ # (Phase 1c) + │ ├── journal.go # Event journal with disruption window tracking + │ └── policy.go # Per-operation outage policies + └── report/ # (Phase 1f) + ├── report.go # Summary report generation + └── templates/ # Report templates (markdown/HTML) ``` +### Shared Utilities: `test/utils/` + +The `test/utils/` module provides reusable test infrastructure for **both** E2E and long-haul tests. This avoids duplicating ~2000 lines of proven utilities. The packages originate from PR #346's `test/e2e/pkg/e2eutils/` and are promoted to the shared location. + +**Key packages and how long-haul uses them:** + +| Package | What it provides | Long-haul use | +|---------|-----------------|---------------| +| `mongo/` | Client, Seed, Count, Ping, Handle, port-forward connect | Writers + Verifiers connect to DocumentDB gateway | +| `assertions/` | AssertDocumentDBReady, AssertInstanceCount, AssertPrimaryUnchanged | Health monitor polls cluster health continuously | +| `documentdb/` | Create, WaitHealthy, Delete, PatchInstances, PatchSpec | Operation executor (scale, upgrade, backup/restore) | +| `operatorhealth/` | Gate (pod UID/restart tracking), Check, MarkChurned | Health monitor detects operator churn under load | +| `portforward/` | OpenWithErr for gateway service | Writers open port-forward to DocumentDB gateway | +| `timeouts/` | For(op), PollInterval(op) — standardised wait durations | All waiters use consistent, CNPG-aligned timeouts | +| `fixtures/` | ensureNamespace, ensureCredentialSecret, ownershipLabels, teardownByLabels | Canary setup creates namespace + credentials; teardown by label on abort | +| `clusterprobe/` | HasVolumeSnapshotCRD, StorageClassAllowsExpansion | Backup operations skip when CSI snapshots unavailable | +| `seed/` | SmallDataset, MediumDataset (deterministic bson.M generators) | Writer seed data for baseline verification | + +**Module structure:** + +``` +test/utils/go.mod → github.com/documentdb/documentdb-operator/test/utils +test/e2e/go.mod → github.com/documentdb/documentdb-operator/test/e2e +test/longhaul/go.mod → github.com/documentdb/documentdb-operator/test/longhaul +operator/src/go.mod → github.com/documentdb/documentdb-operator (unchanged) +``` + +Each test module uses a `replace` directive to point at the local operator source and `test/utils`: + +```go +// test/longhaul/go.mod +module github.com/documentdb/documentdb-operator/test/longhaul + +require ( + github.com/documentdb/documentdb-operator/test/utils v0.0.0 + github.com/documentdb/documentdb-operator v0.0.0 +) + +replace ( + github.com/documentdb/documentdb-operator/test/utils => ../utils + github.com/documentdb/documentdb-operator => ../../operator/src +) +``` + +> **Migration note:** PR #346 currently has utilities under `test/e2e/pkg/e2eutils/`. Extracting them to +> `test/utils/` is a follow-up task that should be coordinated with xgerman. Until extraction happens, +> long-haul tests can vendor the needed types locally and swap to imports once `test/utils/` exists. + --- ## Configuration -```go -type Config struct { - // Canary runs until failure; MaxDuration=0 means infinite. - // Use --max-duration=30m for local dev testing against Kind. - MaxDuration time.Duration +All configuration is via environment variables. Tests are **gated** behind `LONGHAUL_ENABLED` — they are safely skipped in regular CI runs (`go test ./...`). - // Workload tuning - NumWriters int // default: 5 - NumVerifiers int // default: 2 +**Current (Phase 1a):** - // Operation scheduling - OpCooldown time.Duration // min interval between disruptive ops - OpEnabled []string // which operations to enable +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `LONGHAUL_ENABLED` | Yes | — | Must be `true`, `1`, or `yes` to run. Otherwise all tests skip. | +| `LONGHAUL_CLUSTER_NAME` | Yes | — | Name of the target DocumentDB cluster CR. | +| `LONGHAUL_NAMESPACE` | No | `default` | Kubernetes namespace of the target DocumentDB cluster. | +| `LONGHAUL_MAX_DURATION` | No | `30m` | Max test duration (`0s` = run until failure). | - // Failure handling - RecoveryTimeout time.Duration // max time to wait for auto-recovery before fatal -} -``` +> **Note:** The default 30m timeout is a safety net for local development. The persistent canary +> Job manifest explicitly sets `LONGHAUL_MAX_DURATION=0s` to enable run-until-failure mode. + +**Planned (future phases):** + +| Variable | Default | Phase | Description | +|----------|---------|-------|-------------| +| `LONGHAUL_NUM_WRITERS` | `5` | 1b | Number of concurrent writer goroutines | +| `LONGHAUL_NUM_VERIFIERS` | `2` | 1b | Number of concurrent verifier goroutines | +| `LONGHAUL_OP_COOLDOWN` | `5m` | 1e | Min interval between disruptive operations | +| `LONGHAUL_OP_ENABLED` | all | 1e | Comma-separated list of enabled operations | +| `LONGHAUL_RECOVERY_TIMEOUT` | `5m` | 2e | Max time to wait for auto-recovery before fatal | --- @@ -267,25 +363,118 @@ The long haul test code is fully open source in the repository — anyone can ru **Local development (anyone):** ```bash -cd operator/src -go test ./test/longhaul/ -v --max-duration=30m +cd test/longhaul + +# Run config unit tests (fast, no Kubernetes cluster needed) +go test ./config/ -v + +# Run the canary against a local Kind cluster +LONGHAUL_ENABLED=true \ +LONGHAUL_CLUSTER_NAME=documentdb-sample \ +LONGHAUL_NAMESPACE=default \ +LONGHAUL_MAX_DURATION=10m \ +go test ./... -v -timeout 0 + +# Or build a standalone binary +go test -c -o longhaul.test ./ +LONGHAUL_ENABLED=true \ +LONGHAUL_CLUSTER_NAME=documentdb-sample \ +./longhaul.test -test.v -test.timeout 0 ``` -Runs against whatever cluster your kubeconfig points to (Kind, Minikube, etc.). +Runs against whatever Kubernetes cluster your kubeconfig points to (Kind, Minikube, etc.). **Persistent canary (internal):** - Dedicated AKS cluster provisioned once (manually or via IaC) -- Long haul test deployed as a Kubernetes Job on the same cluster (separate `longhaul` namespace) +- Long haul test deployed as a Kubernetes Job on the same AKS cluster (separate `longhaul` namespace) - On new operator release: re-deploy operator via Helm + restart longhaul Job - Internal Grafana/OTel dashboard for monitoring (optional) -- Cluster preserved on failure for investigation +- DocumentDB cluster preserved on failure for investigation + +> **Note:** The canary runs on a team-managed AKS cluster. Contributors do not need cluster access — +> test results are made public via GitHub Issues (on failure) and an optional status badge in README. +> This is standard practice for open-source projects (CockroachDB, Strimzi, Kubernetes itself all +> run long-running tests on private infrastructure with public results). + +### Alerting + +The alerting system uses a **two-layer architecture** to avoid managing long-lived tokens on the AKS cluster: + +**Layer 1: AKS cluster (always running)** +- Long-haul canary runs as a Kubernetes Job — continuous workload +- Writes status to a well-known ConfigMap (`longhaul-status` in `longhaul` namespace) +- Updates include: current state (running/failed/passed), last heartbeat, failure details, journal excerpt +- No GitHub token needed on the AKS cluster + +**Layer 2: GitHub Actions (periodic health check)** +- Scheduled workflow runs every hour (`cron: '0 * * * *'`) +- Connects to AKS cluster via Azure federated identity (OIDC, same as auto-upgrade workflow) +- Checks canary health: pod status, status ConfigMap, recent pod logs +- If failure detected → creates a GitHub Issue with: + - Title: `[Long Haul Failure] {failure type} — {timestamp}` + - Body: DocumentDB cluster name, uptime, error details, journal excerpt, pod logs + - Labels: `long-haul-failure` +- Uses `GITHUB_TOKEN` (auto-managed by GitHub Actions, no expiry, no rotation) +- Maintainers receive email automatically via GitHub's issue notification system +- Deduplication: skips issue creation if an open `long-haul-failure` issue already exists -### When Bugs Are Found +```yaml +on: + schedule: + - cron: '0 * * * *' # every hour + workflow_dispatch: # manual trigger -Bugs discovered by the canary are filed as regular GitHub issues — no special process needed. The long haul test collects enough context (event journal, cluster state snapshot, failure details) to make issues actionable. +jobs: + check-canary: + runs-on: ubuntu-latest + permissions: + id-token: write # Azure OIDC + issues: write # create GitHub Issues + steps: + - uses: actions/checkout@v4 + - uses: azure/login@v2 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - run: az aks get-credentials --resource-group $RG --name $CLUSTER + - name: Check canary status + id: status + run: | + # Check pod health + POD_STATUS=$(kubectl get pods -l job-name=longhaul -n longhaul -o jsonpath='{.items[0].status.phase}') + # Read status ConfigMap + CANARY_STATUS=$(kubectl get configmap longhaul-status -n longhaul -o jsonpath='{.data.status}') + echo "pod_status=$POD_STATUS" >> $GITHUB_OUTPUT + echo "canary_status=$CANARY_STATUS" >> $GITHUB_OUTPUT + - name: Create issue on failure + if: steps.status.outputs.canary_status == 'failed' || steps.status.outputs.pod_status != 'Running' + uses: actions/github-script@v7 + with: + script: | + // Deduplicate: skip if open issue exists + const { data: issues } = await github.rest.issues.listForRepo({ + owner: context.repo.owner, repo: context.repo.repo, + labels: 'long-haul-failure', state: 'open' + }); + if (issues.length > 0) return; + await github.rest.issues.create({ + owner: context.repo.owner, repo: context.repo.repo, + title: `[Long Haul Failure] ${new Date().toISOString()}`, + body: `Canary status: ${{ steps.status.outputs.canary_status }}\nPod: ${{ steps.status.outputs.pod_status }}`, + labels: ['long-haul-failure'] + }); +``` + +**Benefits:** +- No long-lived GitHub tokens on the AKS cluster +- `GITHUB_TOKEN` in Actions is auto-managed — no expiry, no rotation +- Maintainers get email through GitHub's built-in notification system +- All failures are publicly visible as GitHub Issues — contributors can see and comment +- Easy to extend: add Slack webhook, Teams notification, or status badge in future ### Auto-Upgrade -A GitHub Actions workflow handles upgrading the canary cluster automatically. It triggers on new releases and can also be triggered manually. +A GitHub Actions workflow handles upgrading the canary AKS cluster automatically. It triggers on new releases and can also be triggered manually. ```yaml on: @@ -309,7 +498,7 @@ jobs: - run: helm upgrade documentdb-operator ./operator/documentdb-helm-chart - run: | kubectl delete job longhaul -n longhaul --ignore-not-found - kubectl apply -f test/longhaul/deploy/job.yaml + kubectl apply -f operator/src/test/longhaul/deploy/job.yaml kubectl wait --for=condition=ready pod -l job-name=longhaul -n longhaul --timeout=120s ``` @@ -338,14 +527,17 @@ jobs: Each phase is a self-contained, demoable increment (~1-2 PRs each). -### Phase 1a: Project Skeleton + Config -- `test/longhaul/` directory structure, Ginkgo suite entry point -- Config loading (`--max-duration`, writer count, cooldowns, operation list) -- Can run against a cluster (does nothing yet) +### Phase 1a: Project Skeleton + Config ✅ +- `test/longhaul/` directory with Ginkgo suite, BeforeSuite skip gate, placeholder test +- `test/longhaul/config/` sub-package with Config struct, env var loading, validation, IsEnabled +- Config unit tests (23 specs) in separate suite — fast, no Kubernetes cluster needed +- README with usage guide, config reference, CI safety explanation +- CI-safe: `LONGHAUL_ENABLED` gate skips tests in `go test ./...` ### Phase 1b: Data Plane Workload - Multi-writer goroutines with durability oracle - Reader/verifier with gap, duplicate, and checksum detection +- Reuses `test/utils/mongo` for gateway connections and `test/utils/seed` patterns for data generation - Metrics counters (writes attempted/acknowledged/failed, reads, verification failures) ### Phase 1c: Event Journal @@ -356,22 +548,23 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each). ### Phase 1d: Health Monitor - Pod readiness, restart counts, OOMKills - DocumentDB CR status conditions +- Reuses `test/utils/assertions` (AssertDocumentDBReady) and `test/utils/operatorhealth` (Gate) - Steady-state detection (all healthy, no recent restarts, workload success rate OK) ### Phase 1e: Scale Operations -- Scale up/down with precondition checks +- Scale up/down with precondition checks (reuses `test/utils/documentdb.PatchInstances`) - Per-operation outage policy enforcement - First control plane operation — validates the operation scheduler pattern ### Phase 1f: Summary Report - Markdown report on exit (pass/fail, duration, stats, operation timeline) - Event journal dump -- Testable locally: `go test ./test/longhaul/ -v --max-duration=30m` against Kind +- Testable locally: `cd test/longhaul && LONGHAUL_MAX_DURATION=30m go test ./... -v -timeout 0` against Kind ### Phase 2a: Backup & Restore Operations - On-demand backup creation + wait for completion -- Restore to new cluster + data verification against backup watermark -- Cleanup of restored cluster +- Restore to new DocumentDB cluster + data verification against backup watermark +- Cleanup of restored DocumentDB cluster ### Phase 2b: HA & Replication Operations - Toggle HA (localHA) @@ -380,7 +573,7 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each). ### Phase 2c: Upgrade Operations - Operator upgrade (Helm) -- Cluster binary upgrade (documentDBVersion) +- DocumentDB binary upgrade (documentDBVersion) - Schema upgrade (schemaVersion) - Each tested separately with outage policy @@ -391,7 +584,7 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each). ### Phase 2e: Failure Tiers + Auto-Recovery - Fatal / degraded / warning classification - Auto-recovery logic (wait for K8s restart before declaring fatal) -- Cluster state preservation on fatal failure +- DocumentDB cluster state preservation on fatal failure ### Phase 2f: AKS Deployment - Dockerfile for longhaul test image @@ -403,6 +596,13 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each). - GitHub Actions workflow (triggered on release + manual dispatch) - Azure OIDC auth, Helm upgrade, Job restart +### Phase 2h: Alerting Workflow +- GitHub Actions scheduled workflow (hourly cron) +- Checks canary pod status + status ConfigMap +- Creates GitHub Issue on failure (with deduplication) +- Labels: `long-haul-failure` +- Maintainers receive email via GitHub notification system + ### Phase 3: Multi-Region Canary - Add/remove region operations - Cross-region replication verification @@ -413,3 +613,21 @@ Each phase is a self-contained, demoable increment (~1-2 PRs each). ## Open Questions 1. What AKS cluster/subscription should be used for the dedicated canary cluster? 2. Desired SLO targets (e.g., 99.9% write success during steady state)? +3. **Module placement:** Long-haul tests live in `test/longhaul/` as a separate Go module (`test/longhaul/go.mod`). Shared test infrastructure lives in `test/utils/` and is imported by both `test/e2e/` and `test/longhaul/` via `replace` directives. This keeps test dependencies (Ginkgo, mongo-driver, CNPG test utils) out of the operator's runtime `go.mod`. +4. **Shared utility extraction:** PR #346 currently places reusable utilities under `test/e2e/pkg/e2eutils/`. A follow-up task will extract them to `test/utils/` so long-haul tests can import without depending on the E2E module. Until extraction, long-haul can vendor needed helpers locally. + +## Design Decisions (Provisional) + +The following decisions shape future Phase interfaces. They are provisional — details will be refined when each Phase begins, but the approach is locked. + +### Journal Durability (Phase 1c) +The event journal will use a PVC-backed file for persistence across pod restarts. The journal appends structured JSON lines (`{timestamp, event_type, op_id, cluster_state, error}`). On startup, the journal reader scans the existing file to reconstruct in-memory state. The PVC is mounted at `/data/journal/` in the canary Job manifest. + +### Writer Sequence Resumption (Phase 1b) +On restart, each writer bootstraps its sequence number from `max(seq)` for its `writer_id` in the database. The oracle tolerates gaps between a crash and resume — gaps are logged as expected (crash-recovery gap) rather than flagged as data loss. The `(writer_id, seq)` unique index guarantees no duplicate sequence numbers. + +### Teardown on Abort (Phase 1b) +The harness registers a signal handler for SIGTERM and SIGINT. On signal: (1) cancel all writer/reader contexts, (2) flush journal to disk, (3) write final status to ConfigMap, (4) exit with appropriate code. On startup, the harness checks for a leftover run (stale ConfigMap with state=running but no matching pod) and logs a warning before proceeding. + +### Latency-Regression Baseline (Phase 1d) +During the first 30 minutes of a canary run, the monitor establishes P50/P99 write and read latency baselines. After warmup, sustained P99 regression >2× baseline for >5 minutes triggers a warning-level alert. The exact thresholds are configurable via environment variables (`LONGHAUL_LATENCY_P99_MULTIPLIER`, `LONGHAUL_LATENCY_WINDOW`). diff --git a/test/longhaul/README.md b/test/longhaul/README.md new file mode 100644 index 00000000..6ca2ab7d --- /dev/null +++ b/test/longhaul/README.md @@ -0,0 +1,105 @@ +# Long Haul Tests + +Long haul tests validate that DocumentDB Kubernetes Operator clusters remain healthy under +continuous load over extended periods. They run a canary workload that writes and reads data, +performs management operations, and checks for data integrity. + +> **Status:** Phase 1a (skeleton). The canary workload and management operations will be added +> in subsequent phases. See [design document](../../docs/designs/long-haul-test-design.md) +> for the full plan. + +## Project Structure + +``` +test/longhaul/ +├── go.mod # Separate Go module (imports test/utils when available) +├── README.md # This file +├── suite_test.go # Ginkgo suite entry point (the canary) +├── longhaul_test.go # BeforeSuite + long-running test specs +└── config/ + ├── config.go # Config struct, env var loading, validation + ├── suite_test.go # Config unit test suite entry + └── config_test.go # Config unit tests +``` + +- **`test/longhaul/`** — The actual long-running canary. Designed to run for hours/days. +- **`test/longhaul/config/`** — Config parsing and validation. Fast unit tests, safe for CI. + +## Quick Start + +### Prerequisites + +- A running Kubernetes cluster with DocumentDB deployed +- `kubectl` configured to access the cluster +- Go 1.25+ + +### Run the Config Unit Tests + +These are fast and require no cluster: + +```bash +cd test/longhaul +go test ./config/ -v +``` + +### Run the Long Haul Canary Locally + +Against a local Kind cluster (see [development environment guide](../../docs/developer-guides/development-environment.md)): + +```bash +cd test/longhaul + +LONGHAUL_ENABLED=true \ +LONGHAUL_CLUSTER_NAME=documentdb-sample \ +LONGHAUL_NAMESPACE=default \ +LONGHAUL_MAX_DURATION=10m \ +go test ./... -v -timeout 0 +``` + +> **Note:** Use `-timeout 0` to disable Go's default 10-minute test timeout for long runs. + +### Build a Standalone Binary + +For containerized deployment (Phase 2+): + +```bash +cd test/longhaul +go test -c -o longhaul.test ./ + +# Run the compiled binary +LONGHAUL_ENABLED=true \ +LONGHAUL_CLUSTER_NAME=documentdb-sample \ +LONGHAUL_NAMESPACE=default \ +./longhaul.test -test.v -test.timeout 0 +``` + +## Configuration + +All configuration is via environment variables. Tests are **gated** behind `LONGHAUL_ENABLED` — +they are safely skipped in regular CI runs (`go test ./...`). + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `LONGHAUL_ENABLED` | Yes | — | Must be `true`, `1`, or `yes` to run. Otherwise all tests skip. | +| `LONGHAUL_CLUSTER_NAME` | Yes | — | Name of the target DocumentDB cluster CR. | +| `LONGHAUL_NAMESPACE` | No | `default` | Kubernetes namespace of the target cluster. | +| `LONGHAUL_MAX_DURATION` | No | `30m` | Max test duration. Use `0s` for run-until-failure. | + +> Additional configuration (writer count, operation cooldown, etc.) will be added in later phases +> as the corresponding features are implemented. + +## CI Safety + +The long haul tests are gated behind `LONGHAUL_ENABLED`. No CI workflow currently sets this +variable — do not add it to any PR-gated workflow. + +1. `LONGHAUL_ENABLED` is not set in any CI workflow +2. The `BeforeSuite` calls `Skip()` when disabled +3. CI output shows `Suite skipped in BeforeSuite -- 0 Passed | 0 Failed | 1 Skipped` + +> **Note:** For persistent canary deployment, the Job manifest explicitly sets +> `LONGHAUL_MAX_DURATION=0s` to enable run-until-failure mode. The default 30m timeout +> is only a safety net for local development. + +The config unit tests (`test/longhaul/config/`) run unconditionally and are included in normal +CI test runs — they are fast (~0.002s) and require no cluster. diff --git a/test/longhaul/config/config.go b/test/longhaul/config/config.go new file mode 100644 index 00000000..70672548 --- /dev/null +++ b/test/longhaul/config/config.go @@ -0,0 +1,87 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package config + +import ( + "fmt" + "os" + "strings" + "time" +) + +const ( + // Environment variable names for long haul test configuration. + EnvEnabled = "LONGHAUL_ENABLED" + EnvMaxDuration = "LONGHAUL_MAX_DURATION" + EnvNamespace = "LONGHAUL_NAMESPACE" + EnvClusterName = "LONGHAUL_CLUSTER_NAME" +) + +// Config holds all configuration for a long haul test run. +type Config struct { + // MaxDuration is the maximum test duration. Zero means run until failure. + // Requires explicit LONGHAUL_MAX_DURATION=0s to enable infinite runs. + // Default: 30m (safe for local development). + MaxDuration time.Duration + + // Namespace is the Kubernetes namespace of the target DocumentDB cluster. + Namespace string + + // ClusterName is the name of the target DocumentDB cluster CR. + ClusterName string +} + +// DefaultConfig returns a Config with safe defaults for local development. +func DefaultConfig() Config { + return Config{ + MaxDuration: 30 * time.Minute, + Namespace: "default", + ClusterName: "", + } +} + +// LoadFromEnv loads configuration from environment variables, +// falling back to defaults for any unset variable. +func LoadFromEnv() (Config, error) { + cfg := DefaultConfig() + + if v := os.Getenv(EnvMaxDuration); v != "" { + d, err := time.ParseDuration(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvMaxDuration, v, err) + } + cfg.MaxDuration = d + } + + if v := os.Getenv(EnvNamespace); v != "" { + cfg.Namespace = v + } + + if v := os.Getenv(EnvClusterName); v != "" { + cfg.ClusterName = v + } + + return cfg, nil +} + +// Validate checks that the configuration is valid. +func (c *Config) Validate() error { + if c.MaxDuration < 0 { + return fmt.Errorf("max duration must not be negative, got %s", c.MaxDuration) + } + if c.Namespace == "" { + return fmt.Errorf("namespace must not be empty") + } + if c.ClusterName == "" { + return fmt.Errorf("cluster name must not be empty") + } + return nil +} + +// IsEnabled returns true if the long haul test is explicitly enabled +// via the LONGHAUL_ENABLED environment variable. +func IsEnabled() bool { + v := strings.TrimSpace(strings.ToLower(os.Getenv(EnvEnabled))) + return v == "true" || v == "1" || v == "yes" +} diff --git a/test/longhaul/config/config_test.go b/test/longhaul/config/config_test.go new file mode 100644 index 00000000..af07d63c --- /dev/null +++ b/test/longhaul/config/config_test.go @@ -0,0 +1,157 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package config + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Config", func() { + Describe("DefaultConfig", func() { + It("returns safe defaults", func() { + cfg := DefaultConfig() + Expect(cfg.MaxDuration).To(Equal(30 * time.Minute)) + Expect(cfg.Namespace).To(Equal("default")) + Expect(cfg.ClusterName).To(BeEmpty()) + }) + }) + + Describe("LoadFromEnv", func() { + It("uses defaults when no env vars set", func() { + GinkgoT().Setenv(EnvMaxDuration, "") + GinkgoT().Setenv(EnvNamespace, "") + GinkgoT().Setenv(EnvClusterName, "") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.MaxDuration).To(Equal(30 * time.Minute)) + }) + + It("parses MaxDuration from env", func() { + GinkgoT().Setenv(EnvMaxDuration, "1h") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.MaxDuration).To(Equal(1 * time.Hour)) + }) + + It("parses zero MaxDuration for infinite runs", func() { + GinkgoT().Setenv(EnvMaxDuration, "0s") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.MaxDuration).To(Equal(time.Duration(0))) + }) + + It("parses Namespace and ClusterName from env", func() { + GinkgoT().Setenv(EnvNamespace, "test-ns") + GinkgoT().Setenv(EnvClusterName, "my-cluster") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.Namespace).To(Equal("test-ns")) + Expect(cfg.ClusterName).To(Equal("my-cluster")) + }) + + It("returns error for invalid MaxDuration", func() { + GinkgoT().Setenv(EnvMaxDuration, "not-a-duration") + _, err := LoadFromEnv() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(EnvMaxDuration)) + }) + }) + + Describe("Validate", func() { + It("passes for valid config", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test-cluster" + Expect(cfg.Validate()).To(Succeed()) + }) + + It("fails when Namespace is empty", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test" + cfg.Namespace = "" + Expect(cfg.Validate()).To(MatchError(ContainSubstring("namespace"))) + }) + + It("fails when ClusterName is empty", func() { + cfg := DefaultConfig() + Expect(cfg.Validate()).To(MatchError(ContainSubstring("cluster name"))) + }) + + It("fails when MaxDuration is negative", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test" + cfg.MaxDuration = -1 * time.Second + Expect(cfg.Validate()).To(MatchError(ContainSubstring("max duration must not be negative"))) + }) + }) + + Describe("IsEnabled", func() { + It("returns false when env not set", func() { + GinkgoT().Setenv(EnvEnabled, "") + Expect(IsEnabled()).To(BeFalse()) + }) + + It("returns true for 'true'", func() { + GinkgoT().Setenv(EnvEnabled, "true") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for '1'", func() { + GinkgoT().Setenv(EnvEnabled, "1") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for 'yes'", func() { + GinkgoT().Setenv(EnvEnabled, "yes") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true case-insensitively", func() { + GinkgoT().Setenv(EnvEnabled, "TRUE") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for mixed case 'True'", func() { + GinkgoT().Setenv(EnvEnabled, "True") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for mixed case 'YES'", func() { + GinkgoT().Setenv(EnvEnabled, "YES") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true with surrounding whitespace", func() { + GinkgoT().Setenv(EnvEnabled, " true ") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for ' yes ' with whitespace", func() { + GinkgoT().Setenv(EnvEnabled, " yes ") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns false for whitespace-only", func() { + GinkgoT().Setenv(EnvEnabled, " ") + Expect(IsEnabled()).To(BeFalse()) + }) + + It("returns false for 'false'", func() { + GinkgoT().Setenv(EnvEnabled, "false") + Expect(IsEnabled()).To(BeFalse()) + }) + + It("returns false for '0'", func() { + GinkgoT().Setenv(EnvEnabled, "0") + Expect(IsEnabled()).To(BeFalse()) + }) + + It("returns false for 'no'", func() { + GinkgoT().Setenv(EnvEnabled, "no") + Expect(IsEnabled()).To(BeFalse()) + }) + }) +}) diff --git a/test/longhaul/config/suite_test.go b/test/longhaul/config/suite_test.go new file mode 100644 index 00000000..c12c6a89 --- /dev/null +++ b/test/longhaul/config/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package config + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestConfig(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Long Haul Config Suite") +} diff --git a/test/longhaul/go.mod b/test/longhaul/go.mod new file mode 100644 index 00000000..54b92254 --- /dev/null +++ b/test/longhaul/go.mod @@ -0,0 +1,23 @@ +module github.com/documentdb/documentdb-operator/test/longhaul + +go 1.25.9 + +require ( + github.com/onsi/ginkgo/v2 v2.28.1 + github.com/onsi/gomega v1.39.1 +) + +require ( + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/mod v0.32.0 // indirect + golang.org/x/net v0.49.0 // indirect + golang.org/x/sync v0.19.0 // indirect + golang.org/x/sys v0.40.0 // indirect + golang.org/x/text v0.33.0 // indirect + golang.org/x/tools v0.41.0 // indirect +) diff --git a/test/longhaul/go.sum b/test/longhaul/go.sum new file mode 100644 index 00000000..b1521ced --- /dev/null +++ b/test/longhaul/go.sum @@ -0,0 +1,69 @@ +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/vP9vJGqPwcdqsWjOt+V8J7+bTc= +github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= +github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI= +github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE= +github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= +github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= +golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= +golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= +golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= +golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= +google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= +google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/test/longhaul/longhaul_test.go b/test/longhaul/longhaul_test.go new file mode 100644 index 00000000..80553609 --- /dev/null +++ b/test/longhaul/longhaul_test.go @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package longhaul + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/config" +) + +var testConfig config.Config + +var _ = BeforeSuite(func() { + if !config.IsEnabled() { + Skip("Long haul tests are disabled. Set LONGHAUL_ENABLED=true to run.") + } + + var err error + testConfig, err = config.LoadFromEnv() + Expect(err).NotTo(HaveOccurred(), "Failed to load long haul config from environment") + + err = testConfig.Validate() + Expect(err).NotTo(HaveOccurred(), "Invalid long haul config") + + GinkgoWriter.Printf("Long haul test config:\n") + GinkgoWriter.Printf(" MaxDuration: %s\n", testConfig.MaxDuration) + GinkgoWriter.Printf(" Namespace: %s\n", testConfig.Namespace) + GinkgoWriter.Printf(" ClusterName: %s\n", testConfig.ClusterName) +}) + +var _ = Describe("Long Haul Test", func() { + It("should run the long haul canary", func() { + // Phase 1b+ will implement the actual workload, operations, and monitoring. + // For now, verify the skeleton is wired up correctly. + GinkgoWriter.Println("Long haul test skeleton is running") + Expect(testConfig.ClusterName).NotTo(BeEmpty()) + }) +}) diff --git a/test/longhaul/suite_test.go b/test/longhaul/suite_test.go new file mode 100644 index 00000000..ca024859 --- /dev/null +++ b/test/longhaul/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package longhaul + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestLongHaul(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Long Haul Suite") +}