diff --git a/.github/actions/setup-port-forwarding/action.yml b/.github/actions/setup-port-forwarding/action.yml deleted file mode 100644 index c587daf6..00000000 --- a/.github/actions/setup-port-forwarding/action.yml +++ /dev/null @@ -1,305 +0,0 @@ -name: 'Setup Port Forwarding' -description: 'Sets up robust port forwarding with retry logic for DocumentDB testing' -inputs: - namespace: - description: 'Kubernetes namespace where the DocumentDB cluster is running' - required: true - cluster-name: - description: 'Name of the DocumentDB cluster' - required: true - port: - description: 'Port to forward' - required: true - architecture: - description: 'Target architecture for logging purposes' - required: true - test-type: - description: 'Type of test (comprehensive or performance)' - required: false - default: 'comprehensive' -runs: - using: 'composite' - steps: - - name: Setup port forwarding with enhanced retries - shell: bash - run: | - echo "Setting up port forwarding for ${{ inputs.test-type }} tests on ${{ inputs.architecture }}..." - - # Function to setup port forwarding with enhanced retries - setup_port_forward() { - local max_attempts=5 - local attempt=1 - local base_sleep=5 - local test_type="${{ inputs.test-type }}" - - # Adjust retry parameters based on test type - if [[ "$test_type" == "performance" ]]; then - max_attempts=4 - base_sleep=3 - fi - - while [ $attempt -le $max_attempts ]; do - echo "Port forwarding attempt $attempt/$max_attempts..." - - # Exponential/Progressive backoff for retry delays - local retry_delay=$((base_sleep * attempt)) - if [ $attempt -gt 1 ]; then - echo "Waiting ${retry_delay}s before retry attempt..." - sleep $retry_delay - fi - - # Get the actual pod name and ensure it's ready - POD_NAME=$(kubectl get pods -n ${{ inputs.namespace }} -l cnpg.io/cluster=${{ inputs.cluster-name }} -o jsonpath='{.items[0].metadata.name}') - if [ -z "$POD_NAME" ]; then - echo "❌ No DocumentDB pod found" - kubectl get pods -n ${{ inputs.namespace }} - ((attempt++)) - continue - fi - echo "Using pod: $POD_NAME" - - # Comprehensive pod readiness check with retries - pod_ready=false - local readiness_checks=3 - if [[ "$test_type" == "performance" ]]; then - readiness_checks=2 - fi - - for ready_check in $(seq 1 $readiness_checks); do - pod_phase=$(kubectl get pod $POD_NAME -n ${{ inputs.namespace }} -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") - echo "Pod phase: $pod_phase (readiness check $ready_check/$readiness_checks)" - - if [[ "$pod_phase" == "Running" ]]; then - # Wait for pod to be ready - local timeout=60 - if [[ "$test_type" == "performance" ]]; then - timeout=45 - fi - - if kubectl wait --for=condition=Ready pod/$POD_NAME -n ${{ inputs.namespace }} --timeout=${timeout}s 2>/dev/null; then - echo "✓ Pod is ready" - pod_ready=true - break - else - echo "❌ Pod readiness check failed, retrying..." - sleep 10 - fi - else - echo "Pod is not running, waiting..." - local timeout=90 - if [[ "$test_type" == "performance" ]]; then - timeout=45 - fi - - if kubectl wait --for=condition=Ready pod/$POD_NAME -n ${{ inputs.namespace }} --timeout=${timeout}s 2>/dev/null; then - echo "✓ Pod became ready" - pod_ready=true - break - else - echo "❌ Pod failed to become ready, retrying..." - local sleep_time=15 - if [[ "$test_type" == "performance" ]]; then - sleep_time=8 - fi - sleep $sleep_time - fi - fi - done - - if [ "$pod_ready" = false ]; then - echo "❌ Pod $POD_NAME is not ready after multiple checks (attempt $attempt)" - if [[ "$test_type" == "comprehensive" ]]; then - kubectl describe pod/$POD_NAME -n ${{ inputs.namespace }} | head -30 - fi - ((attempt++)) - continue - fi - - # Clean up any existing port forwarding with more thorough cleanup - cleanup_port_forward() { - # Kill any existing port forwarding processes - pkill -f "kubectl port-forward.*${{ inputs.port }}" 2>/dev/null || true - - # Determine file prefix based on test type - local file_prefix="/tmp/pf" - if [[ "$test_type" == "performance" ]]; then - file_prefix="/tmp/perf_pf" - fi - - # Clean up PID files and logs - for file in "${file_prefix}_pid" "${file_prefix}_output.log"; do - if [ -f "$file" ]; then - [ -f "${file_prefix}_pid" ] && kill $(cat "${file_prefix}_pid") 2>/dev/null || true - rm -f "$file" - fi - done - - # Wait for port to be released - local cleanup_sleep=3 - if [[ "$test_type" == "performance" ]]; then - cleanup_sleep=2 - fi - sleep $cleanup_sleep - - # Check if port is still in use and force kill if needed - local port_users=$(lsof -ti:${{ inputs.port }} 2>/dev/null || true) - if [ -n "$port_users" ]; then - echo "Force killing processes using port ${{ inputs.port }}: $port_users" - echo "$port_users" | xargs -r kill -9 2>/dev/null || true - local force_sleep=2 - if [[ "$test_type" == "performance" ]]; then - force_sleep=1 - fi - sleep $force_sleep - fi - } - - cleanup_port_forward - - # Start port-forward with enhanced error handling - echo "Starting $test_type port forwarding from pod $POD_NAME..." - local file_prefix="/tmp/pf" - if [[ "$test_type" == "performance" ]]; then - file_prefix="/tmp/perf_pf" - fi - - kubectl port-forward pod/$POD_NAME ${{ inputs.port }}:${{ inputs.port }} -n ${{ inputs.namespace }} > "${file_prefix}_output.log" 2>&1 & - PF_PID=$! - echo $PF_PID > "${file_prefix}_pid" - - # Wait for port-forward to establish - echo "Waiting for $test_type port forwarding to establish..." - local establish_sleep=20 - if [[ "$test_type" == "performance" ]]; then - establish_sleep=15 - fi - sleep $establish_sleep - - # Check if port-forward process is still running - if ! kill -0 $PF_PID 2>/dev/null; then - echo "❌ $test_type port forwarding process died immediately (attempt $attempt)" - if [ -f "${file_prefix}_output.log" ]; then - echo "$test_type port forwarding output:" - cat "${file_prefix}_output.log" - fi - ((attempt++)) - continue - fi - - # Enhanced connection testing with progressive delays - connection_success=false - local connection_tests=8 - local connection_sleep=3 - if [[ "$test_type" == "performance" ]]; then - connection_tests=6 - connection_sleep=2 - fi - - for i in $(seq 1 $connection_tests); do - sleep $connection_sleep - if nc -z 127.0.0.1 ${{ inputs.port }} 2>/dev/null; then - echo "✓ $test_type port forwarding connection test passed (attempt $i/$connection_tests)" - connection_success=true - break - else - echo "❌ $test_type port forwarding connection test failed (attempt $i/$connection_tests)" - # Check if port-forward is still alive - if ! kill -0 $PF_PID 2>/dev/null; then - echo "❌ $test_type port forwarding process died during connection testing" - break - fi - fi - done - - if [ "$connection_success" = false ]; then - echo "❌ All $test_type connection tests failed (attempt $attempt)" - kill $PF_PID 2>/dev/null || true - ((attempt++)) - continue - fi - - # Extended stability check with more thorough validation - echo "Running $test_type stability check..." - stable=true - local stability_checks=5 - local stability_sleep=8 - if [[ "$test_type" == "performance" ]]; then - stability_checks=3 - stability_sleep=6 - fi - - for check in $(seq 1 $stability_checks); do - sleep $stability_sleep - - # Check if process is still alive - if ! kill -0 $PF_PID 2>/dev/null; then - echo "❌ $test_type port forwarding process died during stability check $check/$stability_checks (attempt $attempt)" - stable=false - break - fi - - # Check connection stability - if ! nc -z 127.0.0.1 ${{ inputs.port }} 2>/dev/null; then - echo "❌ $test_type connection lost during stability check $check/$stability_checks (attempt $attempt)" - stable=false - break - fi - - # Additional validation: try to establish a brief connection - local tcp_timeout=5 - if [[ "$test_type" == "performance" ]]; then - tcp_timeout=3 - fi - - if timeout $tcp_timeout bash -c "/dev/null; then - echo "✓ $test_type stability check $check/$stability_checks passed (TCP connection verified)" - else - echo "❌ $test_type TCP connection verification failed during stability check $check/$stability_checks (attempt $attempt)" - stable=false - break - fi - done - - if [ "$stable" = true ]; then - echo "✓ $test_type port forwarding established and stable on ${{ inputs.architecture }} (attempt $attempt)" - if [[ "$test_type" == "comprehensive" ]]; then - echo "✓ Final validation: Port forwarding is ready for use" - fi - return 0 - else - echo "❌ $test_type stability check failed (attempt $attempt)" - if [ -f "${file_prefix}_output.log" ]; then - echo "$test_type port forwarding output:" - local tail_lines=30 - if [[ "$test_type" == "performance" ]]; then - tail_lines=20 - fi - tail -$tail_lines "${file_prefix}_output.log" - fi - kill $PF_PID 2>/dev/null || true - cleanup_port_forward - ((attempt++)) - fi - done - - echo "❌ Failed to establish stable $test_type port forwarding after $max_attempts attempts" - return 1 - } - - # Call the function with enhanced error handling - if ! setup_port_forward; then - echo "=== Final ${{ inputs.test-type }} diagnostics ===" - kubectl get pods -n ${{ inputs.namespace }} -o wide - kubectl describe pods -n ${{ inputs.namespace }} - kubectl get events -n ${{ inputs.namespace }} --sort-by='.lastTimestamp' | tail -15 - kubectl logs -n ${{ inputs.namespace }} -l cnpg.io/cluster=${{ inputs.cluster-name }} --tail=50 - - # Check for any system-level issues - echo "=== ${{ inputs.test-type }} system diagnostics ===" - lsof -i:${{ inputs.port }} || echo "No processes using port ${{ inputs.port }}" - if [[ "${{ inputs.test-type }}" == "comprehensive" ]]; then - netstat -tuln | grep ${{ inputs.port }} || echo "Port ${{ inputs.port }} not in use" - fi - - exit 1 - fi diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 006c6f96..7bb069de 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -66,6 +66,33 @@ The code review agent will: - Emit events for significant state changes - Use finalizers for cleanup operations +## Issue Triage & Priority + +**Priority is tracked via GitHub Projects, not labels.** Do not create `P0`/`P1`/`P2` labels; the repo intentionally doesn't use them. + +- Planning board: [DocumentDB k8s operator planning board](https://github.com/orgs/documentdb/projects/6) (project number `6`, owner `documentdb`) +- Issue tracking board: [DocumentDB issue tracking](https://github.com/orgs/documentdb/projects/4) (project number `4`) +- Both boards have a single-select `Priority` field with values `P0`, `P1`, `P2`. + +### Setting priority on a new issue + +1. Add the issue to the relevant project: + ```bash + gh project item-add 6 --owner documentdb --url + ``` +2. Set the Priority field using `gh project item-edit` with the project + field + option IDs (obtainable via `gh project field-list 6 --owner documentdb --format json` and the GraphQL `options` query). Example: + ```bash + gh api graphql -f query=' + mutation($project:ID!,$item:ID!,$field:ID!,$opt:String!){ + updateProjectV2ItemFieldValue(input:{projectId:$project,itemId:$item,fieldId:$field,value:{singleSelectOptionId:$opt}}){projectV2Item{id}} + }' -F project=PVT_kwDODDbYls4BIeDc -F item= -F field=PVTSSF_lADODDbYls4BIeDczg4658Q -F opt= + ``` + +### Assignment + +- Reviewers / maintainers are listed in `CODEOWNERS` and `MAINTAINERS.md`. Rayhan Hossain's GitHub handle is `hossain-rayhan`. +- Use `gh issue edit --repo documentdb/documentdb-kubernetes-operator --add-assignee ` rather than editing through the UI so the change is auditable. + ## Commit Messages Follow conventional commits format: @@ -75,3 +102,21 @@ Follow conventional commits format: - `test:` for test additions/changes - `refactor:` for code refactoring - `chore:` for maintenance tasks + +### DCO Sign-off (Required) + +Every commit **must** carry a `Signed-off-by:` trailer — the repo enforces the +[Developer Certificate of Origin](../contribute/developer-certificate-of-origin) +via a DCO check on PRs, and unsigned commits block the merge. + +- Use `git commit -s` (or `git commit --signoff`) for new commits. +- To retrofit sign-offs onto commits you already made on the current branch: + ```bash + GIT_SEQUENCE_EDITOR=: git rebase -i \ + --exec 'git commit --amend --no-edit --signoff' + ``` + (Plain `git rebase --signoff` is a no-op when commits don't need to be replayed.) +- Verify before pushing: `git log -n --format='%(trailers:key=Signed-off-by)'` + must print a trailer for every commit. +- The sign-off is in addition to the `Co-authored-by: Copilot …` trailer, not a + replacement for it. diff --git a/.github/workflows/release_documentdb_images.yml b/.github/workflows/release_documentdb_images.yml index b25d48c2..cf3747e2 100644 --- a/.github/workflows/release_documentdb_images.yml +++ b/.github/workflows/release_documentdb_images.yml @@ -110,31 +110,28 @@ jobs: sed -i "s|documentDbVersion: \"${OLD_VERSION}\"|documentDbVersion: \"${NEW_VERSION}\"|" \ operator/documentdb-helm-chart/values.yaml - # 4. Update test workflow fallback images - sed -i "s|documentdb:${OLD_VERSION}|documentdb:${NEW_VERSION}|g" \ - .github/workflows/test-backup-and-restore.yml - sed -i "s|gateway:${OLD_VERSION}|gateway:${NEW_VERSION}|g" \ - .github/workflows/test-backup-and-restore.yml - - # 5. Update the released database baseline used by upgrade tests - sed -i "s|RELEASED_DATABASE_VERSION: ${OLD_VERSION}|RELEASED_DATABASE_VERSION: ${NEW_VERSION}|" \ - .github/workflows/test-upgrade-and-rollback.yml - - # 6. Update sidecar plugin config test (hardcoded expected gateway image) + # 4. (Removed) Test workflow fallback images — the legacy + # test-backup-and-restore.yml and test-upgrade-and-rollback.yml + # workflows have been consolidated into test-e2e.yml. Database + # image versions for e2e are resolved from the operator's + # built-in defaults (constants.go) rather than per-workflow + # fallback tags, so no sed is required here. + + # 5. Update sidecar plugin config test (hardcoded expected gateway image) sed -i "s|:${OLD_VERSION}\"|:${NEW_VERSION}\"|g" \ operator/cnpg-plugins/sidecar-injector/internal/config/config_test.go - # 7. Update build workflow defaults + # 6. Update build workflow defaults sed -i "s|DEFAULT_DOCUMENTDB_VERSION: '${OLD_VERSION}'|DEFAULT_DOCUMENTDB_VERSION: '${NEW_VERSION}'|" \ .github/workflows/build_documentdb_images.yml sed -i "s|default: '${OLD_VERSION}'|default: '${NEW_VERSION}'|g" \ .github/workflows/build_documentdb_images.yml - # 8. Update release workflow default version + # 7. Update release workflow default version sed -i "s|default: '${OLD_VERSION}'|default: '${NEW_VERSION}'|g" \ .github/workflows/release_documentdb_images.yml - # 9. Update gateway Dockerfile default source image ARG + # 8. Update gateway Dockerfile default source image ARG sed -i "s|pg17-${OLD_VERSION}|pg17-${NEW_VERSION}|" \ .github/dockerfiles/Dockerfile_gateway_public_image @@ -159,7 +156,6 @@ jobs: - Updated `DEFAULT_DOCUMENTDB_IMAGE` and `DEFAULT_GATEWAY_IMAGE` in `constants.go` - Updated sidecar plugin default gateway image in `config.go` and `config_test.go` - Updated `documentDbVersion` in Helm chart `values.yaml` - - Updated fallback images in `test-backup-and-restore.yml` - Updated build/release workflow defaults in `build_documentdb_images.yml` and `release_documentdb_images.yml` - Updated gateway Dockerfile default source image in `Dockerfile_gateway_public_image` diff --git a/.github/workflows/release_images.yml b/.github/workflows/release_images.yml index 8ca23db9..e1e603c4 100644 --- a/.github/workflows/release_images.yml +++ b/.github/workflows/release_images.yml @@ -25,11 +25,6 @@ on: source_ref: description: 'Git ref to package the Helm chart from (tag or commit recommended to avoid drift)' required: true - run_tests: - description: 'Run tests before releasing' - required: false - default: true - type: boolean permissions: contents: read @@ -38,36 +33,16 @@ permissions: id-token: write jobs: - # Optional test jobs - run both E2E and integration tests in parallel if enabled - test-e2e: - name: E2E Test Images Before Release - if: ${{ inputs.run_tests == true }} - uses: ./.github/workflows/test-E2E.yml - with: - image_tag: ${{ inputs.candidate_version }} - secrets: inherit - - test-integration: - name: Integration Test Images Before Release - if: ${{ inputs.run_tests == true }} - uses: ./.github/workflows/test-integration.yml - with: - image_tag: ${{ inputs.candidate_version }} - secrets: inherit - - test-backup-and-restore: - name: Test Backup and Restore - if: ${{ inputs.run_tests == true }} - uses: ./.github/workflows/test-backup-and-restore.yml - with: - image_tag: ${{ inputs.candidate_version }} - secrets: inherit - + # NOTE: Pre-release E2E/integration/backup gates were removed when the + # legacy test-E2E.yml / test-integration.yml / test-backup-and-restore.yml + # workflows were consolidated into the unified test-e2e.yml workflow + # (see docs/designs/e2e-test-suite.md). That workflow is triggered + # on pull_request to main and guards merges to the source branch. + # This release workflow now assumes the candidate artifact has + # already passed PR-level CI. copy-and-push-manifest: name: Release Images runs-on: ubuntu-latest - needs: [test-e2e, test-integration, test-backup-and-restore] - if: ${{ always() && (needs.test-e2e.result == 'success' || needs.test-e2e.result == 'skipped') && (needs.test-integration.result == 'success' || needs.test-integration.result == 'skipped') && (needs.test-backup-and-restore.result == 'success' || needs.test-backup-and-restore.result == 'skipped') }} strategy: matrix: image: [operator, sidecar, documentdb, gateway] diff --git a/.github/workflows/release_operator.yml b/.github/workflows/release_operator.yml index 013b0b8b..5fbbbe88 100644 --- a/.github/workflows/release_operator.yml +++ b/.github/workflows/release_operator.yml @@ -18,11 +18,6 @@ on: source_ref: description: 'Git ref to package the Helm chart from (tag or commit recommended to avoid drift)' required: true - run_tests: - description: 'Run tests before releasing' - required: false - default: true - type: boolean permissions: contents: read @@ -32,31 +27,13 @@ permissions: jobs: # --------------------------------------------------------------------------- - # Optional test gate — run E2E, integration, and backup tests in parallel - # --------------------------------------------------------------------------- - test-e2e: - name: E2E Test Images Before Release - if: ${{ inputs.run_tests == true }} - uses: ./.github/workflows/test-E2E.yml - with: - image_tag: ${{ inputs.candidate_version }} - secrets: inherit - - test-integration: - name: Integration Test Images Before Release - if: ${{ inputs.run_tests == true }} - uses: ./.github/workflows/test-integration.yml - with: - image_tag: ${{ inputs.candidate_version }} - secrets: inherit - - test-backup-and-restore: - name: Test Backup and Restore - if: ${{ inputs.run_tests == true }} - uses: ./.github/workflows/test-backup-and-restore.yml - with: - image_tag: ${{ inputs.candidate_version }} - secrets: inherit + # NOTE: Pre-release E2E/integration/backup gates were removed when the + # legacy test-E2E.yml / test-integration.yml / test-backup-and-restore.yml + # workflows were consolidated into the unified test-e2e.yml workflow + # (see docs/designs/e2e-test-suite.md). That workflow is triggered + # on pull_request to main and guards merges to the source branch. + # This release workflow now assumes the candidate artifact has + # already passed PR-level CI. # --------------------------------------------------------------------------- # Promote operator and sidecar images (retag candidate → release) @@ -64,8 +41,6 @@ jobs: promote-operator-images: name: Promote ${{ matrix.image }} runs-on: ubuntu-latest - needs: [test-e2e, test-integration, test-backup-and-restore] - if: ${{ always() && (needs.test-e2e.result == 'success' || needs.test-e2e.result == 'skipped') && (needs.test-integration.result == 'success' || needs.test-integration.result == 'skipped') && (needs.test-backup-and-restore.result == 'success' || needs.test-backup-and-restore.result == 'skipped') }} strategy: matrix: # NOTE: wal-replica excluded until its Dockerfile is created (feature-flagged, disabled by default). diff --git a/.github/workflows/test-E2E.yml b/.github/workflows/test-E2E.yml deleted file mode 100644 index d18985bf..00000000 --- a/.github/workflows/test-E2E.yml +++ /dev/null @@ -1,493 +0,0 @@ -name: TEST - E2E Test with mongosh - -on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main, develop ] - schedule: - # Run daily at 2 AM UTC - - cron: '0 2 * * *' - workflow_dispatch: - inputs: - node_count: - description: 'Number of DocumentDB nodes' - required: false - default: '1' - test_level: - description: 'Test level to run' - required: false - default: 'full' - type: choice - options: - - quick - - integration - - full - image_tag: - description: 'Optional: Use existing image tag instead of building locally' - required: false - type: string - workflow_call: - inputs: - image_tag: - description: 'Optional: Use existing image tag instead of building locally' - required: false - type: string - node_count: - description: 'Number of DocumentDB nodes' - required: false - default: '1' - type: string - test_level: - description: 'Test level to run' - required: false - default: 'full' - type: string - -permissions: - contents: read - actions: read - packages: read - -env: - CERT_MANAGER_NS: cert-manager - OPERATOR_NS: documentdb-operator - DB_NS: documentdb-e2e-test - DB_NAME: documentdb-e2e - DB_USERNAME: k8s_secret_user - DB_PASSWORD: K8sSecret100 - DB_PORT: 10260 - -jobs: - # Conditional build workflow - only run if image_tag is not provided or on pull_request - build: - name: Build Images and Charts - if: ${{ (inputs.image_tag == '' || inputs.image_tag == null) || github.event_name == 'pull_request' }} - uses: ./.github/workflows/test-build-and-package.yml - with: - version: '0.2.0' - secrets: inherit - - e2e-test: - name: E2E (${{ matrix.architecture }}, K8s ${{ matrix.kubernetes_version }}) - runs-on: ${{ matrix.runner }} - timeout-minutes: 60 - needs: build - if: always() && (needs.build.result == 'success' || needs.build.result == 'skipped') - - strategy: - matrix: - include: - # ImageVolume mode (K8s >= 1.35) - uses separate PostgreSQL + extension images - - architecture: amd64 - runner: ubuntu-22.04 - test_scenario_name: "single-node" - node_count: 1 - instances_per_node: 1 - kubernetes_version: "v1.35.0" - - architecture: arm64 - runner: ubuntu-22.04-arm - test_scenario_name: "single-node" - node_count: 1 - instances_per_node: 1 - kubernetes_version: "v1.35.0" - - env: - # Use built image tag on PR or when no external tag provided - IMAGE_TAG: ${{ (github.event_name == 'pull_request' || inputs.image_tag == '' || inputs.image_tag == null) && needs.build.outputs.image_tag || inputs.image_tag }} - EXT_IMAGE_TAG: ${{ needs.build.outputs.ext_image_tag || '' }} - CHART_VERSION: ${{ needs.build.outputs.chart_version || '0.1.0' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Download artifacts - if: ${{ (inputs.image_tag == '' || inputs.image_tag == null) || github.event_name == 'pull_request' }} - uses: actions/download-artifact@v4 - with: - pattern: 'build-*' - path: ./artifacts - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Log test configuration - run: | - echo "## E2E Test Configuration" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - if [[ -n "${{ inputs.image_tag }}" ]]; then - echo "- **Mode**: Using provided image tag" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: \`${{ inputs.image_tag }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Source**: External (no local build)" >> $GITHUB_STEP_SUMMARY - else - echo "- **Mode**: Using locally built images" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: \`${{ env.IMAGE_TAG }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Source**: Local build pipeline" >> $GITHUB_STEP_SUMMARY - fi - echo "- **Architecture**: \`${{ matrix.architecture }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Kubernetes Version**: \`${{ matrix.kubernetes_version }}\`" >> $GITHUB_STEP_SUMMARY - - - name: Setup test environment - uses: ./.github/actions/setup-test-environment - with: - test-type: 'e2e' - architecture: ${{ matrix.architecture }} - runner: ${{ matrix.runner }} - test-scenario-name: ${{ matrix.test_scenario_name }} - node-count: '${{ matrix.node_count }}' - instances-per-node: '${{ matrix.instances_per_node }}' - cert-manager-namespace: ${{ env.CERT_MANAGER_NS }} - operator-namespace: ${{ env.OPERATOR_NS }} - db-namespace: ${{ env.DB_NS }} - db-cluster-name: ${{ env.DB_NAME }} - db-username: ${{ env.DB_USERNAME }} - db-password: ${{ env.DB_PASSWORD }} - db-port: ${{ env.DB_PORT }} - image-tag: ${{ env.IMAGE_TAG }} - documentdb-image-tag: ${{ env.EXT_IMAGE_TAG }} - chart-version: ${{ env.CHART_VERSION }} - use-external-images: ${{ github.event_name != 'pull_request' && inputs.image_tag != '' && inputs.image_tag != null }} - github-token: ${{ secrets.GITHUB_TOKEN }} - repository-owner: ${{ github.repository_owner }} - kubernetes-version: ${{ matrix.kubernetes_version }} - - - name: Setup port forwarding for comprehensive tests - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Execute comprehensive mongosh tests - run: | - echo "Running comprehensive mongosh validation tests on ${{ matrix.architecture }}..." - - # Run comprehensive tests with validation using external script - if mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --file operator/src/scripts/test-scripts/comprehensive_mongosh_tests.js; then - echo "✓ Comprehensive mongosh tests completed successfully on ${{ matrix.architecture }}" - else - echo "❌ Comprehensive mongosh tests failed on ${{ matrix.architecture }}" - exit 1 - fi - - - name: Cleanup comprehensive test port forwarding - if: always() - run: | - # Stop port-forward if it exists - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - - # Clean up output log - rm -f /tmp/pf_output.log - - # Clean up output log - rm -f /tmp/pf_output.log - - - name: Setup port forwarding for performance tests - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'performance' - - - name: Execute performance tests - run: | - echo "Running performance validation tests on ${{ matrix.architecture }}..." - - # Run performance tests using external script - if mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --file operator/src/scripts/test-scripts/performance_test.js; then - echo "✓ Performance tests completed successfully on ${{ matrix.architecture }}" - else - echo "❌ Performance tests failed on ${{ matrix.architecture }}" - exit 1 - fi - - - name: Cleanup performance testing - if: always() - run: | - # Stop performance test port-forward - if [ -f /tmp/perf_pf_pid ]; then - PF_PID=$(cat /tmp/perf_pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/perf_pf_pid - fi - - # Clean up output log - rm -f /tmp/perf_pf_output.log - - # Clean up output log - rm -f /tmp/perf_pf_output.log - - - name: Verify DocumentDB Status and Connection String - run: | - echo "Verifying DocumentDB status fields on ${{ matrix.architecture }}..." - - # Get the DocumentDB resource status - DB_STATUS=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.status}') - CONNECTION_STRING=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.connectionString}') - - echo "DocumentDB Status: $DB_STATUS" - echo "Connection String: ${CONNECTION_STRING:0:50}..." # Print first 50 chars only - - # Verify status is "Cluster in healthy state" - if [[ "$DB_STATUS" == "Cluster in healthy state" ]]; then - echo "✓ DocumentDB status is healthy" - else - echo "❌ DocumentDB status is not healthy: $DB_STATUS" - kubectl get documentdb $DB_NAME -n $DB_NS -o yaml - exit 1 - fi - - # Verify connection string is not empty - if [[ -n "$CONNECTION_STRING" ]]; then - echo "✓ Connection string is populated" - else - echo "❌ Connection string is empty" - kubectl get documentdb $DB_NAME -n $DB_NS -o yaml - exit 1 - fi - - echo "✅ DocumentDB status validation passed" - - - name: Test OTel monitoring sidecar enable/disable - run: | - echo "Testing OTel Collector sidecar injection on ${{ matrix.architecture }}..." - chmod +x operator/src/scripts/test-scripts/test-otel-monitoring.sh - operator/src/scripts/test-scripts/test-otel-monitoring.sh - - - name: Test cluster health and monitoring - run: | - echo "Testing cluster health and monitoring on ${{ matrix.architecture }}..." - - # Check DocumentDB resource status - kubectl get documentdb $DB_NAME -n $DB_NS -o yaml - - # Check pod resources and health - kubectl top pods -n $DB_NS --containers || echo "Metrics server not available" - - # Check logs for any errors - kubectl logs -n $DB_NS -l cnpg.io/cluster=$DB_NAME --tail=50 - - # Check events - kubectl get events -n $DB_NS --sort-by='.lastTimestamp' - - - name: Verify mount options are set by PV controller - run: | - echo "Verifying PV mount options are set by the PV controller..." - - # Find PV directly using documentdb.io labels set by the PV controller - pv_name=$(kubectl get pv -l documentdb.io/cluster=${{ env.DB_NAME }},documentdb.io/namespace=${{ env.DB_NS }} -o jsonpath='{.items[0].metadata.name}') - echo "PV name: $pv_name" - - if [ -z "$pv_name" ]; then - echo "❌ Failed to find PV with documentdb.io/cluster=${{ env.DB_NAME }} and documentdb.io/namespace=${{ env.DB_NS }}" - exit 1 - fi - - # Get mount options from PV - mount_options=$(kubectl get pv $pv_name -o jsonpath='{.spec.mountOptions}') - echo "PV mount options: $mount_options" - - # Check for security mount options (nodev, nosuid, noexec) - if echo "$mount_options" | grep -q "nodev" && \ - echo "$mount_options" | grep -q "nosuid" && \ - echo "$mount_options" | grep -q "noexec"; then - echo "✓ PV mount options (nodev, nosuid, noexec) are set correctly" - else - echo "❌ PV mount options are missing. Expected nodev, nosuid, noexec" - exit 1 - fi - - - name: Test PV reclaim policy default and explicit Delete - shell: bash - run: | - echo "Testing PV reclaim policy - default (Retain) and explicit Delete..." - - # Test 1: Verify default policy is Retain on the existing cluster - echo "=== Test 1: Verify default PV reclaim policy is Retain ===" - - # Find PV directly using documentdb.io labels set by the PV controller - pv_name=$(kubectl get pv -l documentdb.io/cluster=${{ env.DB_NAME }},documentdb.io/namespace=${{ env.DB_NS }} -o jsonpath='{.items[0].metadata.name}') - echo "PV name: $pv_name" - - if [ -z "$pv_name" ]; then - echo "❌ Failed to find PV with documentdb.io/cluster=${{ env.DB_NAME }} and documentdb.io/namespace=${{ env.DB_NS }}" - exit 1 - fi - - # Verify default PV reclaim policy is Retain - current_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}') - echo "Current PV reclaim policy: $current_policy" - - if [ "$current_policy" != "Retain" ]; then - echo "❌ Expected default PV reclaim policy to be 'Retain', but got '$current_policy'" - exit 1 - fi - echo "✓ Default PV reclaim policy is correctly set to Retain" - - # Test 2: Change policy to Delete and verify PV is deleted with cluster - echo "" - echo "=== Test 2: Change policy to Delete and verify PV cleanup ===" - - # Patch the existing DocumentDB to set persistentVolumeReclaimPolicy to Delete - echo "Patching DocumentDB to set persistentVolumeReclaimPolicy to Delete..." - kubectl -n ${{ env.DB_NS }} patch documentdb ${{ env.DB_NAME }} --type=merge \ - -p '{"spec":{"resource":{"storage":{"persistentVolumeReclaimPolicy":"Delete"}}}}' - - # Wait for PV controller to update the PV reclaim policy - echo "Waiting for PV reclaim policy to be updated to Delete..." - MAX_RETRIES=30 - SLEEP_INTERVAL=5 - ITER=0 - while [ $ITER -lt $MAX_RETRIES ]; do - new_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}') - if [ "$new_policy" == "Delete" ]; then - echo "✓ PV reclaim policy updated to Delete" - break - else - echo "PV reclaim policy is still '$new_policy'. Waiting..." - sleep $SLEEP_INTERVAL - fi - ((++ITER)) - done - - if [ "$new_policy" != "Delete" ]; then - echo "❌ PV reclaim policy was not updated to Delete within expected time" - exit 1 - fi - - # Delete the DocumentDB cluster - echo "Deleting DocumentDB cluster to test PV cleanup with Delete policy..." - kubectl -n ${{ env.DB_NS }} delete documentdb ${{ env.DB_NAME }} --wait=false - - # Wait for DocumentDB to be deleted - echo "Waiting for DocumentDB to be deleted..." - MAX_RETRIES=30 - SLEEP_INTERVAL=10 - ITER=0 - while [ $ITER -lt $MAX_RETRIES ]; do - db_exists=$(kubectl -n ${{ env.DB_NS }} get documentdb ${{ env.DB_NAME }} --ignore-not-found) - if [ -z "$db_exists" ]; then - echo "✓ DocumentDB deleted successfully." - break - else - echo "DocumentDB still exists. Waiting..." - sleep $SLEEP_INTERVAL - fi - ((++ITER)) - done - - # Verify no PVsRetained warning event was emitted (since policy is Delete) - events=$(kubectl -n ${{ env.DB_NS }} get events --field-selector reason=PVsRetained,involvedObject.name=${{ env.DB_NAME }} --ignore-not-found -o jsonpath='{.items}') - if [ -z "$events" ] || [ "$events" == "[]" ]; then - echo "✓ No PVsRetained warning event emitted (expected for Delete policy)" - else - echo "⚠️ Unexpected PVsRetained event found for Delete policy cluster" - fi - - # Wait a bit for PV to be deleted (the storage class handles actual deletion) - echo "Waiting for PV to be deleted..." - sleep 30 - - # Verify PV was deleted (because reclaim policy is Delete) - pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null) - if [ -z "$pv_exists" ]; then - echo "✓ PV $pv_name was deleted as expected (Delete policy)" - else - pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}') - echo "⚠️ PV $pv_name still exists with status: $pv_status" - echo "Note: PV deletion depends on the storage provisioner. The reclaim policy was correctly set to Delete." - fi - - echo "" - echo "✓ PV reclaim policy test completed successfully" - - - name: Collect comprehensive logs on failure - if: failure() - uses: ./.github/actions/collect-logs - with: - architecture: ${{ matrix.architecture }} - operator-namespace: ${{ env.OPERATOR_NS }} - db-namespace: ${{ env.DB_NS }} - db-name: ${{ env.DB_NAME }} - - - name: Test completion summary - if: always() - run: | - echo "## E2E Test Summary for ${{ matrix.architecture }} (K8s ${{ matrix.kubernetes_version }})" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Architecture**: ${{ matrix.architecture }}" >> $GITHUB_STEP_SUMMARY - echo "- **Runner**: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY - echo "- **Kubernetes Version**: ${{ matrix.kubernetes_version }}" >> $GITHUB_STEP_SUMMARY - echo "- **Test Scenario**: ${{ matrix.test_scenario_name }}" >> $GITHUB_STEP_SUMMARY - echo "- **Node Count**: ${{ matrix.node_count }}" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: ${{ env.IMAGE_TAG }}" >> $GITHUB_STEP_SUMMARY - echo "- **Chart Version**: ${{ env.CHART_VERSION }}" >> $GITHUB_STEP_SUMMARY - if [[ -n "${{ inputs.image_tag }}" ]]; then - echo "- **Using External Images**: true" >> $GITHUB_STEP_SUMMARY - else - echo "- **Using External Images**: false" >> $GITHUB_STEP_SUMMARY - fi - - if [[ "${{ job.status }}" == "success" ]]; then - echo "- **Status**: ✅ PASSED" >> $GITHUB_STEP_SUMMARY - else - echo "- **Status**: ❌ FAILED" >> $GITHUB_STEP_SUMMARY - fi - - test-summary: - name: E2E Test Summary - runs-on: ubuntu-latest - if: always() - needs: [build, e2e-test] - steps: - - name: Generate overall test summary - run: | - echo "## E2E Test Results Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Test Configuration:" >> $GITHUB_STEP_SUMMARY - echo "- **Build Step**: ${{ inputs.image_tag && 'Skipped (using external images)' || 'Executed' }}" >> $GITHUB_STEP_SUMMARY - echo "- **External Images**: ${{ inputs.image_tag && 'true' || 'false' }}" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: ${{ inputs.image_tag || 'Built from source' }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Parallel Architecture Testing:" >> $GITHUB_STEP_SUMMARY - echo "- **AMD64**: Tested in parallel on ubuntu-latest" >> $GITHUB_STEP_SUMMARY - echo "- **ARM64**: Tested in parallel on ubuntu-22.04-arm" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "Both architectures run simultaneously for faster feedback!" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Job Results:" >> $GITHUB_STEP_SUMMARY - echo "- **Build**: ${{ needs.build.result }}" >> $GITHUB_STEP_SUMMARY - echo "- **E2E Tests**: ${{ needs.e2e-test.result }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - # Overall status - if [[ "${{ needs.e2e-test.result }}" == "success" ]]; then - echo "### Overall Status: ✅ ALL TESTS PASSED" >> $GITHUB_STEP_SUMMARY - echo "Both AMD64 and ARM64 architectures tested successfully in parallel!" >> $GITHUB_STEP_SUMMARY - else - echo "### Overall Status: ❌ SOME TESTS FAILED" >> $GITHUB_STEP_SUMMARY - echo "Check individual job results above for details." >> $GITHUB_STEP_SUMMARY - fi diff --git a/.github/workflows/test-backup-and-restore.yml b/.github/workflows/test-backup-and-restore.yml deleted file mode 100644 index e1dfb899..00000000 --- a/.github/workflows/test-backup-and-restore.yml +++ /dev/null @@ -1,580 +0,0 @@ -name: Test - Backup and Restore - -on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main, develop ] - schedule: - - cron: '0 2 * * *' - workflow_dispatch: - inputs: - node_count: - description: 'Number of DocumentDB nodes' - required: false - default: '1' - image_tag: - description: 'Optional: Use existing image tag instead of building locally' - required: false - type: string - workflow_call: - inputs: - image_tag: - description: 'Optional: Use existing image tag instead of building locally' - required: false - type: string - node_count: - description: 'Number of DocumentDB nodes' - required: false - default: '1' - type: string - -permissions: - contents: read - actions: read - packages: read - -env: - CERT_MANAGER_NS: cert-manager - OPERATOR_NS: documentdb-operator - DB_NS: documentdb-backup-and-restore-test - DB_NAME: documentdb-backup-and-restore - DB_RESTORE_NAME: documentdb-restore-from-backup - DB_USERNAME: k8s_secret_user - DB_PASSWORD: K8sSecret100 - DB_PORT: 10260 - DOCUMENTDB_IMAGE: "" - GATEWAY_IMAGE: "" - -jobs: - # Conditional build workflow - only run if image_tag is not provided or on pull_request - build: - name: Build Images and Charts - if: ${{ (inputs.image_tag == '' || inputs.image_tag == null) || github.event_name == 'pull_request' }} - uses: ./.github/workflows/test-build-and-package.yml - with: - version: '0.2.0' - secrets: inherit - - backup-and-restore-test: - name: Run Backup and Restore Tests - runs-on: ${{ matrix.runner }} - timeout-minutes: 60 - needs: build - if: always() && (needs.build.result == 'success' || needs.build.result == 'skipped') - - strategy: - matrix: - include: - - architecture: amd64 - runner: ubuntu-22.04 - test_scenario_name: "single-node" - node_count: 1 - instances_per_node: 1 - - architecture: arm64 - runner: ubuntu-22.04-arm - test_scenario_name: "single-node" - node_count: 1 - instances_per_node: 1 - env: - # Use built image tag on PR or when no external tag provided - IMAGE_TAG: ${{ (github.event_name == 'pull_request' || inputs.image_tag == '' || inputs.image_tag == null) && needs.build.outputs.image_tag || inputs.image_tag }} - EXT_IMAGE_TAG: ${{ needs.build.outputs.ext_image_tag || '' }} - CHART_VERSION: ${{ needs.build.outputs.chart_version || '0.1.0' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Download artifacts - if: ${{ (inputs.image_tag == '' || inputs.image_tag == null) || github.event_name == 'pull_request' }} - uses: actions/download-artifact@v4 - with: - pattern: 'build-*' - path: ./artifacts - - - name: Log test configuration - run: | - echo "## Backup and Restore Test Configuration" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - if [[ -n "${{ inputs.image_tag }}" ]]; then - echo "- **Mode**: Using provided image tag" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: \`${{ inputs.image_tag }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Source**: External (no local build)" >> $GITHUB_STEP_SUMMARY - else - echo "- **Mode**: Using locally built images" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: \`${{ env.IMAGE_TAG }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Source**: Local build pipeline" >> $GITHUB_STEP_SUMMARY - fi - echo "- **Architecture**: \`${{ matrix.architecture }}\`" >> $GITHUB_STEP_SUMMARY - - - name: Determine DocumentDB and Gateway image references - run: | - # For local builds, use the locally built images; for external, read documentDbVersion from values.yaml. - # Database images use a separate version track from operator images. - if [[ "${{ github.event_name }}" == "pull_request" || -z "${{ inputs.image_tag }}" ]]; then - DOCDB_IMAGE="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/documentdb:${{ env.EXT_IMAGE_TAG }}-${{ matrix.architecture }}" - GW_IMAGE="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/gateway:${{ env.EXT_IMAGE_TAG }}-${{ matrix.architecture }}" - else - DOCDB_TAG=$(grep 'documentDbVersion:' operator/documentdb-helm-chart/values.yaml | sed 's/.*"\(.*\)".*/\1/') - echo "Using documentDbVersion from values.yaml: $DOCDB_TAG" - DOCDB_IMAGE="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/documentdb:${DOCDB_TAG}" - GW_IMAGE="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/gateway:${DOCDB_TAG}" - fi - echo "DOCUMENTDB_IMAGE_REF=$DOCDB_IMAGE" >> $GITHUB_ENV - echo "GATEWAY_IMAGE_REF=$GW_IMAGE" >> $GITHUB_ENV - echo "DocumentDB image: $DOCDB_IMAGE" - echo "Gateway image: $GW_IMAGE" - - - name: Setup test environment - uses: ./.github/actions/setup-test-environment - with: - architecture: ${{ matrix.architecture }} - runner: ${{ matrix.runner }} - test-scenario-name: ${{ matrix.test_scenario_name }} - node-count: '${{ matrix.node_count }}' - instances-per-node: '${{ matrix.instances_per_node }}' - cert-manager-namespace: ${{ env.CERT_MANAGER_NS }} - operator-namespace: ${{ env.OPERATOR_NS }} - db-namespace: ${{ env.DB_NS }} - db-cluster-name: ${{ env.DB_NAME }} - db-username: ${{ env.DB_USERNAME }} - db-password: ${{ env.DB_PASSWORD }} - db-port: ${{ env.DB_PORT }} - image-tag: ${{ env.IMAGE_TAG }} - documentdb-image-tag: ${{ env.EXT_IMAGE_TAG }} - chart-version: ${{ env.CHART_VERSION }} - documentdb-image: ${{ env.DOCUMENTDB_IMAGE_REF }} - gateway-image: ${{ env.GATEWAY_IMAGE_REF }} - use-external-images: ${{ github.event_name != 'pull_request' && inputs.image_tag != '' && inputs.image_tag != null }} - github-token: ${{ secrets.GITHUB_TOKEN }} - repository-owner: ${{ github.repository_owner }} - - name: Setup port forwarding for comprehensive tests - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Insert test data using mongosh - run: | - echo "Inserting test data into DocumentDB cluster..." - if mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval "for (let i = 1; i <= 100; i++) { db.testCollection.insertOne({ index: i, message: 'This is document ' + i }); }" ; then - echo "✓ Test data insertion completed successfully on ${{ matrix.architecture }}" - else - echo "❌ Test data insertion failed on ${{ matrix.architecture }}" - exit 1 - fi - - echo "Verifying inserted test data..." - count=$(mongosh 127.0.0.1:$DB_PORT --quiet --eval "db.testCollection.countDocuments({})" -u $DB_USERNAME -p $DB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates) - if [[ "$count" -eq 100 ]]; then - echo "✓ Test data verification completed successfully on ${{ matrix.architecture }}" - else - echo "❌ Test data verification failed on ${{ matrix.architecture }}" - exit 1 - fi - - - name: Create ScheduledBackup to trigger backups - shell: bash - run: | - cat </dev/null || true - rm -f /tmp/pf_pid - fi - - # Clean up output log - rm -f /tmp/pf_output.log - - # Clean up output log - rm -f /tmp/pf_output.log - - - name: Restore from backup - shell: bash - run: | - # Get the latest backup name - backup_name=$(kubectl -n ${{ env.DB_NS }} get backups -o jsonpath='{.items[?(@.status.phase=="completed")].metadata.name}' | tr ' ' '\n' | sort | tail -n 1) - - # Create DocumentDB resource - cat </dev/null || echo "") - if [ "$status" == "Cluster in healthy state" ]; then - echo "✓ DocumentDB cluster ${{ env.DB_RESTORE_NAME }} is healthy." - exit 0 - else - echo "Current status: $status. Waiting..." - kubectl -n ${{ env.DB_NS }} get documentdb ${{ env.DB_RESTORE_NAME }} || true - sleep $SLEEP_INTERVAL - fi - ((++ITER)) - done - echo "❌ DocumentDB cluster ${{ env.DB_RESTORE_NAME }} did not become healthy within expected time." - kubectl -n ${{ env.DB_NS }} describe documentdb ${{ env.DB_RESTORE_NAME }} || true - kubectl -n ${{ env.DB_NS }} get pods -l cnpg.io/cluster=${{ env.DB_RESTORE_NAME }} || true - exit 1 - - - name: Setup port forwarding for comprehensive tests - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_RESTORE_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Validate restored data - run: | - # Validate that the restored cluster has the expected data - count=$(mongosh 127.0.0.1:$DB_PORT --quiet --eval "db.testCollection.countDocuments({})" -u $DB_USERNAME -p $DB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates) - if [ "$count" -eq 100 ]; then - echo "✓ Data validation completed successfully on ${{ matrix.architecture }}" - else - echo "❌ Data validation failed on ${{ matrix.architecture }}" - exit 1 - fi - - - name: Cleanup comprehensive test port forwarding - if: always() - run: | - # Stop port-forward if it exists - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - - # Clean up output log - rm -f /tmp/pf_output.log - - # Clean up output log - rm -f /tmp/pf_output.log - - - name: Test if expired backups are cleaned up - shell: bash - run: | - echo "Verifying expired backups are cleaned up..." - # pick up one backup name - backup_name=$(kubectl -n $DB_NS get backups -o jsonpath='{.items[0].metadata.name}') - # set expiration time to past - kubectl -n $DB_NS patch backup $backup_name --type='json' --type=merge -p='{"status":{"expiredAt":"2000-01-01T00:00:00Z"}}' --subresource=status - # wait for cleanup - MAX_RETRIES=10 - SLEEP_INTERVAL=15 - ITER=0 - while [ $ITER -lt $MAX_RETRIES ]; do - backup_status=$(kubectl -n $DB_NS get backup $backup_name --ignore-not-found) - if [ -z "$backup_status" ]; then - echo "✓ Expired backup cleaned up successfully." - exit 0 - else - echo "Backup $backup_name still exists. Retrying in $SLEEP_INTERVAL seconds..." - kubectl -n $DB_NS get backup $backup_name - sleep $SLEEP_INTERVAL - fi - ((++ITER)) - done - echo "❌ Expired backup was not cleaned up within expected time." - exit 1 - - - name: Test PV retention after DocumentDB deletion - id: test-pv-retention - shell: bash - run: | - echo "Testing PV retention after DocumentDB deletion..." - - # Find PV directly using documentdb.io labels set by the PV controller - # PVs are cluster-scoped and labeled with documentdb.io/cluster and documentdb.io/namespace - pv_name=$(kubectl get pv -l documentdb.io/cluster=${{ env.DB_RESTORE_NAME }},documentdb.io/namespace=${{ env.DB_NS }} -o jsonpath='{.items[0].metadata.name}') - echo "PV name: $pv_name" - - if [ -z "$pv_name" ]; then - echo "❌ Failed to find PV with documentdb.io/cluster=${{ env.DB_RESTORE_NAME }} and documentdb.io/namespace=${{ env.DB_NS }}" - exit 1 - fi - - # Check current PV reclaim policy - should be Retain by default - current_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}') - echo "Current PV reclaim policy: $current_policy" - - if [ "$current_policy" != "Retain" ]; then - echo "❌ Expected PV reclaim policy to be 'Retain' (default), but got '$current_policy'" - exit 1 - fi - echo "✓ PV reclaim policy is correctly set to Retain (default)" - - # Delete the restored DocumentDB cluster - kubectl -n ${{ env.DB_NS }} delete documentdb ${{ env.DB_RESTORE_NAME }} --wait=false - - # Wait for DocumentDB to be deleted - echo "Waiting for DocumentDB to be deleted..." - MAX_RETRIES=30 - SLEEP_INTERVAL=10 - ITER=0 - while [ $ITER -lt $MAX_RETRIES ]; do - db_exists=$(kubectl -n ${{ env.DB_NS }} get documentdb ${{ env.DB_RESTORE_NAME }} --ignore-not-found) - if [ -z "$db_exists" ]; then - echo "✓ DocumentDB deleted successfully." - break - else - echo "DocumentDB still exists. Waiting..." - sleep $SLEEP_INTERVAL - fi - ((++ITER)) - done - - # Verify PV still exists (because reclaim policy is Retain) - pv_exists=$(kubectl get pv $pv_name --ignore-not-found) - if [ -n "$pv_exists" ]; then - echo "✓ PV $pv_name retained after DocumentDB deletion" - else - echo "❌ PV $pv_name was deleted unexpectedly" - exit 1 - fi - - # Verify pv_name is not empty before writing to GITHUB_OUTPUT - if [ -z "$pv_name" ]; then - echo "❌ Error: PV name is empty, cannot proceed with recovery test" - exit 1 - fi - - # Store PV name for later steps using GitHub Actions output (more robust than temp files) - echo "pv_name=$pv_name" >> $GITHUB_OUTPUT - - - name: Restore DocumentDB from retained PV - shell: bash - run: | - pv_name="${{ steps.test-pv-retention.outputs.pv_name }}" - echo "Restoring DocumentDB from retained PV: $pv_name" - - # Create DocumentDB resource with PV recovery - echo "Creating DocumentDB with PV recovery from $pv_name" - cat </dev/null || echo "") - if [ "$status" == "Cluster in healthy state" ]; then - echo "✓ DocumentDB cluster ${{ env.DB_RESTORE_NAME }}-from-pv is healthy." - exit 0 - else - echo "Current status: $status. Waiting..." - kubectl -n ${{ env.DB_NS }} get documentdb ${{ env.DB_RESTORE_NAME }}-from-pv || true - sleep $SLEEP_INTERVAL - fi - ((++ITER)) - done - echo "❌ DocumentDB cluster ${{ env.DB_RESTORE_NAME }}-from-pv did not become healthy within expected time." - kubectl -n ${{ env.DB_NS }} describe documentdb ${{ env.DB_RESTORE_NAME }}-from-pv || true - kubectl -n ${{ env.DB_NS }} get pods -l cnpg.io/cluster=${{ env.DB_RESTORE_NAME }}-from-pv || true - exit 1 - - - name: Setup port forwarding for PV restored cluster - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_RESTORE_NAME }}-from-pv - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Validate data exists after PV restoration - run: | - echo "Validating data exists after PV restoration..." - - # Validate that the restored cluster has the expected data - count=$(mongosh 127.0.0.1:$DB_PORT --quiet --eval "db.testCollection.countDocuments({})" -u $DB_USERNAME -p $DB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates) - if [ "$count" -eq 100 ]; then - echo "✓ Data validation completed successfully after PV restoration on ${{ matrix.architecture }}" - else - echo "❌ Data validation failed after PV restoration on ${{ matrix.architecture }}. Count: $count" - exit 1 - fi - - - name: Verify temporary recovery PVC is cleaned up - shell: bash - run: | - echo "Verifying temporary recovery PVC is cleaned up after cluster is healthy..." - - # The temp PVC name follows the pattern: -pv-recovery-temp - temp_pvc_name="${{ env.DB_RESTORE_NAME }}-from-pv-pv-recovery-temp" - - # Wait a bit for cleanup to happen - sleep 10 - - # Check if temporary PVC still exists - temp_pvc_exists=$(kubectl -n ${{ env.DB_NS }} get pvc $temp_pvc_name --ignore-not-found -o name) - if [ -z "$temp_pvc_exists" ]; then - echo "✓ Temporary recovery PVC $temp_pvc_name was cleaned up successfully" - else - echo "❌ Temporary recovery PVC $temp_pvc_name still exists after cluster is healthy" - kubectl -n ${{ env.DB_NS }} get pvc $temp_pvc_name - exit 1 - fi - - - name: Cleanup PV restored cluster port forwarding - if: always() - run: | - # Stop port-forward if it exists - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - - # Clean up output log - rm -f /tmp/pf_output.log - - - name: Collect logs on failure - if: failure() - uses: ./.github/actions/collect-logs - with: - architecture: ${{ matrix.architecture }} - operator-namespace: ${{ env.OPERATOR_NS }} - db-namespace: ${{ env.DB_NS }} - db-cluster-name: ${{ env.DB_NAME }} - cert-manager-namespace: ${{ env.CERT_MANAGER_NS }} diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml new file mode 100644 index 00000000..8a0633c9 --- /dev/null +++ b/.github/workflows/test-e2e.yml @@ -0,0 +1,274 @@ +# Unified DocumentDB E2E test workflow. +# +# Replaces the legacy test-E2E.yml / test-integration.yml / +# test-backup-and-restore.yml / test-upgrade-and-rollback.yml quartet. +# See docs/designs/e2e-test-suite.md ("CI Workflow" section) for the +# design rationale. +# +# Each matrix job runs a single Ginkgo label-filtered slice of the suite +# under test/e2e/. The composite action .github/actions/setup-test-environment +# provisions a Kind cluster, installs cert-manager + the operator, and +# deploys any scenario-specific prerequisites. + +name: TEST - E2E + +on: + push: + branches: [main] + pull_request: + branches: [main] + paths: + - 'test/e2e/**' + - 'operator/src/**' + - 'operator/documentdb-helm-chart/**' + - '.github/workflows/test-e2e.yml' + - '.github/actions/**' + workflow_dispatch: + inputs: + label: + description: 'Ginkgo --label-filter override (empty = use per-job default)' + required: false + type: string + default: '' + depth: + description: 'Test depth tier (maps to TEST_DEPTH / E2E_DEPTH)' + required: false + type: choice + options: + - Low + - Medium + - High + default: Medium + keep_clusters: + description: 'Keep Kind clusters running after tests (for debugging)' + required: false + type: boolean + default: false + +permissions: + contents: read + actions: read + packages: read + +env: + # Namespaces / identity used by the composite setup action. Held here + # so every job inherits them without duplication. + CERT_MANAGER_NS: cert-manager + OPERATOR_NS: documentdb-operator + DB_NS: documentdb-e2e + DB_NAME: documentdb-e2e + DB_USERNAME: k8s_secret_user + DB_PASSWORD: K8sSecret100 + DB_PORT: 10260 + +jobs: + # --------------------------------------------------------------------------- + # Build operator + gateway images and the helm chart once per workflow run. + # Each E2E job below downloads the resulting artifacts into ./artifacts so + # setup-test-environment can load them into its Kind cluster. + # --------------------------------------------------------------------------- + build: + name: Build Images and Charts + uses: ./.github/workflows/test-build-and-package.yml + with: + version: '0.2.0' + secrets: inherit + + # --------------------------------------------------------------------------- + # E2E matrix. + # + # Rows = (label-group, architecture). The label-group carries the + # Ginkgo --label-filter, Ginkgo --procs setting, and a human-readable + # scenario name used for artifact naming and kind cluster isolation. + # + # arm64 rows target `ubuntu-22.04-arm` — the same GitHub-hosted runner + # the legacy workflows use. If that SKU becomes unavailable in the + # future, the arm64 rows can be removed or gated on a feature flag; + # do not silently drop them. + # --------------------------------------------------------------------------- + e2e: + name: E2E ${{ matrix.group }} (${{ matrix.architecture }}) + needs: build + if: | + always() + && needs.build.result == 'success' + && ( + matrix.group != 'performance' + || github.event_name == 'workflow_dispatch' + || contains(github.event.pull_request.labels.*.name, 'run-perf') + ) + runs-on: ${{ matrix.runner }} + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + architecture: [amd64, arm64] + group: + - smoke + - lifecycle + - scale + - data + - performance + - backup + - tls + - feature + - upgrade + include: + # Per-group defaults. `default_filter` is used when the + # workflow_dispatch `label` input is empty. + # + # NOTE: the design doc's "feature || exposure || status" filter + # uses the code-level label name `feature-gates` (see + # test/e2e/labels.go FeatureLabel). The design-doc name + # `feature` is a shorthand; we honour code as source of truth. + - group: smoke + default_filter: 'smoke' + procs: 'auto' + - group: lifecycle + default_filter: 'lifecycle' + procs: 'auto' + - group: scale + default_filter: 'scale' + procs: '2' + - group: data + default_filter: 'data' + procs: 'auto' + - group: performance + default_filter: 'performance' + procs: '1' + - group: backup + default_filter: 'backup' + procs: '2' + - group: tls + default_filter: 'tls' + procs: 'auto' + - group: feature + default_filter: 'feature-gates || exposure || status' + procs: 'auto' + - group: upgrade + default_filter: 'upgrade' + procs: '1' + # Per-architecture runner mapping. + - architecture: amd64 + runner: ubuntu-22.04 + - architecture: arm64 + runner: ubuntu-22.04-arm + env: + E2E_RUN_ID: ${{ github.run_id }}-${{ github.run_attempt }} + E2E_DEPTH: ${{ inputs.depth || 'Medium' }} + TEST_DEPTH: ${{ inputs.depth || 'Medium' }} + GINKGO_LABEL_FILTER: ${{ inputs.label != '' && inputs.label || matrix.default_filter }} + E2E_KEEP_CLUSTERS: ${{ inputs.keep_clusters && '1' || '0' }} + IMAGE_TAG: ${{ needs.build.outputs.image_tag }} + CHART_VERSION: ${{ needs.build.outputs.chart_version || '0.1.0' }} + # Upgrade job knobs. Defaults point at the public OCI chart; + # repository admins may override via `vars`/`secrets` for private + # scenarios. If any of the *_IMAGE values are left unset the + # upgrade specs Skip at runtime (see test/e2e/tests/upgrade/). + E2E_UPGRADE: ${{ matrix.group == 'upgrade' && '1' || '' }} + E2E_UPGRADE_PREVIOUS_CHART: ${{ vars.E2E_UPGRADE_PREVIOUS_CHART || 'oci://ghcr.io/documentdb/charts/documentdb-operator' }} + E2E_UPGRADE_PREVIOUS_VERSION: ${{ vars.E2E_UPGRADE_PREVIOUS_VERSION || '' }} + E2E_UPGRADE_CURRENT_CHART: ${{ vars.E2E_UPGRADE_CURRENT_CHART || '' }} + E2E_UPGRADE_OLD_DOCUMENTDB_IMAGE: ${{ vars.E2E_UPGRADE_OLD_DOCUMENTDB_IMAGE || '' }} + E2E_UPGRADE_NEW_DOCUMENTDB_IMAGE: ${{ vars.E2E_UPGRADE_NEW_DOCUMENTDB_IMAGE || '' }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: test/e2e/go.mod + cache-dependency-path: test/e2e/go.sum + + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + pattern: 'build-*' + path: ./artifacts + + - name: Log test configuration + run: | + { + echo "## E2E — ${{ matrix.group }} (${{ matrix.architecture }})" + echo "" + echo "- **Label filter**: \`${GINKGO_LABEL_FILTER}\`" + echo "- **Procs**: \`${{ matrix.procs }}\`" + echo "- **Depth**: \`${E2E_DEPTH}\`" + echo "- **Run ID**: \`${E2E_RUN_ID}\`" + echo "- **Image tag**: \`${IMAGE_TAG}\`" + echo "- **Chart version**: \`${CHART_VERSION}\`" + } >> "$GITHUB_STEP_SUMMARY" + + - name: Setup test environment + uses: ./.github/actions/setup-test-environment + with: + test-type: 'e2e' + architecture: ${{ matrix.architecture }} + runner: ${{ matrix.runner }} + test-scenario-name: ${{ matrix.group }} + node-count: '1' + instances-per-node: '1' + cert-manager-namespace: ${{ env.CERT_MANAGER_NS }} + operator-namespace: ${{ env.OPERATOR_NS }} + db-namespace: ${{ env.DB_NS }} + db-cluster-name: ${{ env.DB_NAME }} + db-username: ${{ env.DB_USERNAME }} + db-password: ${{ env.DB_PASSWORD }} + db-port: ${{ env.DB_PORT }} + image-tag: ${{ env.IMAGE_TAG }} + chart-version: ${{ env.CHART_VERSION }} + use-external-images: 'false' + github-token: ${{ secrets.GITHUB_TOKEN }} + repository-owner: ${{ github.repository_owner }} + + - name: Install Ginkgo CLI + working-directory: test/e2e + run: | + go install github.com/onsi/ginkgo/v2/ginkgo + ginkgo version + + - name: Run E2E specs + working-directory: test/e2e + run: | + mkdir -p artifacts + set -o pipefail + ginkgo run \ + -r \ + --label-filter="${GINKGO_LABEL_FILTER}" \ + --procs=${{ matrix.procs }} \ + --timeout=75m \ + --keep-going \ + --junit-report=junit.xml \ + --output-dir=artifacts \ + ./tests/... 2>&1 | tee artifacts/ginkgo.log + + - name: Collect cluster diagnostics + if: failure() + uses: ./.github/actions/collect-logs + with: + architecture: ${{ matrix.architecture }} + operator-namespace: ${{ env.OPERATOR_NS }} + db-namespace: ${{ env.DB_NS }} + db-name: ${{ env.DB_NAME }} + + - name: Upload JUnit report + if: always() + uses: actions/upload-artifact@v4 + with: + name: e2e-junit-${{ matrix.group }}-${{ matrix.architecture }}-${{ github.run_attempt }} + path: test/e2e/artifacts/junit.xml + if-no-files-found: warn + retention-days: 14 + + - name: Upload E2E logs and diagnostics + if: failure() + uses: actions/upload-artifact@v4 + with: + name: e2e-logs-${{ matrix.group }}-${{ matrix.architecture }}-${{ github.run_attempt }} + path: | + test/e2e/artifacts/ + /tmp/cluster-logs/ + if-no-files-found: ignore + retention-days: 14 diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml deleted file mode 100644 index 1f718d4e..00000000 --- a/.github/workflows/test-integration.yml +++ /dev/null @@ -1,163 +0,0 @@ -name: TEST - Integration with python - -on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main, develop ] - workflow_dispatch: - inputs: - image_tag: - description: 'Optional: Use existing image tag instead of building locally' - required: false - type: string - workflow_call: - inputs: - image_tag: - description: 'Optional: Use existing image tag instead of building locally' - required: false - type: string - -permissions: - packages: write - contents: read - id-token: write - -env: - # Cluster configuration - CERT_MANAGER_NS: cert-manager - OPERATOR_NS: documentdb-operator - DB_NS: documentdb-preview-ns - DB_NAME: documentdb-preview - # Connection parameters - DB_USERNAME: default_user - DB_PASSWORD: Admin100 - DB_PORT: 10260 - -jobs: - # Use the reusable build workflow - only if no image tag is provided or on pull_request - build: - name: Build Images and Charts - if: ${{ (github.event.inputs.image_tag == '' || github.event.inputs.image_tag == null) || github.event_name == 'pull_request' }} - uses: ./.github/workflows/test-build-and-package.yml - with: - version: '0.2.0' - secrets: inherit - - integration-test: - name: Run Integration Tests - runs-on: ${{ matrix.runner }} - timeout-minutes: 45 - needs: build - if: always() && (needs.build.result == 'success' || needs.build.result == 'skipped') - - strategy: - matrix: - include: - - architecture: amd64 - runner: ubuntu-22.04 - test_scenario_name: "single-node" - node_count: 1 - instances_per_node: 1 - - architecture: arm64 - runner: ubuntu-22.04-arm - test_scenario_name: "single-node" - node_count: 1 - instances_per_node: 1 - - env: - # Use built image tag on PR or when no external tag provided - IMAGE_TAG: ${{ (github.event_name == 'pull_request' || github.event.inputs.image_tag == '' || github.event.inputs.image_tag == null) && needs.build.outputs.image_tag || github.event.inputs.image_tag }} - EXT_IMAGE_TAG: ${{ needs.build.outputs.ext_image_tag || '' }} - CHART_VERSION: ${{ needs.build.outputs.chart_version || '0.1.0' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Download artifacts - if: ${{ (github.event.inputs.image_tag == '' || github.event.inputs.image_tag == null) || github.event_name == 'pull_request' }} - uses: actions/download-artifact@v4 - with: - pattern: 'build-*' - path: ./artifacts - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Log test configuration - run: | - echo "## Integration Test Configuration" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - if [[ -n "${{ github.event.inputs.image_tag }}" ]]; then - echo "- **Mode**: Using provided image tag" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: \`${{ github.event.inputs.image_tag }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Source**: External (no local build)" >> $GITHUB_STEP_SUMMARY - else - echo "- **Mode**: Using locally built images" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: \`${{ env.IMAGE_TAG }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Source**: Local build pipeline" >> $GITHUB_STEP_SUMMARY - fi - echo "- **Architecture**: \`${{ matrix.architecture }}\`" >> $GITHUB_STEP_SUMMARY - - - name: Setup test environment - uses: ./.github/actions/setup-test-environment - with: - test-type: 'integration' - architecture: ${{ matrix.architecture }} - runner: ${{ matrix.runner }} - test-scenario-name: ${{ matrix.test_scenario_name }} - node-count: '${{ matrix.node_count }}' - instances-per-node: '${{ matrix.instances_per_node }}' - cert-manager-namespace: ${{ env.CERT_MANAGER_NS }} - operator-namespace: ${{ env.OPERATOR_NS }} - db-namespace: ${{ env.DB_NS }} - db-cluster-name: ${{ env.DB_NAME }} - db-username: ${{ env.DB_USERNAME }} - db-password: ${{ env.DB_PASSWORD }} - db-port: ${{ env.DB_PORT }} - image-tag: ${{ env.IMAGE_TAG }} - documentdb-image-tag: ${{ env.EXT_IMAGE_TAG }} - chart-version: ${{ env.CHART_VERSION }} - use-external-images: ${{ github.event_name != 'pull_request' && github.event.inputs.image_tag != '' && github.event.inputs.image_tag != null }} - github-token: ${{ secrets.GITHUB_TOKEN }} - repository-owner: ${{ github.repository_owner }} - - - name: Test connection with mongosh - run: | - echo "Testing connection with mongosh on ${{ matrix.architecture }} architecture..." - chmod +x operator/src/scripts/test-scripts/test-mongodb-connection.sh - ./operator/src/scripts/test-scripts/test-mongodb-connection.sh \ - --architecture "${{ matrix.architecture }}" \ - --namespace "${{ env.DB_NS }}" \ - --cluster-name "${{ env.DB_NAME }}" \ - --pod-name "${{ env.DB_NAME }}-1" \ - --port "${{ env.DB_PORT }}" \ - --username "${{ env.DB_USERNAME }}" \ - --password "${{ env.DB_PASSWORD }}" \ - --test-type 'basic' - - - name: Test with Python PyMongo client - run: | - echo "Testing with Python PyMongo client on ${{ matrix.architecture }} architecture..." - chmod +x operator/src/scripts/test-scripts/test-python-pymongo.sh - ./operator/src/scripts/test-scripts/test-python-pymongo.sh \ - --architecture "${{ matrix.architecture }}" \ - --namespace "${{ env.DB_NS }}" \ - --cluster-name "${{ env.DB_NAME }}" \ - --pod-name "${{ env.DB_NAME }}-1" \ - --port "${{ env.DB_PORT }}" \ - --username "${{ env.DB_USERNAME }}" \ - --password "${{ env.DB_PASSWORD }}" - - - name: Collect logs on failure - if: failure() - uses: ./.github/actions/collect-logs - with: - architecture: ${{ matrix.architecture }} - operator-namespace: ${{ env.OPERATOR_NS }} - db-namespace: ${{ env.DB_NS }} - db-cluster-name: ${{ env.DB_NAME }} - cert-manager-namespace: ${{ env.CERT_MANAGER_NS }} diff --git a/.github/workflows/test-upgrade-and-rollback.yml b/.github/workflows/test-upgrade-and-rollback.yml deleted file mode 100644 index d1e9c05f..00000000 --- a/.github/workflows/test-upgrade-and-rollback.yml +++ /dev/null @@ -1,1337 +0,0 @@ -name: TEST - Upgrade and Rollback - -on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main, develop ] - schedule: - - cron: '0 2 * * *' - workflow_dispatch: - inputs: - image_tag: - description: 'Optional: Use existing image tag instead of building locally' - required: false - type: string - released_chart_version: - description: 'Released chart version to upgrade from (default: latest)' - required: false - type: string - default: 'latest' - workflow_call: - inputs: - image_tag: - description: 'Optional: Use existing image tag instead of building locally' - required: false - type: string - released_chart_version: - description: 'Released chart version to upgrade from (default: latest)' - required: false - type: string - default: 'latest' - -permissions: - contents: read - actions: read - packages: read - -env: - CERT_MANAGER_NS: cert-manager - OPERATOR_NS: documentdb-operator - DB_NS: documentdb-upgrade-test - DB_NAME: documentdb-upgrade - DB_USERNAME: k8s_secret_user - DB_PASSWORD: K8sSecret100 - DB_PORT: 10260 - RELEASED_DATABASE_VERSION: 0.109.0 - # Always resolve released baseline images from the canonical org, not the fork owner. - RELEASED_DATABASE_OWNER: documentdb - -jobs: - build: - name: Build Images and Charts - if: ${{ (inputs.image_tag == '' || inputs.image_tag == null) || github.event_name == 'pull_request' }} - uses: ./.github/workflows/test-build-and-package.yml - with: - version: '0.2.0' - secrets: inherit - - upgrade-and-rollback-test: - name: Upgrade & Rollback (${{ matrix.architecture }}) - runs-on: ${{ matrix.runner }} - timeout-minutes: 60 - needs: build - if: always() && (needs.build.result == 'success' || needs.build.result == 'skipped') - - strategy: - matrix: - include: - - architecture: amd64 - runner: ubuntu-22.04 - test_scenario_name: "single-node" - node_count: 1 - instances_per_node: 1 - - architecture: arm64 - runner: ubuntu-22.04-arm - test_scenario_name: "single-node" - node_count: 1 - instances_per_node: 1 - - env: - IMAGE_TAG: ${{ (github.event_name == 'pull_request' || inputs.image_tag == '' || inputs.image_tag == null) && needs.build.outputs.image_tag || inputs.image_tag }} - EXT_IMAGE_TAG: ${{ needs.build.outputs.ext_image_tag || '' }} - CHART_VERSION: ${{ needs.build.outputs.chart_version || '0.1.0' }} - DOCUMENTDB_COMBINED_IMAGE: ghcr.io/microsoft/documentdb/documentdb-local:16 - RELEASED_CHART_VERSION: ${{ inputs.released_chart_version || 'latest' }} - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Download artifacts - if: ${{ (inputs.image_tag == '' || inputs.image_tag == null) || github.event_name == 'pull_request' }} - uses: actions/download-artifact@v4 - with: - pattern: 'build-*' - path: ./artifacts - - - name: Determine new DocumentDB and Gateway image references - run: | - # Old images should always come from the latest released database baseline for this repository owner. - OLD_DOCDB="ghcr.io/${{ env.RELEASED_DATABASE_OWNER }}/documentdb-kubernetes-operator/documentdb:${{ env.RELEASED_DATABASE_VERSION }}" - OLD_GW="ghcr.io/${{ env.RELEASED_DATABASE_OWNER }}/documentdb-kubernetes-operator/gateway:${{ env.RELEASED_DATABASE_VERSION }}" - - # New images come from this workflow run (self-built) or from the provided candidate tag. - # Database images use a separate version track from operator images. - # CI-built images are now tagged with the extension version (e.g., 0.109.0-test-RUNID-arch) - # by test-build-and-package.yml, so no re-tagging is needed. - if [[ "${{ github.event_name }}" == "pull_request" || -z "${{ inputs.image_tag }}" ]]; then - NEW_DOCDB="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/documentdb:${{ env.EXT_IMAGE_TAG }}-${{ matrix.architecture }}" - NEW_GW="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/gateway:${{ env.EXT_IMAGE_TAG }}-${{ matrix.architecture }}" - else - # External images: read documentDbVersion from values.yaml (separate version track from operator). - DOCDB_TAG=$(grep 'documentDbVersion:' operator/documentdb-helm-chart/values.yaml | sed 's/.*"\(.*\)".*/\1/') - echo "Using documentDbVersion from values.yaml: $DOCDB_TAG" - NEW_DOCDB="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/documentdb:${DOCDB_TAG}" - NEW_GW="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/gateway:${DOCDB_TAG}" - fi - echo "DOCUMENTDB_IMAGE=$NEW_DOCDB" >> $GITHUB_ENV - echo "GATEWAY_IMAGE=$NEW_GW" >> $GITHUB_ENV - echo "DOCUMENTDB_OLD_IMAGE=$OLD_DOCDB" >> $GITHUB_ENV - echo "GATEWAY_OLD_IMAGE=$OLD_GW" >> $GITHUB_ENV - echo "Old DocumentDB image: $OLD_DOCDB" - echo "Old Gateway image: $OLD_GW" - echo "New DocumentDB image: $NEW_DOCDB" - echo "New Gateway image: $NEW_GW" - - - name: Log test configuration - run: | - echo "## Upgrade & Rollback Test Configuration" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - if [[ -n "${{ inputs.image_tag }}" ]]; then - echo "- **Mode**: Using provided image tag" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: \`${{ inputs.image_tag }}\`" >> $GITHUB_STEP_SUMMARY - else - echo "- **Mode**: Using locally built images" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: \`${{ env.IMAGE_TAG }}\`" >> $GITHUB_STEP_SUMMARY - fi - echo "- **Architecture**: \`${{ matrix.architecture }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Released Database Baseline**: \`${{ env.RELEASED_DATABASE_VERSION }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Old Extension Image**: \`${{ env.DOCUMENTDB_OLD_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **New Extension Image**: \`${{ env.DOCUMENTDB_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Combined Image**: \`${{ env.DOCUMENTDB_COMBINED_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **Old Gateway Image**: \`${{ env.GATEWAY_OLD_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY - echo "- **New Gateway Image**: \`${{ env.GATEWAY_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY - - # TODO: Remove this step once release versions > 0.1.3 - - name: Determine initial DocumentDB image - run: | - echo "=== Determining DocumentDB image for initial deployment ===" - - # Add the public DocumentDB Helm repository - helm repo add documentdb https://documentdb.github.io/documentdb-kubernetes-operator 2>/dev/null || true - helm repo update - - # Resolve the released chart version - CHART_VERSION="${{ env.RELEASED_CHART_VERSION }}" - if [[ "$CHART_VERSION" == "latest" ]]; then - RESOLVED_VERSION=$(helm search repo documentdb/documentdb-operator -o json | jq -r '.[0].version' 2>/dev/null || echo "") - if [[ -z "$RESOLVED_VERSION" || "$RESOLVED_VERSION" == "null" ]]; then - echo "⚠️ Failed to resolve chart version from Helm repo, defaulting to threshold" - RESOLVED_VERSION="0.1.3" - fi - else - RESOLVED_VERSION="$CHART_VERSION" - fi - echo "Resolved released chart version: $RESOLVED_VERSION" - - # Determine image mode based on release version - # Versions <= 0.1.3 use combined image (no ImageVolume support) - # Versions > 0.1.3 use extension image (ImageVolume mode) - THRESHOLD="0.1.3" - # Strip any pre-release suffix (e.g., 0.1.3-rc1 → 0.1.3) for clean semver comparison - CLEAN_VERSION=$(echo "$RESOLVED_VERSION" | sed 's/-.*//') - if [[ "$(printf '%s\n' "$THRESHOLD" "$CLEAN_VERSION" | sort -V | head -n1)" == "$CLEAN_VERSION" ]]; then - echo "Released version $RESOLVED_VERSION <= $THRESHOLD → combined image required" - USE_COMBINED=true - else - echo "Released version $RESOLVED_VERSION > $THRESHOLD → extension image supported" - USE_COMBINED=false - fi - - # Persist USE_COMBINED for later steps - # TODO: Remove once we deprecate combined mode - echo "USE_COMBINED=$USE_COMBINED" >> $GITHUB_ENV - - # Set the initial image based on determination - COMBINED_IMAGE="${{ env.DOCUMENTDB_COMBINED_IMAGE }}" - EXTENSION_IMAGE="${{ env.DOCUMENTDB_OLD_IMAGE }}" - if [[ "$USE_COMBINED" == "true" ]]; then - echo "DOCUMENTDB_INITIAL_IMAGE=$COMBINED_IMAGE" >> $GITHUB_ENV - # In combined mode, the gateway is part of the combined image - echo "GATEWAY_OLD_IMAGE=$COMBINED_IMAGE" >> $GITHUB_ENV - echo "✓ Using combined image for initial deployment: $COMBINED_IMAGE" - else - echo "DOCUMENTDB_INITIAL_IMAGE=$EXTENSION_IMAGE" >> $GITHUB_ENV - echo "✓ Using extension image for initial deployment: $EXTENSION_IMAGE" - fi - - - name: Setup test environment - uses: ./.github/actions/setup-test-environment - with: - test-type: 'e2e' - architecture: ${{ matrix.architecture }} - runner: ${{ matrix.runner }} - test-scenario-name: ${{ matrix.test_scenario_name }} - node-count: '${{ matrix.node_count }}' - instances-per-node: '${{ matrix.instances_per_node }}' - cert-manager-namespace: ${{ env.CERT_MANAGER_NS }} - operator-namespace: ${{ env.OPERATOR_NS }} - db-namespace: ${{ env.DB_NS }} - db-cluster-name: ${{ env.DB_NAME }} - db-username: ${{ env.DB_USERNAME }} - db-password: ${{ env.DB_PASSWORD }} - db-port: ${{ env.DB_PORT }} - image-tag: ${{ env.IMAGE_TAG }} - documentdb-image-tag: ${{ env.EXT_IMAGE_TAG }} - chart-version: ${{ env.CHART_VERSION }} - documentdb-image: ${{ env.DOCUMENTDB_INITIAL_IMAGE }} - gateway-image: ${{ env.GATEWAY_OLD_IMAGE }} - use-external-images: ${{ github.event_name != 'pull_request' && inputs.image_tag != '' && inputs.image_tag != null }} - released-chart-version: ${{ env.RELEASED_CHART_VERSION }} - github-token: ${{ secrets.GITHUB_TOKEN }} - repository-owner: ${{ github.repository_owner }} - - - name: Setup port forwarding for data seeding - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Seed test data before upgrade - run: | - echo "=== Data Persistence: Writing seed data before upgrade ===" - mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval ' - db = db.getSiblingDB("upgrade_test_db"); - db.test_collection.insertOne({ _id: "upgrade_marker", step: "pre-upgrade", timestamp: new Date().toISOString() }); - db.test_collection.insertOne({ _id: "persistence_check", data: "this_must_survive_rollback", count: 42 }); - var count = db.test_collection.countDocuments(); - print("✓ Seed data written: " + count + " documents"); - assert(count === 2, "Expected 2 documents but found " + count); - ' - echo "✓ Seed data written successfully on old version" - - - name: Cleanup port forwarding after data seeding - if: always() - run: | - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - rm -f /tmp/pf_output.log - - - name: "Step 1: Operator Control Plane Upgrade (released → built)" - run: | - echo "=== Step 1: Operator Control Plane Upgrade ===" - echo "Upgrading operator from released chart to locally built version on ${{ matrix.architecture }}..." - - ARCH="${{ matrix.architecture }}" - - # --- Baseline from Released Operator --- - echo "" - echo "--- Baseline (Released Operator) ---" - echo "Helm release info:" - helm list -n $OPERATOR_NS - - RELEASED_OPERATOR_IMAGE=$(kubectl get deployment documentdb-operator -n $OPERATOR_NS -o jsonpath='{.spec.template.spec.containers[0].image}') - echo "Released operator image: $RELEASED_OPERATOR_IMAGE" - - # Record DB pod state before operator upgrade - echo "" - echo "DB pods before operator upgrade:" - kubectl get pods -n $DB_NS -l cnpg.io/cluster=$DB_NAME -o wide - PRE_UPGRADE_UIDS=$(kubectl get pods -n $DB_NS -l cnpg.io/cluster=$DB_NAME -o jsonpath='{.items[*].metadata.uid}') - echo "Pod UIDs: $PRE_UPGRADE_UIDS" - - # --- Prepare Built Chart --- - echo "" - echo "--- Preparing Built Chart ---" - CHART_ARTIFACT_DIR="./artifacts/build-helm-chart-${ARCH}" - EXPECTED_CHART_FILE="$CHART_ARTIFACT_DIR/documentdb-chart-${{ env.CHART_VERSION }}-${ARCH}.tgz" - - if [ ! -f "$EXPECTED_CHART_FILE" ]; then - echo "❌ Built Helm chart not found: $EXPECTED_CHART_FILE" - ls -la "$CHART_ARTIFACT_DIR/" || echo "Chart artifact directory not found" - exit 1 - fi - - echo "Extracting built chart: $EXPECTED_CHART_FILE" - rm -rf ./documentdb-chart - tar -xzf "$EXPECTED_CHART_FILE" - - echo "Built chart version:" - cat ./documentdb-chart/Chart.yaml | grep -E "^(version|appVersion):" - - # --- Apply CRDs before Helm Upgrade --- - # Helm does not upgrade CRDs on `helm upgrade` (only on `helm install`). - # We must apply them separately so new fields (e.g. spec.schemaVersion) are - # recognised by the API server and passed to the validating webhook. - echo "" - echo "--- Applying CRDs from new chart ---" - kubectl apply --server-side --force-conflicts -f ./documentdb-chart/crds/ - echo "CRDs applied." - - # --- Perform Helm Upgrade --- - echo "" - echo "--- Performing Helm Upgrade ---" - LOCAL_IMAGE_TAG="${{ env.IMAGE_TAG }}-${ARCH}" - echo "Upgrading with image tag: $LOCAL_IMAGE_TAG" - - helm upgrade documentdb-operator ./documentdb-chart \ - --namespace $OPERATOR_NS \ - --set documentDbVersion="$LOCAL_IMAGE_TAG" \ - --set image.documentdbk8soperator.tag="$LOCAL_IMAGE_TAG" \ - --set image.documentdbk8soperator.pullPolicy=IfNotPresent \ - --set image.sidecarinjector.tag="$LOCAL_IMAGE_TAG" \ - --set image.sidecarinjector.pullPolicy=IfNotPresent \ - --wait --timeout=15m - - echo "Helm upgrade completed. Release info:" - helm list -n $OPERATOR_NS - - # --- Verify Upgraded Operator --- - echo "" - echo "--- Verifying Upgraded Operator ---" - kubectl wait --for=condition=Available deployment/documentdb-operator -n $OPERATOR_NS --timeout=300s - - UPGRADED_OPERATOR_IMAGE=$(kubectl get deployment documentdb-operator -n $OPERATOR_NS -o jsonpath='{.spec.template.spec.containers[0].image}') - echo "Upgraded operator image: $UPGRADED_OPERATOR_IMAGE" - - if [[ "$UPGRADED_OPERATOR_IMAGE" == "$RELEASED_OPERATOR_IMAGE" ]]; then - echo "❌ Operator image did not change after upgrade" - exit 1 - fi - echo "✓ Operator image changed: $RELEASED_OPERATOR_IMAGE → $UPGRADED_OPERATOR_IMAGE" - - # --- Verify DB Pod Stability --- - echo "" - echo "--- Verifying DB Pod Stability ---" - kubectl get pods -n $DB_NS -l cnpg.io/cluster=$DB_NAME -o wide - POST_UPGRADE_UIDS=$(kubectl get pods -n $DB_NS -l cnpg.io/cluster=$DB_NAME -o jsonpath='{.items[*].metadata.uid}') - echo "Pod UIDs after upgrade: $POST_UPGRADE_UIDS" - - if [[ "$PRE_UPGRADE_UIDS" == "$POST_UPGRADE_UIDS" ]]; then - echo "✓ DB pod UIDs unchanged — operator upgrade did not restart DB pods" - else - echo "⚠️ DB pod UIDs changed — pods may have been restarted during operator upgrade" - echo " Before: $PRE_UPGRADE_UIDS" - echo " After: $POST_UPGRADE_UIDS" - fi - - # --- Verify Cluster Health --- - echo "" - echo "--- Verifying Cluster Health ---" - timeout 300 bash -c ' - while true; do - DB_STATUS=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.status}" 2>/dev/null) - CLUSTER_STATUS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.phase}" 2>/dev/null) - echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS" - if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then - echo "✓ Cluster is healthy after operator upgrade" - break - fi - sleep 10 - done - ' -- "$DB_NAME" "$DB_NS" - - echo "" - echo "✅ Step 1 passed: Operator control plane upgraded successfully" - echo " Operator: $RELEASED_OPERATOR_IMAGE → $UPGRADED_OPERATOR_IMAGE" - - - name: Setup port forwarding for operator upgrade verification - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Verify data persistence after operator upgrade - run: | - echo "=== Data Persistence: Verifying after operator upgrade ===" - mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval ' - db = db.getSiblingDB("upgrade_test_db"); - var count = db.test_collection.countDocuments(); - assert(count === 2, "Expected 2 documents but found " + count + " after operator upgrade"); - print("✓ All " + count + " documents persisted through operator upgrade"); - ' - echo "✓ Data persistence verified after operator upgrade" - - - name: Cleanup port forwarding after operator upgrade verification - if: always() - run: | - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - rm -f /tmp/pf_output.log - - # ============================================================ - # TODO: Remove the following 4 steps once released version > 0.1.3 - # When the released operator uses combined mode, the cluster must be - # recreated under the upgraded operator to switch to ImageVolume mode. - # ============================================================ - - - name: "Recreate cluster for ImageVolume mode (combined → extension)" - if: env.USE_COMBINED == 'true' - run: | - echo "=== Recreating cluster: combined mode → ImageVolume mode ===" - echo "The released operator deployed in combined mode. After operator upgrade," - echo "we must recreate the cluster so the new operator deploys it in ImageVolume mode." - - # Delete the combined-mode cluster - echo "" - echo "Deleting combined-mode cluster..." - kubectl delete documentdb $DB_NAME -n $DB_NS --wait=false - - echo "Waiting for DocumentDB to be deleted..." - timeout 300 bash -c ' - while true; do - db_exists=$(kubectl -n "$1" get documentdb "$2" --ignore-not-found -o name) - if [[ -z "$db_exists" ]]; then - echo "✓ DocumentDB deleted successfully." - break - fi - echo "DocumentDB still exists. Waiting..." - sleep 10 - done - ' -- "$DB_NS" "$DB_NAME" - - echo "Waiting for cluster pods to be cleaned up..." - timeout 120 bash -c ' - while true; do - pod_count=$(kubectl get pods -n "$1" -l cnpg.io/cluster="$2" --no-headers 2>/dev/null | wc -l) - if [[ "$pod_count" -eq 0 ]]; then - echo "✓ All cluster pods cleaned up." - break - fi - echo "Still $pod_count pods remaining. Waiting..." - sleep 5 - done - ' -- "$DB_NS" "$DB_NAME" - - echo "Cleaning up old PVCs..." - kubectl delete pvc -n $DB_NS -l cnpg.io/cluster=$DB_NAME --wait=true --timeout=60s || true - - # Create a fresh cluster with extension image under the upgraded operator - OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" - OLD_GATEWAY="${{ env.GATEWAY_OLD_IMAGE }}" - echo "" - echo "Creating new cluster with ImageVolume mode..." - echo " Extension image: $OLD_EXTENSION" - echo " Gateway image: $OLD_GATEWAY" - cat </dev/null) - CLUSTER_STATUS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.phase}" 2>/dev/null) - echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS" - if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then - echo "✓ Recreated cluster is healthy" - break - fi - sleep 10 - done - ' -- "$DB_NAME" "$DB_NS" - - # Update DOCUMENTDB_INITIAL_IMAGE so Step 2 baseline check uses the correct image - echo "DOCUMENTDB_INITIAL_IMAGE=$OLD_EXTENSION" >> $GITHUB_ENV - echo "" - echo "✅ Cluster recreated in ImageVolume mode" - echo " DOCUMENTDB_INITIAL_IMAGE updated to: $OLD_EXTENSION" - - - name: Setup port forwarding for re-seeding after recreation - if: env.USE_COMBINED == 'true' - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Re-seed test data after cluster recreation - if: env.USE_COMBINED == 'true' - run: | - echo "=== Re-seeding test data after cluster recreation ===" - mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval ' - db = db.getSiblingDB("upgrade_test_db"); - db.test_collection.insertOne({ _id: "upgrade_marker", step: "pre-upgrade", timestamp: new Date().toISOString() }); - db.test_collection.insertOne({ _id: "persistence_check", data: "this_must_survive_rollback", count: 42 }); - var count = db.test_collection.countDocuments(); - print("✓ Seed data written: " + count + " documents"); - assert(count === 2, "Expected 2 documents but found " + count); - ' - echo "✓ Seed data re-written after cluster recreation" - - - name: Cleanup port forwarding after re-seeding - if: always() && env.USE_COMBINED == 'true' - run: | - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - rm -f /tmp/pf_output.log - - # ============================================================ - # END TODO: Remove the above 4 steps once released version > 0.1.3 - # ============================================================ - - - name: "Step 2: Upgrade Both Extension and Gateway Images" - run: | - echo "=== Step 2: Upgrade Both Extension and Gateway Images ===" - echo "Testing simultaneous extension + gateway upgrade on ${{ matrix.architecture }}..." - - OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" - NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" - OLD_GATEWAY="${{ env.GATEWAY_OLD_IMAGE }}" - NEW_GATEWAY="${{ env.GATEWAY_IMAGE }}" - - # Verify baseline: cluster deployed with old images - CURRENT_EXTENSION=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') - echo "Current extension image: $CURRENT_EXTENSION" - if [[ "$CURRENT_EXTENSION" != "$OLD_EXTENSION" ]]; then - echo "❌ Expected old extension image $OLD_EXTENSION but found $CURRENT_EXTENSION" - exit 1 - fi - - CURRENT_GATEWAY=$(kubectl get cluster $DB_NAME -n $DB_NS -o jsonpath='{.spec.plugins[0].parameters.gatewayImage}') - echo "Current gateway image: $CURRENT_GATEWAY" - if [[ "$CURRENT_GATEWAY" != "$OLD_GATEWAY" ]]; then - echo "❌ Expected old gateway image $OLD_GATEWAY but found $CURRENT_GATEWAY" - exit 1 - fi - echo "✓ Cluster deployed with old images" - - # Record and verify version before upgrade - VERSION_BEFORE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "DocumentDB schema version before upgrade: $VERSION_BEFORE" - - if [[ -z "$VERSION_BEFORE" ]]; then - echo "❌ status.schemaVersion is empty before upgrade" - exit 1 - fi - echo "✓ DocumentDB schema version is populated before upgrade" - - # Patch both images simultaneously - echo "" - echo "Upgrading both images..." - echo " Extension: $OLD_EXTENSION → $NEW_EXTENSION" - echo " Gateway: $OLD_GATEWAY → $NEW_GATEWAY" - kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ - -p "{\"spec\":{\"documentDBImage\":\"$NEW_EXTENSION\",\"gatewayImage\":\"$NEW_GATEWAY\"}}" - - echo "Waiting for cluster to be healthy with new images..." - timeout 600 bash -c ' - while true; do - DB_STATUS=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.status}" 2>/dev/null) - CLUSTER_STATUS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.phase}" 2>/dev/null) - SCHEMA_VERSION=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.schemaVersion}" 2>/dev/null || echo "N/A") - echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS, schemaVersion: $SCHEMA_VERSION" - if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then - HEALTHY_PODS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.instancesStatus.healthy}" 2>/dev/null | jq length 2>/dev/null || echo "0") - if [[ "$HEALTHY_PODS" -ge "1" ]]; then - # Verify pods are actually running the new extension image - # With ImageVolume (K8s >= 1.35), the extension image is mounted as a volume, not an init container - POD_IMAGES=$(kubectl get pods -n "$2" -l cnpg.io/cluster="$1" -o jsonpath="{.items[*].spec.volumes[*].image.reference}" 2>/dev/null) - if echo "$POD_IMAGES" | grep -q "$3"; then - echo "✓ Cluster healthy with $HEALTHY_PODS pods running new images" - break - else - echo "Pods not yet running new extension image, waiting..." - fi - fi - fi - sleep 10 - done - ' -- "$DB_NAME" "$DB_NS" "$NEW_EXTENSION" - - # Verify extension image - FINAL_EXTENSION=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') - if [[ "$FINAL_EXTENSION" != "$NEW_EXTENSION" ]]; then - echo "❌ Extension image not applied: expected $NEW_EXTENSION, got $FINAL_EXTENSION" - exit 1 - fi - echo "✓ Extension image upgraded to $NEW_EXTENSION" - - # Verify gateway image in CNPG cluster - FINAL_GATEWAY=$(kubectl get cluster $DB_NAME -n $DB_NS -o jsonpath='{.spec.plugins[0].parameters.gatewayImage}') - if [[ "$FINAL_GATEWAY" != "$NEW_GATEWAY" ]]; then - echo "❌ Gateway image not applied: expected $NEW_GATEWAY, got $FINAL_GATEWAY" - exit 1 - fi - echo "✓ Gateway image upgraded to $NEW_GATEWAY" - - # Verify DocumentDB schema version unchanged (two-phase default: schema stays at old version) - VERSION_AFTER=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "DocumentDB schema version after upgrade: $VERSION_AFTER" - if [[ -z "$VERSION_AFTER" ]]; then - echo "❌ status.schemaVersion is empty after upgrade" - exit 1 - fi - if [[ "$VERSION_AFTER" != "$VERSION_BEFORE" ]]; then - echo "❌ Schema version changed from $VERSION_BEFORE to $VERSION_AFTER — expected unchanged (two-phase default)" - exit 1 - fi - echo "✓ Schema version unchanged after binary upgrade: $VERSION_AFTER (two-phase default validated)" - - # Verify status fields - echo "" - echo "=== Status Field Verification ===" - STATUS_DB_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.documentDBImage}') - echo "status.documentDBImage: $STATUS_DB_IMAGE" - if [[ "$STATUS_DB_IMAGE" == "$NEW_EXTENSION" ]]; then - echo "✓ status.documentDBImage matches new extension image" - else - echo "⚠️ status.documentDBImage ($STATUS_DB_IMAGE) does not match expected ($NEW_EXTENSION)" - fi - - STATUS_GATEWAY_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.gatewayImage}') - echo "status.gatewayImage: $STATUS_GATEWAY_IMAGE" - if [[ "$STATUS_GATEWAY_IMAGE" == "$NEW_GATEWAY" ]]; then - echo "✓ status.gatewayImage matches new gateway image" - else - echo "⚠️ status.gatewayImage ($STATUS_GATEWAY_IMAGE) does not match expected ($NEW_GATEWAY)" - fi - - echo "" - echo "✅ Step 2 passed: Both images upgraded successfully" - echo " Extension: $OLD_EXTENSION → $NEW_EXTENSION" - echo " Gateway: $OLD_GATEWAY → $NEW_GATEWAY" - - - name: Setup port forwarding for upgrade verification - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Verify data persistence after upgrade - run: | - echo "=== Data Persistence: Verifying after upgrade ===" - mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval ' - db = db.getSiblingDB("upgrade_test_db"); - var count = db.test_collection.countDocuments(); - assert(count === 2, "Expected 2 documents but found " + count + " after upgrade"); - print("✓ All " + count + " documents persisted through upgrade"); - ' - echo "✓ Data persistence verified after upgrade" - - - name: Cleanup port forwarding after upgrade verification - if: always() - run: | - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - rm -f /tmp/pf_output.log - - - name: "Step 3: Rollback Extension Image (gateway stays at new version)" - run: | - echo "=== Step 3: Rollback Extension Image ===" - echo "Rolling back extension image while keeping gateway at new version..." - - OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" - NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" - NEW_GATEWAY="${{ env.GATEWAY_IMAGE }}" - - # Record state before rollback - VERSION_BEFORE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "DocumentDB schema version before extension rollback: $VERSION_BEFORE" - - EVENTS_BEFORE=$(kubectl get events -n $DB_NS --field-selector reason=ExtensionRollback --no-headers 2>/dev/null | wc -l || echo "0") - echo "ExtensionRollback events before: $EVENTS_BEFORE" - - # Rollback only extension image - echo "" - echo "Patching spec.documentDBImage: $NEW_EXTENSION → $OLD_EXTENSION" - kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ - -p "{\"spec\":{\"documentDBImage\":\"$OLD_EXTENSION\"}}" - - echo "Waiting for cluster to stabilize after extension rollback..." - timeout 600 bash -c ' - while true; do - DB_STATUS=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.status}" 2>/dev/null) - CLUSTER_STATUS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.phase}" 2>/dev/null) - echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS" - if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then - HEALTHY_PODS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.instancesStatus.healthy}" 2>/dev/null | jq length 2>/dev/null || echo "0") - if [[ "$HEALTHY_PODS" -ge "1" ]]; then - # Verify pods are running the rolled-back extension image - # With ImageVolume (K8s >= 1.35), the extension image is mounted as a volume, not an init container - POD_IMAGES=$(kubectl get pods -n "$2" -l cnpg.io/cluster="$1" -o jsonpath="{.items[*].spec.volumes[*].image.reference}" 2>/dev/null) - if echo "$POD_IMAGES" | grep -q "$3"; then - echo "✓ Cluster healthy with $HEALTHY_PODS pods running rolled-back extension image" - break - else - echo "Pods not yet running rolled-back extension image, waiting..." - fi - fi - fi - sleep 10 - done - ' -- "$DB_NAME" "$DB_NS" "$OLD_EXTENSION" - - echo "" - echo "=== Extension Rollback Verification ===" - - # Verify extension image rolled back - CURRENT_EXTENSION=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') - if [[ "$CURRENT_EXTENSION" == "$OLD_EXTENSION" ]]; then - echo "✓ spec.documentDBImage correctly rolled back to $OLD_EXTENSION" - else - echo "❌ spec.documentDBImage should be $OLD_EXTENSION but is $CURRENT_EXTENSION" - exit 1 - fi - - # Verify schema version preserved (ALTER EXTENSION UPDATE skipped) - VERSION_AFTER=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "DocumentDB schema version before rollback: $VERSION_BEFORE" - echo "DocumentDB schema version after rollback: $VERSION_AFTER" - if [[ "$VERSION_AFTER" == "$VERSION_BEFORE" ]]; then - echo "✓ Schema version preserved — ALTER EXTENSION UPDATE correctly skipped" - else - echo "⚠️ Schema version changed from $VERSION_BEFORE to $VERSION_AFTER" - fi - - # Verify ExtensionRollback warning event (poll up to 60s instead of hardcoded sleep) - echo "Waiting for ExtensionRollback event..." - EVENTS_AFTER=$EVENTS_BEFORE - for i in $(seq 1 12); do - EVENTS_AFTER=$(kubectl get events -n $DB_NS --field-selector reason=ExtensionRollback --no-headers 2>/dev/null | wc -l || echo "0") - if [[ "$EVENTS_AFTER" -gt "$EVENTS_BEFORE" ]]; then - break - fi - sleep 5 - done - echo "ExtensionRollback events after: $EVENTS_AFTER" - if [[ "$EVENTS_AFTER" -gt "$EVENTS_BEFORE" ]]; then - echo "✓ ExtensionRollback warning event detected" - kubectl get events -n $DB_NS --field-selector reason=ExtensionRollback - else - echo "⚠️ No new ExtensionRollback event detected within 60s" - kubectl get events -n $DB_NS --sort-by='.lastTimestamp' | tail -20 - fi - - # Verify gateway image UNCHANGED at new version - CURRENT_GATEWAY=$(kubectl get cluster $DB_NAME -n $DB_NS -o jsonpath='{.spec.plugins[0].parameters.gatewayImage}') - echo "" - echo "Gateway image after extension rollback: $CURRENT_GATEWAY" - if [[ "$CURRENT_GATEWAY" == "$NEW_GATEWAY" ]]; then - echo "✓ Gateway image unchanged at $NEW_GATEWAY (extension rollback did not affect gateway)" - else - echo "❌ Gateway image changed unexpectedly: expected $NEW_GATEWAY, got $CURRENT_GATEWAY" - exit 1 - fi - - # Verify status fields - STATUS_DB_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.documentDBImage}') - echo "status.documentDBImage: $STATUS_DB_IMAGE" - if [[ "$STATUS_DB_IMAGE" == "$OLD_EXTENSION" ]]; then - echo "✓ status.documentDBImage reflects rolled-back extension" - else - echo "⚠️ status.documentDBImage ($STATUS_DB_IMAGE) does not match $OLD_EXTENSION" - fi - - STATUS_GATEWAY_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.gatewayImage}') - echo "status.gatewayImage: $STATUS_GATEWAY_IMAGE" - if [[ "$STATUS_GATEWAY_IMAGE" == "$NEW_GATEWAY" ]]; then - echo "✓ status.gatewayImage still at new gateway version" - else - echo "⚠️ status.gatewayImage ($STATUS_GATEWAY_IMAGE) does not match $NEW_GATEWAY" - fi - - echo "" - echo "✅ Step 3 passed: Extension rolled back, gateway unchanged" - echo " Extension: $NEW_EXTENSION → $OLD_EXTENSION (rolled back)" - echo " Gateway: $NEW_GATEWAY (unchanged)" - - - name: Setup port forwarding for extension rollback verification - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Verify data persistence after extension rollback - run: | - echo "=== Data Persistence: Verifying after extension rollback ===" - mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval ' - db = db.getSiblingDB("upgrade_test_db"); - var count = db.test_collection.countDocuments(); - assert(count === 2, "Expected 2 documents but found " + count + " after extension rollback"); - print("✓ All " + count + " documents persisted through extension rollback"); - ' - echo "✓ Data persistence verified after extension rollback" - - - name: Cleanup port forwarding after extension rollback verification - if: always() - run: | - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - rm -f /tmp/pf_output.log - - - name: "Step 4: Rollback Gateway Image (extension stays at old version)" - run: | - echo "=== Step 4: Rollback Gateway Image ===" - echo "Rolling back gateway image while keeping extension at old version..." - - OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" - OLD_GATEWAY="${{ env.GATEWAY_OLD_IMAGE }}" - NEW_GATEWAY="${{ env.GATEWAY_IMAGE }}" - - # Record state before gateway rollback - VERSION_BEFORE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "DocumentDB schema version before gateway rollback: $VERSION_BEFORE" - - # Rollback only gateway image - echo "" - echo "Patching spec.gatewayImage: $NEW_GATEWAY → $OLD_GATEWAY" - kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ - -p "{\"spec\":{\"gatewayImage\":\"$OLD_GATEWAY\"}}" - - echo "Waiting for cluster to stabilize after gateway rollback..." - timeout 600 bash -c ' - while true; do - DB_STATUS=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.status}" 2>/dev/null) - CLUSTER_STATUS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.phase}" 2>/dev/null) - echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS" - if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then - HEALTHY_PODS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.instancesStatus.healthy}" 2>/dev/null | jq length 2>/dev/null || echo "0") - if [[ "$HEALTHY_PODS" -ge "1" ]]; then - # Verify gateway plugin parameter reflects the rolled-back image - CURRENT_GW_PARAM=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.spec.plugins[0].parameters.gatewayImage}" 2>/dev/null) - if [[ "$CURRENT_GW_PARAM" == "$3" ]]; then - echo "✓ Cluster healthy with $HEALTHY_PODS pods and gateway image rolled back" - break - else - echo "Gateway image not yet rolled back in cluster spec, waiting..." - fi - fi - fi - sleep 10 - done - ' -- "$DB_NAME" "$DB_NS" "$OLD_GATEWAY" - - echo "" - echo "=== Gateway Rollback Verification ===" - - # Verify gateway image rolled back in CNPG cluster - CURRENT_GATEWAY=$(kubectl get cluster $DB_NAME -n $DB_NS -o jsonpath='{.spec.plugins[0].parameters.gatewayImage}') - if [[ "$CURRENT_GATEWAY" == "$OLD_GATEWAY" ]]; then - echo "✓ Gateway image rolled back to $OLD_GATEWAY" - else - echo "❌ Gateway image should be $OLD_GATEWAY but is $CURRENT_GATEWAY" - exit 1 - fi - - # Verify extension image UNCHANGED at old version - CURRENT_EXTENSION=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') - echo "Extension image after gateway rollback: $CURRENT_EXTENSION" - if [[ "$CURRENT_EXTENSION" == "$OLD_EXTENSION" ]]; then - echo "✓ Extension image unchanged at $OLD_EXTENSION (gateway rollback did not affect extension)" - else - echo "❌ Extension image changed unexpectedly: expected $OLD_EXTENSION, got $CURRENT_EXTENSION" - exit 1 - fi - - # Verify schema version unchanged (gateway is stateless, no schema impact) - VERSION_AFTER=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "DocumentDB schema version before gateway rollback: $VERSION_BEFORE" - echo "DocumentDB schema version after gateway rollback: $VERSION_AFTER" - if [[ "$VERSION_AFTER" == "$VERSION_BEFORE" ]]; then - echo "✓ Schema version unchanged — gateway rollback has no schema impact" - else - echo "⚠️ Schema version changed unexpectedly from $VERSION_BEFORE to $VERSION_AFTER" - fi - - # Verify status fields - STATUS_DB_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.documentDBImage}') - echo "status.documentDBImage: $STATUS_DB_IMAGE" - if [[ "$STATUS_DB_IMAGE" == "$OLD_EXTENSION" ]]; then - echo "✓ status.documentDBImage still at old extension" - else - echo "⚠️ status.documentDBImage ($STATUS_DB_IMAGE) does not match $OLD_EXTENSION" - fi - - STATUS_GATEWAY_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.gatewayImage}') - echo "status.gatewayImage: $STATUS_GATEWAY_IMAGE" - if [[ "$STATUS_GATEWAY_IMAGE" == "$OLD_GATEWAY" ]]; then - echo "✓ status.gatewayImage reflects rolled-back gateway" - else - echo "⚠️ status.gatewayImage ($STATUS_GATEWAY_IMAGE) does not match $OLD_GATEWAY" - fi - - echo "" - echo "✅ Step 4 passed: Gateway rolled back, extension unchanged" - echo " Extension: $OLD_EXTENSION (unchanged)" - echo " Gateway: $NEW_GATEWAY → $OLD_GATEWAY (rolled back)" - - - name: Setup port forwarding for gateway rollback verification - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Verify data persistence after gateway rollback - run: | - echo "=== Data Persistence: Verifying after gateway rollback ===" - mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval ' - db = db.getSiblingDB("upgrade_test_db"); - var count = db.test_collection.countDocuments(); - assert(count === 2, "Expected 2 documents but found " + count + " after gateway rollback"); - print("✓ All " + count + " documents persisted through full upgrade/rollback cycle"); - ' - echo "✓ Data persistence verified after gateway rollback" - - - name: Cleanup port forwarding after gateway rollback verification - if: always() - run: | - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - rm -f /tmp/pf_output.log - - # ============================================================ - # Steps 5-8: Two-phase schema upgrade and webhook validation - # ============================================================ - - - name: "Step 5: Re-upgrade binary (setup for schema tests)" - run: | - echo "=== Step 5: Re-upgrade Binary ===" - echo "Re-upgrading extension and gateway images to new version for schema tests..." - - NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" - NEW_GATEWAY="${{ env.GATEWAY_IMAGE }}" - - # Patch both images back to new version - echo "Patching images to new versions..." - echo " Extension: → $NEW_EXTENSION" - echo " Gateway: → $NEW_GATEWAY" - kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ - -p "{\"spec\":{\"documentDBImage\":\"$NEW_EXTENSION\",\"gatewayImage\":\"$NEW_GATEWAY\"}}" - - echo "Waiting for cluster to be healthy with new images..." - timeout 600 bash -c ' - while true; do - DB_STATUS=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.status}" 2>/dev/null) - CLUSTER_STATUS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.phase}" 2>/dev/null) - echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS" - if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then - HEALTHY_PODS=$(kubectl get cluster "$1" -n "$2" -o jsonpath="{.status.instancesStatus.healthy}" 2>/dev/null | jq length 2>/dev/null || echo "0") - if [[ "$HEALTHY_PODS" -ge "1" ]]; then - POD_IMAGES=$(kubectl get pods -n "$2" -l cnpg.io/cluster="$1" -o jsonpath="{.items[*].spec.volumes[*].image.reference}" 2>/dev/null) - if echo "$POD_IMAGES" | grep -q "$3"; then - echo "✓ Cluster healthy with $HEALTHY_PODS pods running new images" - break - else - echo "Pods not yet running new extension image, waiting..." - fi - fi - fi - sleep 10 - done - ' -- "$DB_NAME" "$DB_NS" "$NEW_EXTENSION" - - # Verify schema version is still at baseline - VERSION_CURRENT=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "Schema version after re-upgrade: $VERSION_CURRENT" - echo "SCHEMA_BASELINE=$VERSION_CURRENT" >> $GITHUB_ENV - - echo "" - echo "✅ Step 5 passed: Binary re-upgraded for schema tests" - echo " Extension: $NEW_EXTENSION" - echo " Gateway: $NEW_GATEWAY" - echo " Schema: $VERSION_CURRENT (baseline)" - - - name: Setup port forwarding for re-upgrade verification - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Verify data persistence after re-upgrade - run: | - echo "=== Data Persistence: Verifying after re-upgrade ===" - mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval ' - db = db.getSiblingDB("upgrade_test_db"); - var count = db.test_collection.countDocuments(); - assert(count === 2, "Expected 2 documents but found " + count + " after re-upgrade"); - print("✓ All " + count + " documents persisted through re-upgrade"); - ' - echo "✓ Data persistence verified after re-upgrade" - - - name: Cleanup port forwarding after re-upgrade verification - if: always() - run: | - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - rm -f /tmp/pf_output.log - - - name: "Step 6: Schema Finalization (two-phase commit)" - run: | - echo "=== Step 6: Schema Finalization ===" - echo "Setting spec.schemaVersion to finalize the schema migration..." - - NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" - SCHEMA_BASELINE="${{ env.SCHEMA_BASELINE }}" - - # Determine the new schema version from the new extension image tag - # Strip any architecture suffix (e.g., "0.112.0-amd64" → "0.112.0") - RAW_TAG=$(echo "$NEW_EXTENSION" | sed 's/.*://') - NEW_SCHEMA_VERSION=$(echo "$RAW_TAG" | grep -oP '^\d+\.\d+\.\d+') - if [[ -z "$NEW_SCHEMA_VERSION" ]]; then - echo "✗ Could not extract semver from image tag: $RAW_TAG" - exit 1 - fi - echo "Baseline schema version: $SCHEMA_BASELINE" - echo "Target schema version: $NEW_SCHEMA_VERSION" - - # Ensure we're actually testing a version upgrade, not a no-op - if [[ "$SCHEMA_BASELINE" == "$NEW_SCHEMA_VERSION" ]]; then - echo "❌ Baseline and target schema versions are identical ($SCHEMA_BASELINE)." - echo " The upgrade test requires different versions to validate the two-phase commit." - exit 1 - fi - - # Set spec.schemaVersion to trigger ALTER EXTENSION UPDATE - kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ - -p "{\"spec\":{\"schemaVersion\":\"$NEW_SCHEMA_VERSION\"}}" - - echo "Waiting for schema version to update..." - timeout 300 bash -c ' - while true; do - STATUS_SCHEMA=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.schemaVersion}" 2>/dev/null) - DB_STATUS=$(kubectl get documentdb "$1" -n "$2" -o jsonpath="{.status.status}" 2>/dev/null) - echo "status.schemaVersion: $STATUS_SCHEMA, status: $DB_STATUS" - if [[ "$STATUS_SCHEMA" == "$3" && "$DB_STATUS" == "Cluster in healthy state" ]]; then - echo "✓ Schema version updated to $STATUS_SCHEMA" - break - fi - sleep 10 - done - ' -- "$DB_NAME" "$DB_NS" "$NEW_SCHEMA_VERSION" - - # Verify schema version changed - FINAL_SCHEMA=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - if [[ "$FINAL_SCHEMA" == "$NEW_SCHEMA_VERSION" ]]; then - echo "✓ Schema finalized: $SCHEMA_BASELINE → $FINAL_SCHEMA" - else - echo "❌ Schema version should be $NEW_SCHEMA_VERSION but is $FINAL_SCHEMA" - exit 1 - fi - - echo "NEW_SCHEMA_VERSION=$NEW_SCHEMA_VERSION" >> $GITHUB_ENV - echo "" - echo "✅ Step 6 passed: Schema finalized to $NEW_SCHEMA_VERSION" - - - name: Setup port forwarding for schema finalization verification - uses: ./.github/actions/setup-port-forwarding - with: - namespace: ${{ env.DB_NS }} - cluster-name: ${{ env.DB_NAME }} - port: ${{ env.DB_PORT }} - architecture: ${{ matrix.architecture }} - test-type: 'comprehensive' - - - name: Verify data persistence after schema finalization - run: | - echo "=== Data Persistence: Verifying after schema finalization ===" - mongosh 127.0.0.1:$DB_PORT \ - -u $DB_USERNAME \ - -p $DB_PASSWORD \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --eval ' - db = db.getSiblingDB("upgrade_test_db"); - var count = db.test_collection.countDocuments(); - assert(count === 2, "Expected 2 documents but found " + count + " after schema finalization"); - print("✓ All " + count + " documents persisted through schema finalization"); - ' - echo "✓ Data persistence verified after schema finalization" - - - name: Cleanup port forwarding after schema finalization verification - if: always() - run: | - if [ -f /tmp/pf_pid ]; then - PF_PID=$(cat /tmp/pf_pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/pf_pid - fi - rm -f /tmp/pf_output.log - - - name: "Step 7: Webhook — Reject Rollback Below Schema" - run: | - echo "=== Step 7: Webhook — Reject Rollback Below Schema ===" - echo "Attempting to roll back documentDBImage below status.schemaVersion..." - - CURRENT_SCHEMA="${{ env.NEW_SCHEMA_VERSION }}" - # Use a synthetic image reference with a version guaranteed to be below - # the finalized schema. The webhook extracts the semver from the tag and - # rejects the patch before any pod changes, so the image needn't exist. - ROLLBACK_IMAGE="ghcr.io/${{ github.repository_owner }}/documentdb-kubernetes-operator/documentdb:0.1.0-rollback-test" - - echo "Current schema version: $CURRENT_SCHEMA" - echo "Attempting rollback to: $ROLLBACK_IMAGE (synthetic lower version)" - - # This SHOULD fail — the webhook must reject rollback below schema version. - # Patch both documentDBImage and schemaVersion to exercise both validation paths. - PATCH_OUTPUT=$(kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ - -p "{\"spec\":{\"documentDBImage\":\"$ROLLBACK_IMAGE\",\"schemaVersion\":\"$CURRENT_SCHEMA\"}}" 2>&1) && { - echo "❌ Webhook did NOT reject the rollback — patch succeeded unexpectedly" - echo "Output: $PATCH_OUTPUT" - exit 1 - } - - echo "Patch rejected (expected). Output:" - echo "$PATCH_OUTPUT" - - # Verify the error message mentions rollback blocking - if echo "$PATCH_OUTPUT" | grep -qi "rollback blocked\|older than installed schema"; then - echo "✓ Webhook correctly rejected rollback with expected error message" - else - echo "⚠️ Patch was rejected but error message doesn't match expected pattern" - echo " (Still passing — the important thing is the rejection)" - fi - - # Verify cluster state is unchanged - CURRENT_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') - NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" - if [[ "$CURRENT_IMAGE" == "$NEW_EXTENSION" ]]; then - echo "✓ Cluster state unchanged — documentDBImage still at $CURRENT_IMAGE" - else - echo "❌ documentDBImage changed unexpectedly to $CURRENT_IMAGE" - exit 1 - fi - - echo "" - echo "✅ Step 7 passed: Webhook correctly blocked rollback below schema version" - - - name: "Step 8: Webhook — Reject Schema Exceeds Binary" - run: | - echo "=== Step 8: Webhook — Reject Schema Exceeds Binary ===" - echo "Attempting to set schemaVersion higher than binary version..." - - # Use an artificially high version that exceeds any binary - INVALID_SCHEMA="99.999.0" - CURRENT_IMAGE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') - echo "Attempting schemaVersion: $INVALID_SCHEMA" - echo "Current documentDBImage: $CURRENT_IMAGE" - - # This SHOULD fail — the webhook must reject schema > binary. - # Patch both schemaVersion and documentDBImage to exercise both validation paths. - PATCH_OUTPUT=$(kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ - -p "{\"spec\":{\"schemaVersion\":\"$INVALID_SCHEMA\",\"documentDBImage\":\"$CURRENT_IMAGE\"}}" 2>&1) && { - echo "❌ Webhook did NOT reject the invalid schema version — patch succeeded unexpectedly" - echo "Output: $PATCH_OUTPUT" - exit 1 - } - - echo "Patch rejected (expected). Output:" - echo "$PATCH_OUTPUT" - - # Verify the error message mentions schema exceeding binary - if echo "$PATCH_OUTPUT" | grep -qi "exceeds.*binary"; then - echo "✓ Webhook correctly rejected schema version with expected error message" - else - echo "⚠️ Patch was rejected but error message doesn't match expected pattern" - echo " (Still passing — the important thing is the rejection)" - fi - - # Verify schema version is unchanged - CURRENT_SCHEMA=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - EXPECTED_SCHEMA="${{ env.NEW_SCHEMA_VERSION }}" - if [[ "$CURRENT_SCHEMA" == "$EXPECTED_SCHEMA" ]]; then - echo "✓ Schema version unchanged: $CURRENT_SCHEMA" - else - echo "❌ Schema version changed unexpectedly to $CURRENT_SCHEMA" - exit 1 - fi - - echo "" - echo "✅ Step 8 passed: Webhook correctly blocked schema version exceeding binary" - - - name: Collect logs on failure - if: failure() - uses: ./.github/actions/collect-logs - with: - architecture: ${{ matrix.architecture }} - operator-namespace: ${{ env.OPERATOR_NS }} - db-namespace: ${{ env.DB_NS }} - db-name: ${{ env.DB_NAME }} - - - name: Test completion summary - if: always() - run: | - echo "## Upgrade & Rollback Test Summary for ${{ matrix.architecture }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Architecture**: ${{ matrix.architecture }}" >> $GITHUB_STEP_SUMMARY - echo "- **Initial Image**: ${{ env.DOCUMENTDB_INITIAL_IMAGE }}" >> $GITHUB_STEP_SUMMARY - echo "- **Old Extension Image**: ${{ env.DOCUMENTDB_OLD_IMAGE }}" >> $GITHUB_STEP_SUMMARY - echo "- **New Extension Image**: ${{ env.DOCUMENTDB_IMAGE }}" >> $GITHUB_STEP_SUMMARY - echo "- **Old Gateway Image**: ${{ env.GATEWAY_OLD_IMAGE }}" >> $GITHUB_STEP_SUMMARY - echo "- **New Gateway Image**: ${{ env.GATEWAY_IMAGE }}" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: ${{ env.IMAGE_TAG }}" >> $GITHUB_STEP_SUMMARY - echo "- **Chart Version**: ${{ env.CHART_VERSION }}" >> $GITHUB_STEP_SUMMARY - echo "- **Released Chart Version**: ${{ env.RELEASED_CHART_VERSION }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Test Steps:" >> $GITHUB_STEP_SUMMARY - echo "- Step 1: Operator control plane upgrade (released → built)" >> $GITHUB_STEP_SUMMARY - echo "- Step 2: Upgrade both extension and gateway images" >> $GITHUB_STEP_SUMMARY - echo "- Step 3: Rollback extension image" >> $GITHUB_STEP_SUMMARY - echo "- Step 4: Rollback gateway image" >> $GITHUB_STEP_SUMMARY - echo "- Step 5: Re-upgrade binary (setup for schema tests)" >> $GITHUB_STEP_SUMMARY - echo "- Step 6: Schema finalization (two-phase commit)" >> $GITHUB_STEP_SUMMARY - echo "- Step 7: Webhook — reject rollback below schema" >> $GITHUB_STEP_SUMMARY - echo "- Step 8: Webhook — reject schema exceeds binary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [[ "${{ job.status }}" == "success" ]]; then - echo "- **Status**: ✅ PASSED" >> $GITHUB_STEP_SUMMARY - else - echo "- **Status**: ❌ FAILED" >> $GITHUB_STEP_SUMMARY - fi - - test-summary: - name: Upgrade & Rollback Test Summary - runs-on: ubuntu-latest - if: always() - needs: [build, upgrade-and-rollback-test] - steps: - - name: Generate overall test summary - run: | - echo "## Upgrade & Rollback Test Results Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Test Configuration:" >> $GITHUB_STEP_SUMMARY - echo "- **Build Step**: ${{ inputs.image_tag && 'Skipped (using external images)' || 'Executed' }}" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag**: ${{ inputs.image_tag || 'Built from source' }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Job Results:" >> $GITHUB_STEP_SUMMARY - echo "- **Build**: ${{ needs.build.result }}" >> $GITHUB_STEP_SUMMARY - echo "- **Upgrade & Rollback Tests**: ${{ needs.upgrade-and-rollback-test.result }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [[ "${{ needs.upgrade-and-rollback-test.result }}" == "success" ]]; then - echo "### Overall Status: ✅ ALL TESTS PASSED" >> $GITHUB_STEP_SUMMARY - else - echo "### Overall Status: ❌ SOME TESTS FAILED" >> $GITHUB_STEP_SUMMARY - echo "Check individual job results above for details." >> $GITHUB_STEP_SUMMARY - fi diff --git a/AGENTS.md b/AGENTS.md index 2f81b08b..1516952c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -371,6 +371,39 @@ Types: - Mock external dependencies appropriately - Ensure tests are idempotent and isolated +### Running E2E tests + +The end-to-end suite lives in [`test/e2e/`](test/e2e/) as a separate Go module +and replaces the legacy `test-integration.yml`, `test-E2E.yml`, +`test-backup-and-restore.yml`, and `test-upgrade-and-rollback.yml` workflows +(and their bash / JavaScript / Python glue). It is a Go / Ginkgo v2 / Gomega +suite that drives the operator end-to-end and speaks the Mongo wire protocol +via `go.mongodb.org/mongo-driver/v2`. + +**Prereqs:** kind + the DocumentDB operator already installed in the target +cluster. In CI, the `.github/actions/setup-test-environment` composite action +handles cluster creation and operator install (via `make deploy`). Locally, +`operator/src/scripts/development/deploy.sh` is the equivalent entry point. + +**Running:** + +```bash +cd test/e2e +ginkgo -r --label-filter=smoke ./tests/... # smoke +ginkgo -r --label-filter=lifecycle ./tests/... # single area +TEST_DEPTH=4 ginkgo -r --procs=4 ./tests/... # full sweep (Lowest depth) +``` + +Labels are defined in `test/e2e/labels.go` (areas: `lifecycle`, `scale`, +`data`, `performance`, `backup`, `recovery`, `tls`, `feature-gates`, +`exposure`, `status`, `upgrade`; plus cross-cutting `smoke`/`basic`/ +`destructive`/`disruptive`/`slow` and capability `needs-*` labels). Depth is +controlled by `TEST_DEPTH` (0=Highest … 4=Lowest, default 2=Medium). + +See [`test/e2e/README.md`](test/e2e/README.md) for the full env-var table +(including `E2E_RUN_ID` and the `E2E_UPGRADE_*` upgrade-suite variables), +helper-package index, troubleshooting, and CNPG dependency policy. + ### Code Review For thorough code reviews, reference the code review agent: diff --git a/CHANGELOG.md b/CHANGELOG.md index b52b8f9f..92a27c8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,18 @@ ### Breaking Changes - **Validating webhook added**: A new `ValidatingWebhookConfiguration` enforces that `spec.schemaVersion` never exceeds the binary version and blocks `spec.documentDBVersion` rollbacks below the committed schema version. This requires [cert-manager](https://cert-manager.io/) to be installed in the cluster (it is already a prerequisite for the sidecar injector). Existing clusters upgrading to this release will have the webhook activated automatically via `helm upgrade`. +### Testing infrastructure +- **Unified E2E test suite ([#346](https://github.com/documentdb/documentdb-kubernetes-operator/pull/346))**: The four legacy end-to-end workflows (`test-integration.yml`, `test-E2E.yml`, `test-backup-and-restore.yml`, `test-upgrade-and-rollback.yml`) and their bash / JavaScript (mongosh) / Python (pymongo) glue have been replaced by a single Go / Ginkgo v2 / Gomega suite under `test/e2e/`. Specs are organised by CRD operation (lifecycle, scale, data, performance, backup, tls, feature gates, exposure, status, upgrade), reuse CloudNative-PG's `tests/utils` packages as a library, and speak the Mongo wire protocol via `go.mongodb.org/mongo-driver/v2`. + +### Breaking changes for contributors +- **Local E2E invocation changed.** Tests are now run via `ginkgo` against an already-provisioned cluster, not via `npm test` / bash scripts. Typical invocation: + ```bash + cd test/e2e + ginkgo -r --label-filter=smoke ./tests/... + ``` + Label selection replaces per-workflow entry points; depth is controlled by `TEST_DEPTH` (0=Highest … 4=Lowest). See [`test/e2e/README.md`](test/e2e/README.md) for prereqs, the full env-var table (including `E2E_RUN_ID` and the `E2E_UPGRADE_*` upgrade-suite variables), and troubleshooting. +- **Design rationale** for the migration — scope, fixture tiers, parallelism model, CNPG reuse strategy — is documented in [`docs/designs/e2e-test-suite.md`](docs/designs/e2e-test-suite.md). + ## [0.2.0] - 2026-03-25 ### Major Features diff --git a/docs/designs/e2e-test-suite.md b/docs/designs/e2e-test-suite.md new file mode 100644 index 00000000..229a9c82 --- /dev/null +++ b/docs/designs/e2e-test-suite.md @@ -0,0 +1,497 @@ +# Plan: Unified Go/Ginkgo E2E Suite for DocumentDB Operator + +## Problem + +Four independent black-box test workflows exercise overlapping parts of the operator, each +with its own bash glue, port-forward logic, and inline mongosh/Python assertions: + +| Workflow | What it covers today | +|---|---| +| `test-integration.yml` | Port-forward + mongosh comprehensive JS + pymongo heredoc | +| `test-E2E.yml` | Port-forward + `comprehensive_mongosh_tests.js` + `performance_test.js` + status/PV/mount checks | +| `test-backup-and-restore.yml` | Seed data → ScheduledBackup → wait → delete data → restore CR → validate | +| `test-upgrade-and-rollback.yml` | Install released operator → seed → Helm upgrade to built → verify → recreate → verify again | + +Pain points: port-forward lifecycle re-implemented everywhere; assertion logic is JS +(`throw new Error`) or homegrown Python — no JUnit; no coverage for update/scale/delete-reclaim/ +TLS modes/ChangeStreams/service exposure/PV recovery; heavy operations (cluster creation +~60–120 s) are repeated per workflow and per test; two toolchains (bash + JS + Python) for +contributors to navigate on top of the Go operator code. + +## Proposed Approach + +Build **one unified Go + Ginkgo v2 + Gomega E2E suite** that drives the operator +end-to-end, reusing CNPG's `tests/utils/` Go packages wherever possible. Tests are grouped +by CRD operation in per-area Go packages; the data plane is validated via +`go.mongodb.org/mongo-driver/v2`. The suite fully replaces the four workflows. + +### Why Go over Python + +Spike result (see `Spike Findings` below): ~20 CNPG util packages are directly reusable +because DocumentDB wraps the same `apiv1.Cluster` / `apiv1.Backup` CRs CNPG defines. +Reusing them deletes a large fraction of the infrastructure we were about to rebuild +(MinIO deploy, namespace management, envsubst, stern log streaming, CNPG Cluster +introspection, backup CR helpers, timeouts map). + +### Design principles + +1. **Amortize heavy lifting.** Cluster creation (~60–120 s per 1-instance cluster) is the + single biggest cost. Classify every spec as *read-only* or *mutating*. Read-only specs + share a session-scoped cluster and isolate via per-spec Mongo database names; only + mutating specs pay for a fresh cluster. +2. **Small, single-purpose tests.** Each `It(...)` asserts one behavior. Porting + `comprehensive_mongosh_tests.js` produces ~10 small specs, not one monolith. +3. **Parallelize safely.** Ginkgo `-p` (process-per-package) + worker-aware namespace + naming. Marker/label-grouped CI jobs add a second parallelism layer. +4. **Structure for growth.** Per-area Go packages + shared `pkg/e2eutils/` + composable + manifest fragments. Adding a new CRD field = one new package, not sprawl. + +### Stack + +- **Ginkgo v2 + Gomega** — BDD runner + matchers. Same framework the operator already + uses for `envtest`, so contributors share patterns and caches. +- **`sigs.k8s.io/controller-runtime/pkg/client`** — typed CR access via our `api/preview` + types (no dynamic client / unstructured dicts). +- **`go.mongodb.org/mongo-driver/v2`** — data-plane assertions. +- **CNPG `tests/utils/`** — imported as a library (Apache-2.0, compatible with our MIT). + Pin version in `go.mod`. +- **`github.com/cloudnative-pg/cloudnative-pg/tests/labels`** + `tests/levels` — import the + depth/label plumbing rather than re-implementing. + +### Layout + +``` +test/e2e/ # new top-level Go test tree +├── go.mod # separate module; pins CNPG utils version +├── README.md # local run instructions +├── suite_test.go # SynchronizedBeforeSuite, global fixtures +├── labels.go # our label taxonomy (wraps CNPG's) +├── levels.go # thin re-export of CNPG's levels +├── pkg/e2eutils/ # our helpers; each file <300 LOC +│ ├── testenv/ +│ │ └── env.go # wraps CNPG's TestingEnvironment with dummy PG values +│ ├── documentdb/ +│ │ └── documentdb.go # CR verbs: Create, PatchSpec, WaitHealthy, Delete +│ ├── mongo/ +│ │ └── client.go # MongoClient builder, Seed, Probe, Count +│ ├── assertions/ +│ │ └── assertions.go # AssertDocumentDBReady, AssertPrimaryUnchanged, AssertWalLevel, … +│ ├── timeouts/ +│ │ └── timeouts.go # DocumentDB-specific overrides atop CNPG's map +│ ├── seed/ +│ │ └── datasets.go # canonical datasets (Small, Medium, Sort, Agg) +│ ├── portforward/ +│ │ └── portforward.go # wraps CNPG's forwardconnection for Mongo port +│ ├── operatorhealth/ +│ │ └── gate.go # adapted from CNPG's operator/ for documentdb-operator ns +│ └── fixtures/ # shared cluster fixtures (session scope) +│ ├── shared_ro.go # 1-instance cluster, per-spec DB names +│ ├── shared_scale.go # 2-instance cluster; tests reset to 2 on teardown +│ └── minio.go # lazy, label-gated (wraps CNPG minio.Deploy) +├── manifests/ # .yaml.template files; CNPG envsubst expands +│ ├── base/ +│ │ └── documentdb.yaml.template +│ ├── mixins/ # concatenated into base; simple sh envsubst pipeline +│ │ ├── tls_disabled.yaml.template +│ │ ├── tls_selfsigned.yaml.template +│ │ ├── tls_certmanager.yaml.template +│ │ ├── tls_provided.yaml.template +│ │ ├── feature_changestreams.yaml.template +│ │ ├── exposure_loadbalancer.yaml.template +│ │ ├── exposure_clusterip.yaml.template +│ │ ├── storage_custom.yaml.template +│ │ └── reclaim_retain.yaml.template +│ └── backup/ +│ ├── backup.yaml.template +│ ├── scheduled_backup.yaml.template +│ ├── recovery_from_backup.yaml.template +│ └── recovery_from_pv.yaml.template +└── tests/ # per-area Go packages; Ginkgo `-p` = 1 proc/pkg + ├── lifecycle/ + │ ├── lifecycle_suite_test.go + │ ├── deploy_test.go + │ ├── update_image_test.go + │ ├── update_loglevel_test.go + │ ├── update_storage_test.go + │ └── delete_reclaim_test.go + ├── scale/ + │ ├── scale_suite_test.go # spins up shared_scale_cluster + │ ├── scale_up_test.go # 1→2, 2→3 + │ └── scale_down_test.go # 3→2, 2→1; primary re-election + ├── data/ # all read-only; shares ro cluster + │ ├── data_suite_test.go # spins up shared_ro_cluster + │ ├── crud_test.go + │ ├── query_test.go + │ ├── aggregation_test.go + │ ├── sort_limit_skip_test.go + │ ├── update_ops_test.go + │ ├── delete_ops_test.go + │ └── pipeline_test.go + ├── performance/ # read-only; shares ro cluster; serial (-procs=1) + │ ├── performance_suite_test.go + │ ├── perf_insert_test.go + │ ├── perf_count_range_test.go + │ ├── perf_aggregation_test.go + │ ├── perf_sort_test.go + │ ├── perf_update_test.go + │ └── perf_delete_drop_test.go + ├── backup/ + │ ├── backup_suite_test.go # spins up minio + │ ├── backup_ondemand_test.go + │ ├── backup_scheduled_test.go + │ ├── restore_from_backup_test.go + │ └── restore_from_pv_test.go + ├── tls/ + │ ├── tls_suite_test.go + │ ├── tls_disabled_test.go + │ ├── tls_selfsigned_test.go + │ ├── tls_certmanager_test.go # skipped via Label("needs-certmanager") + │ └── tls_provided_test.go + ├── feature_gates/ + │ ├── feature_gates_suite_test.go + │ └── changestreams_test.go # table-driven over (enabled/disabled) + ├── exposure/ + │ ├── exposure_suite_test.go + │ ├── clusterip_test.go + │ └── loadbalancer_test.go # Label("needs-metallb") + ├── status/ + │ ├── status_suite_test.go # shared_ro_cluster + │ ├── connection_string_test.go + │ ├── pv_name_test.go + │ └── mount_options_test.go + └── upgrade/ + ├── upgrade_suite_test.go # owns its own operator install; Label("disruptive") + ├── upgrade_control_plane_test.go # released chart → built chart, verify data + ├── upgrade_images_test.go # extension + gateway image bump + └── rollback_test.go # optional — if rollback is supported +``` + +### Fixture tiers (dedup heavy lifting) + +All fixtures read config from env + CLI options (`--kube-context`, `--operator-ns`, +`--image-tag`, `--chart-version`, `--test-depth`, `--keep-clusters`). Ginkgo uses `flag` +registration; env vars mirror flags. + +**Session-scoped (most expensive, created once per `go test` invocation of a package):** + +- `Env` — wraps `environment.TestingEnvironment` from CNPG, constructed in + `SynchronizedBeforeSuite`. `POSTGRES_IMG` set to a dummy value because we don't use + the `postgres/` helpers that read it. +- `OperatorReady` — one-time check the documentdb-operator Deployment is Available + + CRDs installed. +- `SharedROCluster` — 1-instance DocumentDB, created once per package that imports it. + Consumed by `data/`, `performance/`, `status/`. **Read-only-by-convention**: each spec + uses its own Mongo database `db_`. The fixture wraps + the CR handle in a read-only proxy that panics on `PatchSpec`/`Delete`. +- `SharedScaleCluster` — 2-instance cluster used as starting state for `scale/`. Tests + reset instance count to 2 in `AfterEach` so the cluster is reusable. +- `Minio` — lazy session fixture in `backup/backup_suite_test.go`; calls + `cnpgminio.Deploy` only if the package is selected. + +**Per-spec (cheap or mutating), constructed in `BeforeEach`:** + +- `FreshDocumentDB(spec *apiv1preview.DocumentDB)` — factory used by lifecycle/tls/ + feature/exposure/backup/upgrade. Unique namespace, wait healthy, register cleanup via + `DeferCleanup`. +- `MongoClient(documentdb)` — mongo-driver client bound to the CR's service via a + Ginkgo-owned port-forward. +- `TmpNamespace()` — `e2e--`, auto-deleted. + +**Auto-applied:** + +- `operatorhealth.Gate` — invoked from `BeforeEach` and `AfterEach` of a top-level + `Describe` in `suite_test.go`. Snapshots operator pod UID + restart count; if it + churned, all subsequent non-`disruptive`/`upgrade` specs are **skipped** via a + package-global sentinel. Adapted from CNPG's `operator/` package, retargeted to our + `documentdb-operator` namespace and image. + +**Dedup summary:** + +| Test area | Cluster source | Wall-time saving vs all-fresh | +|---|---|---| +| `data/` (7 specs) | `SharedROCluster` | ~10 min | +| `performance/` (6 specs) | `SharedROCluster` | ~9 min | +| `status/` (3 specs) | `SharedROCluster` | ~5 min | +| `scale/` (4 specs) | `SharedScaleCluster` | ~5 min | +| `lifecycle/`, `tls/`, `feature_gates/`, `exposure/`, `backup/`, `upgrade/` | `FreshDocumentDB` | N/A (need isolation) | + +### Parallelism + +- `ginkgo -p ./tests/...` — one process per package. `SharedROCluster` is created once + per `data/` / `performance/` / `status/` process (acceptable, Ginkgo cannot share + across processes without external coordination). +- Within a package: Ginkgo defaults to serial within a process. For `data/` we enable + `--procs=N` and use `BeforeAll` (ordered container) so the cluster is created once per + process while specs run in parallel against their own DBs. +- Per-process naming: namespaces `e2e--`, DBs + `db__`, cluster names `ro-`. +- CI: marker-grouped GitHub Actions jobs run in parallel; within each job, Ginkgo + parallelizes at the process level. +- Performance job forces `--procs=1` so timing thresholds aren't noisy. +- Upgrade job forces `--procs=1` (disruptive; owns its own operator install). + +### Level/depth control + +- Import CNPG's `tests/levels` package. Every top-level `Describe`/`Context` adds a + level tag via `Label(levels.Medium.String())` (or Highest/High/Low/Lowest). +- `TEST_DEPTH=N` env var — reused as-is from CNPG's plumbing. +- Default depth = Medium. Smoke CI job uses Highest; nightly uses Lowest. + +### Labels (replaces "markers") + +Ginkgo labels, applied via `Label("…")` on `Describe`/`Context`/`It` and filtered via +`--label-filter`. We wrap CNPG's `tests/labels.go` and add DocumentDB-specific ones: + +```go +// labels.go +const ( + // Functional area (one per package via suite_test Describe label) + LifecycleLabel = "lifecycle" + ScaleLabel = "scale" + DataLabel = "data" + PerformanceLabel = "performance" + BackupLabel = "backup" + RecoveryLabel = "recovery" + TLSLabel = "tls" + FeatureLabel = "feature" + ExposureLabel = "exposure" + StatusLabel = "status" + UpgradeLabel = "upgrade" + + // Cross-cutting + SmokeLabel = "smoke" + BasicLabel = "basic" + DestructiveLabel = "destructive" // mutates cluster data + DisruptiveLabel = "disruptive" // may break operator; exempt from health gate + SlowLabel = "slow" // >5 min + + // Prereqs — tests with these labels Skip() if env missing + NeedsMinioLabel = "needs-minio" + NeedsCertManagerLabel = "needs-certmanager" + NeedsMetalLBLabel = "needs-metallb" +) +``` + +### Manifests — base + mixin templates + +Plain text files expanded by `cnpgenvsubst.Envsubst` (from `tests/utils/envsubst`). +Composition is done in Go: + +```go +// pkg/e2eutils/documentdb/documentdb.go +func RenderCR(name, ns string, mixins []string, vars map[string]string) ([]byte, error) { + parts := []string{"manifests/base/documentdb.yaml.template"} + for _, m := range mixins { + parts = append(parts, "manifests/mixins/"+m+".yaml.template") + } + return envsubst.Expand(concatFiles(parts), vars) +} +``` + +No Jinja2; `envsubst` is enough for our CRs, and it matches what CNPG uses so mental +model is shared. + +### Assertions & timeouts + +- `pkg/e2eutils/assertions/assertions.go` — Gomega-wrapped verbs: + `AssertDocumentDBReady`, `AssertInstanceCount`, `AssertPrimaryUnchanged`, + `AssertPVCCount`, `AssertTLSSecretReady`, `AssertWalLevel`, `AssertServiceType`, + `AssertConnectionStringMatches`. Each returns `func()` suitable for + `Eventually(...).Should(Succeed())`. +- `pkg/e2eutils/timeouts/timeouts.go` — starts from + `cnpgtimeouts.Timeouts()`, overrides/adds DocumentDB-specific ops: + ```go + type Op string + const ( + DocumentDBReady Op = "documentdb-ready" + DocumentDBUpgrade Op = "documentdb-upgrade" + InstanceScale Op = "instance-scale" + PVCResize Op = "pvc-resize" + ) + func For(op Op) time.Duration { … } + ``` + +### CI Workflow + +One workflow `test-e2e.yml` with amd64+arm64 matrix. Within each matrix row, marker-grouped +jobs in parallel: + +| CI job | `--label-filter` | `ginkgo --procs` | Runner | +|---|---|---|---| +| `smoke` | `smoke` | auto | ubuntu-latest | +| `lifecycle` | `lifecycle` | auto | ubuntu-latest | +| `scale` | `scale` | 2 | ubuntu-latest | +| `data` | `data` | auto | ubuntu-latest | +| `performance` | `performance` | 1 | ubuntu-latest (dedicated) | +| `backup` | `backup` | 2 | ubuntu-latest | +| `tls` | `tls` | auto | ubuntu-latest | +| `feature` | `feature \|\| exposure \|\| status` | auto | ubuntu-latest | +| `upgrade` | `upgrade` | 1 | ubuntu-latest | + +Each job: setup kind → install operator (existing `setup-test-environment` action) → +`ginkgo -r --label-filter="…" --procs=N --junit-report=junit.xml ./tests/...` → upload +JUnit + logs. `workflow_dispatch` inputs: `label`, `depth`, `keep_clusters`. + +### Fate of Existing Artifacts + +**Delete** after the new suite is green in CI for one full run: +- `.github/workflows/{test-integration,test-E2E,test-backup-and-restore,test-upgrade-and-rollback}.yml` +- `.github/actions/setup-port-forwarding/` +- `operator/src/scripts/test-scripts/{test-mongodb-connection.sh,test-python-pymongo.sh,mongo-python-data-pusher.py,comprehensive_mongosh_tests.js,performance_test.js}` + +**Keep:** +- `.github/actions/setup-test-environment/`, `.github/actions/collect-logs/` +- `operator/src/scripts/test-scripts/deploy-csi-driver.sh` (infra prep) +- Go unit/envtest suite — out of scope. + +### Scope Boundaries + +- In scope: single-cluster operations on kind; all CRD spec fields + CRs. +- Out of scope: cross-cluster replication, multi-cloud, AKS/EKS-specific LB annotations, + Azure Fleet — stays in `documentdb-playground/`. +- Operator install/uninstall is in `setup-test-environment`; the suite assumes a running + operator. `tests/upgrade/` owns its two-phase install. + +### Module layout (go.mod placement) + +`test/e2e/` is a **separate Go module** (own `go.mod`). Reasons: +- Pulls in CNPG test utils + Ginkgo + mongo-driver without polluting the operator's + runtime dependencies. +- Lets us iterate on test deps without triggering operator builds. +- Matches how CNPG itself is organized (`tests/e2e/`). + +## Spike findings + +**Repo investigated:** `github.com/cloudnative-pg/cloudnative-pg` @ main, `tests/utils/`. +**License:** Apache-2.0 (compatible with our MIT; no NOTICE file). +**API stability:** `tests/utils/*` is public (not `internal/`) but has no stability +contract — expect occasional churn at CNPG version bumps; pin version in `go.mod`. + +Reusability tally of the 29 `tests/utils/*` packages: + +| Status | Packages | Count | +|---|---|---| +| ✅ Direct reuse | `clusterutils`, `minio`, `backups`, `timeouts`, `namespaces`, `pods`, `services`, `storage`, `secrets`, `yaml`, `envsubst`, `exec`, `run`, `logs`, `objects`, `sternmultitailer`, `forwardconnection`, `nodes`, `endpoints`, `deployments` | ~20 | +| ⚠️ Adapt | `environment.TestingEnvironment` (PG-coupled; construct with dummy POSTGRES_IMG), `operator` (retarget to `documentdb-operator` namespace) | 2 | +| ❌ Skip | `postgres`, `replicationslot`, `fencing`, `importdb`, `cloudvendors`, `openshift`, `proxy`, `azurite` | ~7 | + +Key enabling fact: DocumentDB's operator **wraps CNPG's `apiv1.Cluster` and +`apiv1.Backup`** — so `clusterutils.GetPrimary`, `clusterutils.GetReplicas`, +`backups.Create`, `backups.AssertBackupConditionInClusterStatus` work on our resources as-is. + +## Todos + +### Phase 0 — Spike verification (new) + +1. `cnpg-utils-probe` — Write 30-line `cmd/probe/main.go` that constructs + `environment.TestingEnvironment` with dummy PG env vars, calls + `clusterutils.GetPrimary` on a live DocumentDB cluster in kind, confirms compile + run. + Gate for the rest of Phase 1. + +### Phase 1 — Scaffolding & helpers + +2. `scaffold` — `test/e2e/` tree, separate `go.mod` (pinning CNPG utils version), Ginkgo + suite boilerplate, `labels.go`, re-export of CNPG `levels.go`, CLI flag plumbing, + area-package skeleton with empty `*_suite_test.go` in each. +3. `testenv` — `pkg/e2eutils/testenv/env.go`: constructor that wraps + `environment.NewTestingEnvironment()` with dummy `POSTGRES_IMG`; exposes our typed + `client.Client` with `api/preview` scheme registered. +4. `helpers-documentdb` — `documentdb.go`: `Create`, `PatchSpec`, `WaitHealthy`, + `Delete`, `List`, `RenderCR` (base+mixin envsubst pipeline). +5. `helpers-mongo` — `mongo/client.go`: `NewClient(host, port, user, pw, tls)`, + `Seed(ctx, db, n)`, `Ping`, `Count`. +6. `helpers-portforward` — `portforward.go`: thin wrapper over CNPG's + `forwardconnection` targeting the DocumentDB gateway port. +7. `helpers-assertions` — `assertions.go`: `AssertDocumentDBReady`, + `AssertInstanceCount`, `AssertPrimaryUnchanged`, `AssertPVCCount`, + `AssertTLSSecretReady`, `AssertWalLevel`, `AssertServiceType`, + `AssertConnectionStringMatches`. Each returns `func() error` for `Eventually`. +8. `helpers-timeouts` — `timeouts.go`: extends CNPG's map with DocumentDB ops. +9. `helpers-seed` — `seed/datasets.go`: `SmallDataset(10)`, `MediumDataset(1000)`, + `SortDataset`, `AggDataset` — reused by data/performance/backup/upgrade. +10. `operator-health-gate` — `operatorhealth/gate.go`: adapted from CNPG's `operator/` + package for `documentdb-operator` ns; `BeforeEach`/`AfterEach` hooks + package + sentinel to skip subsequent specs on churn. +11. `shared-fixtures` — `pkg/e2eutils/fixtures/`: `shared_ro.go`, `shared_scale.go`, + `minio.go` (wraps CNPG `minio.Deploy`, lazy-constructed). +12. `manifests-base` — `manifests/base/documentdb.yaml.template` + all mixins under + `manifests/mixins/` and `manifests/backup/`. +13. `suite-root` — `suite_test.go`: `SynchronizedBeforeSuite` builds `Env`, installs + lazy MinIO hook, starts stern log tailer, registers operator-health gate. + +### Phase 2 — Test packages (one per area) + +14. `tests-data` — `data_suite_test.go` spins up `SharedROCluster`; port + `comprehensive_mongosh_tests.js` + pymongo heredoc, **split** into 7 spec files. + Package label `DataLabel`. +15. `tests-performance` — 6 spec files, one per timed op; shares `SharedROCluster`; + forced serial in CI. Thresholds preserved. +16. `tests-status` — 3 spec files; shares `SharedROCluster`. +17. `tests-lifecycle` — 5 spec files; each owns its own `FreshDocumentDB`. +18. `tests-scale` — `scale_suite_test.go` with `SharedScaleCluster`; up + down spec + files; each `AfterEach` resets to 2 instances. +19. `tests-backup` — `backup_suite_test.go` owns `Minio`; 4 spec files. +20. `tests-tls` — 4 spec files, one per mode. CertManager file uses + `NeedsCertManagerLabel`. +21. `tests-feature-gates` — `changestreams_test.go` table-driven over (enabled, disabled). +22. `tests-exposure` — ClusterIP + LoadBalancer spec files; LB uses `NeedsMetalLBLabel`. +23. `tests-upgrade` — `upgrade_suite_test.go` with multi-phase install helpers; **split** + into 2–3 spec files so failures pinpoint the phase. + +### Phase 3 — Integration + +24. `local-run` — Full suite green locally on kind at `TEST_DEPTH=Medium` with `ginkgo -p`. +25. `ci-workflow` — `.github/workflows/test-e2e.yml`: amd64+arm64 matrix, label-grouped + jobs per table above, `workflow_dispatch` inputs. +26. `cleanup-workflows` — Delete the 4 old workflows + `setup-port-forwarding` composite. +27. `cleanup-scripts` — Delete old bash/JS/Python test scripts. +28. `docs` — Update `docs/developer-guides/` + AGENTS.md: tree, local run (`ginkgo -p + ./tests/...`), labels, levels, how to add a new area / mixin / assertion; CHANGELOG + migration note; document the CNPG utils dependency + pin policy. + +## Comparison: Our Plan vs CloudNative-PG E2E Suite + +| Aspect | CNPG | Our plan | Decision | +|---|---|---|---| +| Language | Go (Ginkgo+Gomega) | Go (Ginkgo+Gomega) | **Aligned.** | +| Test selection | 28 labels + TEST_DEPTH | Our labels + **imported** `tests/levels` | Aligned; we re-export CNPG's levels. | +| Matrix (K8s×PG×engine) | full 3-D | amd64/arm64 only | Defer to GA. | +| Cluster bring-up | `hack/setup-cluster.sh` | existing `setup-test-environment` action | Keep ours. | +| Session-scoped MinIO | yes (`minio.Deploy`) | **imported as-is** from CNPG | Adopted verbatim. | +| Operator health gate | yes (`BeforeEach` pod check) | `operatorhealth/gate.go` — adapted from CNPG `operator/` | Adapted (ns retargeted). | +| Shared cluster for read-only | implicit per-namespace | explicit `SharedROCluster` + read-only proxy | **We go further.** | +| Assertion composables | `AssertClusterIsReady`, etc. | `pkg/e2eutils/assertions` | Aligned. | +| Manifest templating | `envsubst` over `.yaml.template` | `envsubst` over `.yaml.template` | **Imported.** | +| Per-op timeouts | `Timeouts()` map | extends CNPG's map | **Imported + extended.** | +| Parallelism | `ginkgo -p` + within-pkg procs | `ginkgo -p` + within-pkg procs + label-grouped CI | Two-layer. | +| Stern log streaming | yes | **imported** (`sternmultitailer`) | Adopted. | +| Label filter (`/test` comment) | yes | `workflow_dispatch` inputs | Defer. | + +### Not copying, with rationale + +- **Multi-engine** (k3d/EKS/AKS/GKE/OpenShift) — defer to GA. +- **Branch-snapshot operator install** from artifacts repo — we build in the same workflow. +- **postgres/** helpers — we speak Mongo, not libpq. + +## Open Questions / Risks + +- **CNPG utils API churn**: pinned version mitigates but doesn't eliminate. Budget ~½ day + per CNPG bump for test-util compat fixes. Document in contribute guide. +- **Dummy `POSTGRES_IMG`** in `testenv.Env` feels brittle; if CNPG starts *eagerly* + validating the image in `NewTestingEnvironment`, we'd need to fork. Check on first + probe; fallback plan is to copy the constructor (~100 LOC). +- **Read-only proxy enforcement**: making sure tests can't accidentally call + `PatchSpec` on `SharedROCluster`. The proxy panics at runtime — acceptable; maybe add + a linter later. +- **Backup object store**: confirm `test-backup-and-restore.yml` uses MinIO (likely) so + CNPG's `minio.Deploy` is a drop-in. Verify during Phase 0 probe. +- **MetalLB / SC expansion / cert-manager**: label-gated skips; document the env + contract in README. +- **Ginkgo parallelism across processes** can't share `SharedROCluster`; acceptable + cost (we pay for one cluster per Go process in `data/`+`performance/`+`status/` = + 3 clusters max per CI job instead of 1). Lower than the N-per-spec baseline we're + replacing. +- **Total CI wallclock**: budget review after first full run. +- **Rubber-duck review**: after Phase 0 (probe) + Phase 1 (scaffold + helpers + + suite_test) + one populated area (e.g. `tests/data/`), review shape before building + the rest. diff --git a/operator/src/scripts/test-scripts/comprehensive_mongosh_tests.js b/operator/src/scripts/test-scripts/comprehensive_mongosh_tests.js deleted file mode 100644 index c4d4bb33..00000000 --- a/operator/src/scripts/test-scripts/comprehensive_mongosh_tests.js +++ /dev/null @@ -1,497 +0,0 @@ -// Comprehensive DocumentDB test suite with validation -print("=== Starting Comprehensive DocumentDB Tests with Validation ==="); - -// Validation helper function -function validate(condition, message) { - if (!condition) { - print("DEBUG: Validation failed for: " + message); - print("DEBUG: Condition was:", condition); - throw new Error("VALIDATION FAILED: " + message); - } - print("✓ " + message); -} - -// Helper function to handle Long objects returned by some MongoDB drivers -function getLongValue(val) { - if (typeof val === 'object' && val !== null && 'low' in val) { - return val.low; // Extract the actual number from Long object - } - return val; -} - -// Test 1: Basic Connection and Database Operations -print("\n--- Test 1: Basic Database Operations ---"); -db = db.getSiblingDB('testdb'); - -// Verify database connection -print("DEBUG: Current database:", db.getName()); -print("DEBUG: Database connection test:", db.runCommand({ping: 1})); - -// Test collection creation -db.createCollection("users"); -db.createCollection("products"); -db.createCollection("orders"); - -// Validate collections were created -var collections = db.getCollectionNames(); -validate(collections.includes("users"), "Users collection created"); -validate(collections.includes("products"), "Products collection created"); -validate(collections.includes("orders"), "Orders collection created"); - -// Insert sample data -var users = [ - { _id: 1, name: "John Doe", email: "john@example.com", age: 30, city: "New York" }, - { _id: 2, name: "Jane Smith", email: "jane@example.com", age: 25, city: "San Francisco" }, - { _id: 3, name: "Bob Johnson", email: "bob@example.com", age: 35, city: "Chicago" }, - { _id: 4, name: "Alice Brown", email: "alice@example.com", age: 28, city: "Seattle" } -]; - -var products = [ - { _id: 1, name: "Laptop", price: 999.99, category: "Electronics", stock: 50 }, - { _id: 2, name: "Phone", price: 699.99, category: "Electronics", stock: 100 }, - { _id: 3, name: "Book", price: 19.99, category: "Education", stock: 200 }, - { _id: 4, name: "Desk", price: 299.99, category: "Furniture", stock: 25 } -]; - -var orders = [ - { _id: 1, userId: 1, productId: 1, quantity: 1, total: 999.99, date: new Date() }, - { _id: 2, userId: 2, productId: 2, quantity: 2, total: 1399.98, date: new Date() }, - { _id: 3, userId: 3, productId: 3, quantity: 3, total: 59.97, date: new Date() } -]; - -var userResult = db.users.insertMany(users); -var productResult = db.products.insertMany(products); -var orderResult = db.orders.insertMany(orders); - -// Debug the insert results -print("DEBUG: userResult:", JSON.stringify(userResult)); -print("DEBUG: productResult:", JSON.stringify(productResult)); -print("DEBUG: orderResult:", JSON.stringify(orderResult)); - -// Helper function to get insertedIds count (handles both array and object formats) -function getInsertedCount(result) { - if (result.insertedIds) { - if (Array.isArray(result.insertedIds)) { - return result.insertedIds.length; - } else if (typeof result.insertedIds === 'object') { - return Object.keys(result.insertedIds).length; - } - } - return 0; -} - -// Validate insertions -validate(userResult.acknowledged === true, "User insertion was acknowledged"); -validate(getInsertedCount(userResult) === 4, "Inserted exactly 4 users"); -validate(productResult.acknowledged === true, "Product insertion was acknowledged"); -validate(getInsertedCount(productResult) === 4, "Inserted exactly 4 products"); -validate(orderResult.acknowledged === true, "Order insertion was acknowledged"); -validate(getInsertedCount(orderResult) === 3, "Inserted exactly 3 orders"); - -print("Inserted", getInsertedCount(userResult), "users"); -print("Inserted", getInsertedCount(productResult), "products"); -print("Inserted", getInsertedCount(orderResult), "orders"); - -// Verify the documents actually exist in the database -var actualUserCount = db.users.countDocuments(); -var actualProductCount = db.products.countDocuments(); -var actualOrderCount = db.orders.countDocuments(); - -print("DEBUG: Actual document counts - Users:", actualUserCount, "Products:", actualProductCount, "Orders:", actualOrderCount); -validate(actualUserCount === 4, "Database contains exactly 4 users"); -validate(actualProductCount === 4, "Database contains exactly 4 products"); -validate(actualOrderCount === 3, "Database contains exactly 3 orders"); - -// Verify specific users exist -var johnExists = db.users.findOne({ name: "John Doe" }); -var janeExists = db.users.findOne({ name: "Jane Smith" }); -print("DEBUG: John Doe exists:", johnExists !== null); -print("DEBUG: Jane Smith exists:", janeExists !== null); -validate(johnExists !== null, "John Doe document exists after insertion"); -validate(janeExists !== null, "Jane Smith document exists after insertion"); - -// Test 2: Query Operations -print("\n--- Test 2: Query Operations ---"); - -// Simple queries with validation -var youngUsers = db.users.find({ age: { $lt: 30 } }).toArray(); -validate(youngUsers.length === 2, "Found exactly 2 users under 30 (Jane: 25, Alice: 28)"); -validate(youngUsers.some(u => u.name === "Jane Smith"), "Jane Smith found in young users"); -validate(youngUsers.some(u => u.name === "Alice Brown"), "Alice Brown found in young users"); - -var expensiveProducts = db.products.find({ price: { $gt: 500 } }).toArray(); -validate(expensiveProducts.length === 2, "Found exactly 2 expensive products (Laptop, Phone)"); -validate(expensiveProducts.some(p => p.name === "Laptop"), "Laptop found in expensive products"); -validate(expensiveProducts.some(p => p.name === "Phone"), "Phone found in expensive products"); - -// Complex queries with sorting -var sortedUsers = db.users.find().sort({ age: -1 }).toArray(); -validate(sortedUsers.length === 4, "Sorted query returned all 4 users"); -validate(sortedUsers[0].name === "Bob Johnson" && sortedUsers[0].age === 35, "First user is Bob (35)"); -validate(sortedUsers[1].name === "John Doe" && sortedUsers[1].age === 30, "Second user is John (30)"); -validate(sortedUsers[2].name === "Alice Brown" && sortedUsers[2].age === 28, "Third user is Alice (28)"); -validate(sortedUsers[3].name === "Jane Smith" && sortedUsers[3].age === 25, "Fourth user is Jane (25)"); - -print("Users sorted by age (desc):", sortedUsers.map(u => u.name + " (" + u.age + ")")); - -// Test 3: Aggregation Pipeline -print("\n--- Test 3: Aggregation Operations ---"); - -// Average age with validation -var avgAge = db.users.aggregate([ - { $group: { _id: null, avgAge: { $avg: "$age" }, count: { $sum: 1 } } } -]).toArray(); - -var expectedAvgAge = (30 + 25 + 35 + 28) / 4; // 29.5 -validate(avgAge.length === 1, "Aggregation returned exactly 1 result"); -validate(Math.abs(avgAge[0].avgAge - expectedAvgAge) < 0.01, "Average age is correct: " + expectedAvgAge); -validate(avgAge[0].count === 4, "Count is correct: 4 users"); - -print("Average user age:", avgAge[0].avgAge, "from", avgAge[0].count, "users"); - -// Group by city with validation -var cityGroups = db.users.aggregate([ - { $group: { _id: "$city", count: { $sum: 1 }, avgAge: { $avg: "$age" } } }, - { $sort: { count: -1 } } -]).toArray(); - -validate(cityGroups.length === 4, "Grouped by 4 different cities"); -var cities = cityGroups.map(g => g._id); -validate(cities.includes("New York"), "New York city group found"); -validate(cities.includes("San Francisco"), "San Francisco city group found"); -validate(cities.includes("Chicago"), "Chicago city group found"); -validate(cities.includes("Seattle"), "Seattle city group found"); - -print("Users by city:", cityGroups); - -// Product statistics with validation -var productStats = db.products.aggregate([ - { $group: { - _id: "$category", - count: { $sum: 1 }, - avgPrice: { $avg: "$price" }, - totalStock: { $sum: "$stock" } - }}, - { $sort: { avgPrice: -1 } } -]).toArray(); - -validate(productStats.length === 3, "Grouped by 3 categories"); -var electronicsStats = productStats.find(s => s._id === "Electronics"); -validate(electronicsStats && electronicsStats.count === 2, "Electronics category has 2 products"); -validate(electronicsStats && electronicsStats.totalStock === 150, "Electronics total stock is 150"); - -print("Product statistics by category:", productStats); - -// Test 4: Update Operations -print("\n--- Test 4: Update Operations ---"); - -// Small delay to ensure inserts are fully committed -print("DEBUG: Waiting for inserts to be committed..."); -sleep(1000); // 1 second delay - -// First, verify the user exists before attempting update -var johnBefore = db.users.findOne({ name: "John Doe" }); -print("DEBUG: John Doe before update:", JSON.stringify(johnBefore)); -validate(johnBefore !== null, "John Doe document exists before update"); -validate(johnBefore.name === "John Doe", "John Doe has correct name"); -validate(johnBefore.age === 30, "John Doe has initial age of 30"); - -// Update single document with validation -var updateResult = db.users.updateOne( - { name: "John Doe" }, - { $set: { age: 31, lastUpdated: new Date() } } -); - -print("DEBUG: updateResult:", JSON.stringify(updateResult)); - -var matchedCount = getLongValue(updateResult.matchedCount); -var modifiedCount = getLongValue(updateResult.modifiedCount); - -print("DEBUG: Extracted counts - matched:", matchedCount, "modified:", modifiedCount); - -// If first update fails, try with exact field matching -if (matchedCount !== 1) { - print("DEBUG: First update failed, trying exact match..."); - var allUsers = db.users.find().toArray(); - print("DEBUG: All users in database:", JSON.stringify(allUsers)); - - // Try to find John with different criteria - var johnVariants = [ - db.users.findOne({ name: "John Doe" }), - db.users.findOne({ _id: 1 }), - db.users.findOne({ email: "john@example.com" }) - ]; - print("DEBUG: John search variants:", JSON.stringify(johnVariants)); - - // Try update by _id instead - updateResult = db.users.updateOne( - { _id: 1 }, - { $set: { age: 31, lastUpdated: new Date() } } - ); - print("DEBUG: updateResult by _id:", JSON.stringify(updateResult)); - - matchedCount = getLongValue(updateResult.matchedCount); - modifiedCount = getLongValue(updateResult.modifiedCount); -} - -validate(matchedCount === 1, "Update matched exactly 1 document"); -validate(modifiedCount === 1, "Update modified exactly 1 document"); - -// Verify the update -var updatedJohn = db.users.findOne({ name: "John Doe" }); -validate(updatedJohn.age === 31, "John's age updated to 31"); -validate(updatedJohn.lastUpdated !== undefined, "John has lastUpdated field"); - -print("Updated", modifiedCount, "user document"); - -// Update multiple documents with validation -var electronicsBeforeUpdate = db.products.find({ category: "Electronics" }).toArray(); -print("DEBUG: Electronics products before bulk update:", JSON.stringify(electronicsBeforeUpdate)); -validate(electronicsBeforeUpdate.length === 2, "Found exactly 2 Electronics products before update"); - -var bulkUpdate = db.products.updateMany( - { category: "Electronics" }, - { $inc: { stock: -5 }, $set: { lastSold: new Date() } } -); - -print("DEBUG: bulkUpdate result:", JSON.stringify(bulkUpdate)); - -var bulkMatchedCount = getLongValue(bulkUpdate.matchedCount); -var bulkModifiedCount = getLongValue(bulkUpdate.modifiedCount); - -print("DEBUG: Extracted bulk counts - matched:", bulkMatchedCount, "modified:", bulkModifiedCount); -validate(bulkMatchedCount === 2, "Bulk update matched 2 Electronics products"); -validate(bulkModifiedCount === 2, "Bulk update modified 2 products"); - -// Verify bulk update -var updatedElectronics = db.products.find({ category: "Electronics" }).toArray(); -validate(updatedElectronics.every(p => p.lastSold !== undefined), "All electronics have lastSold field"); -var laptop = updatedElectronics.find(p => p.name === "Laptop"); -var phone = updatedElectronics.find(p => p.name === "Phone"); -validate(laptop.stock === 45, "Laptop stock reduced to 45"); -validate(phone.stock === 95, "Phone stock reduced to 95"); - -print("Updated", bulkModifiedCount, "product documents"); - -// Upsert operation with validation -var existingUser = db.users.findOne({ email: "new@example.com" }); -print("DEBUG: Existing user with new@example.com:", JSON.stringify(existingUser)); - -var upsertResult = db.users.updateOne( - { email: "new@example.com" }, - { $set: { name: "New User", age: 22, city: "Boston" } }, - { upsert: true } -); - -print("DEBUG: upsertResult:", JSON.stringify(upsertResult)); - -var upsertMatchedCount = getLongValue(upsertResult.matchedCount); -var upsertModifiedCount = getLongValue(upsertResult.modifiedCount); -var upsertedCount = getLongValue(upsertResult.upsertedCount); - -print("DEBUG: Extracted upsert counts - matched:", upsertMatchedCount, "modified:", upsertModifiedCount, "upserted:", upsertedCount); -validate(upsertMatchedCount === 0, "Upsert matched 0 existing documents"); -validate(upsertModifiedCount === 0, "Upsert modified 0 existing documents"); -validate(upsertedCount === 1, "Upsert created 1 new document"); - -// Verify upsert -var newUser = db.users.findOne({ email: "new@example.com" }); -validate(newUser && newUser.name === "New User", "New user created with correct name"); -validate(newUser && newUser.age === 22, "New user has correct age"); - -print("Upsert operation - matched:", upsertMatchedCount, "modified:", upsertModifiedCount, "upserted:", upsertedCount); - -// Test 5: Text Search -print("\n--- Test 5: Text Search ---"); - -// Simple text search without text index -var laptopProducts = db.products.find({ name: /laptop/i }).toArray(); -validate(laptopProducts.length === 1, "Text search found exactly 1 laptop"); -validate(laptopProducts[0].name === "Laptop", "Found product is the Laptop"); - -print("Text search for 'laptop' found:", laptopProducts.length, "products"); - -// Test 6: Array Operations -print("\n--- Test 6: Array Operations ---"); - -// Count users before adding hobbies array -var userCountBefore = db.users.countDocuments(); -print("DEBUG: User count before adding hobbies:", userCountBefore); - -// Add array field to users -var arrayUpdateResult = db.users.updateMany( - {}, - { $set: { hobbies: [] } } -); -print("DEBUG: arrayUpdateResult:", JSON.stringify(arrayUpdateResult)); - -var arrayMatchedCount = getLongValue(arrayUpdateResult.matchedCount); -var arrayModifiedCount = getLongValue(arrayUpdateResult.modifiedCount); - -print("DEBUG: Extracted array counts - matched:", arrayMatchedCount, "modified:", arrayModifiedCount); -validate(arrayMatchedCount === userCountBefore, "Array update matched all " + userCountBefore + " users"); -validate(arrayModifiedCount === userCountBefore, "Added hobbies array to all " + userCountBefore + " users"); - -// Verify hobbies field was added -var usersWithHobbiesField = db.users.find({ hobbies: { $exists: true } }).toArray(); -validate(usersWithHobbiesField.length === userCountBefore, "All users now have hobbies field"); - -// Update with array operations -var johnBeforeHobbies = db.users.findOne({ name: "John Doe" }); -print("DEBUG: John before adding hobbies:", JSON.stringify(johnBeforeHobbies)); -validate(johnBeforeHobbies !== null, "John Doe exists before adding hobbies"); -validate(Array.isArray(johnBeforeHobbies.hobbies), "John has hobbies array field"); - -var johnHobbiesResult = db.users.updateOne( - { name: "John Doe" }, - { $push: { hobbies: { $each: ["reading", "gaming", "cooking"] } } } -); -print("DEBUG: johnHobbiesResult:", JSON.stringify(johnHobbiesResult)); - -var johnHobbiesMatched = getLongValue(johnHobbiesResult.matchedCount); -var johnHobbiesModified = getLongValue(johnHobbiesResult.modifiedCount); - -validate(johnHobbiesMatched === 1, "John hobbies update matched 1 document"); -validate(johnHobbiesModified === 1, "Added hobbies to John"); - -var janeHobbiesResult = db.users.updateOne( - { name: "Jane Smith" }, - { $push: { hobbies: { $each: ["traveling", "photography"] } } } -); -print("DEBUG: janeHobbiesResult:", JSON.stringify(janeHobbiesResult)); - -var janeHobbiesMatched = getLongValue(janeHobbiesResult.matchedCount); -var janeHobbiesModified = getLongValue(janeHobbiesResult.modifiedCount); - -validate(janeHobbiesMatched === 1, "Jane hobbies update matched 1 document"); -validate(janeHobbiesModified === 1, "Added hobbies to Jane"); - -var usersWithHobbies = db.users.find({ hobbies: { $exists: true, $ne: [] } }).toArray(); -validate(usersWithHobbies.length === 2, "Found exactly 2 users with hobbies"); - -// Array query operations -var readingUsers = db.users.find({ hobbies: "reading" }).toArray(); -validate(readingUsers.length === 1, "Found exactly 1 user who likes reading"); -validate(readingUsers[0].name === "John Doe", "John Doe likes reading"); - -print("Users with hobbies:", usersWithHobbies.length); -print("Users who like reading:", readingUsers.length); - -// Test 7: Date Operations -print("\n--- Test 7: Date Operations ---"); - -var today = new Date(); -var yesterday = new Date(today.getTime() - 24 * 60 * 60 * 1000); - -var recentOrders = db.orders.find({ date: { $gte: yesterday } }).toArray(); -validate(recentOrders.length === 3, "All 3 orders are recent (created today)"); - -// Date aggregation -var dailyStats = db.orders.aggregate([ - { $group: { - _id: { $dateToString: { format: "%Y-%m-%d", date: "$date" } }, - totalOrders: { $sum: 1 }, - totalAmount: { $sum: "$total" } - }} -]).toArray(); - -validate(dailyStats.length === 1, "Orders grouped into 1 day"); -validate(dailyStats[0].totalOrders === 3, "Total orders for today is 3"); -var expectedTotal = 999.99 + 1399.98 + 59.97; -validate(Math.abs(dailyStats[0].totalAmount - expectedTotal) < 0.01, "Total amount is correct"); - -print("Recent orders:", recentOrders.length); -print("Daily order statistics:", dailyStats); - -// Test 8: Batch Operations -print("\n--- Test 8: Batch Operations ---"); - -// Debug: Check current products before bulk ops -var allProducts = db.products.find().toArray(); -print("DEBUG: All products before bulk ops:", JSON.stringify(allProducts)); - -var electronicsProducts = db.products.find({ category: "Electronics" }).toArray(); -var cheapProducts = db.products.find({ price: { $lt: 100 } }).toArray(); - -print("DEBUG: Electronics products:", electronicsProducts.length); -print("DEBUG: Products < $100:", cheapProducts.length); -print("DEBUG: Expected total matches:", electronicsProducts.length + cheapProducts.length); - -var bulkOps = db.products.initializeUnorderedBulkOp(); -bulkOps.find({ category: "Electronics" }).update({ $inc: { views: 1 } }); -bulkOps.find({ price: { $lt: 100 } }).update({ $set: { featured: true } }); -bulkOps.insert({ name: "New Product", price: 49.99, category: "Test", stock: 10 }); - -var bulkResult = bulkOps.execute(); - -print("DEBUG: Bulk result:", JSON.stringify(bulkResult)); - -// Handle different property names between MongoDB and DocumentDB -var nMatched = bulkResult.nMatched || bulkResult.matchedCount || 0; -var nModified = bulkResult.nModified || bulkResult.modifiedCount || 0; -var nInserted = bulkResult.nInserted || bulkResult.insertedCount || 0; - -print("DEBUG: nMatched:", nMatched, "nModified:", nModified, "nInserted:", nInserted); - -// Use more flexible validation based on actual data -var expectedMatches = electronicsProducts.length + cheapProducts.length; -validate(nMatched >= expectedMatches - 1, "Bulk operations matched at least " + (expectedMatches - 1) + " documents"); // Allow for slight variance -validate(nModified >= expectedMatches - 1, "Bulk operations modified at least " + (expectedMatches - 1) + " documents"); -validate(nInserted === 1, "Bulk operations inserted 1 document"); - -// Verify bulk operations -var electronicsWithViews = db.products.find({ category: "Electronics", views: { $exists: true } }).toArray(); -validate(electronicsWithViews.length === 2, "Both electronics products have views field"); - -var featuredProducts = db.products.find({ featured: true }).toArray(); -validate(featuredProducts.length >= 1, "At least 1 product is featured"); // Book should be featured - -var newProduct = db.products.findOne({ name: "New Product" }); -validate(newProduct !== null, "New product was inserted"); -validate(newProduct.price === 49.99, "New product has correct price"); - -print("Bulk operation results - matched:", nMatched, "modified:", nModified, "inserted:", nInserted); - -// Test 9: Final Verification -print("\n--- Test 9: Final Data Verification ---"); - -var totalUsers = db.users.countDocuments(); -var totalProducts = db.products.countDocuments(); -var totalOrders = db.orders.countDocuments(); - -print("DEBUG: Final counts - Users:", totalUsers, "Products:", totalProducts, "Orders:", totalOrders); - -// Use dynamic validation based on actual counts (4 original + 1 upserted = 5) -var expectedUsers = 5; // 4 original + 1 upserted -var expectedProducts = 5; // 4 original + 1 bulk inserted -var expectedOrders = 3; // 3 original - -validate(totalUsers === expectedUsers, "Final user count is " + expectedUsers + " (4 original + 1 upserted)"); -validate(totalProducts === expectedProducts, "Final product count is " + expectedProducts + " (4 original + 1 bulk inserted)"); -validate(totalOrders === expectedOrders, "Final order count is " + expectedOrders); - -print("Final counts - Users:", totalUsers, "Products:", totalProducts, "Orders:", totalOrders); - -// Test data consistency -var allUsersHaveHobbies = db.users.find({ hobbies: { $exists: false } }).toArray(); -validate(allUsersHaveHobbies.length === 0, "All users have hobbies field"); - -var johnFinal = db.users.findOne({ name: "John Doe" }); -print("DEBUG: John final state:", JSON.stringify(johnFinal)); -validate(johnFinal !== null, "John Doe document exists at end"); -validate(johnFinal.age === 31, "John's age is still 31"); -validate(johnFinal.hobbies && johnFinal.hobbies.includes("reading"), "John still has reading hobby"); - -// Clean up test data -print("\n--- Cleanup ---"); -db.users.drop(); -db.products.drop(); -db.orders.drop(); - -// Verify cleanup -var remainingCollections = db.getCollectionNames(); -validate(!remainingCollections.includes("users"), "Users collection dropped"); -validate(!remainingCollections.includes("products"), "Products collection dropped"); -validate(!remainingCollections.includes("orders"), "Orders collection dropped"); - -print("\n=== All Tests Completed Successfully with Validation! ==="); diff --git a/operator/src/scripts/test-scripts/mongo-python-data-pusher.py b/operator/src/scripts/test-scripts/mongo-python-data-pusher.py deleted file mode 100644 index 35f53917..00000000 --- a/operator/src/scripts/test-scripts/mongo-python-data-pusher.py +++ /dev/null @@ -1,41 +0,0 @@ -from pymongo import MongoClient -from pprint import pprint -import ssl - -# Connection parameters -host = "127.0.0.1" # Use localhost for local testing or replace with the actual load balancer endpoint -port = 10260 -username = "default_user" -password = "Admin100" # Default is Admin100 -auth_db = "admin" # Default auth source unless otherwise needed - -# Connect with TLS and skip cert validation -client = MongoClient( - host, - port, - username=username, - password=password, - authSource=auth_db, - authMechanism="SCRAM-SHA-256", - tls=True, - tlsAllowInvalidCertificates=True -) - -# Use the database -club_db = client["soccer_league"] - -# Insert a soccer club document -insert_result = club_db.clubs.insert_one({ - "name": "Manchester United", - "country": "England", - "founded": 1878, - "stadium": "Old Trafford", - "league": "Premier League", - "titles": ["Premier League", "FA Cup", "Champions League"] -}) - -print(f"Inserted soccer club document ID: {insert_result.inserted_id}") - -# Find all soccer clubs -for doc in club_db.clubs.find(): - pprint(doc) diff --git a/operator/src/scripts/test-scripts/performance_test.js b/operator/src/scripts/test-scripts/performance_test.js deleted file mode 100644 index 94474493..00000000 --- a/operator/src/scripts/test-scripts/performance_test.js +++ /dev/null @@ -1,222 +0,0 @@ -// Performance Test Suite with Validation -print("=== Performance Test Suite with Validation ==="); - -// Validation helper function -function validate(condition, message) { - if (!condition) { - print("DEBUG: Performance validation failed for: " + message); - print("DEBUG: Condition was:", condition); - throw new Error("PERFORMANCE VALIDATION FAILED: " + message); - } - print("✓ " + message); -} - -// Helper function to handle Long objects returned by some MongoDB drivers -function getLongValue(val) { - if (typeof val === 'object' && val !== null && 'low' in val) { - return val.low; // Extract the actual number from Long object - } - return val; -} - -db = db.getSiblingDB('perftest'); - -// Large dataset insertion test -print("\n--- Large Dataset Insertion Test ---"); -var startTime = new Date(); -var docs = []; -for (let i = 0; i < 1000; i++) { - docs.push({ - id: i, - name: "User " + i, - email: "user" + i + "@example.com", - data: "This is sample data for user " + i, - timestamp: new Date(), - metadata: { - source: "performance_test", - batch: Math.floor(i / 100), - random: Math.random() - } - }); -} - -validate(docs.length === 1000, "Created exactly 1000 test documents"); - -var insertStart = new Date(); -var result = db.perfcollection.insertMany(docs); -var insertEnd = new Date(); - -// Debug the insert result -print("DEBUG: performance insertMany result:", JSON.stringify(result)); - -// Helper function to get insertedIds count (handles both array and object formats) -function getInsertedCount(result) { - if (result.insertedIds) { - if (Array.isArray(result.insertedIds)) { - return result.insertedIds.length; - } else if (typeof result.insertedIds === 'object') { - return Object.keys(result.insertedIds).length; - } - } - return 0; -} - -var insertTime = insertEnd - insertStart; -validate(result.acknowledged === true, "Insertion was acknowledged"); -validate(getInsertedCount(result) === 1000, "Inserted exactly 1000 documents"); -validate(insertTime < 10000, "Insertion completed within 10 seconds (took " + insertTime + "ms)"); - -print("Inserted", getInsertedCount(result), "documents in", insertTime, "ms"); - -// Query performance test -print("\n--- Query Performance Test ---"); - -var queryStart = new Date(); -var count = db.perfcollection.countDocuments(); -var queryEnd = new Date(); - -var countTime = queryEnd - queryStart; -validate(count === 1000, "Count query returned correct result: 1000"); -validate(countTime < 5000, "Count query completed within 5 seconds (took " + countTime + "ms)"); - -print("Count query took", countTime, "ms, result:", count); - -// Range query performance test -print("\n--- Range Query Performance Test ---"); - -var queryStart2 = new Date(); -var rangeResults = db.perfcollection.find({ id: { $gte: 500 } }).toArray(); -var queryEnd2 = new Date(); - -var rangeTime = queryEnd2 - queryStart2; -validate(rangeResults.length === 500, "Range query returned exactly 500 documents"); -validate(rangeTime < 5000, "Range query completed within 5 seconds (took " + rangeTime + "ms)"); - -// Validate range query results -var minId = Math.min(...rangeResults.map(r => r.id)); -var maxId = Math.max(...rangeResults.map(r => r.id)); -validate(minId === 500, "Minimum ID in range results is 500"); -validate(maxId === 999, "Maximum ID in range results is 999"); - -print("Range query found", rangeResults.length, "documents in", rangeTime, "ms"); - -// Aggregation performance -print("\n--- Aggregation Performance Test ---"); - -var aggStart = new Date(); -var aggResult = db.perfcollection.aggregate([ - { $match: { id: { $gte: 100 } } }, - { $group: { _id: "$metadata.batch", count: { $sum: 1 }, avgId: { $avg: "$id" } } }, - { $sort: { _id: 1 } } -]).toArray(); -var aggEnd = new Date(); - -var aggTime = aggEnd - aggStart; -validate(aggResult.length === 9, "Aggregation returned 9 batches (batches 1-9)"); // 100-999 = batches 1-9 -validate(aggTime < 5000, "Aggregation completed within 5 seconds (took " + aggTime + "ms)"); - -// Validate aggregation results -var totalDocs = aggResult.reduce((sum, batch) => sum + batch.count, 0); -validate(totalDocs === 900, "Aggregation processed exactly 900 documents (id >= 100)"); - -// Check specific batch -var batch5 = aggResult.find(r => r._id === 5); -validate(batch5 && batch5.count === 100, "Batch 5 has exactly 100 documents"); -validate(batch5 && Math.abs(batch5.avgId - 549.5) < 0.1, "Batch 5 average ID is correct (~549.5)"); - -print("Aggregation processed", aggResult.length, "groups in", aggTime, "ms"); - -// Test sorting performance -print("\n--- Sorting Performance Test ---"); - -var sortStart = new Date(); -var sortedResults = db.perfcollection.find({ id: { $lt: 100 } }).sort({ id: -1 }).toArray(); -var sortEnd = new Date(); - -var sortTime = sortEnd - sortStart; -validate(sortedResults.length === 100, "Sort query returned exactly 100 documents"); -validate(sortTime < 3000, "Sort query completed within 3 seconds (took " + sortTime + "ms)"); - -// Validate sorting -validate(sortedResults[0].id === 99, "First document has ID 99 (descending sort)"); -validate(sortedResults[99].id === 0, "Last document has ID 0 (descending sort)"); - -for (let i = 0; i < sortedResults.length - 1; i++) { - validate(sortedResults[i].id > sortedResults[i + 1].id, "Documents are sorted in descending order"); -} - -print("Sort query processed", sortedResults.length, "documents in", sortTime, "ms"); - -// Test update performance -print("\n--- Update Performance Test ---"); - -var updateStart = new Date(); -var updateResult = db.perfcollection.updateMany( - { "metadata.batch": { $in: [0, 1, 2] } }, - { $set: { updated: true, updateTime: new Date() } } -); -var updateEnd = new Date(); - -var updateTime = updateEnd - updateStart; - -var perfUpdateMatchedCount = getLongValue(updateResult.matchedCount); -var perfUpdateModifiedCount = getLongValue(updateResult.modifiedCount); - -validate(perfUpdateMatchedCount === 300, "Update matched exactly 300 documents (3 batches × 100)"); -validate(perfUpdateModifiedCount === 300, "Update modified exactly 300 documents"); -validate(updateTime < 3000, "Update completed within 3 seconds (took " + updateTime + "ms)"); - -// Verify updates -var updatedDocs = db.perfcollection.find({ updated: true }).toArray(); -validate(updatedDocs.length === 300, "Found exactly 300 updated documents"); -validate(updatedDocs.every(doc => doc.updateTime !== undefined), "All updated docs have updateTime"); - -print("Update modified", perfUpdateModifiedCount, "documents in", updateTime, "ms"); - -// Test delete performance -print("\n--- Delete Performance Test ---"); - -var deleteStart = new Date(); -var deleteResult = db.perfcollection.deleteMany({ id: { $gte: 950 } }); -var deleteEnd = new Date(); - -var deleteTime = deleteEnd - deleteStart; - -var perfDeletedCount = getLongValue(deleteResult.deletedCount); - -validate(perfDeletedCount === 50, "Deleted exactly 50 documents (IDs 950-999)"); -validate(deleteTime < 2000, "Delete completed within 2 seconds (took " + deleteTime + "ms)"); - -// Verify deletions -var remainingCount = db.perfcollection.countDocuments(); -validate(remainingCount === 950, "Exactly 950 documents remain after deletion"); - -var deletedDocs = db.perfcollection.find({ id: { $gte: 950 } }).toArray(); -validate(deletedDocs.length === 0, "No documents with ID >= 950 remain"); - -print("Delete removed", perfDeletedCount, "documents in", deleteTime, "ms"); - -// Overall performance summary -print("\n--- Performance Summary ---"); -var totalTime = new Date() - startTime; -validate(totalTime < 30000, "All performance tests completed within 30 seconds (took " + totalTime + "ms)"); - -print("Total performance test time:", totalTime, "ms"); -print("Insert rate:", Math.round(1000 / (insertTime / 1000)), "docs/sec"); -print("Query rate:", Math.round(1000 / (countTime / 1000)), "queries/sec"); -print("Update rate:", Math.round(300 / (updateTime / 1000)), "updates/sec"); -print("Delete rate:", Math.round(50 / (deleteTime / 1000)), "deletes/sec"); - -// Cleanup with validation -var dropStart = new Date(); -db.perfcollection.drop(); -var dropEnd = new Date(); - -var dropTime = dropEnd - dropStart; -validate(dropTime < 2000, "Collection drop completed within 2 seconds (took " + dropTime + "ms)"); - -// Verify cleanup -var collections = db.getCollectionNames(); -validate(!collections.includes("perfcollection"), "Performance collection was dropped"); - -print("\n=== Performance Tests Completed Successfully with Validation! ==="); diff --git a/operator/src/scripts/test-scripts/test-mongodb-connection.sh b/operator/src/scripts/test-scripts/test-mongodb-connection.sh deleted file mode 100644 index 47bf75b7..00000000 --- a/operator/src/scripts/test-scripts/test-mongodb-connection.sh +++ /dev/null @@ -1,435 +0,0 @@ -#!/bin/bash - -# MongoDB Connection Test Script -# Tests MongoDB connection using mongosh with comprehensive validation - -set -e - -# Default values -ARCHITECTURE="" -NAMESPACE="" -CLUSTER_NAME="" -POD_NAME="" -PORT="" -USERNAME="" -PASSWORD="" -TEST_TYPE="comprehensive" - -# Function to display usage -usage() { - echo "Usage: $0 [OPTIONS]" - echo "Options:" - echo " --architecture ARCH Target architecture for logging" - echo " --namespace NS Kubernetes namespace" - echo " --cluster-name NAME DocumentDB cluster name" - echo " --pod-name NAME Pod name (optional, defaults to CLUSTER_NAME-1)" - echo " --port PORT Port to forward and connect to" - echo " --username USER MongoDB username" - echo " --password PASS MongoDB password" - echo " --test-type TYPE Test type (basic, comprehensive)" - echo " --help Show this help" - exit 1 -} - -# Parse command line arguments -while [[ $# -gt 0 ]]; do - case $1 in - --architecture) - ARCHITECTURE="$2" - shift 2 - ;; - --namespace) - NAMESPACE="$2" - shift 2 - ;; - --cluster-name) - CLUSTER_NAME="$2" - shift 2 - ;; - --pod-name) - POD_NAME="$2" - shift 2 - ;; - --port) - PORT="$2" - shift 2 - ;; - --username) - USERNAME="$2" - shift 2 - ;; - --password) - PASSWORD="$2" - shift 2 - ;; - --test-type) - TEST_TYPE="$2" - shift 2 - ;; - --help) - usage - ;; - *) - echo "Unknown option: $1" - usage - ;; - esac -done - -# Validate required parameters -if [[ -z "$ARCHITECTURE" || -z "$NAMESPACE" || -z "$CLUSTER_NAME" || -z "$PORT" || -z "$USERNAME" || -z "$PASSWORD" ]]; then - echo "Error: Missing required parameters" - usage -fi - -# Set default pod name if not provided -if [[ -z "$POD_NAME" ]]; then - POD_NAME="${CLUSTER_NAME}-1" -fi - -echo "Testing connection with mongosh on $ARCHITECTURE architecture..." -echo "Using pod: $POD_NAME" -echo "Port: $PORT" -echo "Test type: $TEST_TYPE" - -# Function to setup port forwarding with retry logic -setup_port_forward() { - local max_attempts=3 - local attempt=1 - - while [ $attempt -le $max_attempts ]; do - echo "Port forward setup attempt $attempt/$max_attempts..." - - # Start port-forward in background - kubectl port-forward pod/$POD_NAME $PORT:$PORT -n $NAMESPACE > /tmp/mongosh_pf.log 2>&1 & - PF_PID=$! - echo $PF_PID > /tmp/mongosh_pf.pid - - # Wait for port-forward to establish - echo "Waiting for port-forward to establish..." - sleep 10 - - # Check if port-forward process is still running - if ! kill -0 $PF_PID 2>/dev/null; then - echo "❌ Port-forward process died (attempt $attempt)" - if [ -f /tmp/mongosh_pf.log ]; then - echo "Port-forward output:" - cat /tmp/mongosh_pf.log - fi - ((attempt++)) - sleep 5 - continue - fi - - # Test connection - echo "Testing port-forward connection..." - timeout 60 bash -c " - until nc -z 127.0.0.1 $PORT; do - echo 'Waiting for port-forward to be ready...' - sleep 2 - done - " && { - echo "✓ Port-forward established successfully" - return 0 - } - - echo "❌ Port-forward connection test failed (attempt $attempt)" - kill $PF_PID 2>/dev/null || true - ((attempt++)) - sleep 5 - done - - echo "❌ Failed to establish port-forward after $max_attempts attempts" - return 1 -} - -# Function to cleanup port forwarding -cleanup_port_forward() { - if [ -f /tmp/mongosh_pf.pid ]; then - PF_PID=$(cat /tmp/mongosh_pf.pid) - kill $PF_PID 2>/dev/null || true - rm -f /tmp/mongosh_pf.pid - fi - rm -f /tmp/mongosh_pf.log -} - -# Setup port forwarding -if ! setup_port_forward; then - echo "❌ Failed to setup port forwarding" - exit 1 -fi - -echo "Port-forward is ready, creating mongosh test script..." - -# Create comprehensive test script -cat > /tmp/test_mongosh.js << 'MONGOSH_SCRIPT' -// Comprehensive MongoDB Connection Test Script -print("=== Starting MongoDB Connection Test ==="); -print("Connected to DocumentDB!"); - -// Switch to test database -db = db.getSiblingDB('mongosh_test_db'); -print("Using database: mongosh_test_db"); - -// Test 1: Basic Connection and Database Operations -print("\n=== Test 1: Basic Connection and Database Operations ==="); - -// Drop collection if it exists (cleanup from previous runs) -db.test_collection.drop(); - -// Create collection and insert test data -print("Creating collection and inserting test data..."); -db.createCollection("test_collection"); - -var testData = [ - { name: "Alice", age: 30, department: "Engineering", salary: 75000 }, - { name: "Bob", age: 25, department: "Marketing", salary: 55000 }, - { name: "Charlie", age: 35, department: "Sales", salary: 65000 }, - { name: "Diana", age: 28, department: "Engineering", salary: 70000 }, - { name: "Eve", age: 32, department: "Marketing", salary: 60000 } -]; - -var insertResult = db.test_collection.insertMany(testData); -print("Inserted documents:", Object.keys(insertResult.insertedIds).length); - -// Validate insertion -var insertedCount = Object.keys(insertResult.insertedIds).length; -if (insertedCount !== 5) { - throw new Error("Expected 5 inserted documents, got " + insertedCount); -} -print("✓ Insertion validation passed"); - -// Test 2: Query Operations -print("\n=== Test 2: Query Operations ==="); - -// Count documents -var totalDocs = db.test_collection.countDocuments({}); -print("Total documents:", totalDocs); -if (totalDocs !== 5) { - throw new Error("Expected 5 total documents, found " + totalDocs); -} -print("✓ Document count validation passed"); - -// Query with filters -var engineers = db.test_collection.find({ department: "Engineering" }).toArray(); -print("Engineers found:", engineers.length); -if (engineers.length !== 2) { - throw new Error("Expected 2 engineers, found " + engineers.length); -} -print("✓ Department filter validation passed"); - -// Range query -var youngEmployees = db.test_collection.find({ age: { $lt: 30 } }).toArray(); -print("Employees under 30:", youngEmployees.length); -if (youngEmployees.length !== 2) { - throw new Error("Expected 2 employees under 30, found " + youngEmployees.length); -} -print("✓ Range query validation passed"); - -// Test 3: Aggregation Operations -print("\n=== Test 3: Aggregation Operations ==="); - -// Average age calculation -var avgAgeResult = db.test_collection.aggregate([ - { $group: { _id: null, avgAge: { $avg: "$age" }, count: { $sum: 1 } } } -]).toArray(); - -var avgAge = avgAgeResult[0].avgAge; -var expectedAvgAge = (30 + 25 + 35 + 28 + 32) / 5; // 30 -print("Average age:", avgAge, "Expected:", expectedAvgAge); - -if (Math.abs(avgAge - expectedAvgAge) > 0.01) { - throw new Error("Expected average age " + expectedAvgAge + ", got " + avgAge); -} -print("✓ Aggregation validation passed"); - -// Group by department -var deptStats = db.test_collection.aggregate([ - { $group: { - _id: "$department", - count: { $sum: 1 }, - avgSalary: { $avg: "$salary" }, - maxSalary: { $max: "$salary" } - }}, - { $sort: { _id: 1 } } -]).toArray(); - -print("Department statistics:", JSON.stringify(deptStats)); -if (deptStats.length !== 3) { - throw new Error("Expected 3 departments, found " + deptStats.length); -} -print("✓ Department grouping validation passed"); - -// Test 4: Update Operations -print("\n=== Test 4: Update Operations ==="); - -// Helper function to handle Long objects -function getLongValue(val) { - if (typeof val === 'object' && val !== null && 'low' in val) { - return val.low; // Extract the actual number from Long object - } - return val; -} - -// Update single document -var updateResult = db.test_collection.updateOne( - { name: "Alice" }, - { $set: { title: "Senior Engineer", lastModified: new Date() } } -); - -var modifiedCount = getLongValue(updateResult.modifiedCount); -var matchedCount = getLongValue(updateResult.matchedCount); - -print("Update result - Modified:", modifiedCount, "Matched:", matchedCount); -if (modifiedCount !== 1 || matchedCount !== 1) { - throw new Error("Expected 1 modified and 1 matched document, got modified=" + modifiedCount + ", matched=" + matchedCount); -} -print("✓ Single update validation passed"); - -// Verify update content -var aliceUpdated = db.test_collection.findOne({ name: "Alice" }); -if (!aliceUpdated.title || aliceUpdated.title !== "Senior Engineer") { - throw new Error("Alice title update validation failed: " + JSON.stringify(aliceUpdated)); -} -print("✓ Update content validation passed"); - -// Bulk update -var bulkUpdateResult = db.test_collection.updateMany( - { salary: { $lt: 60000 } }, - { $inc: { salary: 5000 }, $set: { salaryAdjusted: true } } -); - -var bulkModifiedCount = getLongValue(bulkUpdateResult.modifiedCount); -print("Bulk update result - Modified:", bulkModifiedCount); -if (bulkModifiedCount !== 1) { // Only Bob should match - throw new Error("Expected 1 document to be updated in bulk operation, got " + bulkModifiedCount); -} -print("✓ Bulk update validation passed"); - -// Test 5: Sorting and Limiting -print("\n=== Test 5: Sorting and Limiting Operations ==="); - -// Sort by age ascending -var sortedByAge = db.test_collection.find().sort({ age: 1 }).toArray(); -var ages = sortedByAge.map(doc => doc.age); -print("Ages in ascending order:", ages); - -// Verify sorting -for (var i = 1; i < ages.length; i++) { - if (ages[i] < ages[i-1]) { - throw new Error("Sorting validation failed: ages not in ascending order"); - } -} -print("✓ Sorting validation passed"); - -// Test limit and skip -var limitedResults = db.test_collection.find().sort({ age: 1 }).limit(2).toArray(); -if (limitedResults.length !== 2) { - throw new Error("Expected 2 documents with limit, got " + limitedResults.length); -} -print("✓ Limit operation validation passed"); - -var skippedResults = db.test_collection.find().sort({ age: 1 }).skip(2).limit(2).toArray(); -if (skippedResults.length !== 2) { - throw new Error("Expected 2 documents with skip+limit, got " + skippedResults.length); -} -print("✓ Skip operation validation passed"); - -// Test 7: Complex Aggregation Pipeline -print("\n=== Test 7: Complex Aggregation Pipeline ==="); - -var complexPipeline = [ - { $match: { age: { $gte: 25 } } }, - { $group: { - _id: "$department", - avgAge: { $avg: "$age" }, - totalSalary: { $sum: "$salary" }, - employees: { $push: "$name" } - }}, - { $project: { - department: "$_id", - avgAge: { $round: ["$avgAge", 1] }, - totalSalary: 1, - employeeCount: { $size: "$employees" }, - employees: 1 - }}, - { $sort: { totalSalary: -1 } } -]; - -var complexResult = db.test_collection.aggregate(complexPipeline).toArray(); -print("Complex aggregation result:", JSON.stringify(complexResult, null, 2)); - -if (complexResult.length === 0) { - throw new Error("Complex aggregation returned no results"); -} -print("✓ Complex aggregation validation passed"); - -// Test 8: Delete Operations -print("\n=== Test 8: Delete Operations ==="); - -// Insert a temporary document for deletion test -var tempInsert = db.test_collection.insertOne({ name: "Temp", age: 99, department: "Temp", temporary: true }); -print("Temporary document inserted:", tempInsert.insertedId); - -// Delete the temporary document -var deleteResult = db.test_collection.deleteOne({ temporary: true }); -print("Delete result - Deleted count:", deleteResult.deletedCount); - -if (deleteResult.deletedCount !== 1) { - throw new Error("Expected 1 document to be deleted, got " + deleteResult.deletedCount); -} -print("✓ Delete operation validation passed"); - -// Verify document was deleted -var tempDoc = db.test_collection.findOne({ temporary: true }); -if (tempDoc !== null) { - throw new Error("Temporary document was not properly deleted"); -} -print("✓ Delete verification passed"); - -// Final validation - ensure we still have our original data -var finalCount = db.test_collection.countDocuments({}); -if (finalCount !== 5) { - throw new Error("Expected 5 documents after cleanup, found " + finalCount); -} -print("✓ Final document count validation passed"); - -// Test Summary -print("\n=== Test Summary ==="); -print("✓ All mongosh tests completed successfully!"); -print("✓ Basic connection: PASSED"); -print("✓ Query operations: PASSED"); -print("✓ Aggregation operations: PASSED"); -print("✓ Update operations: PASSED"); -print("✓ Sorting and limiting: PASSED"); -print("✓ Complex aggregation: PASSED"); -print("✓ Delete operations: PASSED"); -print("✓ Data integrity: VERIFIED"); - -print("\nMongoDB connection test completed successfully!"); -MONGOSH_SCRIPT - -echo "Running mongosh validation tests..." - -# Run the comprehensive test script -if mongosh 127.0.0.1:$PORT \ - -u "$USERNAME" \ - -p "$PASSWORD" \ - --authenticationMechanism SCRAM-SHA-256 \ - --tls \ - --tlsAllowInvalidCertificates \ - --file /tmp/test_mongosh.js; then - echo "✓ Mongosh validation tests completed successfully on $ARCHITECTURE" -else - echo "❌ Mongosh validation tests failed on $ARCHITECTURE" - echo "=== Port-forward logs ===" - cat /tmp/mongosh_pf.log 2>/dev/null || echo "No port-forward logs available" - cleanup_port_forward - exit 1 -fi - -# Cleanup -cleanup_port_forward -rm -f /tmp/test_mongosh.js - -echo "✓ MongoDB connection test completed successfully on $ARCHITECTURE" diff --git a/operator/src/scripts/test-scripts/test-python-pymongo.sh b/operator/src/scripts/test-scripts/test-python-pymongo.sh deleted file mode 100755 index cb10d003..00000000 --- a/operator/src/scripts/test-scripts/test-python-pymongo.sh +++ /dev/null @@ -1,317 +0,0 @@ -#!/bin/bash - -# Python PyMongo Integration Test Script -# Tests MongoDB connection using PyMongo with comprehensive validation - -set -e - -# Default values -ARCHITECTURE="" -NAMESPACE="" -CLUSTER_NAME="" -POD_NAME="" -PORT="" -USERNAME="" -PASSWORD="" - -# Function to display usage -usage() { - echo "Usage: $0 [OPTIONS]" - echo "Options:" - echo " --architecture ARCH Target architecture for logging" - echo " --namespace NS Kubernetes namespace" - echo " --cluster-name NAME DocumentDB cluster name" - echo " --pod-name NAME Pod name (optional, defaults to CLUSTER_NAME-1)" - echo " --port PORT Port to forward and connect to" - echo " --username USER MongoDB username" - echo " --password PASS MongoDB password" - echo " --help Show this help" - exit 1 -} - -# Parse command line arguments -while [[ $# -gt 0 ]]; do - case $1 in - --architecture) - ARCHITECTURE="$2" - shift 2 - ;; - --namespace) - NAMESPACE="$2" - shift 2 - ;; - --cluster-name) - CLUSTER_NAME="$2" - shift 2 - ;; - --pod-name) - POD_NAME="$2" - shift 2 - ;; - --port) - PORT="$2" - shift 2 - ;; - --username) - USERNAME="$2" - shift 2 - ;; - --password) - PASSWORD="$2" - shift 2 - ;; - --help) - usage - ;; - *) - echo "Unknown option: $1" - usage - ;; - esac -done - -# Validate required parameters -if [[ -z "$ARCHITECTURE" || -z "$NAMESPACE" || -z "$CLUSTER_NAME" || -z "$PORT" || -z "$USERNAME" || -z "$PASSWORD" ]]; then - echo "Error: Missing required parameters" - usage -fi - -# Set default pod name if not provided -if [[ -z "$POD_NAME" ]]; then - POD_NAME="${CLUSTER_NAME}-1" -fi - -echo "Testing with Python PyMongo client on $ARCHITECTURE architecture..." -echo "Using pod: $POD_NAME" -echo "Port: $PORT" - -# Function to setup port forwarding with retry logic -setup_port_forward() { - local max_attempts=3 - local attempt=1 - - while [ $attempt -le $max_attempts ]; do - echo "Attempt $attempt: Setting up port forwarding to pod $POD_NAME in namespace $NAMESPACE..." - - # Start port forward in background - kubectl port-forward "pod/$POD_NAME" "$PORT:$PORT" -n "$NAMESPACE" & - PF_PID=$! - - # Give it some time to start - sleep 5 - - # Check if port forward is working by testing the connection - if timeout 30 bash -c "until nc -z 127.0.0.1 $PORT; do echo 'Waiting for port-forward...'; sleep 2; done"; then - echo "✓ Port forwarding established successfully on attempt $attempt" - return 0 - else - echo "❌ Port forwarding failed on attempt $attempt" - kill $PF_PID 2>/dev/null || true - sleep 2 - fi - - ((attempt++)) - done - - echo "❌ Failed to establish port forwarding after $max_attempts attempts" - return 1 -} - -# Function to cleanup port forwarding -cleanup_port_forward() { - if [[ -n "$PF_PID" ]]; then - echo "Cleaning up port forwarding (PID: $PF_PID)..." - kill $PF_PID 2>/dev/null || true - wait $PF_PID 2>/dev/null || true - PF_PID="" - fi -} - -# Set up cleanup trap -trap cleanup_port_forward EXIT - -# Install Python dependencies -echo "Installing Python dependencies..." -pip install pymongo - -# Setup port forwarding -if ! setup_port_forward; then - echo "Failed to setup port forwarding" - exit 1 -fi - -# Test connection and ensure port-forward is ready -echo "Verifying port-forward is ready..." -timeout 60 bash -c " -until nc -z 127.0.0.1 $PORT; do - echo 'Waiting for port-forward to be ready...' - sleep 2 -done -" - -echo "Port-forward is ready, running Python tests..." - -# Get the directory where this script is located -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Run the existing Python test script and validate it completes successfully -echo "Running existing Python test script on $ARCHITECTURE..." -echo "Using Python script: $SCRIPT_DIR/mongo-python-data-pusher.py" -if python3 "$SCRIPT_DIR/mongo-python-data-pusher.py"; then - echo "✓ Existing Python test script completed successfully on $ARCHITECTURE" -else - echo "❌ Existing Python test script failed on $ARCHITECTURE" - exit 1 -fi - -# Create and run comprehensive additional Python tests -cat > additional_test.py << EOF -from pymongo import MongoClient -import ssl -import sys - -def validate_test(condition, message): - if not condition: - print(f"❌ VALIDATION FAILED: {message}") - sys.exit(1) - print(f"✓ {message}") - -# Connection parameters -client = MongoClient( - "127.0.0.1", - $PORT, - username="$USERNAME", - password="$PASSWORD", - authSource="admin", - authMechanism="SCRAM-SHA-256", - tls=True, - tlsAllowInvalidCertificates=True -) - -# Test database operations -test_db = client["integration_test"] - -# Test collection operations -collection = test_db["test_collection"] - -# Clear any existing data -collection.drop() - -# Insert test data and validate -docs = [ - {"type": "integration_test", "value": i, "status": "active"} - for i in range(10) -] -result = collection.insert_many(docs) -print(f"Inserted {len(result.inserted_ids)} documents") - -# Validate insertion -validate_test(len(result.inserted_ids) == 10, "Inserted exactly 10 documents") -validate_test(all(isinstance(id, object) for id in result.inserted_ids), "All inserted IDs are valid ObjectIds") - -# Test queries and validate results -count = collection.count_documents({"status": "active"}) -print(f"Found {count} active documents") -validate_test(count == 10, "Found exactly 10 active documents") - -# Test specific value queries -value_5_docs = list(collection.find({"value": 5})) -validate_test(len(value_5_docs) == 1, "Found exactly 1 document with value 5") -validate_test(value_5_docs[0]["value"] == 5, "Document with value 5 has correct value") -validate_test(value_5_docs[0]["status"] == "active", "Document with value 5 has correct status") -validate_test(value_5_docs[0]["type"] == "integration_test", "Document with value 5 has correct type") - -# Test range queries -high_value_docs = list(collection.find({"value": {"\$gte": 7}})) -validate_test(len(high_value_docs) == 3, "Found exactly 3 documents with value >= 7") -expected_values = {7, 8, 9} -found_values = {doc["value"] for doc in high_value_docs} -validate_test(found_values == expected_values, f"High value documents have correct values: {found_values}") - -# Test aggregation and validate results -pipeline = [ - {"\$match": {"status": "active"}}, - {"\$group": {"_id": "\$status", "total": {"\$sum": "\$value"}, "count": {"\$sum": 1}}} -] -agg_result = list(collection.aggregate(pipeline)) -print(f"Aggregation result: {agg_result}") - -validate_test(len(agg_result) == 1, "Aggregation returned exactly 1 group") -validate_test(agg_result[0]["_id"] == "active", "Aggregation grouped by 'active' status") -expected_total = sum(range(10)) # 0+1+2+...+9 = 45 -validate_test(agg_result[0]["total"] == expected_total, f"Aggregation total is correct: {expected_total}") -validate_test(agg_result[0]["count"] == 10, "Aggregation count is correct: 10") - -# Test update operations -update_result = collection.update_many( - {"value": {"\$lt": 5}}, - {"\$set": {"status": "updated"}} -) -validate_test(update_result.modified_count == 5, f"Updated exactly 5 documents (got {update_result.modified_count})") - -# Validate update results -updated_docs = list(collection.find({"status": "updated"})) -validate_test(len(updated_docs) == 5, "Found exactly 5 updated documents") -updated_values = {doc["value"] for doc in updated_docs} -expected_updated_values = {0, 1, 2, 3, 4} -validate_test(updated_values == expected_updated_values, f"Updated documents have correct values: {updated_values}") - -# Test that non-updated documents are unchanged -active_docs = list(collection.find({"status": "active"})) -validate_test(len(active_docs) == 5, "Found exactly 5 still-active documents") -active_values = {doc["value"] for doc in active_docs} -expected_active_values = {5, 6, 7, 8, 9} -validate_test(active_values == expected_active_values, f"Active documents have correct values: {active_values}") - -# Test sorting -sorted_docs = list(collection.find().sort("value", -1)) # Descending order -validate_test(len(sorted_docs) == 10, "Sorted query returned all 10 documents") -sorted_values = [doc["value"] for doc in sorted_docs] -expected_sorted = list(range(9, -1, -1)) # [9, 8, 7, 6, 5, 4, 3, 2, 1, 0] -validate_test(sorted_values == expected_sorted, f"Documents sorted correctly: {sorted_values}") - -# Test complex aggregation with multiple stages -complex_pipeline = [ - {"\$match": {"value": {"\$gte": 3}}}, - {"\$group": {"_id": "\$status", "avg_value": {"\$avg": "\$value"}, "max_value": {"\$max": "\$value"}}}, - {"\$sort": {"_id": 1}} -] -complex_result = list(collection.aggregate(complex_pipeline)) -print(f"Complex aggregation result: {complex_result}") - -# Validate complex aggregation -validate_test(len(complex_result) == 2, "Complex aggregation returned 2 groups (active and updated)") - -# Find the results for each status -active_result = next((r for r in complex_result if r["_id"] == "active"), None) -updated_result = next((r for r in complex_result if r["_id"] == "updated"), None) - -validate_test(active_result is not None, "Found active group in complex aggregation") -validate_test(updated_result is not None, "Found updated group in complex aggregation") - -# For active status: values 5,6,7,8,9 -> avg = 7, max = 9 -validate_test(abs(active_result["avg_value"] - 7.0) < 0.001, f"Active group avg_value is correct: {active_result['avg_value']}") -validate_test(active_result["max_value"] == 9, f"Active group max_value is correct: {active_result['max_value']}") - -# For updated status: values 3,4 (only those >= 3) -> avg = 3.5, max = 4 -validate_test(abs(updated_result["avg_value"] - 3.5) < 0.001, f"Updated group avg_value is correct: {updated_result['avg_value']}") -validate_test(updated_result["max_value"] == 4, f"Updated group max_value is correct: {updated_result['max_value']}") - -print("All Python integration tests passed with validation!") -print(f"Test completed successfully on architecture: {sys.platform}") - -client.close() -EOF - -echo "Running Python validation tests on $ARCHITECTURE..." -if python3 additional_test.py; then - echo "✓ Python validation tests completed successfully on $ARCHITECTURE" -else - echo "❌ Python validation tests failed on $ARCHITECTURE" - exit 1 -fi - -# Cleanup temporary test file -rm -f additional_test.py - -echo "✅ All Python PyMongo tests completed successfully!" diff --git a/test/e2e/README.md b/test/e2e/README.md new file mode 100644 index 00000000..5c4365f6 --- /dev/null +++ b/test/e2e/README.md @@ -0,0 +1,262 @@ +# DocumentDB Operator E2E Test Suite + +## What this is + +A unified Go / Ginkgo v2 / Gomega end-to-end test suite that drives the +DocumentDB Kubernetes Operator against a real cluster. It replaces the four +legacy GitHub Actions workflows (`test-integration.yml`, `test-E2E.yml`, +`test-backup-and-restore.yml`, `test-upgrade-and-rollback.yml`) and their +bash / JavaScript (mongosh) / Python (pymongo) glue with a single Go module +at `test/e2e/`. Specs are organised by CRD operation (lifecycle, scale, data, +performance, backup, tls, feature gates, exposure, status, upgrade), reuse +CloudNative-PG's `tests/utils` packages as a library, and speak the Mongo +wire protocol via `go.mongodb.org/mongo-driver/v2`. Design rationale and +scope: [`docs/designs/e2e-test-suite.md`](../../docs/designs/e2e-test-suite.md). + +## Prereqs + +| Tool | Version | Notes | +|---|---|---| +| Go | 1.25.x (match `test/e2e/go.mod` — currently `go 1.25.8`) | Separate module from the operator | +| Docker | any recent | Required for kind | +| kind | any recent | Local Kubernetes | +| kubectl | matching cluster | | +| helm | 3.x | Operator install | +| `ginkgo` CLI | v2 | `go install github.com/onsi/ginkgo/v2/ginkgo@latest` | + +The suite itself installs no cluster components — it expects an already-running +cluster with the operator deployed. Backup specs additionally need the CSI +snapshot CRDs; TLS cert-manager specs need cert-manager. Both gate with a +runtime probe and `Skip()` rather than fail when the dependency is missing. + +## Quick start + +From the repository root: + +```bash +# 1. Build images + bring up a kind cluster + install the operator + CRDs. +# The script in scripts/development/deploy.sh drives `make deploy` and the +# same composite action (.github/actions/setup-test-environment) CI uses. +cd operator/src +DEPLOY=true DEPLOY_CLUSTER=true ./scripts/development/deploy.sh +cd - + +# 2. Run the smoke label against that cluster. +cd test/e2e +ginkgo -r --label-filter=smoke ./tests/... +``` + +Run a single area: + +```bash +ginkgo -r --label-filter=lifecycle ./tests/... +ginkgo -r --label-filter='data && level:low' ./tests/data +``` + +## Layout + +``` +test/e2e/ +├── go.mod, go.sum # separate module; pins CNPG test utils +├── suite.go # SetupSuite / TeardownSuite; env + run-id wiring +├── suite_test.go # SynchronizedBeforeSuite entry point +├── labels.go # Ginkgo label constants (area + cross-cutting) +├── levels.go # TEST_DEPTH → Level gate (CurrentLevel, SkipUnlessLevel) +├── runid.go # E2E_RUN_ID resolver (stable per-process id) +├── manifests/ +│ ├── base/ # documentdb.yaml.template — the base CR +│ ├── mixins/ # composable overlays (tls_*, exposure_*, storage_*, …) +│ └── backup/ # backup / scheduled_backup / recovery CR templates +├── pkg/e2eutils/ # helper packages imported by every area suite +└── tests/ # one Go package per functional area + ├── lifecycle/ scale/ data/ performance/ status/ + ├── backup/ tls/ feature_gates/ exposure/ upgrade/ +``` + +## Labels & depth + +Labels live in [`labels.go`](labels.go) and are attached either to the area +suite's top-level `Describe` (area labels) or to individual specs (cross-cutting +and capability labels). + +| Group | Labels | +|---|---| +| Area | `lifecycle`, `scale`, `data`, `performance`, `backup`, `recovery`, `tls`, `feature-gates`, `exposure`, `status`, `upgrade` | +| Cross-cutting | `smoke`, `basic`, `destructive`, `disruptive`, `slow` | +| Capability | `needs-cert-manager`, `needs-metallb`, `needs-csi-snapshots`, `needs-csi-resize` | +| Depth | `level:lowest`, `level:low`, `level:medium`, `level:high`, `level:highest` | + +**Depth gate.** `TEST_DEPTH` takes an integer 0–4 mapping to +`Highest` (0), `High`, `Medium`, `Low`, `Lowest` (4). Default is `Medium` (2) +— the authoritative gate is `e2e.SkipUnlessLevel(e2e.Medium)`, which reads +`TEST_DEPTH` at runtime and `Skip()`s when the configured depth is shallower. +The `level:*` labels are informational duplicates for Ginkgo's `--label-filter`. +(CNPG v1.28.1 does not currently export a `tests/utils/levels` package; +[`levels.go`](levels.go) is our local implementation and will be replaced +with a thin re-export if upstream adds one.) + +Examples: + +```bash +# Fast smoke — typically Highest depth +TEST_DEPTH=0 ginkgo -r --label-filter=smoke ./tests/... + +# Full backup area at default depth, skipping clusters without CSI snapshots +ginkgo -r --label-filter='backup && !needs-csi-snapshots' ./tests/backup + +# Nightly: everything +TEST_DEPTH=4 ginkgo -r --procs=4 ./tests/... + +# Upgrade suite (disruptive — runs serial, owns its own operator install) +E2E_UPGRADE=1 E2E_UPGRADE_PREVIOUS_CHART=… \ + ginkgo --procs=1 --label-filter=upgrade ./tests/upgrade +``` + +## Environment variables + +| Variable | Default | Purpose | +|---|---|---| +| `TEST_DEPTH` | `2` (Medium) | Depth gate; 0=Highest … 4=Lowest | +| `E2E_RUN_ID` | auto-generated | Stable id stamped onto shared fixtures + cluster-scoped objects. Set this in CI so parallel Ginkgo binaries share fixtures; leave **unset locally** — an auto-generated id is safer for ad-hoc runs | +| `E2E_ARTIFACTS_DIR` | `./_artifacts//proc-/` | Override the JUnit / log dump directory | +| `DOCUMENTDB_IMAGE` | chart default | Overrides the extension image used by fresh fixtures | +| `GATEWAY_IMAGE` | chart default | Overrides the gateway image used by fresh fixtures | +| `E2E_STORAGE_CLASS` | cluster default | StorageClass for fresh fixtures | +| `E2E_STORAGE_SIZE` | `1Gi` | PVC size for fresh fixtures | +| `GINKGO_PARALLEL_PROCESS` | set by Ginkgo | Consumed; do not set manually | +| `POSTGRES_IMG` | dummy stub | Set by `testenv` to satisfy CNPG's `TestingEnvironment`; do not override | + +**Upgrade area (gated behind `E2E_UPGRADE=1`):** + +| Variable | Purpose | +|---|---| +| `E2E_UPGRADE` | Must be `1` or every spec in `tests/upgrade/` Skips | +| `E2E_UPGRADE_PREVIOUS_CHART` | OCI or path ref for the "old" operator chart | +| `E2E_UPGRADE_PREVIOUS_VERSION` | Chart version string for the old chart | +| `E2E_UPGRADE_CURRENT_CHART` | Chart ref for the "new" (built-from-tree) chart | +| `E2E_UPGRADE_CURRENT_VERSION` | Optional — defaults to chart's own version | +| `E2E_UPGRADE_RELEASE` | Helm release name | +| `E2E_UPGRADE_OPERATOR_NS` | Operator namespace | +| `E2E_UPGRADE_OLD_DOCUMENTDB_IMAGE` | Extension image used before upgrade | +| `E2E_UPGRADE_NEW_DOCUMENTDB_IMAGE` | Extension image used after upgrade | + +> A note on `E2E_KEEP_CLUSTERS`: the design doc discusses a keep-clusters +> flag, but no such knob is honored by the current suite code. Skip-on-prereq +> is the intended mechanism; to inspect a cluster after a failing spec, pass +> `--fail-fast` and manually defer cluster teardown outside the suite. + +**Missing prereqs are `Skip()`, not `Fail()`.** Backup specs probe the +`VolumeSnapshot`/`VolumeSnapshotClass` CRDs at runtime (`Skip` when absent), +and `tls/tls_certmanager_test.go` probes the `cert-manager.io/v1` API group +the same way. The capability labels (`needs-csi-snapshots`, `needs-cert-manager`, +`needs-metallb`, `needs-csi-resize`) let you filter these out up front if +you already know your environment. + +## Adding a new test + +**Adding a spec to an existing area.** Create a new `*_test.go` in +`tests//`, import the area suite's label, attach the right depth +label, and use the suite's shared fixture rather than a fresh cluster when +the spec is read-only: + +```go +var _ = Describe("my new behavior", Label(e2e.DataLabel), e2e.MediumLevelLabel, func() { + It("does the thing", func(sctx SpecContext) { + e2e.SkipUnlessLevel(e2e.Medium) + // ... sharedROCluster is available via the area's BeforeAll + }) +}) +``` + +**Adding a new area package.** Create `tests//`, add +`_suite_test.go` that calls `e2e.SetupSuite` / `e2e.TeardownSuite`, +define an area label in `labels.go`, and attach it to the top-level +`Describe`. Mirror an existing area — `tests/status/` is the smallest +reference for read-only areas; `tests/lifecycle/` for mutating ones. + +**Adding a new manifest mixin.** Drop a `.yaml.template` under +`manifests/mixins/` and pass its stem via `CreateOptions.Mixins` to +`documentdb.Create`. Note the merge semantics: `RenderCR` produces a +multi-document YAML stream (one doc per template) and `Create` deep-merges +them into a single DocumentDB object before applying — maps merge recursively, +**scalars and slices in later mixins overwrite earlier values**. The public +`RenderCR` still returns the raw multi-doc bytes (useful for artifact dumps +or manual `kubectl apply`). + +**Adding a new assertion.** Put the reusable verb in +`pkg/e2eutils/assertions/assertions.go`. Assertions return `func() error` +so callers can wrap them in `Eventually(...).Should(Succeed())`. + +## Helper packages (`pkg/e2eutils/`) + +| Package | Role | +|---|---| +| `testenv/` | Wraps CNPG's `environment.TestingEnvironment` with dummy `POSTGRES_IMG`; registers our `api/preview` scheme on the typed `client.Client`. | +| `documentdb/` | DocumentDB CR verbs: `RenderCR` (base + mixin envsubst), `Create` (multi-doc merge), `PatchSpec`, `WaitHealthy`, `Delete`, `List`. | +| `mongo/` | `go.mongodb.org/mongo-driver/v2` client builder, seed/probe/count helpers; owns the 10 s post-port-forward ping retry budget (`connectRetryTimeout`). | +| `portforward/` | Thin wrapper over CNPG's `forwardconnection` for the DocumentDB gateway port. | +| `assertions/` | Composable Gomega verbs (`AssertDocumentDBReady`, `AssertInstanceCount`, `AssertPrimaryUnchanged`, `AssertPVCCount`, `AssertTLSSecretReady`, `AssertServiceType`, `AssertConnectionStringMatches`). | +| `timeouts/` | DocumentDB-specific overrides layered on top of CNPG's `timeouts` map (`DocumentDBReady`, `DocumentDBUpgrade`, `InstanceScale`, `PVCResize`). | +| `seed/` | Canonical datasets (`SmallDataset(10)`, `MediumDataset(1000)`, sort/agg fixtures) shared by data / performance / backup / upgrade specs. | +| `fixtures/` | Session-scoped shared clusters (`shared_ro.go`, `shared_scale.go`) and lazy MinIO (`minio.go`). Honors `E2E_RUN_ID`, `DOCUMENTDB_IMAGE`, `GATEWAY_IMAGE`, `E2E_STORAGE_CLASS`, `E2E_STORAGE_SIZE`. | +| `namespaces/` | Per-proc, run-id-scoped namespace naming (`e2e--`). | +| `operatorhealth/` | Operator-pod UID + restart-count gate; flips a package sentinel on churn so subsequent non-`disruptive`/`upgrade` specs skip. | +| `clusterprobe/` | Capability probes (CSI snapshot CRDs, cert-manager, StorageClass resize support) used by area `Skip*` helpers. | +| `backup/` | Helpers for asserting `Backup` / `ScheduledBackup` CR state, snapshot readiness, and MinIO object inspection. | +| `tlscerts/` | Self-signed + provided-mode certificate material builders used by `tests/tls/`. | +| `helmop/` | Helm install/upgrade/uninstall for the upgrade suite (multi-phase operator lifecycle). | + +## CI + +The suite is driven by [`.github/workflows/test-e2e.yml`](../../.github/workflows/test-e2e.yml) +(owned by the CI workflow migration; the file may not yet be present in +every working tree — it is added as part of the Phase 3 rollout). The +workflow fans out into nine label-grouped jobs: + +| Job | `--label-filter` | `--procs` | +|---|---|---| +| `smoke` | `smoke` | auto | +| `lifecycle` | `lifecycle` | auto | +| `scale` | `scale` | 2 | +| `data` | `data` | auto | +| `performance` | `performance` | 1 (dedicated runner) | +| `backup` | `backup` | 2 | +| `tls` | `tls` | auto | +| `feature` | `feature-gates \|\| exposure \|\| status` | auto | +| `upgrade` | `upgrade` | 1 | + +Each job runs `setup-test-environment` → `ginkgo -r --label-filter=… +--junit-report=junit.xml ./tests/...` → upload JUnit + logs. +`workflow_dispatch` exposes `label` and `depth` inputs for ad-hoc runs. + +## Troubleshooting + +- **Port-forward / Mongo connect fails with "connection refused."** The + post-port-forward retry budget is 10 s at 100 ms backoff + (`mongo/connect.go`: `connectRetryTimeout` / `connectRetryBackoff`). If + you consistently exceed it, the gateway pod is probably not Ready — check + the DocumentDB CR status and the gateway container logs. +- **Backup specs all Skip.** Your cluster lacks the CSI snapshot CRDs + (`VolumeSnapshotClass`, `VolumeSnapshot`) or the configured StorageClass + isn't backed by a snapshot-capable CSI driver. `scripts/test-scripts/deploy-csi-driver.sh` + under `operator/src/` installs a hostpath CSI driver suitable for kind. +- **TLS cert-manager spec Skips.** `cert-manager.io/v1` isn't served; install + cert-manager (the `setup-test-environment` composite does this for you). +- **"E2E_RUN_ID was not set" warning in CI logs.** The suite auto-generates + a run id, but cross-binary fixture sharing relies on every Ginkgo invocation + in a CI job seeing the same value. Export `E2E_RUN_ID="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"` + (or similar) once at the top of the job. +- **Operator churn aborts subsequent specs.** `operatorhealth.Gate` snapshots + the operator pod's UID + restart count at suite start; any drift flips a + package sentinel and skips every subsequent non-`disruptive`/`upgrade` spec. + This is working as intended — investigate why the operator restarted. + +## CNPG dependency & pin policy + +The suite imports CloudNative-PG's `tests/utils/*` packages as a library +(Apache-2.0, compatible with our MIT). The version is pinned in +[`go.mod`](go.mod) — currently `github.com/cloudnative-pg/cloudnative-pg +v1.28.1`. `tests/utils/*` is exported (not `internal/`) but has no stability +contract; budget roughly half a day per CNPG version bump for compat fixes +in our wrappers (`testenv`, `operatorhealth`, `portforward`). Bumps should +be single-purpose PRs gated on the full suite. diff --git a/test/e2e/go.mod b/test/e2e/go.mod new file mode 100644 index 00000000..777fbf1d --- /dev/null +++ b/test/e2e/go.mod @@ -0,0 +1,110 @@ +module github.com/documentdb/documentdb-operator/test/e2e + +go 1.25.9 + +require ( + github.com/cloudnative-pg/cloudnative-pg v1.28.1 + github.com/documentdb/documentdb-operator v0.0.0-00010101000000-000000000000 + github.com/kubernetes-csi/external-snapshotter/client/v8 v8.4.0 + github.com/onsi/ginkgo/v2 v2.28.1 + github.com/onsi/gomega v1.39.1 + go.mongodb.org/mongo-driver/v2 v2.5.1 + k8s.io/api v0.35.0 + k8s.io/apimachinery v0.35.0 + k8s.io/client-go v0.35.0 + sigs.k8s.io/controller-runtime v0.22.4 + sigs.k8s.io/yaml v1.6.0 +) + +require ( + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/avast/retry-go/v5 v5.0.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cloudnative-pg/barman-cloud v0.4.1-0.20260108104508-ced266c145f5 // indirect + github.com/cloudnative-pg/cnpg-i v0.5.0 // indirect + github.com/cloudnative-pg/machinery v0.3.3 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.22.4 // indirect + github.com/go-openapi/jsonreference v0.21.4 // indirect + github.com/go-openapi/swag v0.25.4 // indirect + github.com/go-openapi/swag/cmdutils v0.25.4 // indirect + github.com/go-openapi/swag/conv v0.25.4 // indirect + github.com/go-openapi/swag/fileutils v0.25.4 // indirect + github.com/go-openapi/swag/jsonname v0.25.4 // indirect + github.com/go-openapi/swag/jsonutils v0.25.4 // indirect + github.com/go-openapi/swag/loading v0.25.4 // indirect + github.com/go-openapi/swag/mangling v0.25.4 // indirect + github.com/go-openapi/swag/netutils v0.25.4 // indirect + github.com/go-openapi/swag/stringutils v0.25.4 // indirect + github.com/go-openapi/swag/typeutils v0.25.4 // indirect + github.com/go-openapi/swag/yamlutils v0.25.4 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/gnostic-models v0.7.1 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 // indirect + github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect + github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.3 // indirect + github.com/jackc/puddle/v2 v2.2.2 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 // indirect + github.com/klauspost/compress v1.18.0 // indirect + github.com/lib/pq v1.12.0 // indirect + github.com/moby/spdystream v0.5.1 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.1 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.67.4 // indirect + github.com/prometheus/procfs v0.19.2 // indirect + github.com/robfig/cron v1.2.0 // indirect + github.com/spf13/pflag v1.0.10 // indirect + github.com/thoas/go-funk v0.9.3 // indirect + github.com/x448/float16 v0.8.4 // indirect + github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.2.0 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.1 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/crypto v0.47.0 // indirect + golang.org/x/mod v0.32.0 // indirect + golang.org/x/net v0.49.0 // indirect + golang.org/x/oauth2 v0.34.0 // indirect + golang.org/x/sync v0.19.0 // indirect + golang.org/x/sys v0.42.0 // indirect + golang.org/x/term v0.41.0 // indirect + golang.org/x/text v0.33.0 // indirect + golang.org/x/time v0.14.0 // indirect + golang.org/x/tools v0.41.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect + google.golang.org/grpc v1.79.3 // indirect + google.golang.org/protobuf v1.36.11 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + k8s.io/apiextensions-apiserver v0.35.0 // indirect + k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e // indirect + k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect +) + +replace github.com/documentdb/documentdb-operator => ../../operator/src diff --git a/test/e2e/go.sum b/test/e2e/go.sum new file mode 100644 index 00000000..94c5041d --- /dev/null +++ b/test/e2e/go.sum @@ -0,0 +1,303 @@ +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= +github.com/avast/retry-go/v5 v5.0.0 h1:kf1Qc2UsTZ4qq8elDymqfbISvkyMuhgRxuJqX2NHP7k= +github.com/avast/retry-go/v5 v5.0.0/go.mod h1://d+usmKWio1agtZfS1H/ltTqwtIfBnRq9zEwjc3eH8= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cloudnative-pg/barman-cloud v0.4.1-0.20260108104508-ced266c145f5 h1:wPB7VTNgTv6t9sl4QYOBakmVTqHnOdKUht7Q3aL+uns= +github.com/cloudnative-pg/barman-cloud v0.4.1-0.20260108104508-ced266c145f5/go.mod h1:qD0NtJOllNQbRB0MaleuHsZjFYaXtXfdg0HbFTbuHn0= +github.com/cloudnative-pg/cloudnative-pg v1.28.1 h1:HdOUWgFhta558uHfXeO/199qCApxaj5yi05x6nWNmgs= +github.com/cloudnative-pg/cloudnative-pg v1.28.1/go.mod h1:yhRa4GqJAjNd0tT9AiRgk1KdqLhMjo/JmGGoASRl2CU= +github.com/cloudnative-pg/cnpg-i v0.5.0 h1:/TOzpNT6cwNgrpftTtrnLKdoHgMwd+88vZgXjlVgXeE= +github.com/cloudnative-pg/cnpg-i v0.5.0/go.mod h1:7Gh4+UzhBpGhr4DreB1GN9wGYfvxwXCXZUyVt3zE/3I= +github.com/cloudnative-pg/machinery v0.3.3 h1:CaqXqLTJH9RrVv3R/YU0NmFaI/F18HLg2JfH3mQLcDk= +github.com/cloudnative-pg/machinery v0.3.3/go.mod h1:RYAYlVKBF5pH4mg+Q8wHjNDyENV9ajbkG41zOEf8DEs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= +github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.22.4 h1:dZtK82WlNpVLDW2jlA1YCiVJFVqkED1MegOUy9kR5T4= +github.com/go-openapi/jsonpointer v0.22.4/go.mod h1:elX9+UgznpFhgBuaMQ7iu4lvvX1nvNsesQ3oxmYTw80= +github.com/go-openapi/jsonreference v0.21.4 h1:24qaE2y9bx/q3uRK/qN+TDwbok1NhbSmGjjySRCHtC8= +github.com/go-openapi/jsonreference v0.21.4/go.mod h1:rIENPTjDbLpzQmQWCj5kKj3ZlmEh+EFVbz3RTUh30/4= +github.com/go-openapi/swag v0.25.4 h1:OyUPUFYDPDBMkqyxOTkqDYFnrhuhi9NR6QVUvIochMU= +github.com/go-openapi/swag v0.25.4/go.mod h1:zNfJ9WZABGHCFg2RnY0S4IOkAcVTzJ6z2Bi+Q4i6qFQ= +github.com/go-openapi/swag/cmdutils v0.25.4 h1:8rYhB5n6WawR192/BfUu2iVlxqVR9aRgGJP6WaBoW+4= +github.com/go-openapi/swag/cmdutils v0.25.4/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= +github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4= +github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU= +github.com/go-openapi/swag/fileutils v0.25.4 h1:2oI0XNW5y6UWZTC7vAxC8hmsK/tOkWXHJQH4lKjqw+Y= +github.com/go-openapi/swag/fileutils v0.25.4/go.mod h1:cdOT/PKbwcysVQ9Tpr0q20lQKH7MGhOEb6EwmHOirUk= +github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI= +github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag= +github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA= +github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM= +github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s= +github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE= +github.com/go-openapi/swag/mangling v0.25.4 h1:2b9kBJk9JvPgxr36V23FxJLdwBrpijI26Bx5JH4Hp48= +github.com/go-openapi/swag/mangling v0.25.4/go.mod h1:6dxwu6QyORHpIIApsdZgb6wBk/DPU15MdyYj/ikn0Hg= +github.com/go-openapi/swag/netutils v0.25.4 h1:Gqe6K71bGRb3ZQLusdI8p/y1KLgV4M/k+/HzVSqT8H0= +github.com/go-openapi/swag/netutils v0.25.4/go.mod h1:m2W8dtdaoX7oj9rEttLyTeEFFEBvnAx9qHd5nJEBzYg= +github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8= +github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0= +github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw= +github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE= +github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw= +github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg= +github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls= +github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/gnostic-models v0.7.1 h1:SisTfuFKJSKM5CPZkffwi6coztzzeYUhc3v4yxLWH8c= +github.com/google/gnostic-models v0.7.1/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/vP9vJGqPwcdqsWjOt+V8J7+bTc= +github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= +github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= +github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.3 h1:B+8ClL/kCQkRiU82d9xajRPKYMrB7E0MbtzWVi1K4ns= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.3/go.mod h1:NbCUVmiS4foBGBHOYlCT25+YmGpJ32dZPi75pGEUpj4= +github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= +github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= +github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kubernetes-csi/external-snapshotter/client/v8 v8.4.0 h1:bMqrb3UHgHbP+PW9VwiejfDJU1R0PpXVZNMdeH8WYKI= +github.com/kubernetes-csi/external-snapshotter/client/v8 v8.4.0/go.mod h1:E3vdYxHj2C2q6qo8/Da4g7P+IcwqRZyy3gJBzYybV9Y= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/lib/pq v1.12.0 h1:mC1zeiNamwKBecjHarAr26c/+d8V5w/u4J0I/yASbJo= +github.com/lib/pq v1.12.0/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= +github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= +github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI= +github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE= +github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= +github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.1 h1:wyKanf+IFdbIqbDNYGt+f1dabLErLWtBaxd0KaAx4aM= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.1/go.mod h1:WHiLZmOWVop/MoYvRD58LfnPeyE+dcITby/jQjg83Hw= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc= +github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI= +github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= +github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= +github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ= +github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/thoas/go-funk v0.9.3 h1:7+nAEx3kn5ZJcnDm2Bh23N2yOtweO14bi//dvRtgLpw= +github.com/thoas/go-funk v0.9.3/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.2.0 h1:bYKF2AEwG5rqd1BumT4gAnvwU/M9nBp2pTSxeZw7Wvs= +github.com/xdg-go/scram v1.2.0/go.mod h1:3dlrS0iBaWKYVt2ZfA4cj48umJZ+cAEbR6/SjLA88I8= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.mongodb.org/mongo-driver/v2 v2.5.1 h1:j2U/Qp+wvueSpqitLCSZPT/+ZpVc1xzuwdHWwl7d8ro= +go.mongodb.org/mongo-driver/v2 v2.5.1/go.mod h1:yOI9kBsufol30iFsl1slpdq1I0eHPzybRWdyYUs8K/0= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= +go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= +go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= +go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= +go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= +go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= +go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8= +go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew= +go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= +go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc= +go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= +golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= +golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= +golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= +golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= +golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= +golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= +gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= +google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= +k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= +k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= +k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e h1:iW9ChlU0cU16w8MpVYjXk12dqQ4BPFBEgif+ap7/hqQ= +k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 h1:AZYQSJemyQB5eRxqcPky+/7EdBj0xi3g0ZcxxJ7vbWU= +k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= +sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= +sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/test/e2e/labels.go b/test/e2e/labels.go new file mode 100644 index 00000000..8ac02cba --- /dev/null +++ b/test/e2e/labels.go @@ -0,0 +1,59 @@ +// Package e2e contains the DocumentDB Kubernetes Operator end-to-end test +// suite. See docs/designs/e2e-test-suite.md for the full design. +package e2e + +import "github.com/onsi/ginkgo/v2" + +// Ginkgo label constants used to select subsets of the DocumentDB E2E test +// suite at invocation time. Each area suite in tests// applies its +// matching area label to every spec it runs; cross-cutting labels +// (Smoke/Basic/Destructive/Disruptive/Slow and the NeedsXxx capability +// labels) are applied by individual specs. +// +// Keep these in sync with the design document. +const ( + // Area labels — one per test area (tests//). + LifecycleLabel = "lifecycle" + ScaleLabel = "scale" + DataLabel = "data" + PerformanceLabel = "performance" + BackupLabel = "backup" + RecoveryLabel = "recovery" + TLSLabel = "tls" + FeatureLabel = "feature-gates" + ExposureLabel = "exposure" + StatusLabel = "status" + UpgradeLabel = "upgrade" + + // Cross-cutting selectors. + SmokeLabel = "smoke" + BasicLabel = "basic" + DestructiveLabel = "destructive" + DisruptiveLabel = "disruptive" + SlowLabel = "slow" + + // Capability labels — environments that don't provide a prerequisite + // can filter these specs out. + NeedsCertManagerLabel = "needs-cert-manager" + NeedsMetalLBLabel = "needs-metallb" + NeedsCSISnapshotsLabel = "needs-csi-snapshots" + // NeedsCSIResizeLabel marks specs that require the cluster's + // StorageClass to support online PVC expansion (allowVolumeExpansion=true + // plus a resize-capable CSI driver). Environments that lack this + // capability should filter with `--label-filter='!needs-csi-resize'`. + NeedsCSIResizeLabel = "needs-csi-resize" +) + +// Level labels expose the depth tier of a spec to Ginkgo's label filter. +// Phase 2 specs should attach exactly one of these alongside the area +// label so invocations can select, e.g., all "level:low" specs with +// `--label-filter=level:low`. These labels are informational — the +// authoritative gate remains [SkipUnlessLevel], which reads TEST_DEPTH +// at runtime. +var ( + LowLevelLabel = ginkgo.Label("level:low") + MediumLevelLabel = ginkgo.Label("level:medium") + HighLevelLabel = ginkgo.Label("level:high") + HighestLevelLabel = ginkgo.Label("level:highest") + LowestLevelLabel = ginkgo.Label("level:lowest") +) diff --git a/test/e2e/levels.go b/test/e2e/levels.go new file mode 100644 index 00000000..6a1c4664 --- /dev/null +++ b/test/e2e/levels.go @@ -0,0 +1,109 @@ +package e2e + +import ( + "fmt" + "os" + "strconv" + + "github.com/onsi/ginkgo/v2" +) + +// Level represents a depth/intensity tier for a test. Specs can gate +// themselves on the currently configured level so that short CI runs +// execute only the most important specs while nightly/manual runs +// expand coverage. +// +// NOTE: CNPG does not currently expose a `tests/utils/levels` package +// in v1.28.1 (verified with `go doc`). If upstream adds one later, +// replace this file with a thin re-export. +type Level int + +const ( + // Highest runs only the most critical specs (fast smoke). + Highest Level = iota + // High adds the core area-suite coverage. + High + // Medium adds broader coverage for the area. This is the default + // per docs/designs/e2e-test-suite.md. + Medium + // Low adds long-running or edge-case scenarios. + Low + // Lowest runs everything, including slow/destructive corners. + Lowest +) + +// testDepthEnv is the environment variable consulted by CurrentLevel. +// Values are integers 0–4 mapping to Highest…Lowest. Invalid or unset +// values fall back to defaultLevel (Medium). +const testDepthEnv = "TEST_DEPTH" + +// defaultLevel is the depth applied when TEST_DEPTH is unset or +// invalid. Chosen to match the design document. +const defaultLevel = Medium + +// CurrentLevel reads TEST_DEPTH from the environment and returns the +// corresponding Level. Defaults to Medium when unset or invalid. +func CurrentLevel() Level { + raw, ok := os.LookupEnv(testDepthEnv) + if !ok { + return defaultLevel + } + v, err := strconv.Atoi(raw) + if err != nil { + return defaultLevel + } + switch Level(v) { + case Highest, High, Medium, Low, Lowest: + return Level(v) + default: + return defaultLevel + } +} + +// ShouldRun reports whether a spec declared at `required` should run +// given the currently configured level. A spec runs when the configured +// level is at least as deep as the spec's required level. +// +// Deprecated: Phase 2 specs should use [SkipUnlessLevel] instead — +// it is the single, uniform gate documented for area authors and it +// integrates with Ginkgo's reporting by invoking Skip rather than +// silently returning a bool. +func ShouldRun(required Level) bool { + return CurrentLevel() >= required +} + +// SkipUnlessLevel calls Ginkgo's Skip when the current depth level is +// shallower than min. Typical use from an `It`/`DescribeTable`: +// +// It("exercises the pool under sustained load", Label(e2e.SlowLabel), func() { +// e2e.SkipUnlessLevel(e2e.Low) +// ... +// }) +// +// SkipUnlessLevel is the only level-gating pattern Phase 2 test writers +// should use; prefer it over raw calls to [ShouldRun]. +func SkipUnlessLevel(min Level) { + if CurrentLevel() < min { + ginkgo.Skip(fmt.Sprintf("TEST_DEPTH=%d (%s) is shallower than required %s", + CurrentLevel(), levelName(CurrentLevel()), levelName(min))) + } +} + +// levelName returns a human-readable name for a Level for use in skip +// messages. +func levelName(l Level) string { + switch l { + case Highest: + return "Highest" + case High: + return "High" + case Medium: + return "Medium" + case Low: + return "Low" + case Lowest: + return "Lowest" + default: + return fmt.Sprintf("Level(%d)", int(l)) + } +} diff --git a/test/e2e/levels_test.go b/test/e2e/levels_test.go new file mode 100644 index 00000000..128fbe4f --- /dev/null +++ b/test/e2e/levels_test.go @@ -0,0 +1,88 @@ +package e2e + +import ( + "os" + "testing" +) + +func TestCurrentLevelDefault(t *testing.T) { + // t.Setenv with empty value still sets the variable; explicitly + // unset to exercise the "unset" branch. + orig, had := os.LookupEnv(testDepthEnv) + _ = os.Unsetenv(testDepthEnv) + t.Cleanup(func() { + if had { + _ = os.Setenv(testDepthEnv, orig) + } + }) + if got := CurrentLevel(); got != Medium { + t.Fatalf("default CurrentLevel = %v, want Medium", got) + } +} + +func TestCurrentLevelInvalidFallsBack(t *testing.T) { + t.Setenv(testDepthEnv, "not-an-int") + if got := CurrentLevel(); got != Medium { + t.Fatalf("invalid TEST_DEPTH CurrentLevel = %v, want Medium", got) + } + t.Setenv(testDepthEnv, "99") + if got := CurrentLevel(); got != Medium { + t.Fatalf("out-of-range TEST_DEPTH CurrentLevel = %v, want Medium", got) + } +} + +func TestCurrentLevelParses(t *testing.T) { + cases := []struct { + raw string + want Level + }{ + {"0", Highest}, + {"1", High}, + {"2", Medium}, + {"3", Low}, + {"4", Lowest}, + } + for _, c := range cases { + t.Setenv(testDepthEnv, c.raw) + if got := CurrentLevel(); got != c.want { + t.Errorf("CurrentLevel(%s) = %v, want %v", c.raw, got, c.want) + } + } +} + +func TestShouldRunRespectsOrdering(t *testing.T) { + t.Setenv(testDepthEnv, "2") // Medium + // Specs at Highest/High/Medium must run; Low/Lowest must not. + for _, required := range []Level{Highest, High, Medium} { + if !ShouldRun(required) { + t.Errorf("at Medium, ShouldRun(%v) = false; want true", required) + } + } + for _, required := range []Level{Low, Lowest} { + if ShouldRun(required) { + t.Errorf("at Medium, ShouldRun(%v) = true; want false", required) + } + } +} + +func TestLevelName(t *testing.T) { + for _, c := range []struct { + l Level + want string + }{ + {Highest, "Highest"}, + {High, "High"}, + {Medium, "Medium"}, + {Low, "Low"}, + {Lowest, "Lowest"}, + } { + if got := levelName(c.l); got != c.want { + t.Errorf("levelName(%v) = %q, want %q", c.l, got, c.want) + } + } + if got := levelName(Level(42)); got == "" { + t.Error("levelName for unknown should not be empty") + } +} + +// (helpers removed — tests use os.Setenv/Unsetenv directly.) diff --git a/test/e2e/manifests/base/.keep b/test/e2e/manifests/base/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/manifests/base/documentdb.yaml.template b/test/e2e/manifests/base/documentdb.yaml.template new file mode 100644 index 00000000..ed0bc512 --- /dev/null +++ b/test/e2e/manifests/base/documentdb.yaml.template @@ -0,0 +1,18 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + nodeCount: 1 + instancesPerNode: ${INSTANCES} + documentDBImage: ${DOCUMENTDB_IMAGE} + gatewayImage: ${GATEWAY_IMAGE} + documentDbCredentialSecret: ${CREDENTIAL_SECRET} + resource: + storage: + pvcSize: ${STORAGE_SIZE} + storageClass: ${STORAGE_CLASS} + exposeViaService: + serviceType: ${EXPOSURE_TYPE} + logLevel: ${LOG_LEVEL} diff --git a/test/e2e/manifests/embed.go b/test/e2e/manifests/embed.go new file mode 100644 index 00000000..ce6cff1b --- /dev/null +++ b/test/e2e/manifests/embed.go @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package manifests embeds the shared DocumentDB CR templates used by +// the E2E suite. Exposing them as an embed.FS makes template rendering +// independent of the current working directory, so every per-area +// ginkgo binary can locate them without runtime.Caller tricks. +package manifests + +import "embed" + +// FS holds the base/, mixins/, and backup/ template trees. +// +//go:embed base/*.yaml.template mixins/*.yaml.template backup/*.yaml.template +var FS embed.FS diff --git a/test/e2e/manifests/mixins/.keep b/test/e2e/manifests/mixins/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/manifests/mixins/exposure_clusterip.yaml.template b/test/e2e/manifests/mixins/exposure_clusterip.yaml.template new file mode 100644 index 00000000..6b89c8d6 --- /dev/null +++ b/test/e2e/manifests/mixins/exposure_clusterip.yaml.template @@ -0,0 +1,8 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + exposeViaService: + serviceType: ClusterIP diff --git a/test/e2e/manifests/mixins/exposure_loadbalancer.yaml.template b/test/e2e/manifests/mixins/exposure_loadbalancer.yaml.template new file mode 100644 index 00000000..d2eea518 --- /dev/null +++ b/test/e2e/manifests/mixins/exposure_loadbalancer.yaml.template @@ -0,0 +1,8 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + exposeViaService: + serviceType: LoadBalancer diff --git a/test/e2e/manifests/mixins/reclaim_retain.yaml.template b/test/e2e/manifests/mixins/reclaim_retain.yaml.template new file mode 100644 index 00000000..d4542efe --- /dev/null +++ b/test/e2e/manifests/mixins/reclaim_retain.yaml.template @@ -0,0 +1,11 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + resource: + storage: + pvcSize: ${STORAGE_SIZE} + storageClass: ${STORAGE_CLASS} + persistentVolumeReclaimPolicy: Retain diff --git a/test/e2e/manifests/mixins/storage_custom.yaml.template b/test/e2e/manifests/mixins/storage_custom.yaml.template new file mode 100644 index 00000000..70eee4c9 --- /dev/null +++ b/test/e2e/manifests/mixins/storage_custom.yaml.template @@ -0,0 +1,10 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + resource: + storage: + pvcSize: ${STORAGE_SIZE} + storageClass: ${STORAGE_CLASS} diff --git a/test/e2e/manifests/mixins/tls_certmanager.yaml.template b/test/e2e/manifests/mixins/tls_certmanager.yaml.template new file mode 100644 index 00000000..ce9cb050 --- /dev/null +++ b/test/e2e/manifests/mixins/tls_certmanager.yaml.template @@ -0,0 +1,14 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + tls: + gateway: + mode: CertManager + certManager: + issuerRef: + name: ${ISSUER_NAME} + kind: ${ISSUER_KIND} + group: cert-manager.io diff --git a/test/e2e/manifests/mixins/tls_disabled.yaml.template b/test/e2e/manifests/mixins/tls_disabled.yaml.template new file mode 100644 index 00000000..4697ae61 --- /dev/null +++ b/test/e2e/manifests/mixins/tls_disabled.yaml.template @@ -0,0 +1,9 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + tls: + gateway: + mode: Disabled diff --git a/test/e2e/manifests/mixins/tls_provided.yaml.template b/test/e2e/manifests/mixins/tls_provided.yaml.template new file mode 100644 index 00000000..3b99c327 --- /dev/null +++ b/test/e2e/manifests/mixins/tls_provided.yaml.template @@ -0,0 +1,11 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + tls: + gateway: + mode: Provided + provided: + secretName: ${TLS_SECRET_NAME} diff --git a/test/e2e/manifests/mixins/tls_selfsigned.yaml.template b/test/e2e/manifests/mixins/tls_selfsigned.yaml.template new file mode 100644 index 00000000..93a05b2b --- /dev/null +++ b/test/e2e/manifests/mixins/tls_selfsigned.yaml.template @@ -0,0 +1,9 @@ +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: ${NAME} + namespace: ${NAMESPACE} +spec: + tls: + gateway: + mode: SelfSigned diff --git a/test/e2e/pkg/e2eutils/assertions/.keep b/test/e2e/pkg/e2eutils/assertions/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/assertions/assertions.go b/test/e2e/pkg/e2eutils/assertions/assertions.go new file mode 100644 index 00000000..5fd42f40 --- /dev/null +++ b/test/e2e/pkg/e2eutils/assertions/assertions.go @@ -0,0 +1,197 @@ +// Package assertions returns checker closures for use with Gomega's +// Eventually / Consistently. Each helper yields a `func() error` so it +// can be awaited with `Eventually(fn, timeout, poll).Should(Succeed())` +// without this package pulling in ginkgo or gomega itself. +package assertions + +import ( + "context" + "fmt" + "regexp" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + cnpgclusterutils "github.com/cloudnative-pg/cloudnative-pg/tests/utils/clusterutils" + preview "github.com/documentdb/documentdb-operator/api/preview" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" + + documentdbutil "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" +) + +// runningStatus aliases the canonical ReadyStatus constant exported by +// the documentdb helper package so all sibling helpers share a single +// source of truth for the "DocumentDB is healthy" sentinel. +const runningStatus = documentdbutil.ReadyStatus + +// clusterNameFor returns the CNPG Cluster name that backs the given +// DocumentDB. For single-cluster (non-replicated) deployments this +// matches the DocumentDB name; replicated clusters use +// `-` but are out of scope here (see AssertPrimary* +// variants that accept an explicit cluster name). +func clusterNameFor(dd *preview.DocumentDB) string { + return dd.Name +} + +// getDocumentDB is a small helper shared by assertions that need to +// read a DocumentDB by key. +func getDocumentDB(ctx context.Context, c client.Client, key client.ObjectKey) (*preview.DocumentDB, error) { + dd := &preview.DocumentDB{} + if err := c.Get(ctx, key, dd); err != nil { + return nil, fmt.Errorf("get DocumentDB %s: %w", key, err) + } + return dd, nil +} + +// AssertDocumentDBReady returns a checker that succeeds when the +// DocumentDB identified by key reports Status.Status == runningStatus. +// Any other value (including "" for a freshly-created object) yields +// a non-nil error so Eventually will keep polling. +func AssertDocumentDBReady(ctx context.Context, c client.Client, key client.ObjectKey) func() error { + return func() error { + dd, err := getDocumentDB(ctx, c, key) + if err != nil { + return err + } + if dd.Status.Status != runningStatus { + return fmt.Errorf("DocumentDB %s status=%q, want %q", + key, dd.Status.Status, runningStatus) + } + return nil + } +} + +// AssertInstanceCount returns a checker that succeeds when the CNPG +// Cluster backing the DocumentDB reports Status.ReadyInstances == want. +// This is the canonical signal for "scale operation completed": the +// DocumentDB spec alone does not expose a live instance count. +func AssertInstanceCount(ctx context.Context, c client.Client, key client.ObjectKey, want int) func() error { + return func() error { + dd, err := getDocumentDB(ctx, c, key) + if err != nil { + return err + } + cluster := &cnpgv1.Cluster{} + ck := client.ObjectKey{Namespace: key.Namespace, Name: clusterNameFor(dd)} + if err := c.Get(ctx, ck, cluster); err != nil { + return fmt.Errorf("get CNPG Cluster %s: %w", ck, err) + } + if cluster.Status.ReadyInstances != want { + return fmt.Errorf("CNPG Cluster %s readyInstances=%d, want %d", + ck, cluster.Status.ReadyInstances, want) + } + return nil + } +} + +// AssertPrimaryUnchanged returns a checker that succeeds when the +// CNPG primary pod name still matches initialPrimary. It is intended +// for Consistently() checks during operations that must not trigger a +// failover (e.g. PVC resize). +func AssertPrimaryUnchanged(ctx context.Context, c client.Client, key client.ObjectKey, initialPrimary string) func() error { + return func() error { + dd, err := getDocumentDB(ctx, c, key) + if err != nil { + return err + } + pod, err := cnpgclusterutils.GetPrimary(ctx, c, key.Namespace, clusterNameFor(dd)) + if err != nil { + return fmt.Errorf("get primary for %s: %w", key, err) + } + if pod == nil || pod.Name == "" { + return fmt.Errorf("no primary pod found for %s", key) + } + if pod.Name != initialPrimary { + return fmt.Errorf("primary changed: want %s, got %s", initialPrimary, pod.Name) + } + return nil + } +} + +// AssertPVCCount returns a checker that succeeds when the count of +// PersistentVolumeClaims in ns matching labelSelector equals want. +// labelSelector follows the standard Kubernetes selector syntax and +// must parse cleanly or the checker returns an error on every call. +func AssertPVCCount(ctx context.Context, c client.Client, ns, labelSelector string, want int) func() error { + sel, selErr := labels.Parse(labelSelector) + return func() error { + if selErr != nil { + return fmt.Errorf("parse selector %q: %w", labelSelector, selErr) + } + pvcs := &corev1.PersistentVolumeClaimList{} + if err := c.List(ctx, pvcs, client.InNamespace(ns), client.MatchingLabelsSelector{Selector: sel}); err != nil { + return fmt.Errorf("list PVCs in %s: %w", ns, err) + } + if got := len(pvcs.Items); got != want { + return fmt.Errorf("PVC count in %s (%s): got %d, want %d", + ns, labelSelector, got, want) + } + return nil + } +} + +// AssertTLSSecretReady returns a checker that succeeds when the named +// secret exists in ns and contains non-empty tls.crt and tls.key +// entries (the canonical keys for a kubernetes.io/tls Secret). +func AssertTLSSecretReady(ctx context.Context, c client.Client, ns, secretName string) func() error { + return func() error { + s := &corev1.Secret{} + key := client.ObjectKey{Namespace: ns, Name: secretName} + if err := c.Get(ctx, key, s); err != nil { + if apierrors.IsNotFound(err) { + return fmt.Errorf("TLS secret %s not found", key) + } + return fmt.Errorf("get TLS secret %s: %w", key, err) + } + if len(s.Data[corev1.TLSCertKey]) == 0 { + return fmt.Errorf("TLS secret %s missing %s", key, corev1.TLSCertKey) + } + if len(s.Data[corev1.TLSPrivateKeyKey]) == 0 { + return fmt.Errorf("TLS secret %s missing %s", key, corev1.TLSPrivateKeyKey) + } + return nil + } +} + +// AssertServiceType returns a checker that succeeds when the named +// Service exists in ns and its spec.type equals want. +func AssertServiceType(ctx context.Context, c client.Client, ns, svcName string, want corev1.ServiceType) func() error { + return func() error { + svc := &corev1.Service{} + key := client.ObjectKey{Namespace: ns, Name: svcName} + if err := c.Get(ctx, key, svc); err != nil { + return fmt.Errorf("get Service %s: %w", key, err) + } + if svc.Spec.Type != want { + return fmt.Errorf("Service %s type=%s, want %s", key, svc.Spec.Type, want) + } + return nil + } +} + +// AssertConnectionStringMatches returns a checker that succeeds when +// the DocumentDB's Status.ConnectionString is non-empty and matches +// the supplied regular expression. Regex compilation errors surface on +// every invocation so bad test input fails fast in Eventually. +func AssertConnectionStringMatches(ctx context.Context, c client.Client, key client.ObjectKey, regex string) func() error { + re, reErr := regexp.Compile(regex) + return func() error { + if reErr != nil { + return fmt.Errorf("compile regex %q: %w", regex, reErr) + } + dd, err := getDocumentDB(ctx, c, key) + if err != nil { + return err + } + cs := dd.Status.ConnectionString + if cs == "" { + return fmt.Errorf("DocumentDB %s has empty connectionString", key) + } + if !re.MatchString(cs) { + return fmt.Errorf("DocumentDB %s connectionString %q does not match %q", + key, cs, regex) + } + return nil + } +} diff --git a/test/e2e/pkg/e2eutils/assertions/assertions_test.go b/test/e2e/pkg/e2eutils/assertions/assertions_test.go new file mode 100644 index 00000000..017bbd69 --- /dev/null +++ b/test/e2e/pkg/e2eutils/assertions/assertions_test.go @@ -0,0 +1,166 @@ +package assertions + +import ( + "context" + "strings" + "testing" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + preview "github.com/documentdb/documentdb-operator/api/preview" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func newScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := corev1.AddToScheme(s); err != nil { + t.Fatalf("corev1.AddToScheme: %v", err) + } + if err := preview.AddToScheme(s); err != nil { + t.Fatalf("preview.AddToScheme: %v", err) + } + if err := cnpgv1.AddToScheme(s); err != nil { + t.Fatalf("cnpgv1.AddToScheme: %v", err) + } + return s +} + +func TestAssertDocumentDBReady(t *testing.T) { + t.Parallel() + s := newScheme(t) + dd := &preview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{Name: "db1", Namespace: "ns"}, + Status: preview.DocumentDBStatus{Status: "Cluster in healthy state"}, + } + notReady := &preview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{Name: "db2", Namespace: "ns"}, + Status: preview.DocumentDBStatus{Status: "Setting up primary"}, + } + c := fake.NewClientBuilder().WithScheme(s).WithObjects(dd, notReady).Build() + + if err := AssertDocumentDBReady(context.Background(), c, client.ObjectKey{Namespace: "ns", Name: "db1"})(); err != nil { + t.Fatalf("expected ready, got err=%v", err) + } + if err := AssertDocumentDBReady(context.Background(), c, client.ObjectKey{Namespace: "ns", Name: "db2"})(); err == nil { + t.Fatalf("expected not-ready error") + } + if err := AssertDocumentDBReady(context.Background(), c, client.ObjectKey{Namespace: "ns", Name: "missing"})(); err == nil { + t.Fatalf("expected error for missing object") + } +} + +func TestAssertInstanceCount(t *testing.T) { + t.Parallel() + s := newScheme(t) + dd := &preview.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "db", Namespace: "ns"}} + cluster := &cnpgv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{Name: "db", Namespace: "ns"}, + Status: cnpgv1.ClusterStatus{ReadyInstances: 3}, + } + c := fake.NewClientBuilder().WithScheme(s).WithObjects(dd, cluster).Build() + key := client.ObjectKey{Namespace: "ns", Name: "db"} + + if err := AssertInstanceCount(context.Background(), c, key, 3)(); err != nil { + t.Fatalf("want ok, got %v", err) + } + if err := AssertInstanceCount(context.Background(), c, key, 2)(); err == nil { + t.Fatalf("want mismatch error") + } +} + +func TestAssertPVCCount(t *testing.T) { + t.Parallel() + s := newScheme(t) + pvcs := []client.Object{ + &corev1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{ + Name: "p1", Namespace: "ns", Labels: map[string]string{"app": "dd"}}}, + &corev1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{ + Name: "p2", Namespace: "ns", Labels: map[string]string{"app": "dd"}}}, + &corev1.PersistentVolumeClaim{ObjectMeta: metav1.ObjectMeta{ + Name: "p3", Namespace: "ns", Labels: map[string]string{"app": "other"}}}, + } + c := fake.NewClientBuilder().WithScheme(s).WithObjects(pvcs...).Build() + + if err := AssertPVCCount(context.Background(), c, "ns", "app=dd", 2)(); err != nil { + t.Fatalf("want ok, got %v", err) + } + if err := AssertPVCCount(context.Background(), c, "ns", "app=dd", 3)(); err == nil { + t.Fatalf("want mismatch error") + } + // Malformed selector surfaces on every call. + if err := AssertPVCCount(context.Background(), c, "ns", "!!bad!!", 0)(); err == nil { + t.Fatalf("want parse error") + } +} + +func TestAssertTLSSecretReady(t *testing.T) { + t.Parallel() + s := newScheme(t) + good := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "g", Namespace: "ns"}, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{corev1.TLSCertKey: []byte("c"), corev1.TLSPrivateKeyKey: []byte("k")}, + } + missingKey := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "b", Namespace: "ns"}, + Data: map[string][]byte{corev1.TLSCertKey: []byte("c")}, + } + c := fake.NewClientBuilder().WithScheme(s).WithObjects(good, missingKey).Build() + if err := AssertTLSSecretReady(context.Background(), c, "ns", "g")(); err != nil { + t.Fatalf("good: %v", err) + } + if err := AssertTLSSecretReady(context.Background(), c, "ns", "b")(); err == nil { + t.Fatalf("want error for missing key") + } + err := AssertTLSSecretReady(context.Background(), c, "ns", "none")() + if err == nil || !strings.Contains(err.Error(), "not found") { + t.Fatalf("want not-found error, got %v", err) + } +} + +func TestAssertServiceType(t *testing.T) { + t.Parallel() + s := newScheme(t) + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "svc", Namespace: "ns"}, + Spec: corev1.ServiceSpec{Type: corev1.ServiceTypeLoadBalancer}, + } + c := fake.NewClientBuilder().WithScheme(s).WithObjects(svc).Build() + if err := AssertServiceType(context.Background(), c, "ns", "svc", corev1.ServiceTypeLoadBalancer)(); err != nil { + t.Fatalf("want ok, got %v", err) + } + if err := AssertServiceType(context.Background(), c, "ns", "svc", corev1.ServiceTypeClusterIP)(); err == nil { + t.Fatalf("want mismatch") + } +} + +func TestAssertConnectionStringMatches(t *testing.T) { + t.Parallel() + s := newScheme(t) + dd := &preview.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{Name: "db", Namespace: "ns"}, + Status: preview.DocumentDBStatus{ConnectionString: "mongodb://user:pw@svc:10260/?tls=true"}, + } + empty := &preview.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "empty", Namespace: "ns"}} + c := fake.NewClientBuilder().WithScheme(s).WithObjects(dd, empty).Build() + k := client.ObjectKey{Namespace: "ns", Name: "db"} + + if err := AssertConnectionStringMatches(context.Background(), c, k, `^mongodb://.*tls=true`)(); err != nil { + t.Fatalf("want ok, got %v", err) + } + if err := AssertConnectionStringMatches(context.Background(), c, k, `tls=false`)(); err == nil { + t.Fatalf("want mismatch") + } + if err := AssertConnectionStringMatches(context.Background(), c, + client.ObjectKey{Namespace: "ns", Name: "empty"}, `.*`)(); err == nil { + t.Fatalf("want empty-string error") + } + // Bad regex must surface. + if err := AssertConnectionStringMatches(context.Background(), c, k, `[unclosed`)(); err == nil { + t.Fatalf("want regex compile error") + } +} diff --git a/test/e2e/pkg/e2eutils/clusterprobe/clusterprobe.go b/test/e2e/pkg/e2eutils/clusterprobe/clusterprobe.go new file mode 100644 index 00000000..67275007 --- /dev/null +++ b/test/e2e/pkg/e2eutils/clusterprobe/clusterprobe.go @@ -0,0 +1,173 @@ +// Package clusterprobe supplies runtime capability checks for the +// DocumentDB E2E suite. Ginkgo label selectors (e.g. +// `e2e.NeedsCSISnapshotsLabel`) only gate invocation: when a caller +// forgets `--label-filter='!needs-csi-snapshots'` on a cluster that +// lacks CSI snapshot support, the spec still runs and produces +// confusing failures deep inside the Backup/Restore path. +// +// The probes below give each affected spec a deterministic pre-flight +// check that it can invoke from `BeforeEach` and fall through to a +// clear `Skip(...)` message when the capability is missing. They are +// intentionally framework-agnostic (plain errors, no Ginkgo/Gomega +// imports) so unit tests can exercise them with a controller-runtime +// fake client. +package clusterprobe + +import ( + "context" + "errors" + "fmt" + + snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" + storagev1 "k8s.io/api/storage/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// DefaultStorageClassAnnotation is the annotation Kubernetes uses to +// flag a StorageClass as the cluster default. Present with value "true" +// (or the legacy beta annotation) on at most one StorageClass per +// cluster. +const DefaultStorageClassAnnotation = "storageclass.kubernetes.io/is-default-class" + +// legacyDefaultStorageClassAnnotation is the pre-GA annotation still +// honoured by some distributions (e.g. older OpenShift releases). +const legacyDefaultStorageClassAnnotation = "storageclass.beta.kubernetes.io/is-default-class" + +// isMissingKindErr folds the two distinct "kind is not available" +// errors a controller-runtime client can return when the underlying +// CRD is absent: the apimachinery no-match error returned by a real +// cluster whose discovery lacks the type, and the runtime +// not-registered error returned by a fake client whose scheme omits +// it. Callers use it to decide "probe says missing" vs. "probe should +// propagate the error". +func isMissingKindErr(err error) bool { + if err == nil { + return false + } + if meta.IsNoMatchError(err) { + return true + } + if runtime.IsNotRegisteredError(err) { + return true + } + return false +} + +// HasVolumeSnapshotCRD returns true when the cluster exposes the +// snapshot.storage.k8s.io/v1 VolumeSnapshot kind (i.e. the external +// snapshotter CRD is installed and its types are reachable through +// the supplied client). Other errors — RBAC denials, transient +// API-server failures — are returned to the caller as-is; the probe +// does not swallow them. +func HasVolumeSnapshotCRD(ctx context.Context, c client.Client) (bool, error) { + if c == nil { + return false, errors.New("clusterprobe.HasVolumeSnapshotCRD: client must not be nil") + } + var list snapshotv1.VolumeSnapshotList + if err := c.List(ctx, &list); err != nil { + if isMissingKindErr(err) { + return false, nil + } + return false, fmt.Errorf("list VolumeSnapshots: %w", err) + } + return true, nil +} + +// HasUsableSnapshotClass returns true when at least one +// VolumeSnapshotClass exists on the cluster. Callers that already +// confirmed the CRD via [HasVolumeSnapshotCRD] may still see this +// probe report false on clusters where the CRD is installed but no +// class is provisioned — a common state on stock kind nodes without +// the csi-hostpath driver add-on. +func HasUsableSnapshotClass(ctx context.Context, c client.Client) (bool, error) { + if c == nil { + return false, errors.New("clusterprobe.HasUsableSnapshotClass: client must not be nil") + } + var list snapshotv1.VolumeSnapshotClassList + if err := c.List(ctx, &list); err != nil { + if isMissingKindErr(err) { + return false, nil + } + return false, fmt.Errorf("list VolumeSnapshotClasses: %w", err) + } + return len(list.Items) > 0, nil +} + +// StorageClassAllowsExpansion returns true when the named StorageClass +// exists and has `allowVolumeExpansion=true`. When name is empty the +// probe looks up the cluster's default StorageClass (annotation +// storageclass.kubernetes.io/is-default-class=true, or its legacy +// beta variant). A nil AllowVolumeExpansion pointer on an otherwise +// valid StorageClass is reported as false — that is the Kubernetes +// API default meaning "expansion not allowed". +// +// Returns (false, nil) if the StorageClass (named or default) is not +// found; the caller typically translates that into a Skip() message. +// Returns (false, err) for any other API error. +func StorageClassAllowsExpansion(ctx context.Context, c client.Client, name string) (bool, error) { + if c == nil { + return false, errors.New("clusterprobe.StorageClassAllowsExpansion: client must not be nil") + } + sc, err := resolveStorageClass(ctx, c, name) + if err != nil { + return false, err + } + if sc == nil { + return false, nil + } + if sc.AllowVolumeExpansion == nil { + return false, nil + } + return *sc.AllowVolumeExpansion, nil +} + +// resolveStorageClass returns the StorageClass named by name, or when +// name is empty the cluster default. A missing StorageClass returns +// (nil, nil) so the caller can report it as an absent capability. +func resolveStorageClass(ctx context.Context, c client.Client, name string) (*storagev1.StorageClass, error) { + if name != "" { + sc := &storagev1.StorageClass{} + err := c.Get(ctx, client.ObjectKey{Name: name}, sc) + if apierrors.IsNotFound(err) { + return nil, nil + } + if isMissingKindErr(err) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("get StorageClass %s: %w", name, err) + } + return sc, nil + } + var list storagev1.StorageClassList + if err := c.List(ctx, &list); err != nil { + if isMissingKindErr(err) { + return nil, nil + } + return nil, fmt.Errorf("list StorageClasses: %w", err) + } + for i := range list.Items { + sc := &list.Items[i] + if isDefaultStorageClass(sc) { + return sc, nil + } + } + return nil, nil +} + +// isDefaultStorageClass honours both the GA and legacy beta +// "is-default-class" annotations. +func isDefaultStorageClass(sc *storagev1.StorageClass) bool { + if sc == nil { + return false + } + for _, key := range []string{DefaultStorageClassAnnotation, legacyDefaultStorageClassAnnotation} { + if v, ok := sc.Annotations[key]; ok && v == "true" { + return true + } + } + return false +} diff --git a/test/e2e/pkg/e2eutils/clusterprobe/clusterprobe_test.go b/test/e2e/pkg/e2eutils/clusterprobe/clusterprobe_test.go new file mode 100644 index 00000000..24251f13 --- /dev/null +++ b/test/e2e/pkg/e2eutils/clusterprobe/clusterprobe_test.go @@ -0,0 +1,217 @@ +package clusterprobe + +import ( + "context" + "errors" + "testing" + + snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" + storagev1 "k8s.io/api/storage/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func schemeWithSnapshots(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(s); err != nil { + t.Fatalf("add clientgo scheme: %v", err) + } + if err := snapshotv1.AddToScheme(s); err != nil { + t.Fatalf("add snapshotv1 scheme: %v", err) + } + return s +} + +func schemeWithoutSnapshots(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(s); err != nil { + t.Fatalf("add clientgo scheme: %v", err) + } + return s +} + +func TestHasVolumeSnapshotCRD(t *testing.T) { + t.Run("scheme lacks VolumeSnapshot returns false", func(t *testing.T) { + c := fake.NewClientBuilder().WithScheme(schemeWithoutSnapshots(t)).Build() + ok, err := HasVolumeSnapshotCRD(context.Background(), c) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if ok { + t.Fatalf("want false, got true") + } + }) + t.Run("scheme has VolumeSnapshot returns true", func(t *testing.T) { + c := fake.NewClientBuilder().WithScheme(schemeWithSnapshots(t)).Build() + ok, err := HasVolumeSnapshotCRD(context.Background(), c) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if !ok { + t.Fatalf("want true, got false") + } + }) + t.Run("nil client is an error", func(t *testing.T) { + _, err := HasVolumeSnapshotCRD(context.Background(), nil) + if err == nil { + t.Fatalf("want error, got nil") + } + }) +} + +func TestHasUsableSnapshotClass(t *testing.T) { + t.Run("CRD missing returns false", func(t *testing.T) { + c := fake.NewClientBuilder().WithScheme(schemeWithoutSnapshots(t)).Build() + ok, err := HasUsableSnapshotClass(context.Background(), c) + if err != nil || ok { + t.Fatalf("want (false, nil), got (%v, %v)", ok, err) + } + }) + t.Run("no classes returns false", func(t *testing.T) { + c := fake.NewClientBuilder().WithScheme(schemeWithSnapshots(t)).Build() + ok, err := HasUsableSnapshotClass(context.Background(), c) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if ok { + t.Fatalf("want false, got true") + } + }) + t.Run("at least one class returns true", func(t *testing.T) { + vsc := &snapshotv1.VolumeSnapshotClass{ + ObjectMeta: metav1.ObjectMeta{Name: "csi-hostpath-snapclass"}, + Driver: "hostpath.csi.k8s.io", + } + c := fake.NewClientBuilder(). + WithScheme(schemeWithSnapshots(t)). + WithObjects(vsc). + Build() + ok, err := HasUsableSnapshotClass(context.Background(), c) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if !ok { + t.Fatalf("want true, got false") + } + }) +} + +func boolPtr(b bool) *bool { return &b } + +func TestStorageClassAllowsExpansion(t *testing.T) { + mk := func(name string, allow *bool, annotations map[string]string) *storagev1.StorageClass { + return &storagev1.StorageClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Annotations: annotations, + }, + Provisioner: "kubernetes.io/host-path", + AllowVolumeExpansion: allow, + } + } + + t.Run("named class with expansion true", func(t *testing.T) { + c := fake.NewClientBuilder(). + WithScheme(schemeWithSnapshots(t)). + WithObjects(mk("csi-hostpath-sc", boolPtr(true), nil)). + Build() + ok, err := StorageClassAllowsExpansion(context.Background(), c, "csi-hostpath-sc") + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if !ok { + t.Fatalf("want true, got false") + } + }) + t.Run("named class with expansion nil (default false)", func(t *testing.T) { + c := fake.NewClientBuilder(). + WithScheme(schemeWithSnapshots(t)). + WithObjects(mk("standard", nil, nil)). + Build() + ok, err := StorageClassAllowsExpansion(context.Background(), c, "standard") + if err != nil || ok { + t.Fatalf("want (false, nil), got (%v, %v)", ok, err) + } + }) + t.Run("named class with expansion false", func(t *testing.T) { + c := fake.NewClientBuilder(). + WithScheme(schemeWithSnapshots(t)). + WithObjects(mk("standard", boolPtr(false), nil)). + Build() + ok, _ := StorageClassAllowsExpansion(context.Background(), c, "standard") + if ok { + t.Fatalf("want false, got true") + } + }) + t.Run("missing named class returns false nil", func(t *testing.T) { + c := fake.NewClientBuilder().WithScheme(schemeWithSnapshots(t)).Build() + ok, err := StorageClassAllowsExpansion(context.Background(), c, "does-not-exist") + if err != nil || ok { + t.Fatalf("want (false, nil), got (%v, %v)", ok, err) + } + }) + t.Run("empty name resolves default via GA annotation", func(t *testing.T) { + c := fake.NewClientBuilder(). + WithScheme(schemeWithSnapshots(t)). + WithObjects( + mk("other", boolPtr(false), nil), + mk("standard", boolPtr(true), map[string]string{ + DefaultStorageClassAnnotation: "true", + }), + ). + Build() + ok, err := StorageClassAllowsExpansion(context.Background(), c, "") + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if !ok { + t.Fatalf("want true for default class, got false") + } + }) + t.Run("empty name honours legacy beta default annotation", func(t *testing.T) { + c := fake.NewClientBuilder(). + WithScheme(schemeWithSnapshots(t)). + WithObjects( + mk("legacy", boolPtr(true), map[string]string{ + "storageclass.beta.kubernetes.io/is-default-class": "true", + }), + ). + Build() + ok, err := StorageClassAllowsExpansion(context.Background(), c, "") + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if !ok { + t.Fatalf("want true for legacy default, got false") + } + }) + t.Run("empty name with no default class returns false", func(t *testing.T) { + c := fake.NewClientBuilder(). + WithScheme(schemeWithSnapshots(t)). + WithObjects(mk("other", boolPtr(true), nil)). + Build() + ok, err := StorageClassAllowsExpansion(context.Background(), c, "") + if err != nil || ok { + t.Fatalf("want (false, nil), got (%v, %v)", ok, err) + } + }) + t.Run("nil client is an error", func(t *testing.T) { + _, err := StorageClassAllowsExpansion(context.Background(), nil, "anything") + if err == nil { + t.Fatalf("want error, got nil") + } + }) +} + +func TestIsMissingKindErrSmoke(t *testing.T) { + if isMissingKindErr(nil) { + t.Fatalf("nil err should not be missing") + } + if isMissingKindErr(errors.New("boom")) { + t.Fatalf("arbitrary error should not be missing") + } +} diff --git a/test/e2e/pkg/e2eutils/documentdb/.keep b/test/e2e/pkg/e2eutils/documentdb/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/documentdb/documentdb.go b/test/e2e/pkg/e2eutils/documentdb/documentdb.go new file mode 100644 index 00000000..47689c0b --- /dev/null +++ b/test/e2e/pkg/e2eutils/documentdb/documentdb.go @@ -0,0 +1,444 @@ +// Package documentdb provides CRUD and lifecycle helpers for the +// DocumentDB preview CR used by the E2E suite. +// +// The package is deliberately framework-agnostic: it returns plain +// errors rather than calling into Ginkgo/Gomega so unit tests can +// exercise it with a fake client. Suite code wraps these in +// gomega.Eventually where appropriate. +// +// Manifest rendering +// +// Create/RenderCR compose a YAML document from a base template plus +// zero or more mixins, concatenated with "---\n", then run the result +// through CNPG's envsubst helper for ${VAR} substitution. +// +// By default, templates are read from an embedded filesystem +// (test/e2e/manifests via the manifests package) so rendering is +// independent of the current working directory. Callers may pass a +// manifestsRoot to read from disk instead — useful for tests that want +// to point at a fixture tree. +package documentdb + +import ( + "bufio" + "bytes" + "context" + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "regexp" + "strings" + "time" + + "github.com/cloudnative-pg/cloudnative-pg/tests/utils/envsubst" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/yaml" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + e2emanifests "github.com/documentdb/documentdb-operator/test/e2e/manifests" +) + +// ManifestsFS is the filesystem RenderCR reads templates from when the +// caller does not pass an explicit manifestsRoot. Defaults to the +// embedded test/e2e/manifests tree; tests may override it to point at +// a fixture fs.FS (e.g. fstest.MapFS or os.DirFS). +var ManifestsFS fs.FS = e2emanifests.FS + +// baseSubdir and mixinSubdir are layout conventions: /base/.yaml.template +// and /mixins/.yaml.template respectively. +const ( + baseSubdir = "base" + mixinSubdir = "mixins" + templateExt = ".yaml.template" + yamlSeparator = "---\n" + + // DefaultWaitPoll is the polling interval for WaitHealthy/Delete. + DefaultWaitPoll = 2 * time.Second + + // ReadyStatus is the DocumentDBStatus.Status value the operator + // surfaces once the underlying CNPG cluster is healthy. It mirrors + // the CNPG Cluster status verbatim (see + // operator/src/api/preview/documentdb_types.go). Exposed as an + // exported constant so sibling packages (assertions, fixtures) + // share a single source of truth. + ReadyStatus = "Cluster in healthy state" +) + +// CreateOptions drives Create. Base names the file in manifests/base/, +// Mixins names files under manifests/mixins/. Vars are substituted by +// CNPG's envsubst; NAME and NAMESPACE are added automatically if absent. +type CreateOptions struct { + Base string + Mixins []string + Vars map[string]string + ManifestsRoot string // empty = embedded ManifestsFS +} + +// Create renders the CR and applies it via c.Create. The returned object +// is the in-cluster state after Create succeeds. +// +// When opts.Mixins is non-empty, RenderCR produces a multi-document YAML +// that would silently drop all but the first document under a naive +// yaml.Unmarshal. Create therefore deep-merges the rendered documents +// (override semantics: later mixins win) into a single map before +// converting to the typed DocumentDB object. The public RenderCR API +// still returns the raw multi-doc bytes, which are useful for artifact +// dumps and manual kubectl apply. +func Create(ctx context.Context, c client.Client, ns, name string, opts CreateOptions) (*previewv1.DocumentDB, error) { + raw, err := RenderCR(opts.Base, name, ns, opts.Mixins, opts.Vars, opts.ManifestsRoot) + if err != nil { + return nil, err + } + obj, err := decodeMergedDocumentDB(raw) + if err != nil { + return nil, err + } + if obj.Namespace == "" { + obj.Namespace = ns + } + if obj.Name == "" { + obj.Name = name + } + if err := c.Create(ctx, obj); err != nil { + return nil, fmt.Errorf("creating DocumentDB %s/%s: %w", ns, name, err) + } + return obj, nil +} + +// decodeMergedDocumentDB parses a multi-document YAML byte stream (as +// produced by RenderCR) and returns a single DocumentDB object whose +// fields reflect a deep-merge of every document in stream order. +// Maps are merged recursively; scalars and slices in later documents +// overwrite earlier values — the contract every mixin under +// manifests/mixins/ is written against. +func decodeMergedDocumentDB(raw []byte) (*previewv1.DocumentDB, error) { + docs, err := splitYAMLDocuments(raw) + if err != nil { + return nil, err + } + if len(docs) == 0 { + return nil, errors.New("decodeMergedDocumentDB: no YAML documents rendered") + } + merged := map[string]interface{}{} + for i, doc := range docs { + if len(bytes.TrimSpace(doc)) == 0 { + continue + } + var m map[string]interface{} + if err := yaml.Unmarshal(doc, &m); err != nil { + return nil, fmt.Errorf("unmarshaling YAML document %d: %w", i, err) + } + if m == nil { + continue + } + deepMerge(merged, m) + } + buf, err := yaml.Marshal(merged) + if err != nil { + return nil, fmt.Errorf("re-marshaling merged DocumentDB YAML: %w", err) + } + obj := &previewv1.DocumentDB{} + if err := yaml.Unmarshal(buf, obj); err != nil { + return nil, fmt.Errorf("unmarshaling merged DocumentDB YAML: %w", err) + } + return obj, nil +} + +// splitYAMLDocuments splits a raw YAML byte stream on the "\n---\n" +// document separator. A leading "---\n" is tolerated. +func splitYAMLDocuments(raw []byte) ([][]byte, error) { + // Normalise CRLF so the separator match is portable. + normalized := bytes.ReplaceAll(raw, []byte("\r\n"), []byte("\n")) + // Trim a leading separator if present. + normalized = bytes.TrimPrefix(normalized, []byte("---\n")) + return bytes.Split(normalized, []byte("\n---\n")), nil +} + +// deepMerge recursively merges src into dst with override semantics: +// when both sides hold a map[string]interface{} the merge recurses; +// otherwise the src value replaces dst's value. Nil src values are +// skipped so a mixin cannot unintentionally null out a base field just +// because YAML decoded the key as an explicit null. +func deepMerge(dst, src map[string]interface{}) { + for k, sv := range src { + if sv == nil { + continue + } + dv, ok := dst[k] + if !ok { + dst[k] = sv + continue + } + dm, dIsMap := dv.(map[string]interface{}) + sm, sIsMap := sv.(map[string]interface{}) + if dIsMap && sIsMap { + deepMerge(dm, sm) + dst[k] = dm + continue + } + dst[k] = sv + } +} + +// RenderCR reads the base template and mixin templates and returns the +// concatenated, variable-substituted YAML. NAME and NAMESPACE are +// injected into vars if not already present. +// +// When manifestsRoot is empty, templates are read from the embedded +// ManifestsFS (the default test/e2e/manifests tree). When non-empty, +// it is interpreted as an on-disk directory path and read via +// os.DirFS — the legacy behaviour used by fixture-based tests. +func RenderCR(baseName, name, ns string, mixins []string, vars map[string]string, manifestsRoot string) ([]byte, error) { + if baseName == "" { + return nil, errors.New("RenderCR: baseName is required") + } + + var source fs.FS + if manifestsRoot == "" { + source = ManifestsFS + } else { + source = os.DirFS(manifestsRoot) + } + + merged := map[string]string{"NAME": name, "NAMESPACE": ns} + for k, v := range vars { + merged[k] = v + } + + var buf bytes.Buffer + basePath := filepath.ToSlash(filepath.Join(baseSubdir, baseName+templateExt)) + baseBytes, err := fs.ReadFile(source, basePath) + if err != nil { + return nil, fmt.Errorf("reading base template %s: %w", basePath, err) + } + buf.Write(baseBytes) + + for _, m := range mixins { + mixinPath := filepath.ToSlash(filepath.Join(mixinSubdir, m+templateExt)) + mb, err := fs.ReadFile(source, mixinPath) + if err != nil { + return nil, fmt.Errorf("reading mixin template %s: %w", mixinPath, err) + } + if !bytes.HasSuffix(buf.Bytes(), []byte("\n")) { + buf.WriteByte('\n') + } + buf.WriteString(yamlSeparator) + buf.Write(mb) + } + + rendered, err := envsubst.Envsubst(merged, dropEmptyVarLines(buf.Bytes(), merged)) + if err != nil { + return nil, fmt.Errorf("envsubst: %w", err) + } + return rendered, nil +} + +// DropEmptyVarLines removes template lines of the form `key: ${VAR}` +// when merged[VAR] is an empty string. CNPG's envsubst treats empty +// values as missing, so this lets callers opt fields out of the +// rendered YAML by leaving the corresponding variable unset. Operator +// defaults (documentDBImage, gatewayImage, ...) thus fall through to +// server-side defaults instead of being forced to a pinned value. +func DropEmptyVarLines(data []byte, merged map[string]string) []byte { + return dropEmptyVarLines(data, merged) +} + +// singleVarLineRe matches a line whose non-whitespace content is a +// single YAML scalar assignment to a single ${VAR} reference, e.g.: +// +// documentDBImage: ${DOCUMENTDB_IMAGE} +// +// Leading whitespace is preserved, the captured group is the bare +// variable name. Lines with additional text around the reference do +// not match — we only strip "orphan" scalar assignments. +var singleVarLineRe = regexp.MustCompile(`^\s*[A-Za-z0-9_.\-]+:\s*\$\{([A-Za-z_][A-Za-z0-9_]*)\}\s*$`) + +// dropEmptyVarLines removes template lines of the form +// `key: ${VAR}` when merged[VAR] is an empty string. CNPG's envsubst +// treats empty values as missing, so this lets callers opt fields out +// of the rendered CR by leaving the corresponding variable unset. +// Fields the operator defaults server-side (e.g. documentDBImage, +// gatewayImage) thus fall through to operator defaults. +func dropEmptyVarLines(data []byte, merged map[string]string) []byte { + if !bytes.Contains(data, []byte("${")) { + return data + } + var out bytes.Buffer + scanner := bufio.NewScanner(bytes.NewReader(data)) + scanner.Buffer(make([]byte, 64*1024), 1024*1024) + for scanner.Scan() { + line := scanner.Text() + if m := singleVarLineRe.FindStringSubmatch(line); m != nil { + if v, ok := merged[m[1]]; ok && v == "" { + continue + } + } + out.WriteString(line) + out.WriteByte('\n') + } + // Preserve the last newline behaviour of the original buffer: if + // the input didn't end in \n, trim the trailing one we added. + if !strings.HasSuffix(string(data), "\n") && out.Len() > 0 { + b := out.Bytes() + if b[len(b)-1] == '\n' { + out.Truncate(out.Len() - 1) + } + } + return out.Bytes() +} + +// PatchInstances fetches the DocumentDB named by (ns, name) and +// patches its Spec.InstancesPerNode to want. Returns an error if the +// CR cannot be fetched, the desired value is out of the supported +// range (1..3 per the CRD), or the patch fails. When the CR already +// has the desired value the call is a no-op and returns nil. +func PatchInstances(ctx context.Context, c client.Client, ns, name string, want int) error { + if c == nil { + return errors.New("PatchInstances: client must not be nil") + } + if want < 1 || want > 3 { + return fmt.Errorf("PatchInstances: want=%d out of supported range 1..3", want) + } + dd := &previewv1.DocumentDB{} + if err := c.Get(ctx, client.ObjectKey{Namespace: ns, Name: name}, dd); err != nil { + return fmt.Errorf("get DocumentDB %s/%s: %w", ns, name, err) + } + if dd.Spec.InstancesPerNode == want { + return nil + } + before := dd.DeepCopy() + dd.Spec.InstancesPerNode = want + if err := c.Patch(ctx, dd, client.MergeFrom(before)); err != nil { + return fmt.Errorf("patch DocumentDB %s/%s instances=%d: %w", ns, name, want, err) + } + return nil +} + +// PatchSpec applies a merge-from patch that mutates the provided +// DocumentDB's spec in place. mutate receives a pointer to the Spec and +// may set any fields; the diff against the pre-mutation object is sent +// to the API server. +func PatchSpec(ctx context.Context, c client.Client, dd *previewv1.DocumentDB, mutate func(*previewv1.DocumentDBSpec)) error { + if dd == nil || mutate == nil { + return errors.New("PatchSpec: dd and mutate must not be nil") + } + before := dd.DeepCopy() + mutate(&dd.Spec) + if err := c.Patch(ctx, dd, client.MergeFrom(before)); err != nil { + return fmt.Errorf("patching DocumentDB %s/%s: %w", dd.Namespace, dd.Name, err) + } + return nil +} + +// WaitHealthy polls until the DocumentDB named by key reports a healthy +// status or the timeout elapses. "Healthy" is defined as +// Status.Status == ReadyStatus (the CNPG cluster status propagated via +// DocumentDBStatus.Status) or the presence of a Ready=True condition on +// the object (future-proofing). +// +// The polling interval is DefaultWaitPoll; the function returns nil on +// first healthy observation or an error describing the last observed +// state on timeout. +func WaitHealthy(ctx context.Context, c client.Client, key client.ObjectKey, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + var last previewv1.DocumentDB + for { + if err := c.Get(ctx, key, &last); err == nil { + if isHealthy(&last) { + return nil + } + } else if !apierrors.IsNotFound(err) { + return fmt.Errorf("getting DocumentDB %s: %w", key, err) + } + if time.Now().After(deadline) { + return fmt.Errorf("timed out after %s waiting for DocumentDB %s to be healthy (last status=%q)", + timeout, key, last.Status.Status) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(DefaultWaitPoll): + } + } +} + +// isHealthy implements the predicate documented on WaitHealthy. +func isHealthy(dd *previewv1.DocumentDB) bool { + if dd == nil { + return false + } + if dd.Status.Status == ReadyStatus { + return true + } + // Defensive: DocumentDBStatus today has no Conditions field, but if + // one is added later a Ready=True condition should also be honored. + // Reflectively check via annotations or leave to future extension. + return false +} + +// Delete issues a foreground delete on the given DocumentDB and polls +// until the object is gone or timeout elapses. +func Delete(ctx context.Context, c client.Client, dd *previewv1.DocumentDB, timeout time.Duration) error { + if dd == nil { + return errors.New("Delete: dd must not be nil") + } + if err := c.Delete(ctx, dd); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("deleting DocumentDB %s/%s: %w", dd.Namespace, dd.Name, err) + } + key := client.ObjectKeyFromObject(dd) + deadline := time.Now().Add(timeout) + for { + var got previewv1.DocumentDB + err := c.Get(ctx, key, &got) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return fmt.Errorf("polling deletion of %s: %w", key, err) + } + if time.Now().After(deadline) { + return fmt.Errorf("timed out after %s waiting for DocumentDB %s to be deleted", timeout, key) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(DefaultWaitPoll): + } + } +} + +// List returns all DocumentDB objects in the given namespace. +func List(ctx context.Context, c client.Client, ns string) ([]previewv1.DocumentDB, error) { + var ddList previewv1.DocumentDBList + opts := []client.ListOption{} + if ns != "" { + opts = append(opts, client.InNamespace(ns)) + } + if err := c.List(ctx, &ddList, opts...); err != nil { + return nil, fmt.Errorf("listing DocumentDB in %q: %w", ns, err) + } + return ddList.Items, nil +} + +// Get fetches a DocumentDB by key. +func Get(ctx context.Context, c client.Client, key client.ObjectKey) (*previewv1.DocumentDB, error) { + var dd previewv1.DocumentDB + if err := c.Get(ctx, key, &dd); err != nil { + return nil, fmt.Errorf("getting DocumentDB %s: %w", key, err) + } + return &dd, nil +} + +// objectMetaFor is a small helper that constructs an ObjectMeta for +// ad-hoc DocumentDB creation in tests. Exposed because several helpers +// in later phases will build DocumentDB objects programmatically +// instead of rendering templates. +func objectMetaFor(ns, name string) metav1.ObjectMeta { + return metav1.ObjectMeta{Namespace: ns, Name: name} +} + +var _ = objectMetaFor // retained for Phase-2 programmatic builders diff --git a/test/e2e/pkg/e2eutils/documentdb/documentdb_test.go b/test/e2e/pkg/e2eutils/documentdb/documentdb_test.go new file mode 100644 index 00000000..fd35b173 --- /dev/null +++ b/test/e2e/pkg/e2eutils/documentdb/documentdb_test.go @@ -0,0 +1,353 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package documentdb + +import ( + "context" + "os" + "path/filepath" + goruntime "runtime" + "strings" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" +) + +func newScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := previewv1.AddToScheme(s); err != nil { + t.Fatalf("AddToScheme: %v", err) + } + return s +} + +func TestRenderCRConcatenatesBaseAndMixins(t *testing.T) { + dir := t.TempDir() + mustWrite(t, filepath.Join(dir, baseSubdir, "ddb"+templateExt), + "apiVersion: documentdb.io/preview\nkind: DocumentDB\nmetadata:\n name: ${NAME}\n namespace: ${NAMESPACE}\n") + mustWrite(t, filepath.Join(dir, mixinSubdir, "tls"+templateExt), + "# tls mixin for ${NAME}\n") + + got, err := RenderCR("ddb", "my-dd", "ns1", []string{"tls"}, nil, dir) + if err != nil { + t.Fatalf("RenderCR: %v", err) + } + s := string(got) + if !strings.Contains(s, "name: my-dd") { + t.Errorf("expected NAME substitution; got:\n%s", s) + } + if !strings.Contains(s, "namespace: ns1") { + t.Errorf("expected NAMESPACE substitution; got:\n%s", s) + } + if !strings.Contains(s, "---\n") { + t.Errorf("expected YAML separator between base and mixin; got:\n%s", s) + } + if !strings.Contains(s, "tls mixin for my-dd") { + t.Errorf("expected mixin body; got:\n%s", s) + } +} + +func TestRenderCRMissingBaseReturnsError(t *testing.T) { + dir := t.TempDir() + _, err := RenderCR("nope", "n", "ns", nil, nil, dir) + if err == nil { + t.Fatal("expected error for missing base template") + } +} + +func TestRenderCRUserVarsOverrideNameAndNamespace(t *testing.T) { + dir := t.TempDir() + mustWrite(t, filepath.Join(dir, baseSubdir, "b"+templateExt), "x: ${NAME}-${EXTRA}\n") + got, err := RenderCR("b", "n", "ns", nil, map[string]string{"EXTRA": "z"}, dir) + if err != nil { + t.Fatalf("RenderCR: %v", err) + } + if !strings.Contains(string(got), "x: n-z") { + t.Errorf("expected substituted extra var; got: %s", got) + } +} + +func TestGetAndList(t *testing.T) { + s := newScheme(t) + objs := []client.Object{ + &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "a", Namespace: "ns1"}}, + &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "b", Namespace: "ns1"}}, + &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "c", Namespace: "ns2"}}, + } + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build() + ctx := context.Background() + + got, err := Get(ctx, c, types.NamespacedName{Name: "a", Namespace: "ns1"}) + if err != nil { + t.Fatalf("Get: %v", err) + } + if got.Name != "a" { + t.Errorf("got name %q want a", got.Name) + } + + items, err := List(ctx, c, "ns1") + if err != nil { + t.Fatalf("List: %v", err) + } + if len(items) != 2 { + t.Errorf("got %d items want 2", len(items)) + } + + all, err := List(ctx, c, "") + if err != nil { + t.Fatalf("List all: %v", err) + } + if len(all) != 3 { + t.Errorf("got %d items want 3", len(all)) + } +} + +func TestPatchSpec(t *testing.T) { + s := newScheme(t) + dd := &previewv1.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{Name: "a", Namespace: "ns1"}, + Spec: previewv1.DocumentDBSpec{NodeCount: 1, InstancesPerNode: 1}, + } + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(dd).Build() + ctx := context.Background() + + fresh, err := Get(ctx, c, client.ObjectKeyFromObject(dd)) + if err != nil { + t.Fatalf("Get: %v", err) + } + if err := PatchSpec(ctx, c, fresh, func(spec *previewv1.DocumentDBSpec) { + spec.LogLevel = "debug" + }); err != nil { + t.Fatalf("PatchSpec: %v", err) + } + after, err := Get(ctx, c, client.ObjectKeyFromObject(dd)) + if err != nil { + t.Fatalf("Get after: %v", err) + } + if after.Spec.LogLevel != "debug" { + t.Errorf("expected LogLevel=debug, got %q", after.Spec.LogLevel) + } +} + +func TestIsHealthyMatchesRunningStatus(t *testing.T) { + if isHealthy(nil) { + t.Error("nil should not be healthy") + } + if isHealthy(&previewv1.DocumentDB{}) { + t.Error("empty should not be healthy") + } + dd := &previewv1.DocumentDB{Status: previewv1.DocumentDBStatus{Status: ReadyStatus}} + if !isHealthy(dd) { + t.Errorf("%q should be healthy", ReadyStatus) + } + notReady := &previewv1.DocumentDB{Status: previewv1.DocumentDBStatus{Status: "Running"}} + if isHealthy(notReady) { + t.Error(`"Running" should not be healthy (ReadyStatus mismatch)`) + } +} + +func TestWaitHealthyTimeout(t *testing.T) { + s := newScheme(t) + dd := &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "a", Namespace: "ns1"}} + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(dd).Build() + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + err := WaitHealthy(ctx, c, client.ObjectKeyFromObject(dd), 200*time.Millisecond) + if err == nil { + t.Fatal("expected timeout error") + } +} + +func TestDeleteRemovesObject(t *testing.T) { + s := newScheme(t) + dd := &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "a", Namespace: "ns1"}} + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(dd).Build() + ctx := context.Background() + if err := Delete(ctx, c, dd, 2*time.Second); err != nil { + t.Fatalf("Delete: %v", err) + } + if _, err := Get(ctx, c, client.ObjectKeyFromObject(dd)); err == nil { + t.Fatal("expected Get to fail after Delete") + } +} + +func TestPatchInstances_UpdatesSpec(t *testing.T) { + s := newScheme(t) + dd := &previewv1.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{Name: "dd", Namespace: "ns1"}, + Spec: previewv1.DocumentDBSpec{NodeCount: 1, InstancesPerNode: 2}, + } + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(dd).Build() + ctx := context.Background() + + if err := PatchInstances(ctx, c, "ns1", "dd", 3); err != nil { + t.Fatalf("PatchInstances: %v", err) + } + got, err := Get(ctx, c, types.NamespacedName{Namespace: "ns1", Name: "dd"}) + if err != nil { + t.Fatalf("Get: %v", err) + } + if got.Spec.InstancesPerNode != 3 { + t.Fatalf("InstancesPerNode=%d, want 3", got.Spec.InstancesPerNode) + } +} + +func TestPatchInstances_NoopWhenEqual(t *testing.T) { + s := newScheme(t) + dd := &previewv1.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{Name: "dd", Namespace: "ns1", ResourceVersion: "7"}, + Spec: previewv1.DocumentDBSpec{NodeCount: 1, InstancesPerNode: 2}, + } + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(dd).Build() + if err := PatchInstances(context.Background(), c, "ns1", "dd", 2); err != nil { + t.Fatalf("PatchInstances no-op: %v", err) + } +} + +func TestPatchInstances_RejectsOutOfRange(t *testing.T) { + s := newScheme(t) + c := fakeclient.NewClientBuilder().WithScheme(s).Build() + for _, n := range []int{0, 4, -1} { + if err := PatchInstances(context.Background(), c, "ns1", "dd", n); err == nil { + t.Errorf("PatchInstances(%d) expected error, got nil", n) + } + } +} + +func TestPatchInstances_NotFound(t *testing.T) { + s := newScheme(t) + c := fakeclient.NewClientBuilder().WithScheme(s).Build() + if err := PatchInstances(context.Background(), c, "ns1", "missing", 2); err == nil { + t.Fatal("expected error for missing DocumentDB") + } +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("write: %v", err) + } +} + +// TestCreateAppliesTLSDisabledMixin uses the real base + tls_disabled +// mixin shipped under test/e2e/manifests/ to prove the multi-document +// merge in Create is no longer a silent drop: the mixin's +// Spec.TLS.Gateway.Mode must round-trip to the created object. +func TestCreateAppliesTLSDisabledMixin(t *testing.T) { + root := realManifestsRoot(t) + s := newScheme(t) + c := fakeclient.NewClientBuilder().WithScheme(s).Build() + + obj, err := Create(context.Background(), c, "ns1", "dd1", CreateOptions{ + Base: "documentdb", + Mixins: []string{"tls_disabled"}, + ManifestsRoot: root, + Vars: map[string]string{ + "INSTANCES": "1", + "STORAGE_SIZE": "1Gi", + "STORAGE_CLASS": "standard", + "DOCUMENTDB_IMAGE": "ghcr.io/example/ddb:test", + "GATEWAY_IMAGE": "ghcr.io/example/gw:test", + "CREDENTIAL_SECRET": "documentdb-credentials", + "EXPOSURE_TYPE": "ClusterIP", + "LOG_LEVEL": "info", + }, + }) + if err != nil { + t.Fatalf("Create: %v", err) + } + // Assert against the returned object and re-Get it from the fake + // client; both paths must reflect the merged mixin. + if obj.Spec.TLS == nil || obj.Spec.TLS.Gateway == nil { + t.Fatalf("returned object missing Spec.TLS.Gateway; got %+v", obj.Spec) + } + if obj.Spec.TLS.Gateway.Mode != "Disabled" { + t.Fatalf("returned Spec.TLS.Gateway.Mode=%q, want Disabled", obj.Spec.TLS.Gateway.Mode) + } + + got, err := Get(context.Background(), c, types.NamespacedName{Namespace: "ns1", Name: "dd1"}) + if err != nil { + t.Fatalf("Get back: %v", err) + } + if got.Spec.TLS == nil || got.Spec.TLS.Gateway == nil { + t.Fatalf("stored object missing Spec.TLS.Gateway; got %+v", got.Spec) + } + if got.Spec.TLS.Gateway.Mode != "Disabled" { + t.Fatalf("stored Spec.TLS.Gateway.Mode=%q, want Disabled", got.Spec.TLS.Gateway.Mode) + } + // Base fields must still be present after the merge. + if got.Spec.InstancesPerNode != 1 { + t.Errorf("Spec.InstancesPerNode=%d, want 1", got.Spec.InstancesPerNode) + } + if got.Spec.Resource.Storage.PvcSize != "1Gi" { + t.Errorf("Spec.Resource.Storage.PvcSize=%q, want 1Gi", got.Spec.Resource.Storage.PvcSize) + } +} + +// TestCreateAppliesReclaimRetainMixin exercises the same multi-doc +// merge path with a mixin that nests Spec.Resource.Storage — verifying +// the deep-merge preserves sibling keys (PvcSize, StorageClass) while +// adding PersistentVolumeReclaimPolicy from the mixin. +func TestCreateAppliesReclaimRetainMixin(t *testing.T) { + root := realManifestsRoot(t) + s := newScheme(t) + c := fakeclient.NewClientBuilder().WithScheme(s).Build() + + obj, err := Create(context.Background(), c, "ns1", "dd2", CreateOptions{ + Base: "documentdb", + Mixins: []string{"reclaim_retain"}, + ManifestsRoot: root, + Vars: map[string]string{ + "INSTANCES": "1", + "STORAGE_SIZE": "2Gi", + "STORAGE_CLASS": "standard", + "DOCUMENTDB_IMAGE": "ghcr.io/example/ddb:test", + "GATEWAY_IMAGE": "ghcr.io/example/gw:test", + "CREDENTIAL_SECRET": "documentdb-credentials", + "EXPOSURE_TYPE": "ClusterIP", + "LOG_LEVEL": "info", + }, + }) + if err != nil { + t.Fatalf("Create: %v", err) + } + if obj.Spec.Resource.Storage.PersistentVolumeReclaimPolicy != "Retain" { + t.Fatalf("Spec.Resource.Storage.PersistentVolumeReclaimPolicy=%q, want Retain", + obj.Spec.Resource.Storage.PersistentVolumeReclaimPolicy) + } + if obj.Spec.Resource.Storage.PvcSize != "2Gi" { + t.Errorf("Spec.Resource.Storage.PvcSize=%q, want 2Gi (base preserved after merge)", + obj.Spec.Resource.Storage.PvcSize) + } +} + +// realManifestsRoot returns the absolute path to test/e2e/manifests so +// the round-trip tests exercise the real templates rather than the +// synthetic fixtures that the RenderCR-only tests build with t.TempDir. +// Anchored off runtime.Caller so `go test` from any directory works. +func realManifestsRoot(t *testing.T) string { + t.Helper() + _, thisFile, _, ok := goruntime.Caller(0) + if !ok { + t.Fatal("runtime.Caller failed — cannot locate test/e2e/manifests") + } + // this file: test/e2e/pkg/e2eutils/documentdb/documentdb_test.go + // walk up to test/e2e, then into manifests. + root := filepath.Join(filepath.Dir(thisFile), "..", "..", "..", "manifests") + if _, err := os.Stat(filepath.Join(root, "base", "documentdb"+templateExt)); err != nil { + t.Fatalf("manifests root not found at %s: %v", root, err) + } + return root +} diff --git a/test/e2e/pkg/e2eutils/fixtures/.keep b/test/e2e/pkg/e2eutils/fixtures/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/fixtures/fixtures.go b/test/e2e/pkg/e2eutils/fixtures/fixtures.go new file mode 100644 index 00000000..a62ed5d1 --- /dev/null +++ b/test/e2e/pkg/e2eutils/fixtures/fixtures.go @@ -0,0 +1,478 @@ +// Package fixtures provides session-scoped test fixtures shared across +// DocumentDB e2e test areas. Two cluster fixtures are supported: +// +// - SharedRO: a 1-instance read-only DocumentDB reused by data/, +// performance/ and status/ specs. Specs isolate via per-spec Mongo +// database names (see DBNameFor). +// - SharedScale: a 2-instance mutable DocumentDB reused by scale/ +// specs. Callers must call ResetToTwoInstances in AfterEach. +// +// Both fixtures are created lazily via sync.Once guards and torn down +// explicitly from the area suite_test.go AfterSuite. +// +// Ownership labels (LabelRunID, LabelFixture, LabelArea) are stamped on +// every namespace and CR fixtures create so TeardownSharedRO / +// TeardownSharedScale can list-by-label instead of delete-by-name — +// that avoids cross-binary teardown collisions described in the Phase 1 +// rubber-duck review. +package fixtures + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "os" + "path/filepath" + "runtime" + "sync" + "time" + + "github.com/cloudnative-pg/cloudnative-pg/tests/utils/envsubst" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/yaml" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + + documentdbutil "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" +) + +// Ownership label keys stamped on every fixture-created namespace and +// DocumentDB CR. Exported so parallel tooling (CI cleanup scripts, +// kubectl one-liners) can use the same selectors. +const ( + LabelRunID = "e2e.documentdb.io/run-id" + LabelFixture = "e2e.documentdb.io/fixture" + LabelArea = "e2e.documentdb.io/area" +) + +// Fixture kind label values. +const ( + FixtureSharedRO = "shared-ro" + FixtureSharedScale = "shared-scale" + // FixturePerSpec is stamped on namespaces and secrets created per + // individual spec (i.e. not shared across specs). Area-specific + // helpers_test.go files use this value via CreateLabeledNamespace. + FixturePerSpec = "per-spec" +) + +// DefaultCredentialPassword / DefaultCredentialUsername expose the seed +// credentials used by both shared and per-spec fixture secrets. Area +// helpers_test.go files import these instead of re-declaring string +// literals; that way a credential change ripples out in one edit. +const ( + DefaultCredentialPassword = defaultCredentialPassword + DefaultCredentialUsername = defaultCredentialUsername +) + +// procID returns the Ginkgo parallel process identifier as a string, +// falling back to "1" when unset. This lets per-process fixtures coexist +// safely in a single kind cluster during ginkgo -p runs. +func procID() string { + if v, ok := os.LookupEnv("GINKGO_PARALLEL_PROCESS"); ok && v != "" { + return v + } + return "1" +} + +// runIDMu guards runIDVal. fixtures cannot import the parent e2e +// package (it would create an import cycle); instead the root suite +// calls SetRunID once during SetupSuite. +var ( + runIDMu sync.RWMutex + runIDVal string +) + +// SetRunID records the suite-wide run identifier. Call exactly once +// from the root suite.go after resolving the identifier from the +// environment. Subsequent calls with the same value are no-ops; calls +// with a different non-empty value are ignored (first-writer-wins) to +// keep fixture naming stable if a worker races with the primary node. +func SetRunID(id string) { + if id == "" { + return + } + runIDMu.Lock() + defer runIDMu.Unlock() + if runIDVal == "" { + runIDVal = id + } +} + +// RunID returns the identifier previously recorded by SetRunID, or +// "unset" if SetRunID was never called. The fallback exists so unit +// tests that exercise fixture helpers directly still produce valid +// Kubernetes names; production code paths always call SetRunID first. +func RunID() string { + runIDMu.RLock() + defer runIDMu.RUnlock() + if runIDVal == "" { + return "unset" + } + return runIDVal +} + +// resetRunIDForTest clears the cached run id for unit tests. +func resetRunIDForTest() { + runIDMu.Lock() + defer runIDMu.Unlock() + runIDVal = "" +} + +// defaultCredentialSecretName is the credential secret created alongside +// every shared fixture cluster. Tests read these credentials through +// pkg/e2eutils/mongo helpers. +const defaultCredentialSecretName = "documentdb-credentials" + +// DefaultCredentialSecretName is the exported alias of the credential +// secret name created by the shared fixtures. Exported so cross-package +// helpers (e.g., pkg/e2eutils/mongo) can discover the secret without +// duplicating the string literal. +const DefaultCredentialSecretName = defaultCredentialSecretName + +// defaultCredentialUsername / defaultCredentialPassword are the seed +// credentials stamped into the per-fixture credential secret. +const ( + defaultCredentialUsername = "e2e_admin" + defaultCredentialPassword = "E2eAdmin100" //nolint:gosec // fixture-only +) + +// defaultDocumentDBImage / defaultGatewayImage are empty by default so +// the operator composes the cluster itself: CNPG pg18 base image + +// DocumentDB extension via the image-library mechanism + gateway as a +// separate sidecar image. Setting a single monolithic image here would +// make CNPG run the wrong container for postgres. CI pins real images +// via DOCUMENTDB_IMAGE / GATEWAY_IMAGE environment variables. +const ( + defaultDocumentDBImage = "" + defaultGatewayImage = "" +) + +// defaultStorageSize / defaultStorageClass are conservative defaults +// used by both shared fixtures. Override via E2E_STORAGE_SIZE / +// E2E_STORAGE_CLASS environment variables when targeting non-kind +// clusters. +const ( + defaultStorageSize = "1Gi" + defaultStorageClass = "standard" +) + +// defaultFixtureCreateTimeout / defaultFixtureDeleteTimeout / defaultPollInterval +// bound waits performed inside this package. They intentionally do not +// depend on the sibling timeouts package so that fixture setup is not +// delayed by a missing helper. +const ( + defaultFixtureCreateTimeout = 10 * time.Minute + defaultFixtureDeleteTimeout = 5 * time.Minute + defaultPollInterval = 5 * time.Second +) + +// manifestsDir returns the absolute path to the test/e2e/manifests +// directory regardless of the caller's working directory. It relies on +// runtime.Caller to anchor off this source file. +func manifestsDir() (string, error) { + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + return "", fmt.Errorf("runtime.Caller failed while locating manifests") + } + // this file lives at test/e2e/pkg/e2eutils/fixtures/.go — walk up + // four dirs to reach test/e2e, then descend into manifests/. + return filepath.Join(filepath.Dir(thisFile), "..", "..", "..", "manifests"), nil +} + +// renderTemplate applies envsubst to the template at path relative to +// manifestsDir() and unmarshals the result into a DocumentDB CR. +func renderDocumentDB(relPath string, vars map[string]string) (*previewv1.DocumentDB, error) { + root, err := manifestsDir() + if err != nil { + return nil, err + } + data, err := os.ReadFile(filepath.Join(root, relPath)) + if err != nil { + return nil, fmt.Errorf("reading template %s: %w", relPath, err) + } + rendered, err := envsubst.Envsubst(vars, documentdbutil.DropEmptyVarLines(data, vars)) + if err != nil { + return nil, fmt.Errorf("envsubst on %s: %w", relPath, err) + } + out := &previewv1.DocumentDB{} + if err := yaml.Unmarshal(rendered, out); err != nil { + return nil, fmt.Errorf("unmarshal rendered %s: %w", relPath, err) + } + return out, nil +} + +// ownershipLabels returns the canonical ownership labels applied to +// every fixture-created object. area may be empty when the caller is a +// cross-area helper. +func ownershipLabels(fixture, area string) map[string]string { + l := map[string]string{ + LabelRunID: RunID(), + LabelFixture: fixture, + } + if area != "" { + l[LabelArea] = area + } + return l +} + +// ensureNamespace creates the namespace if it is missing and stamps the +// ownership labels onto it. If the namespace already exists its labels +// are validated: a mismatched LabelRunID returns a collision error. +func ensureNamespace(ctx context.Context, c client.Client, name, fixture string) error { + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: ownershipLabels(fixture, ""), + }, + } + err := c.Create(ctx, ns) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("create namespace %s: %w", name, err) + } + existing := &corev1.Namespace{} + if getErr := c.Get(ctx, types.NamespacedName{Name: name}, existing); getErr != nil { + return fmt.Errorf("get existing namespace %s: %w", name, getErr) + } + if got := existing.Labels[LabelRunID]; got != RunID() { + return fmt.Errorf("fixture collision: namespace %s exists with run-id=%q (current run-id=%q)", + name, got, RunID()) + } + if got := existing.Labels[LabelFixture]; got != "" && got != fixture { + return fmt.Errorf("fixture collision: namespace %s exists with fixture=%q (want %q)", + name, got, fixture) + } + return nil +} + +// ensureCredentialSecret creates the fixture credential secret if it is +// missing. The secret schema matches the DocumentDB operator's contract +// (keys "username" and "password"). +func ensureCredentialSecret(ctx context.Context, c client.Client, namespace, name, fixture string) error { + sec := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: ownershipLabels(fixture, ""), + }, + Type: corev1.SecretTypeOpaque, + StringData: map[string]string{ + "username": defaultCredentialUsername, + "password": defaultCredentialPassword, + }, + } + if err := c.Create(ctx, sec); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("create credential secret %s/%s: %w", namespace, name, err) + } + return nil +} + +// CreateLabeledNamespace creates a per-spec namespace stamped with the +// suite run-id, fixture=per-spec, and the caller-supplied area label. +// It is the exported entry point that area helpers_test.go files call +// in BeforeEach; the labels let CI cleanup scripts reap orphaned +// namespaces by selector even when a spec panics before AfterEach. +// +// Semantics on AlreadyExists mirror ensureNamespace: an existing +// namespace with the current run-id (or no run-id label) is adopted; a +// mismatched run-id is a collision and returns an error. +func CreateLabeledNamespace(ctx context.Context, c client.Client, name, area string) error { + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: ownershipLabels(FixturePerSpec, area), + }, + } + err := c.Create(ctx, ns) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("create namespace %s: %w", name, err) + } + existing := &corev1.Namespace{} + if getErr := c.Get(ctx, types.NamespacedName{Name: name}, existing); getErr != nil { + return fmt.Errorf("get existing namespace %s: %w", name, getErr) + } + if got := existing.Labels[LabelRunID]; got != "" && got != RunID() { + return fmt.Errorf("fixture collision: namespace %s exists with run-id=%q (current run-id=%q)", + name, got, RunID()) + } + return nil +} + +// CreateLabeledCredentialSecret creates the default DocumentDB +// credential secret (DefaultCredentialSecretName) in namespace with the +// same labels CreateLabeledNamespace stamps. Idempotent: an existing +// secret is treated as success regardless of label state, matching the +// contract of ensureCredentialSecret used by shared fixtures. +func CreateLabeledCredentialSecret(ctx context.Context, c client.Client, namespace string) error { + return ensureCredentialSecret(ctx, c, namespace, defaultCredentialSecretName, FixturePerSpec) +} + +// baseVars returns the envsubst variable map shared by both fixtures. +func baseVars(namespace, name, instances string) map[string]string { + documentdbImage := defaultDocumentDBImage + if v := os.Getenv("DOCUMENTDB_IMAGE"); v != "" { + documentdbImage = v + } + gatewayImage := defaultGatewayImage + if v := os.Getenv("GATEWAY_IMAGE"); v != "" { + gatewayImage = v + } + storageSize := defaultStorageSize + if v := os.Getenv("E2E_STORAGE_SIZE"); v != "" { + storageSize = v + } + storageClass := defaultStorageClass + if v := os.Getenv("E2E_STORAGE_CLASS"); v != "" { + storageClass = v + } + return map[string]string{ + "NAMESPACE": namespace, + "NAME": name, + "INSTANCES": instances, + "STORAGE_SIZE": storageSize, + "STORAGE_CLASS": storageClass, + "DOCUMENTDB_IMAGE": documentdbImage, + "GATEWAY_IMAGE": gatewayImage, + "CREDENTIAL_SECRET": defaultCredentialSecretName, + "EXPOSURE_TYPE": "ClusterIP", + "LOG_LEVEL": "info", + } +} + +// createDocumentDB creates the supplied CR if absent, stamping the +// ownership labels onto it. On AlreadyExists it validates the existing +// CR's run-id label matches the current RunID(); a mismatch returns an +// explicit collision error so the caller can abort rather than adopt a +// foreign fixture. +func createDocumentDB(ctx context.Context, c client.Client, dd *previewv1.DocumentDB, fixture string) error { + if dd.Labels == nil { + dd.Labels = map[string]string{} + } + for k, v := range ownershipLabels(fixture, "") { + if _, present := dd.Labels[k]; !present { + dd.Labels[k] = v + } + } + err := c.Create(ctx, dd) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("create documentdb %s/%s: %w", dd.Namespace, dd.Name, err) + } + existing := &previewv1.DocumentDB{} + key := types.NamespacedName{Namespace: dd.Namespace, Name: dd.Name} + if getErr := c.Get(ctx, key, existing); getErr != nil { + return fmt.Errorf("get existing documentdb %s: %w", key, getErr) + } + if got := existing.Labels[LabelRunID]; got != RunID() { + return fmt.Errorf("fixture collision: existing CR %s/%s belongs to run %q (current %q)", + dd.Namespace, dd.Name, got, RunID()) + } + if got := existing.Labels[LabelFixture]; got != "" && got != fixture { + return fmt.Errorf("fixture collision: existing CR %s/%s has fixture=%q (want %q)", + dd.Namespace, dd.Name, got, fixture) + } + return nil +} + +// waitDocumentDBHealthy polls the DocumentDB CR until its status +// reports the canonical healthy string used by the operator and CI. +func waitDocumentDBHealthy(ctx context.Context, c client.Client, namespace, name string, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, defaultPollInterval, timeout, true, func(ctx context.Context) (bool, error) { + dd := &previewv1.DocumentDB{} + if err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, dd); err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + return false, err + } + return dd.Status.Status == documentdbutil.ReadyStatus, nil + }) +} + +// deleteDocumentDB deletes the DocumentDB CR and waits for it to be +// fully removed. +func deleteDocumentDB(ctx context.Context, c client.Client, namespace, name string, timeout time.Duration) error { + dd := &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}} + if err := c.Delete(ctx, dd); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete documentdb %s/%s: %w", namespace, name, err) + } + return wait.PollUntilContextTimeout(ctx, defaultPollInterval, timeout, true, func(ctx context.Context) (bool, error) { + err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &previewv1.DocumentDB{}) + if apierrors.IsNotFound(err) { + return true, nil + } + return false, err + }) +} + +// deleteNamespace deletes the namespace and waits for termination. Used +// from fixture teardown. +func deleteNamespace(ctx context.Context, c client.Client, name string, timeout time.Duration) error { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} + if err := c.Delete(ctx, ns); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete namespace %s: %w", name, err) + } + return wait.PollUntilContextTimeout(ctx, defaultPollInterval, timeout, true, func(ctx context.Context) (bool, error) { + err := c.Get(ctx, types.NamespacedName{Name: name}, &corev1.Namespace{}) + if apierrors.IsNotFound(err) { + return true, nil + } + return false, err + }) +} + +// teardownFixtureByLabels performs a label-selector-driven teardown of +// all resources owned by the current RunID() for the given fixture. It +// first deletes any matching DocumentDB CRs (waiting for finalizers), +// then deletes matching namespaces. Callers must pass the same fixture +// constant they used when creating the resources. +func teardownFixtureByLabels(ctx context.Context, c client.Client, fixture string) error { + sel := client.MatchingLabels{ + LabelRunID: RunID(), + LabelFixture: fixture, + } + // Step 1: delete DocumentDB CRs cluster-wide. + dds := &previewv1.DocumentDBList{} + if err := c.List(ctx, dds, sel); err != nil { + return fmt.Errorf("list %s DocumentDB CRs: %w", fixture, err) + } + for i := range dds.Items { + dd := &dds.Items[i] + if err := deleteDocumentDB(ctx, c, dd.Namespace, dd.Name, defaultFixtureDeleteTimeout); err != nil { + return fmt.Errorf("delete %s DocumentDB %s/%s: %w", fixture, dd.Namespace, dd.Name, err) + } + } + // Step 2: delete namespaces. + nss := &corev1.NamespaceList{} + if err := c.List(ctx, nss, sel); err != nil { + return fmt.Errorf("list %s namespaces: %w", fixture, err) + } + for i := range nss.Items { + ns := &nss.Items[i] + if err := deleteNamespace(ctx, c, ns.Name, defaultFixtureDeleteTimeout); err != nil { + return fmt.Errorf("delete %s namespace %s: %w", fixture, ns.Name, err) + } + } + return nil +} + +// DBNameFor returns a deterministic Mongo database name derived from +// the supplied spec text (typically ginkgo's CurrentSpecReport().FullText()). +// The returned string matches "db_" and is safe for Mongo. +func DBNameFor(specText string) string { + sum := sha256.Sum256([]byte(specText)) + return "db_" + hex.EncodeToString(sum[:])[:12] +} diff --git a/test/e2e/pkg/e2eutils/fixtures/fixtures_test.go b/test/e2e/pkg/e2eutils/fixtures/fixtures_test.go new file mode 100644 index 00000000..e0d1f3b8 --- /dev/null +++ b/test/e2e/pkg/e2eutils/fixtures/fixtures_test.go @@ -0,0 +1,215 @@ +package fixtures + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" +) + +func TestDBNameForDeterministic(t *testing.T) { + a := DBNameFor("spec text one") + b := DBNameFor("spec text one") + if a != b { + t.Fatalf("DBNameFor not deterministic: %q != %q", a, b) + } + c := DBNameFor("spec text two") + if a == c { + t.Fatalf("DBNameFor collision for distinct inputs: %q", a) + } + if len(a) != len("db_")+12 { + t.Fatalf("DBNameFor returned unexpected length %q", a) + } + if a[:3] != "db_" { + t.Fatalf("DBNameFor prefix wrong: %q", a) + } +} + +func TestRenderBaseDocumentDB(t *testing.T) { + vars := baseVars("ns", "cluster", "1") + dd, err := renderDocumentDB("base/documentdb.yaml.template", vars) + if err != nil { + t.Fatalf("render failed: %v", err) + } + if dd.Namespace != "ns" || dd.Name != "cluster" { + t.Fatalf("unexpected name/namespace: %s/%s", dd.Namespace, dd.Name) + } + if dd.Spec.NodeCount != 1 { + t.Fatalf("expected nodeCount=1, got %d", dd.Spec.NodeCount) + } + if dd.Spec.InstancesPerNode != 1 { + t.Fatalf("expected instancesPerNode=1, got %d", dd.Spec.InstancesPerNode) + } + if dd.Spec.Resource.Storage.PvcSize == "" { + t.Fatalf("expected pvcSize to be set") + } + if dd.Spec.ExposeViaService.ServiceType != "ClusterIP" { + t.Fatalf("expected ClusterIP exposure, got %q", dd.Spec.ExposeViaService.ServiceType) + } + if _, ok := interface{}(dd).(*previewv1.DocumentDB); !ok { + t.Fatalf("render did not produce *DocumentDB") + } +} + +func TestRenderTLSMixins(t *testing.T) { + for _, tc := range []struct { + path string + wantMode string + }{ + {"mixins/tls_disabled.yaml.template", "Disabled"}, + {"mixins/tls_selfsigned.yaml.template", "SelfSigned"}, + } { + vars := map[string]string{"NAMESPACE": "ns", "NAME": "c"} + dd, err := renderDocumentDB(tc.path, vars) + if err != nil { + t.Fatalf("render %s: %v", tc.path, err) + } + if dd.Spec.TLS == nil || dd.Spec.TLS.Gateway == nil || dd.Spec.TLS.Gateway.Mode != tc.wantMode { + t.Fatalf("%s: expected mode %q, got %+v", tc.path, tc.wantMode, dd.Spec.TLS) + } + } +} + +// TODO(e2e/feature-gates): re-introduce a ChangeStreams mixin-render +// test once the suite ships with a change-stream-capable DocumentDB +// image. The feature is experimental and requires a custom image +// variant (the `-changestream` tag line) that is not part of the +// default e2e image set, so we removed the render+behaviour tests to +// keep the default pipeline green. The API symbol +// previewv1.FeatureGateChangeStreams and the operator's wal_level +// translation remain in place — this is purely about test coverage. + +// The following tests exercise the label-selector teardown contract and +// the AlreadyExists run-id mismatch error path. They use the +// controller-runtime fake client so they can run without a real +// Kubernetes API. + +func TestOwnershipLabels(t *testing.T) { +resetRunIDForTest() +SetRunID("abcd1234") +labels := ownershipLabels(FixtureSharedRO, "lifecycle") +if labels[LabelRunID] != "abcd1234" { +t.Fatalf("run-id label = %q", labels[LabelRunID]) +} +if labels[LabelFixture] != FixtureSharedRO { +t.Fatalf("fixture label = %q", labels[LabelFixture]) +} +if labels[LabelArea] != "lifecycle" { +t.Fatalf("area label = %q", labels[LabelArea]) +} +// Empty area must not be recorded at all. +if _, ok := ownershipLabels(FixtureSharedRO, "")[LabelArea]; ok { +t.Fatalf("area label present for empty area") +} +} + +func TestRunIDFirstWriterWins(t *testing.T) { +resetRunIDForTest() +SetRunID("first") +SetRunID("second") +if got := RunID(); got != "first" { +t.Fatalf("RunID after conflicting sets = %q, want %q", got, "first") +} +resetRunIDForTest() +if got := RunID(); got != "unset" { +t.Fatalf("reset RunID = %q, want \"unset\"", got) +} +} + +// newFakeClient builds a controller-runtime fake client registered for +// the core + preview schemes used by the fixtures helpers. +func newFakeClient(t *testing.T) *fakeclient.ClientBuilder { +t.Helper() +s := runtime.NewScheme() +if err := corev1.AddToScheme(s); err != nil { +t.Fatalf("corev1 AddToScheme: %v", err) +} +if err := previewv1.AddToScheme(s); err != nil { +t.Fatalf("previewv1 AddToScheme: %v", err) +} +return fakeclient.NewClientBuilder().WithScheme(s) +} + +func TestCreateLabeledNamespaceStampsLabels(t *testing.T) { +resetRunIDForTest() +SetRunID("r-create") +c := newFakeClient(t).Build() +if err := CreateLabeledNamespace(context.Background(), c, "ns-a", "lifecycle"); err != nil { +t.Fatalf("CreateLabeledNamespace: %v", err) +} +got := &corev1.Namespace{} +if err := c.Get(context.Background(), types.NamespacedName{Name: "ns-a"}, got); err != nil { +t.Fatalf("Get: %v", err) +} +if got.Labels[LabelRunID] != "r-create" || +got.Labels[LabelFixture] != FixturePerSpec || +got.Labels[LabelArea] != "lifecycle" { +t.Fatalf("unexpected labels: %v", got.Labels) +} +} + +func TestCreateLabeledNamespaceAdoptsMatchingRunID(t *testing.T) { +resetRunIDForTest() +SetRunID("r-adopt") +existing := &corev1.Namespace{ +ObjectMeta: metav1.ObjectMeta{ +Name: "ns-b", +Labels: map[string]string{LabelRunID: "r-adopt"}, +}, +} +c := newFakeClient(t).WithObjects(existing).Build() +if err := CreateLabeledNamespace(context.Background(), c, "ns-b", "lifecycle"); err != nil { +t.Fatalf("expected adoption on matching run-id, got: %v", err) +} +} + +func TestCreateLabeledNamespaceRejectsRunIDMismatch(t *testing.T) { +resetRunIDForTest() +SetRunID("r-current") +existing := &corev1.Namespace{ +ObjectMeta: metav1.ObjectMeta{ +Name: "ns-c", +Labels: map[string]string{LabelRunID: "r-stale"}, +}, +} +c := newFakeClient(t).WithObjects(existing).Build() +err := CreateLabeledNamespace(context.Background(), c, "ns-c", "lifecycle") +if err == nil { +t.Fatalf("expected collision error, got nil") +} +} + +func TestCreateLabeledCredentialSecret(t *testing.T) { +resetRunIDForTest() +SetRunID("r-sec") +c := newFakeClient(t).Build() +if err := CreateLabeledCredentialSecret(context.Background(), c, "ns-s"); err != nil { +t.Fatalf("CreateLabeledCredentialSecret: %v", err) +} +got := &corev1.Secret{} +if err := c.Get(context.Background(), types.NamespacedName{ +Namespace: "ns-s", Name: DefaultCredentialSecretName, +}, got); err != nil { +t.Fatalf("Get: %v", err) +} +if string(got.Data["username"]) != DefaultCredentialUsername { +// fake client promotes StringData to Data on read; both keys must match. +if got.StringData["username"] != DefaultCredentialUsername { +t.Fatalf("username mismatch: data=%q stringData=%q", +got.Data["username"], got.StringData["username"]) +} +} +if got.Labels[LabelRunID] != "r-sec" || got.Labels[LabelFixture] != FixturePerSpec { +t.Fatalf("unexpected labels: %v", got.Labels) +} +// Second call must not error even though the secret already exists. +if err := CreateLabeledCredentialSecret(context.Background(), c, "ns-s"); err != nil { +t.Fatalf("idempotent CreateLabeledCredentialSecret returned: %v", err) +} +} diff --git a/test/e2e/pkg/e2eutils/fixtures/shared_ro.go b/test/e2e/pkg/e2eutils/fixtures/shared_ro.go new file mode 100644 index 00000000..de9dd302 --- /dev/null +++ b/test/e2e/pkg/e2eutils/fixtures/shared_ro.go @@ -0,0 +1,100 @@ +package fixtures + +import ( + "context" + "fmt" + "sync" + + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" +) + +// SharedRONamespace returns the per-process namespace name used by the +// shared read-only fixture cluster. The name embeds the current RunID +// so concurrent runs (e.g., parallel CI jobs) cannot collide on the +// same namespace and stomp one another during teardown. +func SharedRONamespace() string { + return fmt.Sprintf("e2e-shared-ro-%s-%s", RunID(), procID()) +} + +// SharedROName is the DocumentDB CR name used by the shared read-only +// fixture cluster. +const SharedROName = "shared-ro" + +// SharedROHandle is a read-only proxy over the shared RO DocumentDB +// cluster. Callers must NOT mutate the underlying CR. The handle only +// exposes accessors; there are no Patch/Delete methods. +type SharedROHandle struct { + namespace string + name string +} + +// Namespace returns the namespace of the shared RO cluster. +func (h *SharedROHandle) Namespace() string { return h.namespace } + +// Name returns the name of the shared RO cluster. +func (h *SharedROHandle) Name() string { return h.name } + +// GetCR fetches a fresh copy of the underlying DocumentDB CR. The +// returned CR is a deep copy; mutating it has no effect on the live +// resource. Callers that try to Update/Patch the returned CR against +// the API server will succeed silently only if they re-use the real +// client — prefer to treat this as read-only. +func (h *SharedROHandle) GetCR(ctx context.Context, c client.Client) (*previewv1.DocumentDB, error) { + dd := &previewv1.DocumentDB{} + if err := c.Get(ctx, client.ObjectKey{Namespace: h.namespace, Name: h.name}, dd); err != nil { + return nil, fmt.Errorf("get shared-ro documentdb: %w", err) + } + return dd, nil +} + +var ( + sharedRO *SharedROHandle + sharedROOnce sync.Once + sharedROErr error +) + +// GetOrCreateSharedRO returns the session-scoped shared read-only +// DocumentDB fixture, creating it lazily on first call. Subsequent +// calls return the same handle. Errors are cached: a failed first +// attempt will not be retried within the same process. +func GetOrCreateSharedRO(ctx context.Context, c client.Client) (*SharedROHandle, error) { + sharedROOnce.Do(func() { + ns := SharedRONamespace() + if err := ensureNamespace(ctx, c, ns, FixtureSharedRO); err != nil { + sharedROErr = err + return + } + if err := ensureCredentialSecret(ctx, c, ns, defaultCredentialSecretName, FixtureSharedRO); err != nil { + sharedROErr = err + return + } + dd, err := renderDocumentDB("base/documentdb.yaml.template", baseVars(ns, SharedROName, "1")) + if err != nil { + sharedROErr = err + return + } + if err := createDocumentDB(ctx, c, dd, FixtureSharedRO); err != nil { + sharedROErr = err + return + } + if err := waitDocumentDBHealthy(ctx, c, ns, SharedROName, defaultFixtureCreateTimeout); err != nil { + sharedROErr = fmt.Errorf("waiting for shared-ro to become healthy: %w", err) + return + } + sharedRO = &SharedROHandle{namespace: ns, name: SharedROName} + }) + return sharedRO, sharedROErr +} + +// TeardownSharedRO deletes every resource stamped with +// (LabelRunID=RunID(), LabelFixture=FixtureSharedRO). This is +// label-selector-driven so a process that never called +// GetOrCreateSharedRO but observes leftover resources from a previous +// run can still clean up. Safe to call multiple times; callers should +// invoke it from SynchronizedAfterSuite. +func TeardownSharedRO(ctx context.Context, c client.Client) error { + sharedRO = nil + return teardownFixtureByLabels(ctx, c, FixtureSharedRO) +} diff --git a/test/e2e/pkg/e2eutils/fixtures/shared_scale.go b/test/e2e/pkg/e2eutils/fixtures/shared_scale.go new file mode 100644 index 00000000..fb69b4e8 --- /dev/null +++ b/test/e2e/pkg/e2eutils/fixtures/shared_scale.go @@ -0,0 +1,156 @@ +package fixtures + +import ( + "context" + "fmt" + "sync" + "time" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" +) + +// SharedScaleNamespace returns the per-process namespace name used by +// the shared scale fixture cluster. The name embeds the current RunID +// so concurrent runs cannot collide on the same namespace during +// teardown. +func SharedScaleNamespace() string { + return fmt.Sprintf("e2e-shared-scale-%s-%s", RunID(), procID()) +} + +// SharedScaleName is the DocumentDB CR name used by the shared scale +// fixture cluster. +const SharedScaleName = "shared-scale" + +// sharedScaleInstances is the baseline InstancesPerNode value the +// scale fixture is created with and reset to between specs. +const sharedScaleInstances = 2 + +// SharedScaleHandle is the mutable handle to the shared scale +// DocumentDB cluster used by tests/scale/. Unlike SharedROHandle it +// exposes full access to the underlying CR and provides ResetToTwoInstances +// to restore state between specs. +type SharedScaleHandle struct { + namespace string + name string +} + +// Namespace returns the namespace of the shared scale cluster. +func (h *SharedScaleHandle) Namespace() string { return h.namespace } + +// Name returns the name of the shared scale cluster. +func (h *SharedScaleHandle) Name() string { return h.name } + +// GetCR fetches the current state of the underlying DocumentDB CR. +func (h *SharedScaleHandle) GetCR(ctx context.Context, c client.Client) (*previewv1.DocumentDB, error) { + dd := &previewv1.DocumentDB{} + if err := c.Get(ctx, client.ObjectKey{Namespace: h.namespace, Name: h.name}, dd); err != nil { + return nil, fmt.Errorf("get shared-scale documentdb: %w", err) + } + return dd, nil +} + +// ResetToTwoInstances restores the shared scale cluster to +// instancesPerNode=sharedScaleInstances (the default 2) and waits for +// both the operator's DocumentDB status to report healthy and the +// underlying CNPG Cluster's readyInstances to equal 2. Call from an +// AfterEach to leave the fixture in a known state for the next spec. +// +// The CNPG convergence wait is essential: the DocumentDB CR status +// can flip to Ready before the PostgreSQL layer has re-added the +// second replica, which would cause the next spec's scale assertions +// to observe a transient single-instance cluster. +func (h *SharedScaleHandle) ResetToTwoInstances(ctx context.Context, c client.Client) error { + dd := &previewv1.DocumentDB{} + if err := c.Get(ctx, client.ObjectKey{Namespace: h.namespace, Name: h.name}, dd); err != nil { + return fmt.Errorf("get shared-scale for reset: %w", err) + } + if dd.Spec.InstancesPerNode != sharedScaleInstances { + patch := client.MergeFrom(dd.DeepCopy()) + dd.Spec.InstancesPerNode = sharedScaleInstances + if err := c.Patch(ctx, dd, patch); err != nil { + return fmt.Errorf("patch shared-scale back to %d instances: %w", sharedScaleInstances, err) + } + } + if err := waitDocumentDBHealthy(ctx, c, h.namespace, h.name, defaultFixtureCreateTimeout); err != nil { + return err + } + return waitCNPGReadyInstances(ctx, c, h.namespace, h.name, sharedScaleInstances, defaultFixtureCreateTimeout) +} + +// waitCNPGReadyInstances polls the CNPG Cluster associated with the +// DocumentDB named (ns, name) until its Status.ReadyInstances matches +// want. The CNPG Cluster is assumed to carry the same name as the +// DocumentDB CR (the non-replicated convention used across the +// operator). +func waitCNPGReadyInstances(ctx context.Context, c client.Client, namespace, name string, want int, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, defaultPollInterval, timeout, true, func(ctx context.Context) (bool, error) { + cl := &cnpgv1.Cluster{} + err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, cl) + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + return false, fmt.Errorf("get CNPG cluster %s/%s: %w", namespace, name, err) + } + return cl.Status.ReadyInstances == want, nil + }) +} + +var ( + sharedScale *SharedScaleHandle + sharedScaleOnce sync.Once + sharedScaleErr error +) + +// GetOrCreateSharedScale returns the session-scoped shared scale +// DocumentDB fixture, creating it lazily on first call. Subsequent +// calls return the same handle. +func GetOrCreateSharedScale(ctx context.Context, c client.Client) (*SharedScaleHandle, error) { + sharedScaleOnce.Do(func() { + ns := SharedScaleNamespace() + if err := ensureNamespace(ctx, c, ns, FixtureSharedScale); err != nil { + sharedScaleErr = err + return + } + if err := ensureCredentialSecret(ctx, c, ns, defaultCredentialSecretName, FixtureSharedScale); err != nil { + sharedScaleErr = err + return + } + dd, err := renderDocumentDB( + "base/documentdb.yaml.template", + baseVars(ns, SharedScaleName, fmt.Sprintf("%d", sharedScaleInstances)), + ) + if err != nil { + sharedScaleErr = err + return + } + if err := createDocumentDB(ctx, c, dd, FixtureSharedScale); err != nil { + sharedScaleErr = err + return + } + if err := waitDocumentDBHealthy(ctx, c, ns, SharedScaleName, defaultFixtureCreateTimeout); err != nil { + sharedScaleErr = fmt.Errorf("waiting for shared-scale to become healthy: %w", err) + return + } + if err := waitCNPGReadyInstances(ctx, c, ns, SharedScaleName, sharedScaleInstances, defaultFixtureCreateTimeout); err != nil { + sharedScaleErr = fmt.Errorf("waiting for CNPG readyInstances=%d: %w", sharedScaleInstances, err) + return + } + sharedScale = &SharedScaleHandle{namespace: ns, name: SharedScaleName} + }) + return sharedScale, sharedScaleErr +} + +// TeardownSharedScale deletes every resource stamped with +// (LabelRunID=RunID(), LabelFixture=FixtureSharedScale). Safe to call +// multiple times; invoke from SynchronizedAfterSuite. +func TeardownSharedScale(ctx context.Context, c client.Client) error { + sharedScale = nil + return teardownFixtureByLabels(ctx, c, FixtureSharedScale) +} diff --git a/test/e2e/pkg/e2eutils/fixtures/teardown_test.go b/test/e2e/pkg/e2eutils/fixtures/teardown_test.go new file mode 100644 index 00000000..7c993f67 --- /dev/null +++ b/test/e2e/pkg/e2eutils/fixtures/teardown_test.go @@ -0,0 +1,177 @@ +package fixtures + +import ( + "context" + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" +) + +func newFakeScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := corev1.AddToScheme(s); err != nil { + t.Fatalf("corev1: %v", err) + } + if err := previewv1.AddToScheme(s); err != nil { + t.Fatalf("preview: %v", err) + } + return s +} + +// TestTeardownFixtureByLabels_SelectsOnlyMatchingRun creates two sets +// of fixture objects belonging to different run ids and asserts +// teardownFixtureByLabels only removes those tagged with the current +// run id. +func TestTeardownFixtureByLabels_SelectsOnlyMatchingRun(t *testing.T) { + resetRunIDForTest() + SetRunID("runA") + + mineLabels := map[string]string{ + LabelRunID: "runA", + LabelFixture: FixtureSharedRO, + } + theirsLabels := map[string]string{ + LabelRunID: "runB", + LabelFixture: FixtureSharedRO, + } + + mineNS := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-mine", Labels: mineLabels}} + theirsNS := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "e2e-theirs", Labels: theirsLabels}} + mineDD := &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{ + Name: "dd-mine", Namespace: "e2e-mine", Labels: mineLabels, + }} + theirsDD := &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{ + Name: "dd-theirs", Namespace: "e2e-theirs", Labels: theirsLabels, + }} + + c := fake.NewClientBuilder(). + WithScheme(newFakeScheme(t)). + WithObjects(mineNS, theirsNS, mineDD, theirsDD). + Build() + + ctx := context.Background() + if err := teardownFixtureByLabels(ctx, c, FixtureSharedRO); err != nil { + t.Fatalf("teardown: %v", err) + } + + // Mine should be gone. + if err := c.Get(ctx, types.NamespacedName{Name: "e2e-mine"}, &corev1.Namespace{}); err == nil { + t.Fatalf("expected mine namespace to be deleted") + } + if err := c.Get(ctx, types.NamespacedName{Namespace: "e2e-mine", Name: "dd-mine"}, &previewv1.DocumentDB{}); err == nil { + t.Fatalf("expected mine documentdb to be deleted") + } + + // Theirs must survive. + if err := c.Get(ctx, types.NamespacedName{Name: "e2e-theirs"}, &corev1.Namespace{}); err != nil { + t.Fatalf("theirs namespace should still exist: %v", err) + } + if err := c.Get(ctx, types.NamespacedName{Namespace: "e2e-theirs", Name: "dd-theirs"}, &previewv1.DocumentDB{}); err != nil { + t.Fatalf("theirs documentdb should still exist: %v", err) + } +} + +// TestCreateDocumentDB_RunIDMismatchIsExplicitError exercises the +// adoption-refusal path: when an existing CR has a different run-id +// label the helper must return a descriptive error instead of silently +// adopting a foreign fixture. +func TestCreateDocumentDB_RunIDMismatchIsExplicitError(t *testing.T) { + resetRunIDForTest() + SetRunID("newrun") + + existing := &previewv1.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "shared", + Namespace: "ns", + Labels: map[string]string{ + LabelRunID: "oldrun", + LabelFixture: FixtureSharedRO, + }, + }, + } + c := fake.NewClientBuilder(). + WithScheme(newFakeScheme(t)). + WithObjects(existing). + Build() + + attempt := &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "shared", Namespace: "ns"}} + err := createDocumentDB(context.Background(), c, attempt, FixtureSharedRO) + if err == nil { + t.Fatal("expected collision error, got nil") + } + if !strings.Contains(err.Error(), "fixture collision") { + t.Fatalf("expected 'fixture collision' error, got: %v", err) + } + if !strings.Contains(err.Error(), "oldrun") || !strings.Contains(err.Error(), "newrun") { + t.Fatalf("error should name both run ids: %v", err) + } +} + +// TestCreateDocumentDB_AdoptsMatchingRun ensures that an AlreadyExists +// result with a matching run-id label is treated as idempotent success +// (this is the lazy-fixture re-entry path). +func TestCreateDocumentDB_AdoptsMatchingRun(t *testing.T) { + resetRunIDForTest() + SetRunID("runX") + + existing := &previewv1.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{ + Name: "shared", + Namespace: "ns", + Labels: map[string]string{ + LabelRunID: "runX", + LabelFixture: FixtureSharedRO, + }, + }, + } + c := fake.NewClientBuilder(). + WithScheme(newFakeScheme(t)). + WithObjects(existing). + Build() + + attempt := &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "shared", Namespace: "ns"}} + if err := createDocumentDB(context.Background(), c, attempt, FixtureSharedRO); err != nil { + t.Fatalf("expected idempotent success, got %v", err) + } +} + +// TestEnsureNamespace_RunIDMismatchIsExplicitError mirrors the CR test +// for namespace-level collisions. +func TestEnsureNamespace_RunIDMismatchIsExplicitError(t *testing.T) { + resetRunIDForTest() + SetRunID("newrun") + + existing := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ns", + Labels: map[string]string{ + LabelRunID: "oldrun", + LabelFixture: FixtureSharedRO, + }, + }, + } + c := fake.NewClientBuilder(). + WithScheme(newFakeScheme(t)). + WithObjects(existing). + Build() + + err := ensureNamespace(context.Background(), c, "ns", FixtureSharedRO) + if err == nil { + t.Fatal("expected collision error, got nil") + } + if !strings.Contains(err.Error(), "fixture collision") { + t.Fatalf("want fixture collision, got: %v", err) + } +} + +// Silence unused-import warnings if client is otherwise unused. +var _ client.Client = (client.Client)(nil) diff --git a/test/e2e/pkg/e2eutils/helmop/helmop.go b/test/e2e/pkg/e2eutils/helmop/helmop.go new file mode 100644 index 00000000..ebc7033e --- /dev/null +++ b/test/e2e/pkg/e2eutils/helmop/helmop.go @@ -0,0 +1,192 @@ +// Package helmop provides thin wrappers around the `helm` CLI for the +// DocumentDB E2E upgrade specs. The upgrade area owns its own operator +// install — it installs a previous-released chart, deploys a DocumentDB, +// then upgrades the chart to the PR's build — so these helpers are +// disruptive by design and must only be used from specs running with +// `ginkgo -procs=1`. +// +// The helpers shell out to the `helm` binary on PATH. Required CLI: +// `helm` v3.13+ (Helm 3 with `upgrade --install`, `--wait`, and +// `--version` behavior used here). No in-process Helm SDK dependency is +// pulled in so the test module footprint stays small. +// +// Typical flow from a spec: +// +// _ = helmop.Uninstall(ctx, "documentdb-operator", "documentdb-operator") +// Expect(helmop.Install(ctx, "documentdb-operator", "documentdb-operator", +// "documentdb/documentdb-operator", "0.1.2", nil)).To(Succeed()) +// Expect(helmop.WaitOperatorReady(ctx, env, "documentdb-operator", +// 2*time.Minute)).To(Succeed()) +// Expect(helmop.Upgrade(ctx, "documentdb-operator", "documentdb-operator", +// "/path/to/pr-chart", "", nil)).To(Succeed()) +package helmop + +import ( + "bytes" + "context" + "fmt" + "os/exec" + "sort" + "time" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/cloudnative-pg/cloudnative-pg/tests/utils/environment" + + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/operatorhealth" +) + +// DefaultTimeout bounds every helm invocation. Individual callers may +// pass a context with a tighter deadline. +const DefaultTimeout = 10 * time.Minute + +// run executes the helm CLI with the supplied args. stdout/stderr are +// merged so error messages from helm are surfaced verbatim. +func run(ctx context.Context, args ...string) error { + if _, err := exec.LookPath("helm"); err != nil { + return fmt.Errorf("helm CLI not found on PATH: %w", err) + } + cctx := ctx + if _, hasDeadline := ctx.Deadline(); !hasDeadline { + var cancel context.CancelFunc + cctx, cancel = context.WithTimeout(ctx, DefaultTimeout) + defer cancel() + } + cmd := exec.CommandContext(cctx, "helm", args...) + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &out + if err := cmd.Run(); err != nil { + return fmt.Errorf("helm %v: %w\n---helm output---\n%s", args, err, out.String()) + } + return nil +} + +// setFlags renders a values map to deterministic `--set key=value` +// arguments. Sorted by key so command lines are reproducible in logs. +func setFlags(values map[string]string) []string { + if len(values) == 0 { + return nil + } + keys := make([]string, 0, len(values)) + for k := range values { + keys = append(keys, k) + } + sort.Strings(keys) + args := make([]string, 0, 2*len(keys)) + for _, k := range keys { + args = append(args, "--set", fmt.Sprintf("%s=%s", k, values[k])) + } + return args +} + +// Install installs the chart at the given release name / namespace. The +// namespace is created if absent. version may be empty to use the +// latest chart version reachable from the repo/URL. values is an +// optional `--set key=value` map. +func Install(ctx context.Context, releaseName, namespace, chart, version string, values map[string]string) error { + if releaseName == "" || namespace == "" || chart == "" { + return fmt.Errorf("helmop.Install: releaseName, namespace and chart are required") + } + args := []string{ + "install", releaseName, chart, + "--namespace", namespace, + "--create-namespace", + "--wait", + } + if version != "" { + args = append(args, "--version", version) + } + args = append(args, setFlags(values)...) + return run(ctx, args...) +} + +// Upgrade upgrades an existing release, or installs it if the release +// is absent (helm upgrade --install semantics). +func Upgrade(ctx context.Context, releaseName, namespace, chart, version string, values map[string]string) error { + if releaseName == "" || namespace == "" || chart == "" { + return fmt.Errorf("helmop.Upgrade: releaseName, namespace and chart are required") + } + args := []string{ + "upgrade", "--install", releaseName, chart, + "--namespace", namespace, + "--create-namespace", + "--wait", + } + if version != "" { + args = append(args, "--version", version) + } + args = append(args, setFlags(values)...) + return run(ctx, args...) +} + +// Uninstall removes a release. A missing release is not an error so +// callers can use Uninstall as an idempotent reset. +func Uninstall(ctx context.Context, releaseName, namespace string) error { + if releaseName == "" || namespace == "" { + return fmt.Errorf("helmop.Uninstall: releaseName and namespace are required") + } + err := run(ctx, "uninstall", releaseName, "--namespace", namespace, "--wait", "--ignore-not-found") + return err +} + +// WaitOperatorReady polls the operator namespace until a pod with the +// operator label is Ready or the timeout expires. It deliberately +// reuses operatorhealth's label selector so callers observe the same +// pod the churn gate watches. +func WaitOperatorReady(ctx context.Context, env *environment.TestingEnvironment, namespace string, timeout time.Duration) error { + if env == nil || env.Client == nil { + return fmt.Errorf("helmop.WaitOperatorReady: nil env/client") + } + if namespace == "" { + return fmt.Errorf("helmop.WaitOperatorReady: namespace required") + } + deadline := time.Now().Add(timeout) + const poll = 3 * time.Second + var lastReason string + for { + ready, reason, err := operatorReadyOnce(ctx, env.Client, namespace) + if err == nil && ready { + return nil + } + if err != nil { + lastReason = err.Error() + } else { + lastReason = reason + } + if time.Now().After(deadline) { + return fmt.Errorf("operator pod in %q not ready after %s: %s", namespace, timeout, lastReason) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +func operatorReadyOnce(ctx context.Context, c client.Client, namespace string) (bool, string, error) { + var pods corev1.PodList + if err := c.List(ctx, &pods, + client.InNamespace(namespace), + client.MatchingLabels{operatorhealth.PodLabelKey: operatorhealth.PodLabelValue}, + ); err != nil { + return false, "", fmt.Errorf("list operator pods: %w", err) + } + if len(pods.Items) == 0 { + return false, "no operator pods yet", nil + } + for i := range pods.Items { + p := &pods.Items[i] + if p.Status.Phase != corev1.PodRunning { + continue + } + for _, cond := range p.Status.Conditions { + if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue { + return true, "", nil + } + } + } + return false, fmt.Sprintf("%d operator pod(s) present but none Ready", len(pods.Items)), nil +} diff --git a/test/e2e/pkg/e2eutils/helmop/helmop_test.go b/test/e2e/pkg/e2eutils/helmop/helmop_test.go new file mode 100644 index 00000000..eb666e5f --- /dev/null +++ b/test/e2e/pkg/e2eutils/helmop/helmop_test.go @@ -0,0 +1,69 @@ +package helmop + +import ( + "context" + "testing" + "time" +) + +func TestSetFlagsDeterministic(t *testing.T) { + t.Parallel() + got := setFlags(map[string]string{"b": "2", "a": "1", "c": "3"}) + want := []string{"--set", "a=1", "--set", "b=2", "--set", "c=3"} + if len(got) != len(want) { + t.Fatalf("setFlags length = %d, want %d (%v)", len(got), len(want), got) + } + for i := range got { + if got[i] != want[i] { + t.Fatalf("setFlags[%d] = %q, want %q", i, got[i], want[i]) + } + } +} + +func TestSetFlagsEmpty(t *testing.T) { + t.Parallel() + if got := setFlags(nil); got != nil { + t.Fatalf("setFlags(nil) = %v, want nil", got) + } + if got := setFlags(map[string]string{}); got != nil { + t.Fatalf("setFlags(empty) = %v, want nil", got) + } +} + +func TestInstallRejectsEmptyArgs(t *testing.T) { + t.Parallel() + cases := []struct{ rel, ns, chart string }{ + {"", "ns", "chart"}, + {"rel", "", "chart"}, + {"rel", "ns", ""}, + } + for _, c := range cases { + if err := Install(context.Background(), c.rel, c.ns, c.chart, "", nil); err == nil { + t.Errorf("Install(%q,%q,%q) = nil, want error", c.rel, c.ns, c.chart) + } + } +} + +func TestUpgradeRejectsEmptyArgs(t *testing.T) { + t.Parallel() + if err := Upgrade(context.Background(), "", "ns", "chart", "", nil); err == nil { + t.Error("Upgrade with empty release = nil, want error") + } +} + +func TestUninstallRejectsEmptyArgs(t *testing.T) { + t.Parallel() + if err := Uninstall(context.Background(), "", "ns"); err == nil { + t.Error("Uninstall with empty release = nil, want error") + } + if err := Uninstall(context.Background(), "rel", ""); err == nil { + t.Error("Uninstall with empty namespace = nil, want error") + } +} + +func TestWaitOperatorReadyNilEnv(t *testing.T) { + t.Parallel() + if err := WaitOperatorReady(context.Background(), nil, "ns", time.Millisecond); err == nil { + t.Error("WaitOperatorReady(nil env) = nil, want error") + } +} diff --git a/test/e2e/pkg/e2eutils/mongo/.keep b/test/e2e/pkg/e2eutils/mongo/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/mongo/client.go b/test/e2e/pkg/e2eutils/mongo/client.go new file mode 100644 index 00000000..ce296ff4 --- /dev/null +++ b/test/e2e/pkg/e2eutils/mongo/client.go @@ -0,0 +1,221 @@ +// Package mongo provides thin helpers for the DocumentDB E2E suite to +// connect to a DocumentDB gateway endpoint using the official +// mongo-driver/v2 client. It is intentionally minimal: URI construction +// with proper credential escaping, connect/ping, seeding, counting, and +// database cleanup. +package mongo + +import ( + "context" + "crypto/tls" + "crypto/x509" + "errors" + "fmt" + "net/url" + "time" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" +) + +// DefaultConnectTimeout is applied to mongo.Connect when the caller does +// not provide a deadline on the context. +const DefaultConnectTimeout = 10 * time.Second + +// ClientOptions describes the parameters required to reach a DocumentDB +// gateway. All fields are required except TLSInsecure (ignored when TLS +// is false) and AuthDB (defaults to "admin"). +type ClientOptions struct { + // Host is the DocumentDB gateway hostname or IP. + Host string + // Port is the DocumentDB gateway TCP port. + Port string + // User is the plain (un-escaped) username. + User string + // Password is the plain (un-escaped) password. + Password string + // TLS toggles transport TLS on the connection. + TLS bool + // TLSInsecure skips certificate verification when TLS is true. It is + // only appropriate for tests against self-signed certificates that + // are not trusted via RootCAs. Mutually exclusive in practice with + // RootCAs/CABundlePEM: if both are set, RootCAs wins and + // InsecureSkipVerify is not applied. + TLSInsecure bool + // RootCAs, when non-nil and TLS is true, is used as the trust store + // for server-certificate verification. Takes precedence over + // CABundlePEM if both are set. + RootCAs *x509.CertPool + // CABundlePEM, when non-empty and RootCAs is nil, is parsed into a + // one-off CertPool used as the trust store for server-certificate + // verification. Convenience for callers that already have the PEM + // bytes (e.g., from a kubernetes.io/tls Secret). + CABundlePEM []byte + // ServerName is the expected hostname presented by the server for + // SNI + hostname verification. Defaults to Host when empty. Set + // explicitly when connecting through a port-forward (where Host is + // 127.0.0.1 but the cert is issued for a Service DNS name). + ServerName string + // AuthDB is the authentication database (authSource). Defaults to + // "admin" when empty. + AuthDB string +} + +// BuildURI constructs the mongodb:// URI that NewClient would use. It is +// exported to make credential escaping, TLS flag, and authSource +// behaviour directly unit-testable without spinning up a server. +func BuildURI(opts ClientOptions) (string, error) { + if opts.Host == "" { + return "", errors.New("mongo: Host is required") + } + if opts.Port == "" { + return "", errors.New("mongo: Port is required") + } + if opts.User == "" { + return "", errors.New("mongo: User is required") + } + authDB := opts.AuthDB + if authDB == "" { + authDB = "admin" + } + u := url.QueryEscape(opts.User) + p := url.QueryEscape(opts.Password) + tlsFlag := "false" + if opts.TLS { + tlsFlag = "true" + } + // authSource is a URL query parameter; url.QueryEscape keeps it safe + // for names containing reserved characters. + return fmt.Sprintf( + "mongodb://%s:%s@%s:%s/?tls=%s&authSource=%s", + u, p, opts.Host, opts.Port, tlsFlag, url.QueryEscape(authDB), + ), nil +} + +// NewClient builds a connected *mongo.Client against the endpoint +// described by opts. The caller owns the returned client and is +// responsible for calling Disconnect. +// +// Connect time is bounded by DefaultConnectTimeout via the driver's +// SetConnectTimeout option. mongo-driver/v2 Connect is lazy, so +// callers who need a post-connect round-trip must call Ping (or +// pingWithRetry from connect.go) themselves. +func NewClient(_ context.Context, opts ClientOptions) (*mongo.Client, error) { + uri, err := BuildURI(opts) + if err != nil { + return nil, err + } + co := options.Client().ApplyURI(uri).SetConnectTimeout(DefaultConnectTimeout) + if opts.TLS { + tlsCfg, terr := buildTLSConfig(opts) + if terr != nil { + return nil, terr + } + if tlsCfg != nil { + co.SetTLSConfig(tlsCfg) + } + } + c, err := mongo.Connect(co) + if err != nil { + return nil, fmt.Errorf("mongo: connect: %w", err) + } + return c, nil +} + +// buildTLSConfig assembles a *tls.Config for the driver. Priority: +// +// 1. RootCAs, if non-nil — use as trust store. +// 2. CABundlePEM, if non-empty — parse into a fresh pool. +// 3. TLSInsecure — skip verification entirely. +// +// Returns (nil, nil) when TLS is on but none of the above are set; the +// driver then falls back to the system trust store (default behaviour). +// ServerName is propagated when set so callers can overcome SNI +// mismatch in port-forward scenarios. +func buildTLSConfig(opts ClientOptions) (*tls.Config, error) { + cfg := &tls.Config{MinVersion: tls.VersionTLS12} + if opts.ServerName != "" { + cfg.ServerName = opts.ServerName + } + switch { + case opts.RootCAs != nil: + cfg.RootCAs = opts.RootCAs + return cfg, nil + case len(opts.CABundlePEM) > 0: + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(opts.CABundlePEM) { + return nil, errors.New("mongo: CABundlePEM contained no parseable certificates") + } + cfg.RootCAs = pool + return cfg, nil + case opts.TLSInsecure: + cfg.InsecureSkipVerify = true //nolint:gosec // tests only, self-signed gateway + return cfg, nil + } + // TLS on, no CA and not insecure: return a minimal config that + // still honours a user-supplied ServerName but otherwise defers to + // the driver/system trust store. + if cfg.ServerName != "" { + return cfg, nil + } + return nil, nil +} + +// Ping issues a server-selection + hello roundtrip, using the context +// for cancellation/deadline propagation. +func Ping(ctx context.Context, c *mongo.Client) error { + if c == nil { + return errors.New("mongo: nil client") + } + if err := c.Ping(ctx, nil); err != nil { + return fmt.Errorf("mongo: ping: %w", err) + } + return nil +} + +// Seed inserts docs into db.coll via InsertMany and returns the number +// of documents accepted by the server. +func Seed(ctx context.Context, c *mongo.Client, db, coll string, docs []bson.M) (int, error) { + if c == nil { + return 0, errors.New("mongo: nil client") + } + if len(docs) == 0 { + return 0, nil + } + anyDocs := make([]any, len(docs)) + for i := range docs { + anyDocs[i] = docs[i] + } + res, err := c.Database(db).Collection(coll).InsertMany(ctx, anyDocs) + if err != nil { + return 0, fmt.Errorf("mongo: seed %s.%s: %w", db, coll, err) + } + return len(res.InsertedIDs), nil +} + +// Count returns the number of documents in db.coll matching filter. +func Count(ctx context.Context, c *mongo.Client, db, coll string, filter bson.M) (int64, error) { + if c == nil { + return 0, errors.New("mongo: nil client") + } + if filter == nil { + filter = bson.M{} + } + n, err := c.Database(db).Collection(coll).CountDocuments(ctx, filter) + if err != nil { + return 0, fmt.Errorf("mongo: count %s.%s: %w", db, coll, err) + } + return n, nil +} + +// DropDatabase drops the named database. A nil client returns an error. +func DropDatabase(ctx context.Context, c *mongo.Client, db string) error { + if c == nil { + return errors.New("mongo: nil client") + } + if err := c.Database(db).Drop(ctx); err != nil { + return fmt.Errorf("mongo: drop %s: %w", db, err) + } + return nil +} diff --git a/test/e2e/pkg/e2eutils/mongo/client_test.go b/test/e2e/pkg/e2eutils/mongo/client_test.go new file mode 100644 index 00000000..1248e76a --- /dev/null +++ b/test/e2e/pkg/e2eutils/mongo/client_test.go @@ -0,0 +1,207 @@ +package mongo + +import ( + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "math/big" + "strings" + "testing" + "time" +) + +func TestBuildURI_Basic(t *testing.T) { + t.Parallel() + got, err := BuildURI(ClientOptions{ + Host: "gw.example", Port: "10260", User: "alice", Password: "secret", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + want := "mongodb://alice:secret@gw.example:10260/?tls=false&authSource=admin" + if got != want { + t.Fatalf("uri mismatch:\n got=%s\nwant=%s", got, want) + } +} + +func TestBuildURI_EscapesCreds(t *testing.T) { + t.Parallel() + got, err := BuildURI(ClientOptions{ + Host: "h", Port: "1", User: "a@b", Password: "p@ss:w/rd?&", + }) + if err != nil { + t.Fatalf("err: %v", err) + } + // '@', ':', '/', '?', '&' must all be percent-encoded so the driver + // doesn't mis-parse the URI. + for _, bad := range []string{"a@b:", "@ss:", "w/rd?", "?&@"} { + if strings.Contains(got, bad) { + t.Fatalf("uri must escape %q; got %s", bad, got) + } + } + if !strings.Contains(got, "a%40b") { + t.Fatalf("expected user to contain 'a%%40b'; got %s", got) + } + if !strings.Contains(got, "p%40ss%3Aw%2Frd%3F%26") { + t.Fatalf("expected escaped password; got %s", got) + } +} + +func TestBuildURI_TLSFlag(t *testing.T) { + t.Parallel() + on, _ := BuildURI(ClientOptions{Host: "h", Port: "1", User: "u", Password: "p", TLS: true}) + if !strings.Contains(on, "tls=true") { + t.Fatalf("expected tls=true, got %s", on) + } + off, _ := BuildURI(ClientOptions{Host: "h", Port: "1", User: "u", Password: "p", TLS: false}) + if !strings.Contains(off, "tls=false") { + t.Fatalf("expected tls=false, got %s", off) + } +} + +func TestBuildURI_AuthDBOverride(t *testing.T) { + t.Parallel() + got, _ := BuildURI(ClientOptions{ + Host: "h", Port: "1", User: "u", Password: "p", AuthDB: "mydb", + }) + if !strings.Contains(got, "authSource=mydb") { + t.Fatalf("expected authSource=mydb; got %s", got) + } + def, _ := BuildURI(ClientOptions{Host: "h", Port: "1", User: "u", Password: "p"}) + if !strings.Contains(def, "authSource=admin") { + t.Fatalf("expected default authSource=admin; got %s", def) + } +} + +func TestBuildURI_MissingRequired(t *testing.T) { + t.Parallel() + cases := []ClientOptions{ + {Port: "1", User: "u"}, + {Host: "h", User: "u"}, + {Host: "h", Port: "1"}, + } + for i, c := range cases { + if _, err := BuildURI(c); err == nil { + t.Fatalf("case %d: expected error for incomplete opts %+v", i, c) + } + } +} + +// mintSelfSignedPEM returns a short-lived self-signed cert's PEM bytes. +// Used only to feed buildTLSConfig a PEM it can parse; we never need to +// serve TLS from it. +func mintSelfSignedPEM(t *testing.T) []byte { + t.Helper() + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + t.Fatalf("generate key: %v", err) + } + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "test"}, + NotBefore: time.Now().Add(-time.Minute), + NotAfter: time.Now().Add(time.Hour), + KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature, + IsCA: true, + BasicConstraintsValid: true, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key) + if err != nil { + t.Fatalf("create cert: %v", err) + } + return pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) +} + +func TestBuildTLSConfig_RootCAsTakesPriority(t *testing.T) { + t.Parallel() + pool := x509.NewCertPool() + cfg, err := buildTLSConfig(ClientOptions{ + TLS: true, + RootCAs: pool, + CABundlePEM: []byte("ignored"), + TLSInsecure: true, + ServerName: "localhost", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg == nil { + t.Fatal("expected non-nil config") + } + if cfg.RootCAs != pool { + t.Fatal("RootCAs must be the supplied pool, not a parsed bundle") + } + if cfg.InsecureSkipVerify { + t.Fatal("InsecureSkipVerify must not be set when RootCAs is supplied") + } + if cfg.ServerName != "localhost" { + t.Fatalf("ServerName = %q, want localhost", cfg.ServerName) + } + if cfg.MinVersion != 0x0303 { // TLS 1.2 + t.Fatalf("MinVersion = %x, want TLS 1.2", cfg.MinVersion) + } +} + +func TestBuildTLSConfig_CABundlePEMParsed(t *testing.T) { + t.Parallel() + pemBytes := mintSelfSignedPEM(t) + cfg, err := buildTLSConfig(ClientOptions{ + TLS: true, + CABundlePEM: pemBytes, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg == nil || cfg.RootCAs == nil { + t.Fatal("expected RootCAs parsed from PEM") + } +} + +func TestBuildTLSConfig_CABundlePEMInvalid(t *testing.T) { + t.Parallel() + if _, err := buildTLSConfig(ClientOptions{ + TLS: true, + CABundlePEM: []byte("not a real pem"), + }); err == nil { + t.Fatal("expected error for unparseable CABundlePEM") + } +} + +func TestBuildTLSConfig_Insecure(t *testing.T) { + t.Parallel() + cfg, err := buildTLSConfig(ClientOptions{TLS: true, TLSInsecure: true}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg == nil || !cfg.InsecureSkipVerify { + t.Fatal("expected InsecureSkipVerify=true") + } +} + +func TestBuildTLSConfig_NilWhenNoHintsAndNoServerName(t *testing.T) { + t.Parallel() + cfg, err := buildTLSConfig(ClientOptions{TLS: true}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg != nil { + t.Fatalf("expected nil config when no CA/insecure/ServerName supplied, got %+v", cfg) + } +} + +func TestBuildTLSConfig_ServerNameOnlyReturnsConfig(t *testing.T) { + t.Parallel() + cfg, err := buildTLSConfig(ClientOptions{TLS: true, ServerName: "gw.example"}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg == nil || cfg.ServerName != "gw.example" { + t.Fatalf("expected ServerName preserved, got %+v", cfg) + } + if cfg.RootCAs != nil || cfg.InsecureSkipVerify { + t.Fatal("ServerName-only config must not set RootCAs or InsecureSkipVerify") + } +} diff --git a/test/e2e/pkg/e2eutils/mongo/connect.go b/test/e2e/pkg/e2eutils/mongo/connect.go new file mode 100644 index 00000000..0c7a7189 --- /dev/null +++ b/test/e2e/pkg/e2eutils/mongo/connect.go @@ -0,0 +1,268 @@ +// Package mongo — connect.go provides a high-level helper that opens a +// port-forward to a DocumentDB gateway Service, reads credentials from +// the standard "documentdb-credentials" secret in the CR's namespace, +// and returns a connected mongo-driver client wrapped in a [Handle] +// that also owns the port-forward lifetime. +// +// This helper intentionally lives outside pkg/e2eutils/fixtures to +// avoid an import cycle: fixtures creates the CR + secret; mongo is +// the pure data-plane helper callers reach for in `It` blocks. +package mongo + +import ( + "context" + "crypto/x509" + "errors" + "fmt" + "net" + "strconv" + "time" + + "github.com/cloudnative-pg/cloudnative-pg/tests/utils/environment" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + driver "go.mongodb.org/mongo-driver/v2/mongo" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/portforward" +) + +// DefaultCredentialSecretName is the secret name the shared fixtures +// create to hold gateway credentials. Kept in sync with +// fixtures.DefaultCredentialSecretName; duplicated here to avoid a +// circular import. +const DefaultCredentialSecretName = "documentdb-credentials" + +// Handle owns a live mongo-driver client plus the port-forward that +// backs it. Callers must invoke Close when done; failing to do so +// leaks a local port-forward goroutine. +type Handle struct { + client *driver.Client + stop func() error +} + +// Client returns the underlying mongo-driver client. Prefer Database +// for per-spec isolation. +func (h *Handle) Client() *driver.Client { return h.client } + +// Database is a pass-through to the underlying driver client. +func (h *Handle) Database(name string) *driver.Database { + return h.client.Database(name) +} + +// Close disconnects the mongo client and tears down the port-forward. +// Safe to call on a nil handle. Returns the first non-nil error +// observed across (Disconnect, port-forward shutdown). +func (h *Handle) Close(ctx context.Context) error { + if h == nil { + return nil + } + var derr error + if h.client != nil { + derr = h.client.Disconnect(ctx) + } + var serr error + if h.stop != nil { + serr = h.stop() + } + if derr != nil { + return derr + } + return serr +} + +// connectRetryTimeout bounds the post-port-forward ping/retry loop +// because forwardconnection's goroutine takes a brief moment to bind +// the chosen local port. 10s @ 100ms backoff absorbs ~100 attempts, +// which is plenty for the typical <1s bind delay without stretching +// the happy path. +const ( + connectRetryTimeout = 10 * time.Second + connectRetryBackoff = 100 * time.Millisecond +) + +// ConnectOption customises NewFromDocumentDB. Options are composable +// and apply in the order supplied; later options overwrite earlier +// ones for the same field. +type ConnectOption func(*connectConfig) + +type connectConfig struct { + rootCAs *x509.CertPool + caBundlePEM []byte + serverName string + tlsInsecure bool +} + +// WithRootCAs pins the trust store used for server-certificate +// verification to the given pool. Prefer this over WithCABundlePEM +// when you already have a *x509.CertPool assembled. +func WithRootCAs(pool *x509.CertPool) ConnectOption { + return func(c *connectConfig) { c.rootCAs = pool; c.tlsInsecure = false } +} + +// WithCABundlePEM pins the trust store to a CA bundle parsed from PEM +// bytes. Convenient for callers reading ca.crt out of a Secret. +func WithCABundlePEM(pem []byte) ConnectOption { + return func(c *connectConfig) { c.caBundlePEM = pem; c.tlsInsecure = false } +} + +// WithServerName overrides the TLS SNI + hostname-verification target. +// Use when connecting through a port-forward where Host is 127.0.0.1 +// but the server certificate was issued for a Service DNS name. +func WithServerName(name string) ConnectOption { + return func(c *connectConfig) { c.serverName = name } +} + +// WithTLSInsecure turns off server-certificate verification. It is the +// default when no ConnectOption is supplied, preserving legacy +// behaviour; callers that want CA verification must pass WithRootCAs +// or WithCABundlePEM explicitly. +func WithTLSInsecure() ConnectOption { + return func(c *connectConfig) { + c.tlsInsecure = true + c.rootCAs = nil + c.caBundlePEM = nil + } +} + +// NewFromDocumentDB builds a connected Handle against the DocumentDB CR +// identified by (namespace, name). It: +// +// 1. Reads the CR and the "documentdb-credentials" secret from the +// same namespace. +// 2. Picks a free local TCP port. +// 3. Opens a port-forward to the gateway Service via the portforward +// helper (using OpenWithErr so teardown surfaces forwarder errors). +// 4. Connects the mongo-driver client with TLS; verification mode is +// controlled by opts (default: InsecureSkipVerify for backwards +// compatibility with the historical gateway self-signed cert). +// 5. Pings with retry until the port-forward is reachable or +// connectRetryTimeout elapses. +func NewFromDocumentDB( + ctx context.Context, + env *environment.TestingEnvironment, + namespace, name string, + opts ...ConnectOption, +) (*Handle, error) { + if env == nil || env.Client == nil { + return nil, errors.New("mongo: NewFromDocumentDB requires a non-nil TestingEnvironment") + } + + cfg := connectConfig{tlsInsecure: true} + for _, o := range opts { + o(&cfg) + } + + dd := &previewv1.DocumentDB{} + if err := env.Client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, dd); err != nil { + return nil, fmt.Errorf("get DocumentDB %s/%s: %w", namespace, name, err) + } + + user, pass, err := readCredentialSecret(ctx, env, namespace) + if err != nil { + return nil, err + } + + lp, err := pickFreePort() + if err != nil { + return nil, fmt.Errorf("mongo: pick free port: %w", err) + } + + stop, err := portforward.OpenWithErr(ctx, env, dd, lp) + if err != nil { + return nil, fmt.Errorf("mongo: open port-forward: %w", err) + } + + c, err := NewClient(ctx, ClientOptions{ + Host: "127.0.0.1", + Port: strconv.Itoa(lp), + User: user, + Password: pass, + TLS: true, + TLSInsecure: cfg.tlsInsecure, + RootCAs: cfg.rootCAs, + CABundlePEM: cfg.caBundlePEM, + ServerName: cfg.serverName, + AuthDB: "admin", + }) + if err != nil { + _ = stop() + return nil, fmt.Errorf("mongo: connect: %w", err) + } + + // pingWithRetry owns the post-port-forward connection-refused + // window. No pre-ping sleep is needed: the retry loop at + // connectRetryBackoff cadence covers the forwarder bind delay. + if err := pingWithRetry(ctx, c, connectRetryTimeout); err != nil { + _ = c.Disconnect(ctx) + _ = stop() + return nil, fmt.Errorf("mongo: post-connect ping: %w", err) + } + + return &Handle{client: c, stop: stop}, nil +} + +// readCredentialSecret fetches username/password from the fixture +// credential secret. The secret is expected to have keys "username" +// and "password". +func readCredentialSecret( + ctx context.Context, + env *environment.TestingEnvironment, + namespace string, +) (string, string, error) { + sec := &corev1.Secret{} + err := env.Client.Get(ctx, types.NamespacedName{ + Namespace: namespace, Name: DefaultCredentialSecretName, + }, sec) + if err != nil { + return "", "", fmt.Errorf("get credential secret %s/%s: %w", + namespace, DefaultCredentialSecretName, err) + } + u := string(sec.Data["username"]) + p := string(sec.Data["password"]) + if u == "" || p == "" { + return "", "", fmt.Errorf("credential secret %s/%s missing username/password", + namespace, DefaultCredentialSecretName) + } + return u, p, nil +} + +// pickFreePort asks the kernel for an unused TCP port by binding ":0" +// and immediately closing the listener. There is a narrow race window +// between Close and the port-forward goroutine binding the same port; +// pingWithRetry absorbs that window without a fixed pre-ping sleep. +func pickFreePort() (int, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return 0, err + } + defer func() { _ = l.Close() }() + return l.Addr().(*net.TCPAddr).Port, nil +} + +// pingWithRetry polls Ping until it succeeds or timeout elapses. The +// port-forward goroutine needs a moment to bind the local port, so the +// first few pings may fail with "connection refused". Short backoff +// (connectRetryBackoff) keeps the happy path fast while still covering +// slow CI nodes via the overall timeout budget. +func pingWithRetry(ctx context.Context, c *driver.Client, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + var last error + for { + pingCtx, cancel := context.WithTimeout(ctx, 3*time.Second) + err := c.Ping(pingCtx, nil) + cancel() + if err == nil { + return nil + } + last = err + if time.Now().After(deadline) { + return fmt.Errorf("ping did not succeed within %s: %w", timeout, last) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(connectRetryBackoff): + } + } +} diff --git a/test/e2e/pkg/e2eutils/namespaces/namespaces.go b/test/e2e/pkg/e2eutils/namespaces/namespaces.go new file mode 100644 index 00000000..4ace28e7 --- /dev/null +++ b/test/e2e/pkg/e2eutils/namespaces/namespaces.go @@ -0,0 +1,127 @@ +// Package namespaces produces deterministic per-spec Kubernetes +// namespace names for DocumentDB e2e tests. The canonical entry point +// is [NamespaceForSpec], which a spec calls from inside a BeforeEach to +// obtain a name unique to the current spec, parallel process, and run. +// +// The returned names are DNS-1123-compliant (lowercase, ≤63 chars) and +// stable: calling NamespaceForSpec twice from within the same spec +// produces the same name, which is what retry / recovery logic needs. +package namespaces + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "os" + "strings" + + "github.com/onsi/ginkgo/v2" +) + +// maxNameLen bounds the returned namespace name; Kubernetes rejects +// names longer than 63 characters for DNS-1123 labels. +const maxNameLen = 63 + +// runIDFunc is a package-level indirection so unit tests can inject a +// deterministic run id without plumbing the root e2e package (which +// would introduce an import cycle). +var runIDFunc = defaultRunID + +// SetRunIDFunc overrides the run-id accessor. The root suite wires it +// during SetupSuite so NamespaceForSpec returns names that match the +// fixtures/teardown label selectors. Tests call this to inject a +// deterministic id. +func SetRunIDFunc(f func() string) { + if f != nil { + runIDFunc = f + } +} + +func defaultRunID() string { + if v := os.Getenv("E2E_RUN_ID"); v != "" { + return sanitizeSegment(v) + } + return "unset" +} + +// NamespaceForSpec returns a deterministic namespace name for the +// currently-running Ginkgo spec. The name embeds the sanitized area +// label, the run id, the parallel process number, and an 8-character +// SHA-256 prefix derived from the spec's FullText. Collisions across +// specs are avoided by the hash; determinism within a spec is provided +// by the hash being a pure function of the FullText. +// +// If area is empty, "spec" is used. Callers should pass the area +// label constant (e.g., e2e.LifecycleLabel) to make failures easier to +// triage from kubectl output. +func NamespaceForSpec(area string) string { + return buildName(area, ginkgo.CurrentSpecReport().FullText(), procID()) +} + +// procID returns the ginkgo parallel process id, defaulting to "1" +// when unset. Duplicated here (instead of shared with fixtures) to +// avoid a dependency cycle with the fixtures package. +func procID() string { + if v := os.Getenv("GINKGO_PARALLEL_PROCESS"); v != "" { + return v + } + return "1" +} + +// buildName is the pure core of NamespaceForSpec, factored out to make +// it trivially unit-testable without a Ginkgo runtime. +func buildName(area, specText, proc string) string { + areaPart := sanitizeSegment(area) + if areaPart == "" { + areaPart = "spec" + } + sum := sha256.Sum256([]byte(specText)) + hash := hex.EncodeToString(sum[:])[:8] + runID := sanitizeSegment(runIDFunc()) + if runID == "" { + runID = "unset" + } + name := fmt.Sprintf("e2e-%s-%s-p%s-%s", areaPart, runID, proc, hash) + if len(name) <= maxNameLen { + return name + } + // Truncate areaPart first, then runID, preserving the trailing + // hash (which is what guarantees uniqueness). + suffix := fmt.Sprintf("-p%s-%s", proc, hash) + budget := maxNameLen - len("e2e-") - len(suffix) - 1 // -1 for the dash between area and runID + if budget < 2 { + // Degenerate input; fall back to hash-only. + return ("e2e-" + hash + suffix)[:maxNameLen] + } + areaBudget := budget / 2 + runBudget := budget - areaBudget + if len(areaPart) > areaBudget { + areaPart = areaPart[:areaBudget] + } + if len(runID) > runBudget { + runID = runID[:runBudget] + } + return fmt.Sprintf("e2e-%s-%s%s", strings.Trim(areaPart, "-"), strings.Trim(runID, "-"), suffix) +} + +// sanitizeSegment converts arbitrary input into DNS-1123-safe runs of +// [a-z0-9-], collapsing and trimming separators. +func sanitizeSegment(in string) string { + in = strings.ToLower(in) + var b strings.Builder + b.Grow(len(in)) + lastDash := false + for _, r := range in { + switch { + case r >= 'a' && r <= 'z', r >= '0' && r <= '9': + b.WriteRune(r) + lastDash = false + default: + if !lastDash { + b.WriteByte('-') + lastDash = true + } + } + } + return strings.Trim(b.String(), "-") +} diff --git a/test/e2e/pkg/e2eutils/namespaces/namespaces_test.go b/test/e2e/pkg/e2eutils/namespaces/namespaces_test.go new file mode 100644 index 00000000..1323a63f --- /dev/null +++ b/test/e2e/pkg/e2eutils/namespaces/namespaces_test.go @@ -0,0 +1,76 @@ +package namespaces + +import ( + "regexp" + "strings" + "testing" +) + +// dns1123Label matches the Kubernetes DNS-1123 label regex. +var dns1123Label = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`) + +func TestBuildNameDeterministic(t *testing.T) { + SetRunIDFunc(func() string { return "run1" }) + a := buildName("lifecycle", "lifecycle creates a cluster", "1") + b := buildName("lifecycle", "lifecycle creates a cluster", "1") + if a != b { + t.Fatalf("non-deterministic: %q vs %q", a, b) + } + if !strings.HasPrefix(a, "e2e-lifecycle-run1-p1-") { + t.Fatalf("unexpected prefix: %q", a) + } +} + +func TestBuildNameUniquePerSpec(t *testing.T) { + SetRunIDFunc(func() string { return "run1" }) + a := buildName("scale", "scale up to 3", "1") + b := buildName("scale", "scale up to 4", "1") + if a == b { + t.Fatalf("distinct specs produced same name: %q", a) + } +} + +func TestBuildNameUniquePerProc(t *testing.T) { + SetRunIDFunc(func() string { return "run1" }) + a := buildName("data", "spec x", "1") + b := buildName("data", "spec x", "2") + if a == b { + t.Fatalf("distinct procs produced same name: %q", a) + } +} + +func TestBuildNameLengthAndDNS(t *testing.T) { + SetRunIDFunc(func() string { return strings.Repeat("x", 80) }) + longArea := strings.Repeat("area", 20) + name := buildName(longArea, "some-spec-text", "1") + if len(name) > maxNameLen { + t.Fatalf("name too long (%d): %q", len(name), name) + } + if !dns1123Label.MatchString(name) { + t.Fatalf("name not DNS-1123: %q", name) + } +} + +func TestBuildNameEmptyArea(t *testing.T) { + SetRunIDFunc(func() string { return "r" }) + name := buildName("", "spec", "1") + if !strings.HasPrefix(name, "e2e-spec-") { + t.Fatalf("empty area did not default to 'spec': %q", name) + } +} + +func TestSanitizeSegment(t *testing.T) { + cases := map[string]string{ + "Hello World": "hello-world", + "lifecycle": "lifecycle", + "a/b c": "a-b-c", + "---leading": "leading", + "": "", + "UPPER-123": "upper-123", + } + for in, want := range cases { + if got := sanitizeSegment(in); got != want { + t.Errorf("sanitizeSegment(%q) = %q, want %q", in, got, want) + } + } +} diff --git a/test/e2e/pkg/e2eutils/operatorhealth/.keep b/test/e2e/pkg/e2eutils/operatorhealth/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/operatorhealth/gate.go b/test/e2e/pkg/e2eutils/operatorhealth/gate.go new file mode 100644 index 00000000..0257f8ca --- /dev/null +++ b/test/e2e/pkg/e2eutils/operatorhealth/gate.go @@ -0,0 +1,213 @@ +// Package operatorhealth exposes a "churn gate" for the DocumentDB E2E +// suite: a lightweight equivalent of CNPG's tests/utils/operator +// PodRestarted / PodRenamed semantics, plus a sentinel that lets +// non-disruptive specs skip themselves after a prior spec has bounced +// the operator. +// +// Typical use from a suite-level BeforeEach/AfterEach: +// +// var gate *operatorhealth.Gate +// +// BeforeSuite(func() { +// var err error +// gate, err = operatorhealth.NewGate(ctx, env.Client, operatorhealth.DefaultNamespace) +// Expect(err).NotTo(HaveOccurred()) +// }) +// +// BeforeEach(operatorhealth.BeforeEachHook(gate)) +// AfterEach(operatorhealth.AfterEachHook(gate)) +// +// Disruptive specs that intentionally bounce the operator should mark +// the sentinel themselves via MarkChurned() so the AfterEach hook can +// keep its idempotent semantics. +package operatorhealth + +import ( + "context" + "errors" + "fmt" + "sync/atomic" + + . "github.com/onsi/ginkgo/v2" //nolint:revive // Ginkgo DSL is intentional. + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// DefaultNamespace is where the Helm chart installs the DocumentDB +// operator. +const DefaultNamespace = "documentdb-operator" + +// PodLabelSelector is the label the operator Deployment stamps on its +// Pod spec (verified from a live kind cluster: `app=documentdb-operator`). +// If the chart changes the selector, update this constant. +const ( + PodLabelKey = "app" + PodLabelValue = "documentdb-operator" +) + +// operatorChurned is a process-wide sentinel that records whether the +// operator pod has been observed to restart/rename. Once set, it stays +// set for the remainder of the process (the gate is advisory, not a +// correctness gate). +var operatorChurned atomic.Bool + +// Gate snapshots the identity and restart count of the operator pod so +// later Check calls can decide whether the operator churned underneath +// us. +type Gate struct { + c client.Client + ns string + initialUID types.UID + initialRestarts int32 + initialPodName string +} + +// NewGate discovers the current operator pod in ns and captures its +// identity. If no pod is found the caller can decide whether that's a +// fatal condition (typical for non-disruptive suites) or tolerable. +func NewGate(ctx context.Context, c client.Client, ns string) (*Gate, error) { + if c == nil { + return nil, errors.New("NewGate: client must not be nil") + } + if ns == "" { + ns = DefaultNamespace + } + pod, err := findOperatorPod(ctx, c, ns) + if err != nil { + return nil, err + } + return &Gate{ + c: c, + ns: ns, + initialUID: pod.UID, + initialPodName: pod.Name, + initialRestarts: totalRestarts(pod), + }, nil +} + +// Check re-reads the operator pod and reports whether it is still the +// same instance with the same restart count. A drift in UID, name, or +// restart count returns healthy=false with a short reason suitable for +// logging. +func (g *Gate) Check(ctx context.Context) (healthy bool, reason string, err error) { + if g == nil { + return false, "gate is nil", errors.New("Check: gate is nil") + } + pod, err := findOperatorPod(ctx, g.c, g.ns) + if err != nil { + return false, err.Error(), err + } + switch { + case pod.UID != g.initialUID: + return false, fmt.Sprintf("operator pod UID changed: %s -> %s", g.initialUID, pod.UID), nil + case pod.Name != g.initialPodName: + return false, fmt.Sprintf("operator pod renamed: %s -> %s", g.initialPodName, pod.Name), nil + case totalRestarts(pod) != g.initialRestarts: + return false, fmt.Sprintf("operator pod restart count changed: %d -> %d", + g.initialRestarts, totalRestarts(pod)), nil + } + return true, "", nil +} + +// Verify is a convenience wrapper over [Gate.Check] returning nil when +// the operator pod matches the snapshot captured by [NewGate] and an +// error (wrapping the observed reason) otherwise. It also flips the +// process-wide churn sentinel so subsequent calls to [SkipIfChurned] +// observe the drift. +// +// Typical use from an area's BeforeEach: +// +// BeforeEach(func() { Expect(gate.Verify(ctx)).To(Succeed()) }) +func (g *Gate) Verify(ctx context.Context) error { + if g == nil { + return errors.New("Verify: gate is nil") + } + healthy, reason, err := g.Check(ctx) + if err != nil { + MarkChurned() + return fmt.Errorf("operator health check failed: %w", err) + } + if !healthy { + MarkChurned() + return fmt.Errorf("operator churn detected: %s", reason) + } + return nil +} + +// MarkChurned sets the process-wide sentinel, causing SkipIfChurned to +// skip subsequent non-disruptive specs. Disruptive specs that know they +// bounced the operator should call this in their AfterEach. +func MarkChurned() { operatorChurned.Store(true) } + +// HasChurned reports the current sentinel state. +func HasChurned() bool { return operatorChurned.Load() } + +// SkipIfChurned calls Ginkgo's Skip if a prior spec (or an explicit +// MarkChurned call) has observed operator churn. Intended for use from +// BeforeEach of non-disruptive area suites. +func SkipIfChurned() { + if HasChurned() { + Skip("operator churned in a previous spec; skipping non-disruptive spec") + } +} + +// BeforeEachHook returns a Ginkgo BeforeEach body that calls +// SkipIfChurned. If gate is nil the hook still honors the sentinel so +// disruptive specs can flip it without a live Gate. +func BeforeEachHook(gate *Gate) func() { + _ = gate // reserved: future versions may refresh gate snapshot here + return func() { SkipIfChurned() } +} + +// AfterEachHook returns a Ginkgo AfterEach body that re-checks the +// operator pod and flips the sentinel if churn is detected. A nil gate +// disables the check. +func AfterEachHook(gate *Gate) func() { + return func() { + if gate == nil { + return + } + healthy, reason, err := gate.Check(context.Background()) + if err != nil || !healthy { + if reason == "" && err != nil { + reason = err.Error() + } + GinkgoWriter.Printf("operatorhealth: marking churned: %s\n", reason) + MarkChurned() + } + } +} + +// findOperatorPod looks up the first operator pod matching +// PodLabelKey=PodLabelValue in ns. Returns a NotFound error if none +// exist. +func findOperatorPod(ctx context.Context, c client.Client, ns string) (*corev1.Pod, error) { + var pods corev1.PodList + if err := c.List(ctx, &pods, + client.InNamespace(ns), + client.MatchingLabels{PodLabelKey: PodLabelValue}, + ); err != nil { + return nil, fmt.Errorf("listing operator pods in %q: %w", ns, err) + } + if len(pods.Items) == 0 { + return nil, apierrors.NewNotFound(corev1.Resource("pods"), + fmt.Sprintf("%s=%s in %s", PodLabelKey, PodLabelValue, ns)) + } + return &pods.Items[0], nil +} + +// totalRestarts sums RestartCount across all container statuses on pod. +// Matches CNPG's PodRestarted semantics. +func totalRestarts(pod *corev1.Pod) int32 { + if pod == nil { + return 0 + } + var total int32 + for _, cs := range pod.Status.ContainerStatuses { + total += cs.RestartCount + } + return total +} diff --git a/test/e2e/pkg/e2eutils/operatorhealth/gate_test.go b/test/e2e/pkg/e2eutils/operatorhealth/gate_test.go new file mode 100644 index 00000000..36321405 --- /dev/null +++ b/test/e2e/pkg/e2eutils/operatorhealth/gate_test.go @@ -0,0 +1,159 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package operatorhealth + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func newScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(s); err != nil { + t.Fatalf("scheme: %v", err) + } + return s +} + +func newPod(uid, name string, restarts int32) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: DefaultNamespace, + Labels: map[string]string{PodLabelKey: PodLabelValue}, + UID: types.UID(uid), + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "manager", RestartCount: restarts}, + }, + }, + } +} + +func TestNewGateCapturesInitialState(t *testing.T) { + // Reset sentinel between tests. + operatorChurned.Store(false) + + s := newScheme(t) + pod := newPod("uid-1", "documentdb-operator-abc", 0) + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(pod).Build() + + g, err := NewGate(context.Background(), c, DefaultNamespace) + if err != nil { + t.Fatalf("NewGate: %v", err) + } + if g.initialUID != "uid-1" || g.initialRestarts != 0 || g.initialPodName != pod.Name { + t.Errorf("unexpected captured state: %+v", g) + } +} + +func TestCheckHealthyWhenUnchanged(t *testing.T) { + operatorChurned.Store(false) + + s := newScheme(t) + pod := newPod("uid-1", "p1", 0) + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(pod).Build() + + g, err := NewGate(context.Background(), c, DefaultNamespace) + if err != nil { + t.Fatal(err) + } + healthy, reason, err := g.Check(context.Background()) + if err != nil { + t.Fatalf("Check: %v", err) + } + if !healthy { + t.Errorf("expected healthy, got reason=%q", reason) + } +} + +func TestCheckDetectsRestart(t *testing.T) { + operatorChurned.Store(false) + + s := newScheme(t) + pod := newPod("uid-1", "p1", 0) + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(pod).Build() + + g, err := NewGate(context.Background(), c, DefaultNamespace) + if err != nil { + t.Fatal(err) + } + // Bump restart count. + pod.Status.ContainerStatuses[0].RestartCount = 2 + if err := c.Status().Update(context.Background(), pod); err != nil { + t.Fatalf("update pod: %v", err) + } + + healthy, reason, err := g.Check(context.Background()) + if err != nil { + t.Fatalf("Check: %v", err) + } + if healthy { + t.Error("expected unhealthy after restart count bump") + } + if reason == "" { + t.Error("expected non-empty reason") + } +} + +func TestCheckDetectsPodReplacement(t *testing.T) { + operatorChurned.Store(false) + + s := newScheme(t) + pod := newPod("uid-1", "p1", 0) + c := fakeclient.NewClientBuilder().WithScheme(s).WithObjects(pod).Build() + + g, err := NewGate(context.Background(), c, DefaultNamespace) + if err != nil { + t.Fatal(err) + } + + // Replace the pod with a new UID/name. + if err := c.Delete(context.Background(), pod); err != nil { + t.Fatalf("delete pod: %v", err) + } + replacement := newPod("uid-2", "p2", 0) + if err := c.Create(context.Background(), replacement); err != nil { + t.Fatalf("create replacement: %v", err) + } + + healthy, _, err := g.Check(context.Background()) + if err != nil { + t.Fatalf("Check: %v", err) + } + if healthy { + t.Error("expected unhealthy after pod replacement") + } +} + +func TestSentinelMarkAndHas(t *testing.T) { + operatorChurned.Store(false) + if HasChurned() { + t.Fatal("expected sentinel clear") + } + MarkChurned() + if !HasChurned() { + t.Fatal("expected sentinel set") + } + // Reset for other tests. + operatorChurned.Store(false) +} + +func TestNewGateNoPods(t *testing.T) { + operatorChurned.Store(false) + s := newScheme(t) + c := fakeclient.NewClientBuilder().WithScheme(s).Build() + if _, err := NewGate(context.Background(), c, DefaultNamespace); err == nil { + t.Fatal("expected error when no pods match") + } +} diff --git a/test/e2e/pkg/e2eutils/portforward/.keep b/test/e2e/pkg/e2eutils/portforward/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/portforward/portforward.go b/test/e2e/pkg/e2eutils/portforward/portforward.go new file mode 100644 index 00000000..f0323dfa --- /dev/null +++ b/test/e2e/pkg/e2eutils/portforward/portforward.go @@ -0,0 +1,151 @@ +// Package portforward is a thin wrapper around CNPG's +// tests/utils/forwardconnection helper, specialised for the DocumentDB +// gateway service. +// +// The DocumentDB operator creates a Service named +// "documentdb-service-" in the same namespace as the CR, with +// a port named "gateway" targeting the gateway sidecar (default port +// 10260). This package opens a local port-forward to that service and +// returns a stop func the caller defers. +// +// Fallback note +// +// forwardconnection.NewDialerFromService is generic over service name +// and does NOT hardcode Postgres, despite the package's origin in the +// CNPG codebase. We therefore use the CNPG helper directly rather than +// reaching for client-go's portforward.PortForwarder. If a future CNPG +// release tightens the helper to Postgres-only semantics, this file is +// the single place to swap in a client-go implementation. +package portforward + +import ( + "context" + "errors" + "fmt" + "io" + + "github.com/cloudnative-pg/cloudnative-pg/tests/utils/environment" + "github.com/cloudnative-pg/cloudnative-pg/tests/utils/forwardconnection" + "github.com/onsi/ginkgo/v2" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" +) + +// GatewayPort is the default DocumentDB gateway TCP port inside the +// cluster. Mirrored from operator/src/internal/utils/constants.go so +// the E2E module does not depend on the operator's internal packages. +const GatewayPort = 10260 + +// ServiceNamePrefix mirrors DOCUMENTDB_SERVICE_PREFIX from the operator. +// The fully-qualified service name is ServiceNamePrefix + dd.Name, +// truncated to 63 characters to honour the Kubernetes DNS limit. +const ServiceNamePrefix = "documentdb-service-" + +// GatewayServiceName returns the Service name the operator creates for +// the given DocumentDB CR. +func GatewayServiceName(dd *previewv1.DocumentDB) string { + if dd == nil { + return "" + } + name := ServiceNamePrefix + dd.Name + if len(name) > 63 { + name = name[:63] + } + return name +} + +// OpenWithErr establishes a port-forward from localPort on the caller's +// host to the DocumentDB gateway service backing dd. It returns a stop +// func that halts the forward and returns the final error reported by +// the forwarder goroutine (nil on clean shutdown). Callers MUST invoke +// stop exactly once; double-invocation is safe but only the first call +// returns the real error. +// +// If localPort is 0, a free port is picked by the kernel. +// +// Prefer OpenWithErr over Open for new call sites: exposing the +// forwarder error lets specs surface gateway-level disconnects instead +// of silently dropping them. +func OpenWithErr( + ctx context.Context, + env *environment.TestingEnvironment, + dd *previewv1.DocumentDB, + localPort int, +) (stop func() error, err error) { + if env == nil { + return nil, fmt.Errorf("OpenWithErr: env must not be nil") + } + if dd == nil { + return nil, fmt.Errorf("OpenWithErr: dd must not be nil") + } + svcName := GatewayServiceName(dd) + if svcName == "" { + return nil, fmt.Errorf("OpenWithErr: could not derive gateway service name from %+v", dd) + } + + dialer, _, err := forwardconnection.NewDialerFromService( + ctx, + env.Interface, + env.RestClientConfig, + dd.Namespace, + svcName, + ) + if err != nil { + return nil, fmt.Errorf("building dialer for %s/%s: %w", dd.Namespace, svcName, err) + } + + portMaps := []string{fmt.Sprintf("%d:%d", localPort, GatewayPort)} + fc, err := forwardconnection.NewForwardConnection(dialer, portMaps, io.Discard, io.Discard) + if err != nil { + return nil, fmt.Errorf("creating forward connection: %w", err) + } + + fwdCtx, cancel := context.WithCancel(ctx) + errCh := make(chan error, 1) + go func() { errCh <- fc.StartAndWait(fwdCtx) }() + + var stopped bool + stop = func() error { + if stopped { + return nil + } + stopped = true + cancel() + // Drain the goroutine so callers see deterministic teardown. + // context.Canceled is the expected shutdown signal and is + // swallowed; everything else is surfaced. + e := <-errCh + if e != nil && !errors.Is(e, context.Canceled) { + return e + } + return nil + } + return stop, nil +} + +// Open is the backwards-compatible wrapper around OpenWithErr that +// returns a stop func() (no error). Any non-nil forwarder error +// observed at teardown is logged to GinkgoWriter so test failures are +// still traceable. +// +// New callers should prefer OpenWithErr; Open remains for pre-existing +// callers that cannot easily propagate the error (e.g., helpers that +// plug into DeferCleanup with a no-return func). +func Open( + ctx context.Context, + env *environment.TestingEnvironment, + dd *previewv1.DocumentDB, + localPort int, +) (stop func(), err error) { + stopE, err := OpenWithErr(ctx, env, dd, localPort) + if err != nil { + return nil, err + } + return func() { + if ferr := stopE(); ferr != nil { + fmt.Fprintf(ginkgo.GinkgoWriter, + "portforward: forwarder for %s/%s exited with error: %v\n", + dd.Namespace, GatewayServiceName(dd), ferr) + } + }, nil +} diff --git a/test/e2e/pkg/e2eutils/portforward/portforward_test.go b/test/e2e/pkg/e2eutils/portforward/portforward_test.go new file mode 100644 index 00000000..8eaa78e7 --- /dev/null +++ b/test/e2e/pkg/e2eutils/portforward/portforward_test.go @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package portforward + +import ( + "strings" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" +) + +func TestGatewayServiceName(t *testing.T) { + cases := []struct { + name string + dd *previewv1.DocumentDB + want string + }{ + {"nil", nil, ""}, + { + "short", + &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "my-dd"}}, + "documentdb-service-my-dd", + }, + { + "truncated", + &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: strings.Repeat("x", 80)}}, + // 19 (prefix) + 44 xs = 63 + "documentdb-service-" + strings.Repeat("x", 44), + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := GatewayServiceName(tc.dd) + if got != tc.want { + t.Errorf("got %q want %q", got, tc.want) + } + if len(got) > 63 { + t.Errorf("name exceeds 63 chars: %d", len(got)) + } + }) + } +} + +func TestGatewayPort(t *testing.T) { + if GatewayPort != 10260 { + t.Errorf("GatewayPort drift: got %d want 10260 (see operator/src/internal/utils/constants.go)", GatewayPort) + } +} + +func TestOpenWithErr_NilEnv(t *testing.T) { + t.Parallel() + stop, err := OpenWithErr(nil, nil, &previewv1.DocumentDB{ObjectMeta: metav1.ObjectMeta{Name: "x"}}, 0) //nolint:staticcheck // testing nil-ctx/env guard + if err == nil { + t.Fatal("expected error for nil env") + } + if stop != nil { + t.Fatal("expected nil stop when error is returned") + } + if !strings.Contains(err.Error(), "env") { + t.Fatalf("error should mention env: %v", err) + } +} + +func TestOpenWithErr_NilDD(t *testing.T) { + t.Parallel() + // Open is a wrapper around OpenWithErr; exercise the backward- + // compat shim's validation path in the same package without + // needing a real *environment.TestingEnvironment. + stop, err := Open(nil, nil, nil, 0) //nolint:staticcheck // testing nil-guard + if err == nil { + t.Fatal("expected error for nil env/dd") + } + if stop != nil { + t.Fatal("expected nil stop when error is returned") + } +} diff --git a/test/e2e/pkg/e2eutils/seed/.keep b/test/e2e/pkg/e2eutils/seed/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/seed/datasets.go b/test/e2e/pkg/e2eutils/seed/datasets.go new file mode 100644 index 00000000..b89f66b9 --- /dev/null +++ b/test/e2e/pkg/e2eutils/seed/datasets.go @@ -0,0 +1,126 @@ +// Package seed provides deterministic, canonical datasets used by the +// DocumentDB E2E suite. All generators are pure functions with no +// external dependencies; they return freshly-allocated slices of +// bson.M so callers may mutate them safely. +package seed + +import ( + "fmt" + "math/rand/v2" + + "go.mongodb.org/mongo-driver/v2/bson" +) + +// SmallDatasetSize is the number of documents returned by SmallDataset. +const SmallDatasetSize = 10 + +// MediumDatasetSize is the number of documents returned by MediumDataset. +const MediumDatasetSize = 1000 + +// SortDatasetSize is the number of documents returned by SortDataset. +const SortDatasetSize = 100 + +// AggDatasetSize is the number of documents returned by AggDataset. +const AggDatasetSize = 50 + +// AggDatasetGroups is the number of distinct category values emitted by +// AggDataset. Callers asserting group cardinality in aggregation tests +// should use this constant. +const AggDatasetGroups = 5 + +// SmallDataset returns exactly SmallDatasetSize documents with +// predictable identity and score fields, suitable for quick insert / +// count round-trips. Shape: {_id: N, name: "doc-N", score: N*10} for +// N in [1, SmallDatasetSize]. +func SmallDataset() []bson.M { + out := make([]bson.M, SmallDatasetSize) + for i := 0; i < SmallDatasetSize; i++ { + n := i + 1 + out[i] = bson.M{ + "_id": n, + "name": fmt.Sprintf("doc-%d", n), + "score": n * 10, + } + } + return out +} + +// MediumDataset returns MediumDatasetSize documents following the same +// shape as SmallDataset, used to validate bulk insert, count, and +// indexing behaviour under non-trivial sizes. +func MediumDataset() []bson.M { + out := make([]bson.M, MediumDatasetSize) + for i := 0; i < MediumDatasetSize; i++ { + n := i + 1 + out[i] = bson.M{ + "_id": n, + "name": fmt.Sprintf("doc-%d", n), + "score": n * 10, + } + } + return out +} + +// sortDatasetSeed is the deterministic seed used by SortDataset so that +// identical Go runs produce identical document order — this is what +// makes sort assertions in tests reproducible. +var sortDatasetSeed = [32]byte{ + 0xd0, 0xc8, 0xd8, 0x53, 's', 'o', 'r', 't', + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +} + +// SortDataset returns SortDatasetSize documents with varied string +// names and numeric scores suitable for validating $sort semantics. +// Output order is intentionally scrambled but deterministic — running +// the function repeatedly yields the same slice so assertions can +// compare to a known ordering. +func SortDataset() []bson.M { + r := rand.New(rand.NewChaCha8(sortDatasetSeed)) + // Generate names from a 12-letter alphabet so sort validations see + // meaningful string comparisons rather than trivial N-indexed names. + const alphabet = "abcdefghijkl" + indices := r.Perm(SortDatasetSize) + out := make([]bson.M, SortDatasetSize) + for i := 0; i < SortDatasetSize; i++ { + n := indices[i] + 1 + // Two-letter name derived from the permutation for variety. + b := []byte{alphabet[n%len(alphabet)], alphabet[(n*7)%len(alphabet)]} + out[i] = bson.M{ + "_id": n, + "name": string(b), + "score": (n * 37) % 1000, + } + } + return out +} + +// aggDatasetSeed is the deterministic seed used by AggDataset. +var aggDatasetSeed = [32]byte{ + 0xa6, 0x67, 0x67, 'a', 'g', 'g', 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +} + +// AggDataset returns AggDatasetSize documents spread across exactly +// AggDatasetGroups distinct `category` values. Every document has a +// unique numeric _id and a per-category `value` field suitable for +// $group / $sum aggregations. +func AggDataset() []bson.M { + r := rand.New(rand.NewChaCha8(aggDatasetSeed)) + categories := []string{"alpha", "beta", "gamma", "delta", "epsilon"} + if len(categories) != AggDatasetGroups { + // Compile-time invariant guarded by the const. + panic("seed: AggDatasetGroups mismatch") + } + out := make([]bson.M, AggDatasetSize) + // Round-robin to guarantee every category appears at least once, + // then perturb value fields with the deterministic RNG. + for i := 0; i < AggDatasetSize; i++ { + cat := categories[i%len(categories)] + out[i] = bson.M{ + "_id": i + 1, + "category": cat, + "value": r.IntN(1000), + } + } + return out +} diff --git a/test/e2e/pkg/e2eutils/seed/datasets_test.go b/test/e2e/pkg/e2eutils/seed/datasets_test.go new file mode 100644 index 00000000..5a9b34c7 --- /dev/null +++ b/test/e2e/pkg/e2eutils/seed/datasets_test.go @@ -0,0 +1,96 @@ +package seed + +import ( + "fmt" + "reflect" + "testing" +) + +func TestSmallDataset_Shape(t *testing.T) { + t.Parallel() + ds := SmallDataset() + if len(ds) != SmallDatasetSize { + t.Fatalf("want %d docs, got %d", SmallDatasetSize, len(ds)) + } + for i, d := range ds { + n := i + 1 + if d["_id"] != n { + t.Fatalf("doc %d: _id=%v, want %d", i, d["_id"], n) + } + if d["name"] != fmt.Sprintf("doc-%d", n) { + t.Fatalf("doc %d: name=%v", i, d["name"]) + } + if d["score"] != n*10 { + t.Fatalf("doc %d: score=%v, want %d", i, d["score"], n*10) + } + } +} + +func TestMediumDataset_Size(t *testing.T) { + t.Parallel() + ds := MediumDataset() + if len(ds) != MediumDatasetSize { + t.Fatalf("want %d docs, got %d", MediumDatasetSize, len(ds)) + } + // Spot-check first and last doc. + if ds[0]["_id"] != 1 { + t.Fatalf("first _id=%v", ds[0]["_id"]) + } + if ds[MediumDatasetSize-1]["_id"] != MediumDatasetSize { + t.Fatalf("last _id=%v", ds[MediumDatasetSize-1]["_id"]) + } +} + +func TestSortDataset_DeterministicOrder(t *testing.T) { + t.Parallel() + a := SortDataset() + b := SortDataset() + if len(a) != SortDatasetSize { + t.Fatalf("size=%d want %d", len(a), SortDatasetSize) + } + if !reflect.DeepEqual(a, b) { + t.Fatalf("SortDataset is not deterministic across calls") + } + // All _id values must be unique and in range [1, SortDatasetSize]. + seen := make(map[any]bool, SortDatasetSize) + for _, d := range a { + id := d["_id"] + if seen[id] { + t.Fatalf("duplicate _id=%v", id) + } + seen[id] = true + } +} + +func TestAggDataset_GroupCardinality(t *testing.T) { + t.Parallel() + ds := AggDataset() + if len(ds) != AggDatasetSize { + t.Fatalf("size=%d want %d", len(ds), AggDatasetSize) + } + cats := map[string]int{} + for _, d := range ds { + c, ok := d["category"].(string) + if !ok { + t.Fatalf("non-string category: %T", d["category"]) + } + cats[c]++ + } + if len(cats) != AggDatasetGroups { + t.Fatalf("want %d distinct categories, got %d (%v)", AggDatasetGroups, len(cats), cats) + } + // Every category should have at least one document (round-robin + // distribution guarantees this when size ≥ groups). + for c, n := range cats { + if n == 0 { + t.Fatalf("category %s empty", c) + } + } +} + +func TestAggDataset_Deterministic(t *testing.T) { + t.Parallel() + if !reflect.DeepEqual(AggDataset(), AggDataset()) { + t.Fatalf("AggDataset not deterministic") + } +} diff --git a/test/e2e/pkg/e2eutils/testenv/.keep b/test/e2e/pkg/e2eutils/testenv/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/testenv/env.go b/test/e2e/pkg/e2eutils/testenv/env.go new file mode 100644 index 00000000..fee8555d --- /dev/null +++ b/test/e2e/pkg/e2eutils/testenv/env.go @@ -0,0 +1,96 @@ +// Package testenv constructs a CloudNative-PG *environment.TestingEnvironment +// pre-configured for the DocumentDB E2E suite. +// +// Upstream CNPG's NewTestingEnvironment only registers the +// volumesnapshot and prometheus-operator scheme groups. DocumentDB specs +// additionally need: +// +// - CloudNative-PG's apiv1 (Cluster, Backup, ScheduledBackup, Pooler, …) +// - k8s.io client-go scheme (core/v1, apps/v1, …) +// - the DocumentDB operator preview API (documentdb.io/preview) +// +// NewDocumentDBTestingEnvironment registers those groups onto the shared +// scheme and rebuilds env.Client so it can Get/List/Watch DocumentDB CRs. +// +// Phase-0 note: CNPG's NewTestingEnvironment parses POSTGRES_IMG with +// Masterminds/semver. If the tag is not semver-parseable (e.g. "latest") +// it returns an error. We default POSTGRES_IMG=busybox:17.2 when the +// variable is unset so the suite can boot without CNPG postgres images. +package testenv + +import ( + "context" + "fmt" + "os" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + "github.com/cloudnative-pg/cloudnative-pg/tests/utils/environment" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" +) + +// DefaultOperatorNamespace is the namespace the DocumentDB operator is +// deployed into by the Helm chart and by the suite's CI fixtures. +const DefaultOperatorNamespace = "documentdb-operator" + +// DefaultPostgresImage is the placeholder image used to satisfy CNPG's +// semver parsing when the caller does not care about the Postgres image +// (DocumentDB specs never launch raw CNPG clusters from this env). +const DefaultPostgresImage = "busybox:17.2" + +// postgresImgEnv is the environment variable consulted by the upstream +// CNPG testing environment constructor. +const postgresImgEnv = "POSTGRES_IMG" + +// NewDocumentDBTestingEnvironment returns a CNPG *TestingEnvironment with +// the CloudNative-PG apiv1, client-go and DocumentDB preview schemes +// registered and env.Client rebuilt against that scheme. The supplied +// context is stored on the returned environment for callers that need it. +func NewDocumentDBTestingEnvironment(ctx context.Context) (*environment.TestingEnvironment, error) { + if _, ok := os.LookupEnv(postgresImgEnv); !ok { + if err := os.Setenv(postgresImgEnv, DefaultPostgresImage); err != nil { + return nil, fmt.Errorf("setting %s: %w", postgresImgEnv, err) + } + } + + env, err := environment.NewTestingEnvironment() + if err != nil { + return nil, fmt.Errorf("creating CNPG testing environment: %w", err) + } + + utilruntime.Must(cnpgv1.AddToScheme(env.Scheme)) + utilruntime.Must(clientgoscheme.AddToScheme(env.Scheme)) + utilruntime.Must(previewv1.AddToScheme(env.Scheme)) + + c, err := client.New(env.RestClientConfig, client.Options{Scheme: env.Scheme}) + if err != nil { + return nil, fmt.Errorf("rebuilding controller-runtime client with DocumentDB scheme: %w", err) + } + env.Client = c + if ctx != nil { + env.Ctx = ctx + } + return env, nil +} + +// DefaultDocumentDBScheme returns a fresh scheme with the same group +// registrations applied by NewDocumentDBTestingEnvironment. It is useful +// for unit tests that construct a fake client without spinning up the +// full TestingEnvironment. +func DefaultDocumentDBScheme() (*runtime.Scheme, error) { + s := runtime.NewScheme() + if err := cnpgv1.AddToScheme(s); err != nil { + return nil, fmt.Errorf("adding cnpg apiv1 to scheme: %w", err) + } + if err := clientgoscheme.AddToScheme(s); err != nil { + return nil, fmt.Errorf("adding client-go scheme: %w", err) + } + if err := previewv1.AddToScheme(s); err != nil { + return nil, fmt.Errorf("adding documentdb preview scheme: %w", err) + } + return s, nil +} diff --git a/test/e2e/pkg/e2eutils/testenv/env_test.go b/test/e2e/pkg/e2eutils/testenv/env_test.go new file mode 100644 index 00000000..d3755df5 --- /dev/null +++ b/test/e2e/pkg/e2eutils/testenv/env_test.go @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package testenv + +import ( + "testing" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + corev1 "k8s.io/api/core/v1" +) + +func TestDefaultDocumentDBSchemeRegistersExpectedGroups(t *testing.T) { + s, err := DefaultDocumentDBScheme() + if err != nil { + t.Fatalf("DefaultDocumentDBScheme: %v", err) + } + + if !s.Recognizes(cnpgv1.SchemeGroupVersion.WithKind("Cluster")) { + t.Errorf("expected scheme to recognize cnpg apiv1 Cluster") + } + if !s.Recognizes(previewv1.GroupVersion.WithKind("DocumentDB")) { + t.Errorf("expected scheme to recognize DocumentDB preview group") + } + if !s.Recognizes(corev1.SchemeGroupVersion.WithKind("Pod")) { + t.Errorf("expected scheme to recognize core/v1 Pod") + } +} + +func TestDefaultConstants(t *testing.T) { + if DefaultOperatorNamespace == "" { + t.Fatal("DefaultOperatorNamespace must not be empty") + } + if DefaultPostgresImage == "" { + t.Fatal("DefaultPostgresImage must not be empty") + } +} diff --git a/test/e2e/pkg/e2eutils/timeouts/.keep b/test/e2e/pkg/e2eutils/timeouts/.keep new file mode 100644 index 00000000..e69de29b diff --git a/test/e2e/pkg/e2eutils/timeouts/timeouts.go b/test/e2e/pkg/e2eutils/timeouts/timeouts.go new file mode 100644 index 00000000..8a18b73c --- /dev/null +++ b/test/e2e/pkg/e2eutils/timeouts/timeouts.go @@ -0,0 +1,120 @@ +// Package timeouts centralises the Eventually/Consistently durations +// used by the DocumentDB E2E suite. Where an operation corresponds to a +// CNPG event already modelled by +// github.com/cloudnative-pg/cloudnative-pg/tests/utils/timeouts, this +// package reuses the CNPG value (converted to a time.Duration); for +// DocumentDB-specific operations it defines opinionated defaults. +package timeouts + +import ( + "time" + + cnpgtimeouts "github.com/cloudnative-pg/cloudnative-pg/tests/utils/timeouts" +) + +// Op is a DocumentDB-specific operation identifier. New callers should +// prefer the constants below over ad-hoc strings so that the helper can +// surface un-mapped operations via the UnknownOpFallback. +type Op string + +// DocumentDB-specific operations. When adding an entry here, also +// extend documentDBDefaults/cnpgAlias (and PollInterval if the new op +// needs a non-default poll cadence). +const ( + // DocumentDBReady waits for a fresh DocumentDB cluster to reach the + // running state after creation. + DocumentDBReady Op = "documentDBReady" + // DocumentDBUpgrade waits for an in-place image upgrade rollout. + DocumentDBUpgrade Op = "documentDBUpgrade" + // InstanceScale waits for a replica count change to converge. + InstanceScale Op = "instanceScale" + // PVCResize waits for a StorageConfiguration.PvcSize change to be + // applied across all PVCs. + PVCResize Op = "pvcResize" + // BackupComplete waits for a Backup CR to reach Completed. + BackupComplete Op = "backupComplete" + // RestoreComplete waits for a recovery bootstrap to complete. + RestoreComplete Op = "restoreComplete" + // MongoConnect bounds a single mongo client connect/ping attempt. + MongoConnect Op = "mongoConnect" + // ServiceReady waits for a LoadBalancer / ClusterIP to acquire an + // address and begin routing. + ServiceReady Op = "serviceReady" +) + +// UnknownOpFallback is returned by For when an Op is not in the +// DocumentDB map and has no corresponding CNPG mapping. +const UnknownOpFallback = 2 * time.Minute + +// documentDBDefaults captures the DocumentDB-specific defaults used by +// For. Keep this map in sync with the constants above. +var documentDBDefaults = map[Op]time.Duration{ + DocumentDBReady: 5 * time.Minute, + DocumentDBUpgrade: 10 * time.Minute, + InstanceScale: 5 * time.Minute, + PVCResize: 5 * time.Minute, + BackupComplete: 10 * time.Minute, + RestoreComplete: 15 * time.Minute, + MongoConnect: 30 * time.Second, + ServiceReady: 2 * time.Minute, +} + +// cnpgAlias maps selected DocumentDB ops to their CNPG counterparts. +// When the CNPG timeouts map (optionally overridden via the +// TEST_TIMEOUTS environment variable) contains the aliased event, its +// value — converted from seconds to time.Duration — wins over the +// DocumentDB default. This lets operators share a single tuning knob +// for cluster-readiness style waits. +var cnpgAlias = map[Op]cnpgtimeouts.Timeout{ + DocumentDBReady: cnpgtimeouts.ClusterIsReady, + InstanceScale: cnpgtimeouts.ClusterIsReady, + BackupComplete: cnpgtimeouts.BackupIsReady, +} + +// For returns the Eventually timeout for op. Lookup order: +// 1. CNPG alias (honours TEST_TIMEOUTS env var if set). +// 2. DocumentDB default. +// 3. UnknownOpFallback for unknown ops. +func For(op Op) time.Duration { + if alias, ok := cnpgAlias[op]; ok { + if m, err := cnpgtimeouts.Timeouts(); err == nil { + if s, ok := m[alias]; ok { + return time.Duration(s) * time.Second + } + } + } + if d, ok := documentDBDefaults[op]; ok { + return d + } + return UnknownOpFallback +} + +// PollInterval returns the Eventually poll interval for op. Fast ops +// use a short 2-second poll; slow, cluster-level operations use a +// 10-second poll to reduce API-server churn during long waits. +func PollInterval(op Op) time.Duration { + switch op { + case MongoConnect, ServiceReady: + return 2 * time.Second + case DocumentDBReady, DocumentDBUpgrade, InstanceScale, + PVCResize, BackupComplete, RestoreComplete: + return 10 * time.Second + default: + return 5 * time.Second + } +} + +// AllOps returns the set of DocumentDB operations known to this +// package, in insertion order. Useful for table tests. +func AllOps() []Op { + return []Op{ + DocumentDBReady, + DocumentDBUpgrade, + InstanceScale, + PVCResize, + BackupComplete, + RestoreComplete, + MongoConnect, + ServiceReady, + } +} diff --git a/test/e2e/pkg/e2eutils/timeouts/timeouts_test.go b/test/e2e/pkg/e2eutils/timeouts/timeouts_test.go new file mode 100644 index 00000000..e70d2c7f --- /dev/null +++ b/test/e2e/pkg/e2eutils/timeouts/timeouts_test.go @@ -0,0 +1,50 @@ +package timeouts + +import ( + "testing" + "time" +) + +func TestFor_CoversAllOps(t *testing.T) { + t.Parallel() + for _, op := range AllOps() { + d := For(op) + if d <= 0 { + t.Fatalf("For(%s) returned non-positive %s", op, d) + } + // Guard: every known Op must have an explicit entry in + // documentDBDefaults (even if its value coincidentally equals + // UnknownOpFallback) so adding a new Op forces a choice. + if _, ok := documentDBDefaults[op]; !ok { + t.Fatalf("Op %s missing from documentDBDefaults — add an explicit default", op) + } + } +} + +func TestFor_UnknownOpFallback(t *testing.T) { + t.Parallel() + got := For(Op("this-op-does-not-exist")) + if got != UnknownOpFallback { + t.Fatalf("unknown op: got %s, want %s", got, UnknownOpFallback) + } +} + +func TestFor_DocumentDBUpgrade_IsDocumentDBDefault(t *testing.T) { + t.Parallel() + // Not CNPG-aliased → must come straight from documentDBDefaults. + if got, want := For(DocumentDBUpgrade), 10*time.Minute; got != want { + t.Fatalf("DocumentDBUpgrade: got %s, want %s", got, want) + } +} + +func TestPollInterval_NonZero(t *testing.T) { + t.Parallel() + for _, op := range AllOps() { + if got := PollInterval(op); got <= 0 { + t.Fatalf("PollInterval(%s) non-positive: %s", op, got) + } + } + if got := PollInterval(Op("unknown")); got <= 0 { + t.Fatalf("PollInterval(unknown) non-positive: %s", got) + } +} diff --git a/test/e2e/pkg/e2eutils/tlscerts/tlscerts.go b/test/e2e/pkg/e2eutils/tlscerts/tlscerts.go new file mode 100644 index 00000000..9c5cefdf --- /dev/null +++ b/test/e2e/pkg/e2eutils/tlscerts/tlscerts.go @@ -0,0 +1,117 @@ +// Package tlscerts generates throwaway TLS material (CA + server +// certificate) suitable for DocumentDB E2E "Provided" mode tests. +// +// The generated artefacts are written into an in-memory struct whose +// PEM fields can be plugged directly into a Kubernetes +// kubernetes.io/tls Secret (tls.crt / tls.key) plus an optional +// ca.crt entry for clients that want to verify the chain. +// +// None of this material is secure: keys are 2048-bit RSA, validity +// windows are short, and no revocation story exists. It is only +// intended for tests. +package tlscerts + +import ( + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "math/big" + "net" + "time" +) + +// Bundle is the PEM-encoded material produced by Generate. The fields +// align with the canonical key names used by Kubernetes TLS secrets: +// tls.crt (ServerCertPEM), tls.key (ServerKeyPEM) and the optional +// ca.crt (CACertPEM). +type Bundle struct { + CACertPEM []byte + CAKeyPEM []byte + ServerCertPEM []byte + ServerKeyPEM []byte +} + +// GenerateOptions controls Generate. DNSNames and IPAddresses populate +// the server certificate's SANs; at least one entry is required so +// TLS clients performing hostname verification have something to +// match against. Validity defaults to 24 hours when zero. +type GenerateOptions struct { + // CommonName is the server certificate's CN. Defaults to + // "documentdb-e2e" when empty. + CommonName string + // DNSNames populates the SAN DNSNames field. + DNSNames []string + // IPAddresses populates the SAN IPAddresses field. + IPAddresses []net.IP + // Validity defaults to 24 hours when zero. + Validity time.Duration +} + +// Generate builds a self-signed CA and a server certificate signed by +// that CA. Both are returned as PEM-encoded bytes in Bundle. +func Generate(opts GenerateOptions) (*Bundle, error) { + if len(opts.DNSNames) == 0 && len(opts.IPAddresses) == 0 { + return nil, fmt.Errorf("tlscerts: at least one DNSName or IPAddress SAN is required") + } + validity := opts.Validity + if validity == 0 { + validity = 24 * time.Hour + } + cn := opts.CommonName + if cn == "" { + cn = "documentdb-e2e" + } + + caKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + return nil, fmt.Errorf("tlscerts: generate CA key: %w", err) + } + caTmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "documentdb-e2e-ca"}, + NotBefore: time.Now().Add(-5 * time.Minute), + NotAfter: time.Now().Add(validity), + KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature, + BasicConstraintsValid: true, + IsCA: true, + } + caDER, err := x509.CreateCertificate(rand.Reader, caTmpl, caTmpl, &caKey.PublicKey, caKey) + if err != nil { + return nil, fmt.Errorf("tlscerts: sign CA: %w", err) + } + + srvKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + return nil, fmt.Errorf("tlscerts: generate server key: %w", err) + } + srvTmpl := &x509.Certificate{ + SerialNumber: big.NewInt(2), + Subject: pkix.Name{CommonName: cn}, + NotBefore: time.Now().Add(-5 * time.Minute), + NotAfter: time.Now().Add(validity), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth}, + DNSNames: append([]string(nil), opts.DNSNames...), + IPAddresses: append([]net.IP(nil), opts.IPAddresses...), + } + srvDER, err := x509.CreateCertificate(rand.Reader, srvTmpl, caTmpl, &srvKey.PublicKey, caKey) + if err != nil { + return nil, fmt.Errorf("tlscerts: sign server cert: %w", err) + } + + return &Bundle{ + CACertPEM: pemEncode("CERTIFICATE", caDER), + CAKeyPEM: pemEncode("RSA PRIVATE KEY", x509.MarshalPKCS1PrivateKey(caKey)), + ServerCertPEM: pemEncode("CERTIFICATE", srvDER), + ServerKeyPEM: pemEncode("RSA PRIVATE KEY", x509.MarshalPKCS1PrivateKey(srvKey)), + }, nil +} + +// pemEncode is a tiny wrapper so callers don't need to construct +// pem.Block literals at each call site. +func pemEncode(blockType string, der []byte) []byte { + return pem.EncodeToMemory(&pem.Block{Type: blockType, Bytes: der}) +} diff --git a/test/e2e/pkg/e2eutils/tlscerts/tlscerts_test.go b/test/e2e/pkg/e2eutils/tlscerts/tlscerts_test.go new file mode 100644 index 00000000..7e2c1eee --- /dev/null +++ b/test/e2e/pkg/e2eutils/tlscerts/tlscerts_test.go @@ -0,0 +1,97 @@ +package tlscerts + +import ( + "crypto/x509" + "encoding/pem" + "net" + "strings" + "testing" + "time" +) + +func TestGenerateRejectsEmptySANs(t *testing.T) { + if _, err := Generate(GenerateOptions{}); err == nil { + t.Fatalf("expected error for empty SANs") + } +} + +func TestGenerateProducesVerifiableChain(t *testing.T) { + b, err := Generate(GenerateOptions{ + CommonName: "gw.test", + DNSNames: []string{"gw.test", "localhost"}, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + Validity: 1 * time.Hour, + }) + if err != nil { + t.Fatalf("Generate: %v", err) + } + for name, pemBytes := range map[string][]byte{ + "ca.crt": b.CACertPEM, + "ca.key": b.CAKeyPEM, + "tls.crt": b.ServerCertPEM, + "tls.key": b.ServerKeyPEM, + } { + if len(pemBytes) == 0 { + t.Fatalf("%s empty", name) + } + if blk, _ := pem.Decode(pemBytes); blk == nil { + t.Fatalf("%s not valid PEM", name) + } + } + + caBlock, _ := pem.Decode(b.CACertPEM) + if caBlock == nil { + t.Fatal("decode CA") + } + caCert, err := x509.ParseCertificate(caBlock.Bytes) + if err != nil { + t.Fatalf("parse CA: %v", err) + } + if !caCert.IsCA { + t.Fatal("CA.IsCA = false") + } + srvBlock, _ := pem.Decode(b.ServerCertPEM) + srvCert, err := x509.ParseCertificate(srvBlock.Bytes) + if err != nil { + t.Fatalf("parse server: %v", err) + } + pool := x509.NewCertPool() + pool.AddCert(caCert) + if _, err := srvCert.Verify(x509.VerifyOptions{ + Roots: pool, + DNSName: "gw.test", + CurrentTime: time.Now(), + }); err != nil { + t.Fatalf("verify: %v", err) + } + if !containsString(srvCert.DNSNames, "localhost") { + t.Fatalf("missing localhost SAN: %v", srvCert.DNSNames) + } +} + +func TestGenerateDefaultValidity(t *testing.T) { + b, err := Generate(GenerateOptions{DNSNames: []string{"x"}}) + if err != nil { + t.Fatal(err) + } + blk, _ := pem.Decode(b.ServerCertPEM) + cert, err := x509.ParseCertificate(blk.Bytes) + if err != nil { + t.Fatal(err) + } + if cert.NotAfter.Sub(cert.NotBefore) < time.Hour { + t.Fatalf("validity too short: %s", cert.NotAfter.Sub(cert.NotBefore)) + } + if !strings.EqualFold(cert.Subject.CommonName, "documentdb-e2e") { + t.Fatalf("unexpected CN: %s", cert.Subject.CommonName) + } +} + +func containsString(xs []string, want string) bool { + for _, x := range xs { + if x == want { + return true + } + } + return false +} diff --git a/test/e2e/runid.go b/test/e2e/runid.go new file mode 100644 index 00000000..90eb3f98 --- /dev/null +++ b/test/e2e/runid.go @@ -0,0 +1,69 @@ +package e2e + +import ( + "crypto/rand" + "encoding/hex" + "fmt" + "os" + "sync" + "time" +) + +// runIDEnv is the environment variable consulted to pin the run +// identifier. When set and non-empty, its value is used verbatim; when +// unset, a per-process id is generated from the current time and a +// small random suffix on first access. +const runIDEnv = "E2E_RUN_ID" + +var ( + runIDOnce sync.Once + runIDVal string +) + +// RunID returns the process-scoped run identifier used to namespace +// shared fixtures and to label every cluster-scoped object the e2e +// suite creates. Stable for the life of the process. +// +// The identifier is resolved in this order: +// +// 1. $E2E_RUN_ID when set and non-empty (useful for reusing / cleaning +// up fixtures across invocations); +// 2. otherwise a short, low-collision id derived from the current +// Unix nanosecond timestamp plus four random bytes. +// +// Multiple test binaries that run independently each get their own +// id, which is what the fixture teardown logic relies on to avoid +// deleting another binary's still-live resources. +func RunID() string { + runIDOnce.Do(func() { + if v := os.Getenv(runIDEnv); v != "" { + runIDVal = v + return + } + runIDVal = generateRunID() + }) + return runIDVal +} + +// generateRunID produces a short, lowercase alphanumeric identifier. +// Exposed for tests via the resetRunIDForTest helper below. +func generateRunID() string { + var b [4]byte + if _, err := rand.Read(b[:]); err != nil { + // crypto/rand should never fail in practice; fall back to a + // time-derived suffix so the suite keeps running. + return fmt.Sprintf("t%x", time.Now().UnixNano())[:10] + } + // UnixNano in base-36 keeps the prefix short while remaining + // monotonic; 8 hex chars of randomness reduce cross-process + // collision risk when two binaries start in the same nanosecond. + ts := time.Now().UnixNano() + return fmt.Sprintf("%x%s", ts&0xFFFFFFFF, hex.EncodeToString(b[:])) +} + +// resetRunIDForTest re-initialises the once-guard so tests can exercise +// the generation path deterministically. Not part of the public API. +func resetRunIDForTest() { + runIDOnce = sync.Once{} + runIDVal = "" +} diff --git a/test/e2e/runid_test.go b/test/e2e/runid_test.go new file mode 100644 index 00000000..ac07c239 --- /dev/null +++ b/test/e2e/runid_test.go @@ -0,0 +1,54 @@ +package e2e + +import ( + "os" + "regexp" + "testing" +) + +func TestRunIDFromEnv(t *testing.T) { + t.Setenv(runIDEnv, "pinned-run-42") + resetRunIDForTest() + t.Cleanup(resetRunIDForTest) + + if got := RunID(); got != "pinned-run-42" { + t.Fatalf("RunID with env override = %q, want %q", got, "pinned-run-42") + } + // Second call must return the same value (cached). + if got := RunID(); got != "pinned-run-42" { + t.Fatalf("second RunID = %q, want %q", got, "pinned-run-42") + } +} + +func TestRunIDGeneratedWhenEnvMissing(t *testing.T) { + _ = os.Unsetenv(runIDEnv) + resetRunIDForTest() + t.Cleanup(resetRunIDForTest) + + a := RunID() + if a == "" { + t.Fatal("generated RunID must not be empty") + } + // Stable across calls. + if b := RunID(); a != b { + t.Fatalf("RunID not stable: %q != %q", a, b) + } + // Short and lowercase hex/alnum. + if len(a) > 24 { + t.Fatalf("RunID unexpectedly long: %q", a) + } + if !regexp.MustCompile(`^[a-z0-9]+$`).MatchString(a) { + t.Fatalf("RunID not lowercase alnum: %q", a) + } +} + +func TestGenerateRunIDUnique(t *testing.T) { + seen := map[string]struct{}{} + for range 16 { + v := generateRunID() + if _, dup := seen[v]; dup { + t.Fatalf("generateRunID produced duplicate %q", v) + } + seen[v] = struct{}{} + } +} diff --git a/test/e2e/suite.go b/test/e2e/suite.go new file mode 100644 index 00000000..361df5e2 --- /dev/null +++ b/test/e2e/suite.go @@ -0,0 +1,212 @@ +package e2e + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "sync" + "time" + + "github.com/cloudnative-pg/cloudnative-pg/tests/utils/environment" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/operatorhealth" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/testenv" +) + +// suiteEnv holds the process-wide CNPG TestingEnvironment used by every +// spec in the current test binary. It is populated by SetupSuite and +// cleared by TeardownSuite. Each Ginkgo test binary (root + per-area) +// gets its own copy; state is not shared across binaries. +var ( + suiteEnv *environment.TestingEnvironment + suiteEnvOnce sync.Once + suiteEnvErr error + + // suiteGate is the operator-pod churn sentinel captured at + // SetupSuite time. It is reused by [CheckOperatorUnchanged] + // from per-area BeforeEach hooks so a single operator restart + // during the run aborts every subsequent spec instead of + // producing confusing downstream failures. + suiteGate *operatorhealth.Gate +) + +// SuiteEnv returns the TestingEnvironment initialized by SetupSuite. +// Specs must invoke this only after SynchronizedBeforeSuite has run on +// the local node; a nil return means setup was skipped or failed. +func SuiteEnv() *environment.TestingEnvironment { return suiteEnv } + +// SetupSuite builds the shared TestingEnvironment (idempotent) and runs +// the operator-health gate, failing fast if the operator pod is not +// Ready within timeout. Intended to be called from +// SynchronizedBeforeSuite in the suite_test.go of every test binary. +func SetupSuite(ctx context.Context, operatorReadyTimeout time.Duration) error { + suiteEnvOnce.Do(func() { + // Propagate the resolved run id into every package that + // stamps it onto fixtures. Both fixtures and namespaces + // must see the same value before any namespace is + // derived so per-spec names collide deterministically + // across binaries when E2E_RUN_ID is exported. + fixtures.SetRunID(RunID()) + namespaces.SetRunIDFunc(RunID) + + env, err := testenv.NewDocumentDBTestingEnvironment(ctx) + if err != nil { + suiteEnvErr = fmt.Errorf("building TestingEnvironment: %w", err) + return + } + suiteEnv = env + if err := gateOperatorReady(ctx, env.Client, testenv.DefaultOperatorNamespace, operatorReadyTimeout); err != nil { + suiteEnvErr = fmt.Errorf("operator health gate: %w", err) + } + }) + return suiteEnvErr +} + +// TeardownSuite releases the shared fixtures created during the suite +// run. Safe to call even when SetupSuite failed or was never invoked. +// Errors from individual fixture teardowns are joined so the caller +// sees every problem rather than just the first. +func TeardownSuite(ctx context.Context) error { + if suiteEnv == nil || suiteEnv.Client == nil { + return nil + } + var errs []error + if err := fixtures.TeardownSharedRO(ctx, suiteEnv.Client); err != nil && !isNotFound(err) { + errs = append(errs, fmt.Errorf("teardown shared-ro: %w", err)) + } + if err := fixtures.TeardownSharedScale(ctx, suiteEnv.Client); err != nil && !isNotFound(err) { + errs = append(errs, fmt.Errorf("teardown shared-scale: %w", err)) + } + return errors.Join(errs...) +} + +// CheckOperatorUnchanged verifies that the operator pod captured at +// SetupSuite time is still running with the same UID and restart count. +// Returns nil when suiteGate has not been initialized yet (e.g., the +// caller is in the root binary before SynchronizedBeforeSuite), or when +// the operator pod matches the snapshot. Any drift returns a wrapped +// error and flips the package-level churn sentinel so subsequent +// SkipIfChurned calls observe it. +// +// Every per-area suite (except tests/upgrade/, where operator restarts +// are expected) should invoke this from a BeforeEach: +// +// var _ = BeforeEach(func() { +// Expect(e2e.CheckOperatorUnchanged()).To(Succeed()) +// }) +func CheckOperatorUnchanged() error { + if suiteGate == nil { + return nil + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + return suiteGate.Verify(ctx) +} + +// gateOperatorReady waits up to timeout for the DocumentDB operator pod +// to reach Ready=True and stores the captured [operatorhealth.Gate] in +// the package-level suiteGate so [CheckOperatorUnchanged] can reuse it. +func gateOperatorReady(ctx context.Context, c client.Client, ns string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + const poll = 2 * time.Second + var lastReason string + for { + pod, err := findOperatorPodForGate(ctx, c, ns) + switch { + case err == nil && podReady(pod): + g, gateErr := operatorhealth.NewGate(ctx, c, ns) + if gateErr != nil { + return fmt.Errorf("snapshot operator gate: %w", gateErr) + } + suiteGate = g + return nil + case err != nil: + lastReason = err.Error() + default: + lastReason = fmt.Sprintf("pod %s/%s not ready yet", ns, pod.Name) + } + if time.Now().After(deadline) { + return fmt.Errorf("operator pod in %q not ready after %s: %s", ns, timeout, lastReason) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +// findOperatorPodForGate locates the operator pod via the same label +// selector operatorhealth uses. Kept private to avoid cycling the +// internals of operatorhealth — if that package grows an exported +// finder, switch to it. +func findOperatorPodForGate(ctx context.Context, c client.Client, ns string) (*corev1.Pod, error) { + var pods corev1.PodList + if err := c.List(ctx, &pods, + client.InNamespace(ns), + client.MatchingLabels{operatorhealth.PodLabelKey: operatorhealth.PodLabelValue}, + ); err != nil { + return nil, fmt.Errorf("listing operator pods: %w", err) + } + if len(pods.Items) == 0 { + return nil, fmt.Errorf("no operator pods with %s=%s in %s", + operatorhealth.PodLabelKey, operatorhealth.PodLabelValue, ns) + } + return &pods.Items[0], nil +} + +func podReady(pod *corev1.Pod) bool { + if pod == nil || pod.Status.Phase != corev1.PodRunning { + return false + } + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue { + return true + } + } + return false +} + +// isNotFound detects "resource gone" errors returned by fixture +// teardown so the suite does not fail when fixtures were never created +// (e.g., a smoke-only run). +func isNotFound(err error) bool { + return err != nil && apierrors.IsNotFound(err) +} + +// ArtifactsDir returns the directory E2E artifacts (logs, junit reports) +// should be written to. The default layout isolates each ginkgo binary +// run and each parallel process: +// +// ./_artifacts//proc-/ +// +// The directory is created lazily on first call. Override the entire +// path via E2E_ARTIFACTS_DIR — the override is taken verbatim (no RunID +// or proc suffix is appended). +func ArtifactsDir() string { + if v := os.Getenv("E2E_ARTIFACTS_DIR"); v != "" { + _ = os.MkdirAll(v, 0o755) + return v + } + dir := filepath.Join(".", "_artifacts", RunID(), "proc-"+procIDString()) + _ = os.MkdirAll(dir, 0o755) + return dir +} + +// procIDString returns the Ginkgo parallel process id or "1" when +// unset. Kept separate from the fixtures procID helper to avoid a +// circular dependency and because callers in suite.go only need a +// string, not the int form. +func procIDString() string { + if v := os.Getenv("GINKGO_PARALLEL_PROCESS"); v != "" { + return v + } + return "1" +} diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go new file mode 100644 index 00000000..a1274238 --- /dev/null +++ b/test/e2e/suite_test.go @@ -0,0 +1,127 @@ +// suite_test.go is the Ginkgo root for the DocumentDB Kubernetes +// Operator E2E suite. It owns shared bootstrap: building the CNPG +// TestingEnvironment, running the operator-health gate, and tearing +// down session-scoped fixtures. Each per-area package under tests/ +// compiles to its own test binary and performs the same bootstrap via +// the exported SetupSuite / TeardownSuite helpers in suite.go. +// +// Cross-binary run-id contract: +// +// Per-spec fixtures (labeled namespaces, credential secrets) are +// stamped with e2e.RunID(), which falls back to a random value when +// E2E_RUN_ID is unset. Every Ginkgo test binary computes its own +// RunID at start-up, so running two binaries back-to-back without +// E2E_RUN_ID means they cannot adopt each other's fixtures — the +// second binary will reject the mismatched run-id label. To run +// multiple binaries in a single logical E2E run (CI matrix, manual +// bisection, etc.) export E2E_RUN_ID= for all of +// them. When the variable is empty, SynchronizedBeforeSuite logs a +// warning to GinkgoWriter so it surfaces in test output. +// +// Environment variables consulted by the suite: +// +// TEST_DEPTH // 0–4 — depth tier, see levels.go. Default: 1 (High). +// TEST_TIMEOUTS // optional timeout profile, consumed by pkg/e2eutils/timeouts. +// KUBECONFIG // standard; required to reach the test cluster. +// POSTGRES_IMG // placeholder for CNPG's semver parsing (default busybox:17.2). +// E2E_ARTIFACTS_DIR // override for artifact output (default ./_artifacts). +// E2E_RUN_ID // optional shared id for cross-binary fixture reuse. +// E2E_TAIL_LOGS // "1" enables the best-effort operator log tailer. +// +// Standard Ginkgo v2 flags (--ginkgo.label-filter, --ginkgo.focus, -p, +// etc.) are auto-registered. +package e2e + +import ( + "context" + "fmt" + "os" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// operatorReadyTimeout bounds how long SynchronizedBeforeSuite waits +// for the operator pod to report Ready=True before aborting the suite. +const operatorReadyTimeout = 2 * time.Minute + +// TestE2E is the Ginkgo root for this package. Per-area test binaries +// live under tests// and have their own TestX entry points. +func TestE2E(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E Suite") +} + +var _ = SynchronizedBeforeSuite( + // Node 1 (primary process): build the environment, gate the + // operator, optionally start the log tailer, then publish an + // empty marker — each node rebuilds its own local env so there is + // nothing to serialize. + func(ctx SpecContext) []byte { + if os.Getenv("E2E_RUN_ID") == "" { + fmt.Fprintf(GinkgoWriter, + "e2e: WARNING — E2E_RUN_ID is unset; per-spec fixtures cannot be reused "+ + "across test binaries in this run. Export E2E_RUN_ID= "+ + "before invoking multiple ginkgo binaries to share labeled fixtures.\n") + } + if err := SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("suite bootstrap failed on node 1: %v", err)) + } + fmt.Fprintf(GinkgoWriter, + "e2e: depth=%d (TEST_DEPTH=%q) artifacts=%s\n", + CurrentLevel(), os.Getenv("TEST_DEPTH"), ArtifactsDir()) + fmt.Fprintf(GinkgoWriter, + "e2e: active area labels = %v\n", allAreaLabels()) + if os.Getenv("E2E_TAIL_LOGS") == "1" { + startOperatorLogTailer(context.Background()) + } + return []byte{} + }, + // All nodes: build a local env so Ginkgo parallel processes each + // have their own *environment.TestingEnvironment to work with. + func(_ SpecContext, _ []byte) { + if err := SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("suite bootstrap failed on worker node: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + // All nodes: teardown shared fixtures. Errors are logged but not + // escalated — cleanup is best-effort. + func(ctx SpecContext) { + if err := TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "e2e: teardown reported errors: %v\n", err) + } + }, + // Node 1: no-op. Nothing to aggregate. + func(_ SpecContext) {}, +) + +// allAreaLabels returns the static list of area labels declared in +// labels.go. Kept in sync manually; adding a new area should append +// here and in labels.go together. +func allAreaLabels() []string { + return []string{ + LifecycleLabel, ScaleLabel, DataLabel, PerformanceLabel, + BackupLabel, RecoveryLabel, TLSLabel, FeatureLabel, + ExposureLabel, StatusLabel, UpgradeLabel, + } +} + +// startOperatorLogTailer is currently a no-op. The earlier placeholder +// that wrote a stub operator.log into $ARTIFACTS has been removed so +// failure triage does not find an empty file and assume the tailer ran. +// When E2E_TAIL_LOGS=1 is set the suite logs a reminder that no log +// streaming is active yet. +// +// TODO(p2): replace with a proper client-go PodLogs stream that +// appends until the context is cancelled. See +// docs/designs/e2e-test-suite.md §"Diagnostics". +func startOperatorLogTailer(_ context.Context) { + fmt.Fprintf(GinkgoWriter, + "e2e: E2E_TAIL_LOGS=1 requested but the operator log tailer is not implemented yet; "+ + "no operator.log will be produced for this run.\n") +} diff --git a/test/e2e/tests/data/aggregation_test.go b/test/e2e/tests/data/aggregation_test.go new file mode 100644 index 00000000..58e149e6 --- /dev/null +++ b/test/e2e/tests/data/aggregation_test.go @@ -0,0 +1,110 @@ +package data + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + + e2e "github.com/documentdb/documentdb-operator/test/e2e" + emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/seed" +) + +var _ = Describe("DocumentDB data — aggregation", + Ordered, + Label(e2e.DataLabel), + e2e.MediumLevelLabel, + func() { + var ( + ctx context.Context + handle *emongo.Handle + dbName string + coll *mongo.Collection + ) + + BeforeAll(func() { + ctx = context.Background() + handle, dbName = connectSharedRO(ctx) + coll = handle.Database(dbName).Collection("agg") + docs := seed.AggDataset() + any := make([]any, len(docs)) + for i := range docs { + any[i] = docs[i] + } + _, err := coll.InsertMany(ctx, any) + Expect(err).NotTo(HaveOccurred()) + }) + AfterAll(func() { + if handle != nil { + _ = handle.Client().Database(dbName).Drop(ctx) + _ = handle.Close(ctx) + } + }) + + It("groups documents by category and counts per-group cardinality", func() { + pipe := mongo.Pipeline{ + {{Key: "$group", Value: bson.D{ + {Key: "_id", Value: "$category"}, + {Key: "count", Value: bson.D{{Key: "$sum", Value: 1}}}, + }}}, + } + cur, err := coll.Aggregate(ctx, pipe) + Expect(err).NotTo(HaveOccurred()) + defer cur.Close(ctx) + var results []bson.M + Expect(cur.All(ctx, &results)).To(Succeed()) + Expect(results).To(HaveLen(seed.AggDatasetGroups)) + var total int64 + for _, r := range results { + switch v := r["count"].(type) { + case int32: + total += int64(v) + case int64: + total += v + default: + Fail("unexpected count type") + } + } + Expect(total).To(Equal(int64(seed.AggDatasetSize))) + }) + + It("filters with $match before grouping", func() { + pipe := mongo.Pipeline{ + {{Key: "$match", Value: bson.D{{Key: "category", Value: "alpha"}}}}, + {{Key: "$group", Value: bson.D{ + {Key: "_id", Value: "$category"}, + {Key: "n", Value: bson.D{{Key: "$sum", Value: 1}}}, + }}}, + } + cur, err := coll.Aggregate(ctx, pipe) + Expect(err).NotTo(HaveOccurred()) + defer cur.Close(ctx) + var results []bson.M + Expect(cur.All(ctx, &results)).To(Succeed()) + Expect(results).To(HaveLen(1)) + Expect(results[0]["_id"]).To(Equal("alpha")) + }) + + It("projects selected fields with $project", func() { + pipe := mongo.Pipeline{ + {{Key: "$match", Value: bson.D{{Key: "_id", Value: 1}}}}, + {{Key: "$project", Value: bson.D{ + {Key: "_id", Value: 0}, + {Key: "category", Value: 1}, + }}}, + } + cur, err := coll.Aggregate(ctx, pipe) + Expect(err).NotTo(HaveOccurred()) + defer cur.Close(ctx) + var results []bson.M + Expect(cur.All(ctx, &results)).To(Succeed()) + Expect(results).To(HaveLen(1)) + // _id was explicitly excluded; only category remains. + Expect(results[0]).NotTo(HaveKey("_id")) + Expect(results[0]).To(HaveKey("category")) + }) + }, +) diff --git a/test/e2e/tests/data/crud_test.go b/test/e2e/tests/data/crud_test.go new file mode 100644 index 00000000..3d290fff --- /dev/null +++ b/test/e2e/tests/data/crud_test.go @@ -0,0 +1,85 @@ +package data + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.mongodb.org/mongo-driver/v2/bson" + + e2e "github.com/documentdb/documentdb-operator/test/e2e" + emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/seed" +) + +var _ = Describe("DocumentDB data — CRUD", + Ordered, + Label(e2e.DataLabel, e2e.BasicLabel), + e2e.MediumLevelLabel, + func() { + var ( + ctx context.Context + handle *emongo.Handle + dbName string + ) + + BeforeAll(func() { + ctx = context.Background() + handle, dbName = connectSharedRO(ctx) + }) + AfterAll(func() { + if handle != nil { + _ = handle.Client().Database(dbName).Drop(ctx) + _ = handle.Close(ctx) + } + }) + + It("inserts a document and finds it", func() { + coll := handle.Database(dbName).Collection("crud_insert_find") + _, err := coll.InsertOne(ctx, bson.M{"_id": 1, "name": "alice", "score": 10}) + Expect(err).NotTo(HaveOccurred()) + var got bson.M + Expect(coll.FindOne(ctx, bson.M{"_id": 1}).Decode(&got)).To(Succeed()) + Expect(got["name"]).To(Equal("alice")) + }) + + It("bulk inserts the small dataset and counts documents", func() { + coll := handle.Database(dbName).Collection("crud_bulk") + docs := seed.SmallDataset() + any := make([]any, len(docs)) + for i := range docs { + any[i] = docs[i] + } + _, err := coll.InsertMany(ctx, any) + Expect(err).NotTo(HaveOccurred()) + n, err := coll.CountDocuments(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(seed.SmallDatasetSize))) + }) + + It("updates a document in place", func() { + coll := handle.Database(dbName).Collection("crud_update") + _, err := coll.InsertOne(ctx, bson.M{"_id": 1, "status": "new"}) + Expect(err).NotTo(HaveOccurred()) + res, err := coll.UpdateOne(ctx, bson.M{"_id": 1}, bson.M{"$set": bson.M{"status": "done"}}) + Expect(err).NotTo(HaveOccurred()) + Expect(res.ModifiedCount).To(Equal(int64(1))) + var got bson.M + Expect(coll.FindOne(ctx, bson.M{"_id": 1}).Decode(&got)).To(Succeed()) + Expect(got["status"]).To(Equal("done")) + }) + + It("deletes a document and observes the decrement", func() { + coll := handle.Database(dbName).Collection("crud_delete") + docs := []any{bson.M{"_id": 1}, bson.M{"_id": 2}, bson.M{"_id": 3}} + _, err := coll.InsertMany(ctx, docs) + Expect(err).NotTo(HaveOccurred()) + res, err := coll.DeleteOne(ctx, bson.M{"_id": 2}) + Expect(err).NotTo(HaveOccurred()) + Expect(res.DeletedCount).To(Equal(int64(1))) + n, err := coll.CountDocuments(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(2))) + }) + }, +) diff --git a/test/e2e/tests/data/data_suite_test.go b/test/e2e/tests/data/data_suite_test.go new file mode 100644 index 00000000..f83eec43 --- /dev/null +++ b/test/e2e/tests/data/data_suite_test.go @@ -0,0 +1,56 @@ +// Package data hosts the DocumentDB E2E data area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +package data + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestData(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - Data", Label(e2e.DataLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("data bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("data worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "data teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +) + +// BeforeEach in this area aborts the spec if the operator pod has +// drifted since SetupSuite (UID/name/restart-count change). Area +// tests/upgrade/ intentionally omits this hook because operator +// restarts are part of its scenario. +var _ = BeforeEach(func() { +Expect(e2e.CheckOperatorUnchanged()).To(Succeed(), +"operator health check failed — a previous spec or reconciler likely restarted the operator") +}) diff --git a/test/e2e/tests/data/delete_ops_test.go b/test/e2e/tests/data/delete_ops_test.go new file mode 100644 index 00000000..19eedf7a --- /dev/null +++ b/test/e2e/tests/data/delete_ops_test.go @@ -0,0 +1,79 @@ +package data + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + + e2e "github.com/documentdb/documentdb-operator/test/e2e" + emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/seed" +) + +// This spec writes to its per-spec Mongo database only — the shared +// read-only CR is not mutated, honoring fixture contracts. The "RO" in +// SharedRO means the Kubernetes custom resource is read-only; data-plane +// writes into isolated databases are permitted. +var _ = Describe("DocumentDB data — delete operators", + Ordered, + Label(e2e.DataLabel), + e2e.MediumLevelLabel, + func() { + var ( + ctx context.Context + handle *emongo.Handle + dbName string + coll *mongo.Collection + ) + + BeforeAll(func() { + ctx = context.Background() + handle, dbName = connectSharedRO(ctx) + coll = handle.Database(dbName).Collection("delete_ops") + }) + AfterAll(func() { + if handle != nil { + _ = handle.Client().Database(dbName).Drop(ctx) + _ = handle.Close(ctx) + } + }) + + BeforeEach(func() { + // Reset state between Its so counts are deterministic. + _, err := coll.DeleteMany(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred()) + seedSmall(ctx, coll) + }) + + It("deleteOne removes exactly one matching document", func() { + res, err := coll.DeleteOne(ctx, bson.M{"score": bson.M{"$gte": 30}}) + Expect(err).NotTo(HaveOccurred()) + Expect(res.DeletedCount).To(Equal(int64(1))) + n, err := coll.CountDocuments(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(seed.SmallDatasetSize - 1))) + }) + + It("deleteMany removes every matching document", func() { + // SmallDataset scores are 10..100. >= 50 → ids 5..10 → 6 docs. + res, err := coll.DeleteMany(ctx, bson.M{"score": bson.M{"$gte": 50}}) + Expect(err).NotTo(HaveOccurred()) + Expect(res.DeletedCount).To(Equal(int64(6))) + n, err := coll.CountDocuments(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(seed.SmallDatasetSize - 6))) + }) + + It("deleteMany with empty filter removes all documents", func() { + res, err := coll.DeleteMany(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred()) + Expect(res.DeletedCount).To(Equal(int64(seed.SmallDatasetSize))) + n, err := coll.CountDocuments(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(0))) + }) + }, +) diff --git a/test/e2e/tests/data/helpers_test.go b/test/e2e/tests/data/helpers_test.go new file mode 100644 index 00000000..5b38e2c8 --- /dev/null +++ b/test/e2e/tests/data/helpers_test.go @@ -0,0 +1,31 @@ +// Package data hosts DocumentDB E2E data-area specs. This file provides +// a small connectSharedRO helper shared across the spec files in this +// package so each spec does not repeat the fixture-get / +// port-forward / client-connect plumbing. It is a test-only helper +// (package data) and is not exported to other areas. +package data + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + e2e "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" +) + +// connectSharedRO returns a Handle against the session-wide SharedRO +// DocumentDB cluster and a DB name unique to the calling spec. The +// returned Handle MUST be closed by the caller (typically from +// AfterAll). dbName is derived from CurrentSpecReport().FullText() so +// Ginkgo parallel processes running the same file against the same +// cluster do not collide on collection state. +func connectSharedRO(ctx context.Context) (*emongo.Handle, string) { + roHandle, err := fixtures.GetOrCreateSharedRO(ctx, e2e.SuiteEnv().Client) + Expect(err).NotTo(HaveOccurred(), "get-or-create shared-ro fixture") + h, err := emongo.NewFromDocumentDB(ctx, e2e.SuiteEnv(), roHandle.Namespace(), roHandle.Name()) + Expect(err).NotTo(HaveOccurred(), "connect to shared-ro gateway") + return h, fixtures.DBNameFor(CurrentSpecReport().FullText()) +} diff --git a/test/e2e/tests/data/pipeline_test.go b/test/e2e/tests/data/pipeline_test.go new file mode 100644 index 00000000..faae56c2 --- /dev/null +++ b/test/e2e/tests/data/pipeline_test.go @@ -0,0 +1,120 @@ +package data + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + + e2e "github.com/documentdb/documentdb-operator/test/e2e" + emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" +) + +// pipeline_test.go exercises more complex aggregation pipelines: +// $lookup (joins), $unwind (array flattening), and $group. Data is +// seeded inline because seed.AggDataset does not model cross-collection +// relationships. +var _ = Describe("DocumentDB data — complex pipelines", + Ordered, + Label(e2e.DataLabel), + e2e.MediumLevelLabel, + func() { + var ( + ctx context.Context + handle *emongo.Handle + dbName string + orders *mongo.Collection + products *mongo.Collection + ) + + BeforeAll(func() { + ctx = context.Background() + handle, dbName = connectSharedRO(ctx) + orders = handle.Database(dbName).Collection("orders") + products = handle.Database(dbName).Collection("products") + + _, err := products.InsertMany(ctx, []any{ + bson.M{"_id": "p1", "name": "pen", "category": "office"}, + bson.M{"_id": "p2", "name": "book", "category": "office"}, + bson.M{"_id": "p3", "name": "lamp", "category": "home"}, + }) + Expect(err).NotTo(HaveOccurred()) + + _, err = orders.InsertMany(ctx, []any{ + bson.M{"_id": 1, "customer": "alice", "items": bson.A{"p1", "p2"}}, + bson.M{"_id": 2, "customer": "bob", "items": bson.A{"p2", "p3"}}, + bson.M{"_id": 3, "customer": "alice", "items": bson.A{"p3"}}, + }) + Expect(err).NotTo(HaveOccurred()) + }) + AfterAll(func() { + if handle != nil { + _ = handle.Client().Database(dbName).Drop(ctx) + _ = handle.Close(ctx) + } + }) + + It("performs $unwind on the items array", func() { + pipe := mongo.Pipeline{ + {{Key: "$unwind", Value: "$items"}}, + } + cur, err := orders.Aggregate(ctx, pipe) + Expect(err).NotTo(HaveOccurred()) + defer cur.Close(ctx) + var out []bson.M + Expect(cur.All(ctx, &out)).To(Succeed()) + // 2 + 2 + 1 = 5 unwound rows from 3 source orders. + Expect(out).To(HaveLen(5)) + }) + + It("joins orders with products via $lookup + $unwind", func() { + pipe := mongo.Pipeline{ + {{Key: "$unwind", Value: "$items"}}, + {{Key: "$lookup", Value: bson.D{ + {Key: "from", Value: "products"}, + {Key: "localField", Value: "items"}, + {Key: "foreignField", Value: "_id"}, + {Key: "as", Value: "product"}, + }}}, + {{Key: "$unwind", Value: "$product"}}, + {{Key: "$match", Value: bson.D{{Key: "customer", Value: "alice"}}}}, + } + cur, err := orders.Aggregate(ctx, pipe) + Expect(err).NotTo(HaveOccurred()) + defer cur.Close(ctx) + var out []bson.M + Expect(cur.All(ctx, &out)).To(Succeed()) + // alice has orders {1,3} with items {p1,p2,p3} → 3 rows. + Expect(out).To(HaveLen(3)) + for _, doc := range out { + Expect(doc["customer"]).To(Equal("alice")) + product, ok := doc["product"].(bson.M) + Expect(ok).To(BeTrue(), "product should be an embedded doc post-lookup") + Expect(product).To(HaveKey("name")) + } + }) + + It("aggregates per-customer item counts with $group", func() { + pipe := mongo.Pipeline{ + {{Key: "$unwind", Value: "$items"}}, + {{Key: "$group", Value: bson.D{ + {Key: "_id", Value: "$customer"}, + {Key: "n", Value: bson.D{{Key: "$sum", Value: 1}}}, + }}}, + } + cur, err := orders.Aggregate(ctx, pipe) + Expect(err).NotTo(HaveOccurred()) + defer cur.Close(ctx) + var out []bson.M + Expect(cur.All(ctx, &out)).To(Succeed()) + counts := map[string]int{} + for _, r := range out { + counts[r["_id"].(string)] = toInt(r["n"]) + } + Expect(counts).To(HaveKeyWithValue("alice", 3)) + Expect(counts).To(HaveKeyWithValue("bob", 2)) + }) + }, +) diff --git a/test/e2e/tests/data/query_test.go b/test/e2e/tests/data/query_test.go new file mode 100644 index 00000000..d3718270 --- /dev/null +++ b/test/e2e/tests/data/query_test.go @@ -0,0 +1,98 @@ +package data + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + + e2e "github.com/documentdb/documentdb-operator/test/e2e" + emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/seed" +) + +// seedSmall inserts seed.SmallDataset into coll and returns a convenience +// view over the dataset size. +func seedSmall(ctx context.Context, coll *mongo.Collection) int { + docs := seed.SmallDataset() + any := make([]any, len(docs)) + for i := range docs { + any[i] = docs[i] + } + _, err := coll.InsertMany(ctx, any) + Expect(err).NotTo(HaveOccurred()) + return len(docs) +} + +var _ = Describe("DocumentDB data — query filters", + Ordered, + Label(e2e.DataLabel), + e2e.MediumLevelLabel, + func() { + var ( + ctx context.Context + handle *emongo.Handle + dbName string + coll *mongo.Collection + ) + + BeforeAll(func() { + ctx = context.Background() + handle, dbName = connectSharedRO(ctx) + coll = handle.Database(dbName).Collection("query_filters") + seedSmall(ctx, coll) + }) + AfterAll(func() { + if handle != nil { + _ = handle.Client().Database(dbName).Drop(ctx) + _ = handle.Close(ctx) + } + }) + + It("filters with $eq", func() { + var got bson.M + Expect(coll.FindOne(ctx, bson.M{"score": bson.M{"$eq": 50}}).Decode(&got)).To(Succeed()) + Expect(got["_id"]).To(BeEquivalentTo(5)) + }) + + It("filters with $gt", func() { + n, err := coll.CountDocuments(ctx, bson.M{"score": bson.M{"$gt": 50}}) + Expect(err).NotTo(HaveOccurred()) + // SmallDataset scores are N*10 for N in [1..10] → strictly > 50 means 6..10. + Expect(n).To(Equal(int64(5))) + }) + + It("filters with $in", func() { + n, err := coll.CountDocuments(ctx, bson.M{"_id": bson.M{"$in": []int{1, 3, 5, 99}}}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(3))) + }) + + It("filters with $and", func() { + n, err := coll.CountDocuments(ctx, bson.M{"$and": []bson.M{ + {"score": bson.M{"$gte": 30}}, + {"score": bson.M{"$lte": 70}}, + }}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(5))) + }) + + It("filters with $or", func() { + n, err := coll.CountDocuments(ctx, bson.M{"$or": []bson.M{ + {"_id": 1}, + {"_id": 10}, + }}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(2))) + }) + + It("filters with $regex on name", func() { + // SmallDataset names are "doc-N" so all documents match "^doc-". + n, err := coll.CountDocuments(ctx, bson.M{"name": bson.M{"$regex": "^doc-"}}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(Equal(int64(seed.SmallDatasetSize))) + }) + }, +) diff --git a/test/e2e/tests/data/sort_limit_skip_test.go b/test/e2e/tests/data/sort_limit_skip_test.go new file mode 100644 index 00000000..324382af --- /dev/null +++ b/test/e2e/tests/data/sort_limit_skip_test.go @@ -0,0 +1,113 @@ +package data + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" + + e2e "github.com/documentdb/documentdb-operator/test/e2e" + emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/seed" +) + +var _ = Describe("DocumentDB data — sort/limit/skip", + Ordered, + Label(e2e.DataLabel), + e2e.MediumLevelLabel, + func() { + var ( + ctx context.Context + handle *emongo.Handle + dbName string + coll *mongo.Collection + ) + + BeforeAll(func() { + ctx = context.Background() + handle, dbName = connectSharedRO(ctx) + coll = handle.Database(dbName).Collection("sort_cursor") + docs := seed.SortDataset() + docsAny := make([]any, len(docs)) + for i := range docs { + docsAny[i] = docs[i] + } + _, err := coll.InsertMany(ctx, docsAny) + Expect(err).NotTo(HaveOccurred()) + }) + AfterAll(func() { + if handle != nil { + _ = handle.Client().Database(dbName).Drop(ctx) + _ = handle.Close(ctx) + } + }) + + It("sorts ascending by _id", func() { + cur, err := coll.Find(ctx, bson.M{}, + options.Find().SetSort(bson.D{{Key: "_id", Value: 1}}).SetLimit(5)) + Expect(err).NotTo(HaveOccurred()) + defer cur.Close(ctx) + var results []bson.M + Expect(cur.All(ctx, &results)).To(Succeed()) + Expect(results).To(HaveLen(5)) + Expect(results[0]["_id"]).To(BeEquivalentTo(1)) + // Strictly ascending. + for i := 1; i < len(results); i++ { + prev := toInt(results[i-1]["_id"]) + cur := toInt(results[i]["_id"]) + Expect(cur).To(BeNumerically(">", prev)) + } + }) + + It("sorts descending by _id", func() { + cur, err := coll.Find(ctx, bson.M{}, + options.Find().SetSort(bson.D{{Key: "_id", Value: -1}}).SetLimit(3)) + Expect(err).NotTo(HaveOccurred()) + defer cur.Close(ctx) + var results []bson.M + Expect(cur.All(ctx, &results)).To(Succeed()) + Expect(results).To(HaveLen(3)) + Expect(toInt(results[0]["_id"])).To(Equal(seed.SortDatasetSize)) + }) + + It("limits and skips consistently", func() { + // Full page 1 (no skip) of 10 results sorted by _id asc. + page1, err := coll.Find(ctx, bson.M{}, + options.Find().SetSort(bson.D{{Key: "_id", Value: 1}}).SetLimit(10)) + Expect(err).NotTo(HaveOccurred()) + defer page1.Close(ctx) + var page1Docs []bson.M + Expect(page1.All(ctx, &page1Docs)).To(Succeed()) + Expect(page1Docs).To(HaveLen(10)) + + // Page 2 is Skip(5) → first doc of page2 equals 6th of page1. + page2, err := coll.Find(ctx, bson.M{}, + options.Find().SetSort(bson.D{{Key: "_id", Value: 1}}).SetSkip(5).SetLimit(5)) + Expect(err).NotTo(HaveOccurred()) + defer page2.Close(ctx) + var page2Docs []bson.M + Expect(page2.All(ctx, &page2Docs)).To(Succeed()) + Expect(page2Docs).To(HaveLen(5)) + Expect(page2Docs[0]["_id"]).To(Equal(page1Docs[5]["_id"])) + }) + }, +) + +// toInt coerces numeric BSON values (int32/int64/int) to int for test +// comparisons. Panics on unexpected types so failure is obvious. +func toInt(v any) int { + switch n := v.(type) { + case int32: + return int(n) + case int64: + return int(n) + case int: + return n + default: + Fail("unexpected numeric type in _id") + return 0 + } +} diff --git a/test/e2e/tests/data/update_ops_test.go b/test/e2e/tests/data/update_ops_test.go new file mode 100644 index 00000000..42943469 --- /dev/null +++ b/test/e2e/tests/data/update_ops_test.go @@ -0,0 +1,91 @@ +package data + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + + e2e "github.com/documentdb/documentdb-operator/test/e2e" + emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" +) + +var _ = Describe("DocumentDB data — update operators", + Ordered, + Label(e2e.DataLabel), + e2e.MediumLevelLabel, + func() { + var ( + ctx context.Context + handle *emongo.Handle + dbName string + coll *mongo.Collection + ) + + BeforeAll(func() { + ctx = context.Background() + handle, dbName = connectSharedRO(ctx) + coll = handle.Database(dbName).Collection("update_ops") + }) + AfterAll(func() { + if handle != nil { + _ = handle.Client().Database(dbName).Drop(ctx) + _ = handle.Close(ctx) + } + }) + + It("applies $set to add and mutate a field", func() { + id := "set-1" + _, err := coll.InsertOne(ctx, bson.M{"_id": id, "name": "alpha"}) + Expect(err).NotTo(HaveOccurred()) + _, err = coll.UpdateOne(ctx, bson.M{"_id": id}, + bson.M{"$set": bson.M{"name": "alpha-2", "enabled": true}}) + Expect(err).NotTo(HaveOccurred()) + var got bson.M + Expect(coll.FindOne(ctx, bson.M{"_id": id}).Decode(&got)).To(Succeed()) + Expect(got["name"]).To(Equal("alpha-2")) + Expect(got["enabled"]).To(BeTrue()) + }) + + It("applies $inc to a numeric field", func() { + id := "inc-1" + _, err := coll.InsertOne(ctx, bson.M{"_id": id, "count": 10}) + Expect(err).NotTo(HaveOccurred()) + _, err = coll.UpdateOne(ctx, bson.M{"_id": id}, + bson.M{"$inc": bson.M{"count": 5}}) + Expect(err).NotTo(HaveOccurred()) + var got bson.M + Expect(coll.FindOne(ctx, bson.M{"_id": id}).Decode(&got)).To(Succeed()) + Expect(toInt(got["count"])).To(Equal(15)) + }) + + It("applies $unset to remove a field", func() { + id := "unset-1" + _, err := coll.InsertOne(ctx, bson.M{"_id": id, "tmp": "x", "keep": "y"}) + Expect(err).NotTo(HaveOccurred()) + _, err = coll.UpdateOne(ctx, bson.M{"_id": id}, + bson.M{"$unset": bson.M{"tmp": ""}}) + Expect(err).NotTo(HaveOccurred()) + var got bson.M + Expect(coll.FindOne(ctx, bson.M{"_id": id}).Decode(&got)).To(Succeed()) + Expect(got).NotTo(HaveKey("tmp")) + Expect(got).To(HaveKey("keep")) + }) + + It("applies $push to append to an array", func() { + id := "push-1" + _, err := coll.InsertOne(ctx, bson.M{"_id": id, "tags": bson.A{"a"}}) + Expect(err).NotTo(HaveOccurred()) + _, err = coll.UpdateOne(ctx, bson.M{"_id": id}, + bson.M{"$push": bson.M{"tags": "b"}}) + Expect(err).NotTo(HaveOccurred()) + var got bson.M + Expect(coll.FindOne(ctx, bson.M{"_id": id}).Decode(&got)).To(Succeed()) + tags, ok := got["tags"].(bson.A) + Expect(ok).To(BeTrue(), "tags should decode as bson.A") + Expect(tags).To(ConsistOf("a", "b")) + }) + }, +) diff --git a/test/e2e/tests/exposure/clusterip_test.go b/test/e2e/tests/exposure/clusterip_test.go new file mode 100644 index 00000000..7d614a45 --- /dev/null +++ b/test/e2e/tests/exposure/clusterip_test.go @@ -0,0 +1,98 @@ +package exposure + +import ( + "context" + "fmt" + "net" + "strconv" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/portforward" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// reserveFreePort opens and immediately closes a TCP listener on :0 so +// the kernel picks an unused ephemeral port. There is an inherent TOCTOU +// window between the close and the subsequent bind inside port-forward, +// but for a single-threaded ginkgo run it is adequate. +func reserveFreePort() (int, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return 0, fmt.Errorf("reserve free port: %w", err) + } + port := l.Addr().(*net.TCPAddr).Port + _ = l.Close() + return port, nil +} + +// DocumentDB exposure — ClusterIP. +// +// Verifies: +// 1. spec.exposeViaService.serviceType=ClusterIP round-trips into the +// API server unchanged; +// 2. the gateway Service the operator creates is of type ClusterIP; +// 3. a cluster-internal connection (via port-forward) can ping the +// gateway — i.e. the Service is actually wired to Ready gateway pods. +var _ = Describe("DocumentDB exposure — ClusterIP", + Label(e2e.ExposureLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.Medium) }) + + It("routes cluster-internal traffic to the gateway", func() { + env := e2e.SuiteEnv() + Expect(env).ToNot(BeNil()) + c := env.Client + + ctx, cancel := context.WithTimeout(context.Background(), 12*time.Minute) + DeferCleanup(cancel) + + dd, cleanup := setupFreshCluster(ctx, c, "expose-clusterip", + []string{"exposure_clusterip"}, nil) + DeferCleanup(cleanup) + + // 1. Spec round-trip. + Expect(dd.Spec.ExposeViaService.ServiceType).To(Equal("ClusterIP")) + + // 2. Service type is ClusterIP. + svcName := portforward.GatewayServiceName(dd) + Eventually(assertions.AssertServiceType(ctx, c, dd.Namespace, svcName, corev1.ServiceTypeClusterIP), + timeouts.For(timeouts.ServiceReady), timeouts.PollInterval(timeouts.ServiceReady)). + Should(Succeed()) + + // 3. Cluster-internal connection works. + localPort, err := reserveFreePort() + Expect(err).ToNot(HaveOccurred()) + stop, err := portforward.Open(ctx, env, dd, localPort) + Expect(err).ToNot(HaveOccurred(), "open port-forward") + DeferCleanup(stop) + + var pingErr error + Eventually(func() error { + pingCtx, pingCancel := context.WithTimeout(ctx, 10*time.Second) + defer pingCancel() + cli, err := mongo.NewClient(pingCtx, mongo.ClientOptions{ + Host: "127.0.0.1", + Port: strconv.Itoa(localPort), + User: credUser, + Password: credPassword, + TLS: false, + }) + if err != nil { + pingErr = err + return err + } + defer func() { _ = cli.Disconnect(context.Background()) }() + pingErr = mongo.Ping(pingCtx, cli) + return pingErr + }, timeouts.For(timeouts.MongoConnect), timeouts.PollInterval(timeouts.MongoConnect)). + Should(Succeed(), "mongo ping through ClusterIP port-forward: %v", pingErr) + }) + }) diff --git a/test/e2e/tests/exposure/exposure_suite_test.go b/test/e2e/tests/exposure/exposure_suite_test.go new file mode 100644 index 00000000..d8d8e45b --- /dev/null +++ b/test/e2e/tests/exposure/exposure_suite_test.go @@ -0,0 +1,56 @@ +// Package exposure hosts the DocumentDB E2E exposure area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +package exposure + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestExposure(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - Exposure", Label(e2e.ExposureLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("exposure bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("exposure worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "exposure teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +) + +// BeforeEach in this area aborts the spec if the operator pod has +// drifted since SetupSuite (UID/name/restart-count change). Area +// tests/upgrade/ intentionally omits this hook because operator +// restarts are part of its scenario. +var _ = BeforeEach(func() { +Expect(e2e.CheckOperatorUnchanged()).To(Succeed(), +"operator health check failed — a previous spec or reconciler likely restarted the operator") +}) diff --git a/test/e2e/tests/exposure/helpers_test.go b/test/e2e/tests/exposure/helpers_test.go new file mode 100644 index 00000000..66301f09 --- /dev/null +++ b/test/e2e/tests/exposure/helpers_test.go @@ -0,0 +1,115 @@ +package exposure + +import ( + "context" + "os" + "time" + + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + "github.com/documentdb/documentdb-operator/test/e2e" + documentdbutil "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// Credential constants now alias the fixtures exports so every area +// lands on the same values. credUser / credPassword are retained as +// package-level constants because clusterip_test.go (a spec under this +// pass's do-not-touch list) references them directly. +const ( + credSecretName = fixtures.DefaultCredentialSecretName + credUser = fixtures.DefaultCredentialUsername + credPassword = fixtures.DefaultCredentialPassword //nolint:gosec // fixture-only + + // DOCUMENTDB_IMAGE / GATEWAY_IMAGE default to empty strings so the + // operator selects the correct components itself: CNPG pg18 base + + // DocumentDB extension via image-library + gateway as a separate + // sidecar. A pinned env-var override is still honoured for CI. + defaultDocDBImage = "" + defaultGatewayImage = "" +) + +func baseVars(ns, name string) map[string]string { + docdbImg := defaultDocDBImage + if v := os.Getenv("DOCUMENTDB_IMAGE"); v != "" { + docdbImg = v + } + gwImg := defaultGatewayImage + if v := os.Getenv("GATEWAY_IMAGE"); v != "" { + gwImg = v + } + sSize := "1Gi" + if v := os.Getenv("E2E_STORAGE_SIZE"); v != "" { + sSize = v + } + sClass := "standard" + if v := os.Getenv("E2E_STORAGE_CLASS"); v != "" { + sClass = v + } + return map[string]string{ + "NAMESPACE": ns, + "NAME": name, + "INSTANCES": "1", + "STORAGE_SIZE": sSize, + "STORAGE_CLASS": sClass, + "DOCUMENTDB_IMAGE": docdbImg, + "GATEWAY_IMAGE": gwImg, + "CREDENTIAL_SECRET": credSecretName, + "EXPOSURE_TYPE": "ClusterIP", + "LOG_LEVEL": "info", + } +} + +// tests/exposure/ → ../../manifests +func manifestsRoot() string { return "../../manifests" } + +// setupFreshCluster is the exposure-area analogue of the feature_gates +// helper: namespace + secret + DocumentDB, waits for healthy. Returns +// the live CR plus a namespace-deleting cleanup. +func setupFreshCluster( + ctx context.Context, + c client.Client, + name string, + mixins []string, + extraVars map[string]string, +) (*previewv1.DocumentDB, func()) { + ns := namespaces.NamespaceForSpec(e2e.ExposureLabel) + Expect(fixtures.CreateLabeledNamespace(ctx, c, ns, e2e.ExposureLabel)).To(Succeed()) + Expect(fixtures.CreateLabeledCredentialSecret(ctx, c, ns)).To(Succeed()) + vars := baseVars(ns, name) + for k, v := range extraVars { + vars[k] = v + } + _, err := documentdbutil.Create(ctx, c, ns, name, documentdbutil.CreateOptions{ + Base: "documentdb", + Mixins: mixins, + Vars: vars, + ManifestsRoot: manifestsRoot(), + }) + Expect(err).ToNot(HaveOccurred(), "create DocumentDB") + + Eventually(func() error { + return documentdbutil.WaitHealthy(ctx, c, + types.NamespacedName{Namespace: ns, Name: name}, + timeouts.For(timeouts.DocumentDBReady)) + }, timeouts.For(timeouts.DocumentDBReady)+30*time.Second, 10*time.Second). + Should(Succeed(), "DocumentDB %s/%s did not become healthy", ns, name) + + live, err := documentdbutil.Get(ctx, c, client.ObjectKey{Namespace: ns, Name: name}) + Expect(err).ToNot(HaveOccurred()) + + cleanup := func() { + delCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + _ = c.Delete(delCtx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}}) + } + return live, cleanup +} diff --git a/test/e2e/tests/exposure/loadbalancer_test.go b/test/e2e/tests/exposure/loadbalancer_test.go new file mode 100644 index 00000000..3008d41e --- /dev/null +++ b/test/e2e/tests/exposure/loadbalancer_test.go @@ -0,0 +1,140 @@ +package exposure + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + intstr "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/portforward" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// hasLoadBalancerController probes the target cluster by creating a +// throwaway LoadBalancer Service and polling briefly for an external +// address. The probe uses a short timeout so environments without a +// working LB controller skip fast rather than failing the spec. The +// probe namespace is the default namespace; the Service is deleted +// before the function returns regardless of the outcome. +func hasLoadBalancerController(ctx context.Context, c client.Client, timeout time.Duration) (bool, error) { + probeName := fmt.Sprintf("e2e-lb-probe-%d", time.Now().UnixNano()) + probeNS := "default" + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: probeName, + Namespace: probeNS, + Labels: map[string]string{ + "e2e.documentdb.io/probe": "loadbalancer", + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeLoadBalancer, + Selector: map[string]string{"app.kubernetes.io/name": "nonexistent-e2e-probe"}, + Ports: []corev1.ServicePort{{ + Name: "probe", + Port: 80, + TargetPort: intstr.FromInt(80), + Protocol: corev1.ProtocolTCP, + }}, + }, + } + if err := c.Create(ctx, svc); err != nil && !apierrors.IsAlreadyExists(err) { + return false, fmt.Errorf("create LB probe: %w", err) + } + defer func() { + delCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _ = c.Delete(delCtx, &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: probeName, Namespace: probeNS}, + }) + }() + + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + got := &corev1.Service{} + if err := c.Get(ctx, client.ObjectKey{Namespace: probeNS, Name: probeName}, got); err == nil { + for _, ing := range got.Status.LoadBalancer.Ingress { + if ing.IP != "" || ing.Hostname != "" { + return true, nil + } + } + } + select { + case <-ctx.Done(): + return false, ctx.Err() + case <-time.After(2 * time.Second): + } + } + return false, nil +} + +// DocumentDB exposure — LoadBalancer. +// +// Requires a working LoadBalancer controller in the target cluster +// (kind + MetalLB, a cloud-provider LB, etc.). When no external address +// is assigned to a probe Service within ~30s, the spec skips rather than +// fails so unconfigured environments do not poison the run. +var _ = Describe("DocumentDB exposure — LoadBalancer", + Label(e2e.ExposureLabel, e2e.NeedsMetalLBLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.Medium) + env := e2e.SuiteEnv() + Expect(env).ToNot(BeNil()) + probeCtx, cancel := context.WithTimeout(context.Background(), 45*time.Second) + defer cancel() + ok, err := hasLoadBalancerController(probeCtx, env.Client, 30*time.Second) + Expect(err).ToNot(HaveOccurred()) + if !ok { + Skip("no LoadBalancer controller in cluster — probe service acquired no external address within 30s") + } + }) + + It("provisions a LoadBalancer Service with an external address", func() { + env := e2e.SuiteEnv() + c := env.Client + + ctx, cancel := context.WithTimeout(context.Background(), 12*time.Minute) + DeferCleanup(cancel) + + dd, cleanup := setupFreshCluster(ctx, c, "expose-lb", + []string{"exposure_loadbalancer"}, + map[string]string{"EXPOSURE_TYPE": "LoadBalancer"}, + ) + DeferCleanup(cleanup) + + // 1. Spec round-trip. + Expect(dd.Spec.ExposeViaService.ServiceType).To(Equal("LoadBalancer")) + + // 2. Service type is LoadBalancer. + svcName := portforward.GatewayServiceName(dd) + Eventually(assertions.AssertServiceType(ctx, c, dd.Namespace, svcName, corev1.ServiceTypeLoadBalancer), + timeouts.For(timeouts.ServiceReady), timeouts.PollInterval(timeouts.ServiceReady)). + Should(Succeed()) + + // 3. External address is eventually assigned. + Eventually(func() error { + svc := &corev1.Service{} + if err := c.Get(ctx, client.ObjectKey{Namespace: dd.Namespace, Name: svcName}, svc); err != nil { + return err + } + for _, ing := range svc.Status.LoadBalancer.Ingress { + if ing.IP != "" || ing.Hostname != "" { + return nil + } + } + return fmt.Errorf("Service %s/%s has no external address yet", dd.Namespace, svcName) + }, timeouts.For(timeouts.ServiceReady), timeouts.PollInterval(timeouts.ServiceReady)). + Should(Succeed()) + }) + }) diff --git a/test/e2e/tests/feature_gates/feature_gates_suite_test.go b/test/e2e/tests/feature_gates/feature_gates_suite_test.go new file mode 100644 index 00000000..eed835a6 --- /dev/null +++ b/test/e2e/tests/feature_gates/feature_gates_suite_test.go @@ -0,0 +1,83 @@ +// Package feature_gates hosts the DocumentDB E2E featuregates area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +package feature_gates + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestFeatureGates(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - FeatureGates", Label(e2e.FeatureLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("featuregates bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("featuregates worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "featuregates teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +) + +// BeforeEach in this area aborts the spec if the operator pod has +// drifted since SetupSuite (UID/name/restart-count change). Area +// tests/upgrade/ intentionally omits this hook because operator +// restarts are part of its scenario. +var _ = BeforeEach(func() { +Expect(e2e.CheckOperatorUnchanged()).To(Succeed(), +"operator health check failed — a previous spec or reconciler likely restarted the operator") +}) + +// TODO(e2e/feature-gates): add a ChangeStreams spec here once the +// suite standardises on a change-stream-capable DocumentDB image. +// +// Status: experimental feature. The operator already translates +// `spec.featureGates.ChangeStreams=true` into `wal_level=logical` on +// the underlying CNPG Cluster (see operator/src/internal/cnpg/ +// cnpg_cluster.go), but end-to-end validation of the Mongo-wire +// `watch()` call requires the `-changestream` DocumentDB image +// variant, which is not part of the default e2e image set. +// +// Previously this area carried a tests/feature_gates/changestreams_ +// test.go that asserted the wal_level translation via the CNPG spec. +// It was removed together with manifests/mixins/feature_changestreams. +// yaml.template and the fixtures_test render check so the default +// pipeline does not imply the feature is supported in the shipped +// image. +// +// When re-enabling: +// 1. Restore manifests/mixins/feature_changestreams.yaml.template +// (single key: spec.featureGates.ChangeStreams: true). +// 2. Gate the spec behind a `needs-changestream-image` capability +// label (mirrors `needs-cert-manager`) and a preflight check that +// skips when the current documentDBImage cannot handle it. +// 3. Layer a best-effort mongo-driver `Watch` smoke on top of the +// existing wal_level assertion so both the operator and extension +// contracts are covered. diff --git a/test/e2e/tests/feature_gates/helpers_test.go b/test/e2e/tests/feature_gates/helpers_test.go new file mode 100644 index 00000000..0e5995b8 --- /dev/null +++ b/test/e2e/tests/feature_gates/helpers_test.go @@ -0,0 +1,123 @@ +package feature_gates + +import ( + "context" + "os" + "time" + + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + "github.com/documentdb/documentdb-operator/test/e2e" + documentdbutil "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// Shared credential name for fresh per-spec clusters. Username and +// password values are now sourced from fixtures.DefaultCredentialUsername +// and fixtures.DefaultCredentialPassword so mongo helpers that already +// know those values work against both shared fixtures and per-spec CRs. +const credSecretName = fixtures.DefaultCredentialSecretName + +// defaultDocDBImage / defaultGatewayImage are empty by default so the +// operator picks the correct layered defaults (CNPG pg18 base + +// DocumentDB extension image + gateway sidecar). Env vars still +// override for CI pinning. +const ( + defaultDocDBImage = "" + defaultGatewayImage = "" +) + +// baseVars builds the envsubst map the base/documentdb.yaml.template +// expects. Callers override individual entries for per-spec tweaks. +func baseVars(ns, name string) map[string]string { + docdbImg := defaultDocDBImage + if v := os.Getenv("DOCUMENTDB_IMAGE"); v != "" { + docdbImg = v + } + gwImg := defaultGatewayImage + if v := os.Getenv("GATEWAY_IMAGE"); v != "" { + gwImg = v + } + sSize := "1Gi" + if v := os.Getenv("E2E_STORAGE_SIZE"); v != "" { + sSize = v + } + sClass := "standard" + if v := os.Getenv("E2E_STORAGE_CLASS"); v != "" { + sClass = v + } + return map[string]string{ + "NAMESPACE": ns, + "NAME": name, + "INSTANCES": "1", + "STORAGE_SIZE": sSize, + "STORAGE_CLASS": sClass, + "DOCUMENTDB_IMAGE": docdbImg, + "GATEWAY_IMAGE": gwImg, + "CREDENTIAL_SECRET": credSecretName, + "EXPOSURE_TYPE": "ClusterIP", + "LOG_LEVEL": "info", + } +} + +// manifestsRoot returns the absolute path to test/e2e/manifests so the +// per-spec clusters can read the mixin templates without depending on +// the caller's working directory. +func manifestsRoot() string { + // tests/feature_gates/ → ../../manifests + return "../../manifests" +} + +// setupFreshCluster creates a namespace, credential secret, and a +// DocumentDB CR composed of the base template plus mixins, then waits +// for it to become healthy. It returns the live CR plus a cleanup func +// that deletes the namespace. Namespace + secret creation delegate to +// the fixtures helpers so ownership labels match the rest of the suite. +func setupFreshCluster( + ctx context.Context, + c client.Client, + name string, + mixins []string, + extraVars map[string]string, +) (*previewv1.DocumentDB, func()) { + ns := namespaces.NamespaceForSpec(e2e.FeatureLabel) + Expect(fixtures.CreateLabeledNamespace(ctx, c, ns, e2e.FeatureLabel)).To(Succeed()) + Expect(fixtures.CreateLabeledCredentialSecret(ctx, c, ns)).To(Succeed()) + vars := baseVars(ns, name) + for k, v := range extraVars { + vars[k] = v + } + _, err := documentdbutil.Create(ctx, c, ns, name, documentdbutil.CreateOptions{ + Base: "documentdb", + Mixins: mixins, + Vars: vars, + ManifestsRoot: manifestsRoot(), + }) + Expect(err).ToNot(HaveOccurred(), "create DocumentDB") + + Eventually(func() error { + return documentdbutil.WaitHealthy(ctx, c, + types.NamespacedName{Namespace: ns, Name: name}, + timeouts.For(timeouts.DocumentDBReady)) + }, timeouts.For(timeouts.DocumentDBReady)+30*time.Second, 10*time.Second). + Should(Succeed(), "DocumentDB %s/%s did not become healthy", ns, name) + + // Re-fetch to return the populated object. + live, err := documentdbutil.Get(ctx, c, client.ObjectKey{Namespace: ns, Name: name}) + Expect(err).ToNot(HaveOccurred()) + + cleanup := func() { + delCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + _ = c.Delete(delCtx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}}) + } + return live, cleanup +} \ No newline at end of file diff --git a/test/e2e/tests/lifecycle/delete_reclaim_test.go b/test/e2e/tests/lifecycle/delete_reclaim_test.go new file mode 100644 index 00000000..9a1c26fc --- /dev/null +++ b/test/e2e/tests/lifecycle/delete_reclaim_test.go @@ -0,0 +1,98 @@ +package lifecycle + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" //nolint:revive + . "github.com/onsi/gomega" //nolint:revive + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +var _ = Describe("DocumentDB lifecycle — delete with Retain reclaim", + Label(e2e.LifecycleLabel, e2e.DestructiveLabel), e2e.MediumLevelLabel, + func() { + const name = "lifecycle-delete-retain" + var ( + ctx context.Context + ns string + c client.Client + ) + + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.Medium) + ctx = context.Background() + c = e2e.SuiteEnv().Client + ns = namespaces.NamespaceForSpec(e2e.LifecycleLabel) + createNamespace(ctx, c, ns) + createCredentialSecret(ctx, c, ns, "documentdb-credentials") + }) + + It("preserves the underlying PersistentVolume after the CR is deleted", func() { + vars := baseVars("1Gi") + dd, err := documentdb.Create(ctx, c, ns, name, documentdb.CreateOptions{ + Base: "documentdb", + Mixins: []string{"reclaim_retain"}, + Vars: vars, + }) + Expect(err).ToNot(HaveOccurred()) + + key := types.NamespacedName{Namespace: ns, Name: name} + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed()) + + // Capture the PV names currently bound to this + // namespace's PVCs so we can verify they survive + // DocumentDB deletion. + var pvcs corev1.PersistentVolumeClaimList + Expect(c.List(ctx, &pvcs, client.InNamespace(ns))).To(Succeed()) + Expect(pvcs.Items).ToNot(BeEmpty(), "expected at least one PVC after Ready") + var pvNames []string + for i := range pvcs.Items { + if v := pvcs.Items[i].Spec.VolumeName; v != "" { + pvNames = append(pvNames, v) + } + } + Expect(pvNames).ToNot(BeEmpty(), "expected bound PVs; got only pending PVCs") + + // Delete the DocumentDB and wait for it to disappear. + Expect(documentdb.Delete(ctx, c, dd, 3*time.Minute)).To(Succeed()) + + // Retained PVs should remain in the API server; their + // phase transitions to Released (or stays Bound briefly) + // but the object itself must not be collected. + for _, pvName := range pvNames { + var pv corev1.PersistentVolume + Eventually(func() error { + return c.Get(ctx, types.NamespacedName{Name: pvName}, &pv) + }, 2*time.Minute, 5*time.Second).Should(Succeed(), + "PV %s should still exist under Retain policy", pvName) + Expect(pv.Spec.PersistentVolumeReclaimPolicy).To( + Equal(corev1.PersistentVolumeReclaimRetain), + "PV %s must have reclaimPolicy=Retain", pvName) + } + + // Manual cleanup: retained PVs will otherwise leak across + // test runs. Deleting them releases the underlying + // provisioner storage in kind's local-path driver. + DeferCleanup(func(ctx SpecContext) { + for _, pvName := range pvNames { + _ = c.Delete(ctx, &corev1.PersistentVolume{ + ObjectMeta: metav1.ObjectMeta{Name: pvName}, + }) + } + }) + }) + }) diff --git a/test/e2e/tests/lifecycle/deploy_test.go b/test/e2e/tests/lifecycle/deploy_test.go new file mode 100644 index 00000000..bb0df506 --- /dev/null +++ b/test/e2e/tests/lifecycle/deploy_test.go @@ -0,0 +1,95 @@ +package lifecycle + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" //nolint:revive + . "github.com/onsi/gomega" //nolint:revive + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + mongohelper "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +var _ = Describe("DocumentDB lifecycle — deploy", + Label(e2e.LifecycleLabel, e2e.BasicLabel, e2e.SmokeLabel), e2e.MediumLevelLabel, + func() { + const name = "lifecycle-deploy" + var ( + ctx context.Context + ns string + c client.Client + ) + + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.Medium) + ctx = context.Background() + c = e2e.SuiteEnv().Client + ns = namespaces.NamespaceForSpec(e2e.LifecycleLabel) + createNamespace(ctx, c, ns) + createCredentialSecret(ctx, c, ns, "documentdb-credentials") + }) + + It("brings a 1-instance cluster to Ready and wires owner refs on the backing CNPG Cluster", func() { + dd, err := documentdb.Create(ctx, c, ns, name, documentdb.CreateOptions{ + Base: "documentdb", + Vars: baseVars("1Gi"), + }) + Expect(err).ToNot(HaveOccurred()) + DeferCleanup(func(ctx SpecContext) { + _ = documentdb.Delete(ctx, c, dd, 3*time.Minute) + }) + + key := types.NamespacedName{Namespace: ns, Name: name} + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed()) + + // CNPG Cluster backing this DocumentDB exists and has an + // owner reference back to the DocumentDB CR — mirrors + // what docs/designs/e2e-test-suite.md calls for. The + // Cluster name equals the DocumentDB name for single- + // cluster deployments (see assertions.clusterNameFor). + var cluster cnpgv1.Cluster + Eventually(func() error { + return c.Get(ctx, key, &cluster) + }, 2*time.Minute, 5*time.Second).Should(Succeed()) + + current := getDD(ctx, ns, name) + Expect(cluster.OwnerReferences).ToNot(BeEmpty(), + "CNPG Cluster should be owned by the DocumentDB CR") + var found bool + for _, o := range cluster.OwnerReferences { + if o.UID == current.UID && o.Kind == "DocumentDB" { + found = true + break + } + } + Expect(found).To(BeTrue(), + "expected owner reference with UID=%s on CNPG Cluster %s", current.UID, key) + + // Data-plane smoke: opening a mongo-driver connection + // against the freshly-deployed CR proves the gateway + // actually answers on the wire. Without this step, + // "Ready=true" alone can mask a broken gateway sidecar + // (e.g. wrong image, misconfigured credentials secret). + // NewFromDocumentDB pings internally before returning, + // so the explicit Ping below is belt-and-braces at the + // test boundary — keeping it here makes the failure + // narrative clear without readers chasing helper code. + h, err := mongohelper.NewFromDocumentDB(ctx, e2e.SuiteEnv(), ns, name) + Expect(err).ToNot(HaveOccurred(), "connect mongo to freshly-deployed DocumentDB") + DeferCleanup(func(ctx SpecContext) { _ = h.Close(ctx) }) + Expect(mongohelper.Ping(ctx, h.Client())).To(Succeed(), + "ping freshly-deployed DocumentDB gateway") + }) + }) diff --git a/test/e2e/tests/lifecycle/helpers_test.go b/test/e2e/tests/lifecycle/helpers_test.go new file mode 100644 index 00000000..d377c018 --- /dev/null +++ b/test/e2e/tests/lifecycle/helpers_test.go @@ -0,0 +1,109 @@ +package lifecycle + +import ( + "context" + "errors" + "os" + + . "github.com/onsi/ginkgo/v2" //nolint:revive + . "github.com/onsi/gomega" //nolint:revive + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" +) + +var ( + // errPendingPVCs signals that no PVCs have been created yet. + errPendingPVCs = errors.New("waiting for PVCs to appear") + // errNotExpanded signals that at least one PVC has not reached + // the requested capacity yet. + errNotExpanded = errors.New("waiting for PVC expansion") +) + +// baseVars returns the envsubst variables used by the lifecycle base +// template. Image overrides honour the same E2E-wide env vars the +// shared fixtures do; tests that need to mutate specific fields +// override the returned map before calling Create. +func baseVars(size string) map[string]string { + // Leave DOCUMENTDB_IMAGE / GATEWAY_IMAGE empty by default so the + // operator picks its own defaults — the DocumentDB extension is + // mounted onto the CNPG pg18 base via the image-library mechanism + // and the gateway is a separate sidecar image. Setting a monolithic + // override here (e.g. documentdb-local:16) would point the CNPG + // cluster at a non-postgres image and break initdb. + ddImage := os.Getenv("DOCUMENTDB_IMAGE") + gwImage := os.Getenv("GATEWAY_IMAGE") + storageClass := "standard" + if v := os.Getenv("E2E_STORAGE_CLASS"); v != "" { + storageClass = v + } + if size == "" { + size = "1Gi" + } + return map[string]string{ + "INSTANCES": "1", + "STORAGE_SIZE": size, + "STORAGE_CLASS": storageClass, + "DOCUMENTDB_IMAGE": ddImage, + "GATEWAY_IMAGE": gwImage, + "CREDENTIAL_SECRET": fixtures.DefaultCredentialSecretName, + "EXPOSURE_TYPE": "ClusterIP", + "LOG_LEVEL": "info", + } +} + +// createNamespace creates ns (via fixtures.CreateLabeledNamespace so the +// ownership labels are stamped) and registers a DeferCleanup to remove +// it. The signature is preserved so update_storage_test.go — which is +// out of scope for this pass — continues to compile. +func createNamespace(ctx context.Context, c client.Client, ns string) { + if err := fixtures.CreateLabeledNamespace(ctx, c, ns, "lifecycle"); err != nil { + Fail("create namespace " + ns + ": " + err.Error()) + } + DeferCleanup(func(ctx SpecContext) { + _ = c.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}}) + }) +} + +// createCredentialSecret seeds the default DocumentDB credential secret +// so the operator can finish the bootstrap bring-up. Name is accepted +// for signature compatibility with update_storage_test.go; when it +// matches DefaultCredentialSecretName the fixtures helper is used so +// ownership labels are stamped. +func createCredentialSecret(ctx context.Context, c client.Client, ns, name string) { + if name == fixtures.DefaultCredentialSecretName || name == "" { + if err := fixtures.CreateLabeledCredentialSecret(ctx, c, ns); err != nil { + Fail("create credential secret " + ns + "/" + name + ": " + err.Error()) + } + return + } + // Non-default secret name — fall back to an inline Create so callers + // can seed multiple named secrets in the same namespace. + sec := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Type: corev1.SecretTypeOpaque, + StringData: map[string]string{ + "username": fixtures.DefaultCredentialUsername, + "password": fixtures.DefaultCredentialPassword, + }, + } + if err := c.Create(ctx, sec); err != nil { + Fail("create credential secret " + ns + "/" + name + ": " + err.Error()) + } +} + +// getDD is a convenience shortcut around documentdb.Get used by specs +// that need to refetch the CR after a patch. +func getDD(ctx context.Context, ns, name string) *previewv1.DocumentDB { + c := e2e.SuiteEnv().Client + dd, err := documentdb.Get(ctx, c, types.NamespacedName{Namespace: ns, Name: name}) + Expect(err).ToNot(HaveOccurred()) + return dd +} diff --git a/test/e2e/tests/lifecycle/lifecycle_suite_test.go b/test/e2e/tests/lifecycle/lifecycle_suite_test.go new file mode 100644 index 00000000..613023a3 --- /dev/null +++ b/test/e2e/tests/lifecycle/lifecycle_suite_test.go @@ -0,0 +1,57 @@ +// Package lifecycle hosts the DocumentDB E2E lifecycle area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +package lifecycle + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestLifecycle(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - Lifecycle", Label(e2e.LifecycleLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("lifecycle bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("lifecycle worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "lifecycle teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +) + + +// BeforeEach in this area aborts the spec if the operator pod has +// drifted since SetupSuite (UID/name/restart-count change). Area +// tests/upgrade/ intentionally omits this hook because operator +// restarts are part of its scenario. +var _ = BeforeEach(func() { +Expect(e2e.CheckOperatorUnchanged()).To(Succeed(), +"operator health check failed — a previous spec or reconciler likely restarted the operator") +}) diff --git a/test/e2e/tests/lifecycle/update_image_test.go b/test/e2e/tests/lifecycle/update_image_test.go new file mode 100644 index 00000000..9a1f9fb6 --- /dev/null +++ b/test/e2e/tests/lifecycle/update_image_test.go @@ -0,0 +1,99 @@ +package lifecycle + +import ( + "context" + "os" + "time" + + . "github.com/onsi/ginkgo/v2" //nolint:revive + . "github.com/onsi/gomega" //nolint:revive + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// The design doc calls the field `spec.documentDbVersion`; the CRD at +// operator/src/api/preview/documentdb_types.go names it DocumentDBVersion +// (JSON `documentDBVersion`) and also exposes DocumentDBImage / GatewayImage +// which take precedence when set. Because the base template provides +// DocumentDBImage (not Version), we exercise the rollout via the image +// field and assert against Status.DocumentDBImage — Phase 3 follow-up to +// parameterise this once the Version-only path is wired into manifests. +var _ = Describe("DocumentDB lifecycle — update documentDBImage", + Label(e2e.LifecycleLabel, e2e.DisruptiveLabel), e2e.MediumLevelLabel, + func() { + const name = "lifecycle-update-image" + var ( + ctx context.Context + ns string + c client.Client + ) + + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.Medium) + ctx = context.Background() + c = e2e.SuiteEnv().Client + ns = namespaces.NamespaceForSpec(e2e.LifecycleLabel) + createNamespace(ctx, c, ns) + createCredentialSecret(ctx, c, ns, "documentdb-credentials") + }) + + It("rolls out a new image tag and reflects it in Status.DocumentDBImage", func() { + vars := baseVars("1Gi") + startImage := vars["DOCUMENTDB_IMAGE"] + if startImage == "" { + Skip("DOCUMENTDB_IMAGE env var must be set for the image-update spec — " + + "it needs an explicit starting tag to roll off of. Set DOCUMENTDB_IMAGE " + + "and optionally E2E_DOCUMENTDB_IMAGE_NEXT to exercise this path.") + } + // The target image override must be an explicit + // different tag; without it the patch would be a no-op + // (same image as startImage) and the Eventually below + // would trivially pass, producing a false positive. + targetImage := os.Getenv("E2E_DOCUMENTDB_IMAGE_NEXT") + if targetImage == "" || targetImage == startImage { + Skip("E2E_DOCUMENTDB_IMAGE_NEXT must be set to a different image than " + + "DOCUMENTDB_IMAGE to exercise a real rollout — skipping to avoid a no-op.") + } + + dd, err := documentdb.Create(ctx, c, ns, name, documentdb.CreateOptions{ + Base: "documentdb", + Vars: vars, + }) + Expect(err).ToNot(HaveOccurred()) + DeferCleanup(func(ctx SpecContext) { + _ = documentdb.Delete(ctx, c, dd, 3*time.Minute) + }) + + key := types.NamespacedName{Namespace: ns, Name: name} + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed()) + + // Refetch for a fresh resourceVersion before patching. + fresh := getDD(ctx, ns, name) + Expect(documentdb.PatchSpec(ctx, c, fresh, func(s *previewv1.DocumentDBSpec) { + s.DocumentDBImage = targetImage + })).To(Succeed()) + + Eventually(func() string { + current := getDD(ctx, ns, name) + return current.Status.DocumentDBImage + }, timeouts.For(timeouts.DocumentDBUpgrade), + timeouts.PollInterval(timeouts.DocumentDBUpgrade), + ).Should(Equal(targetImage)) + + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBUpgrade), + timeouts.PollInterval(timeouts.DocumentDBUpgrade), + ).Should(Succeed()) + }) + }) diff --git a/test/e2e/tests/lifecycle/update_loglevel_test.go b/test/e2e/tests/lifecycle/update_loglevel_test.go new file mode 100644 index 00000000..ddb2c2d3 --- /dev/null +++ b/test/e2e/tests/lifecycle/update_loglevel_test.go @@ -0,0 +1,79 @@ +package lifecycle + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" //nolint:revive + . "github.com/onsi/gomega" //nolint:revive + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +var _ = Describe("DocumentDB lifecycle — update logLevel", + Label(e2e.LifecycleLabel, e2e.BasicLabel), e2e.MediumLevelLabel, + func() { + const name = "lifecycle-update-loglevel" + var ( + ctx context.Context + ns string + c client.Client + ) + + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.Medium) + ctx = context.Background() + c = e2e.SuiteEnv().Client + ns = namespaces.NamespaceForSpec(e2e.LifecycleLabel) + createNamespace(ctx, c, ns) + createCredentialSecret(ctx, c, ns, "documentdb-credentials") + }) + + It("propagates a spec.logLevel patch to the live CR", func() { + vars := baseVars("1Gi") + vars["LOG_LEVEL"] = "info" + + dd, err := documentdb.Create(ctx, c, ns, name, documentdb.CreateOptions{ + Base: "documentdb", + Vars: vars, + }) + Expect(err).ToNot(HaveOccurred()) + DeferCleanup(func(ctx SpecContext) { + _ = documentdb.Delete(ctx, c, dd, 3*time.Minute) + }) + + key := types.NamespacedName{Namespace: ns, Name: name} + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed()) + + // Patch spec.logLevel; field is exported verbatim as + // LogLevel in api/preview/documentdb_types.go. + fresh := getDD(ctx, ns, name) + Expect(fresh.Spec.LogLevel).To(Equal("info")) + Expect(documentdb.PatchSpec(ctx, c, fresh, func(s *previewv1.DocumentDBSpec) { + s.LogLevel = "debug" + })).To(Succeed()) + + Eventually(func() string { + current := getDD(ctx, ns, name) + return current.Spec.LogLevel + }, 1*time.Minute, 2*time.Second).Should(Equal("debug"), + "patched spec.logLevel should reach the API server") + + // Reconciliation should not disrupt readiness while the + // only change is a log-level string. + Consistently(assertions.AssertDocumentDBReady(ctx, c, key), + 30*time.Second, 5*time.Second, + ).Should(Succeed()) + }) + }) diff --git a/test/e2e/tests/lifecycle/update_storage_test.go b/test/e2e/tests/lifecycle/update_storage_test.go new file mode 100644 index 00000000..019acb16 --- /dev/null +++ b/test/e2e/tests/lifecycle/update_storage_test.go @@ -0,0 +1,108 @@ +package lifecycle + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" //nolint:revive + . "github.com/onsi/gomega" //nolint:revive + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/clusterprobe" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// The CRD nests storage as Spec.Resource.Storage.PvcSize (see +// operator/src/api/preview/documentdb_types.go). The design doc wording +// "spec.resource or spec.persistentVolumeClaim" is ambiguous — the real +// field is `spec.resource.storage.pvcSize`, patched below. +var _ = Describe("DocumentDB lifecycle — update storage.pvcSize", + Label(e2e.LifecycleLabel, e2e.DisruptiveLabel, e2e.NeedsCSIResizeLabel), + e2e.MediumLevelLabel, + func() { + const name = "lifecycle-update-storage" + var ( + ctx context.Context + ns string + c client.Client + ) + + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.Medium) + ctx = context.Background() + c = e2e.SuiteEnv().Client + // Runtime capability probe: PVC resize silently falls over + // on StorageClasses without AllowVolumeExpansion=true. The + // NeedsCSIResizeLabel only gates invocation; this probe + // gives a clear Skip when the backing class cannot expand. + scName := baseVars("1Gi")["STORAGE_CLASS"] + canExpand, err := clusterprobe.StorageClassAllowsExpansion(ctx, c, scName) + Expect(err).NotTo(HaveOccurred(), "probe StorageClass %q expansion", scName) + if !canExpand { + Skip("StorageClass " + scName + " does not allow volume expansion — skipping PVC resize spec") + } + ns = namespaces.NamespaceForSpec(e2e.LifecycleLabel) + createNamespace(ctx, c, ns) + createCredentialSecret(ctx, c, ns, "documentdb-credentials") + }) + + It("expands PVCs from 1Gi to 2Gi without rotating the primary", func() { + dd, err := documentdb.Create(ctx, c, ns, name, documentdb.CreateOptions{ + Base: "documentdb", + Vars: baseVars("1Gi"), + }) + Expect(err).ToNot(HaveOccurred()) + DeferCleanup(func(ctx SpecContext) { + _ = documentdb.Delete(ctx, c, dd, 3*time.Minute) + }) + + key := types.NamespacedName{Namespace: ns, Name: name} + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed()) + + // Patch the storage size. + fresh := getDD(ctx, ns, name) + Expect(documentdb.PatchSpec(ctx, c, fresh, func(s *previewv1.DocumentDBSpec) { + s.Resource.Storage.PvcSize = "2Gi" + })).To(Succeed()) + + // PVC capacity should eventually be updated across all + // backing claims. List PVCs in the namespace; in kind + // with a single-instance cluster there is one data PVC. + want := resource.MustParse("2Gi") + Eventually(func() error { + var pvcs corev1.PersistentVolumeClaimList + if err := c.List(ctx, &pvcs, client.InNamespace(ns)); err != nil { + return err + } + if len(pvcs.Items) == 0 { + return errPendingPVCs + } + for i := range pvcs.Items { + got := pvcs.Items[i].Status.Capacity[corev1.ResourceStorage] + if got.Cmp(want) < 0 { + return errNotExpanded + } + } + return nil + }, timeouts.For(timeouts.PVCResize), + timeouts.PollInterval(timeouts.PVCResize), + ).Should(Succeed()) + + // Cluster still healthy after the resize. + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + 1*time.Minute, 5*time.Second, + ).Should(Succeed()) + }) + }) diff --git a/test/e2e/tests/performance/perf_aggregation_test.go b/test/e2e/tests/performance/perf_aggregation_test.go new file mode 100644 index 00000000..b723dd8c --- /dev/null +++ b/test/e2e/tests/performance/perf_aggregation_test.go @@ -0,0 +1,87 @@ +package performance + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/seed" +) + +// Aggregation threshold rationale +// +// A $group pipeline over a few thousand small documents on an indexed +// field is dominated by gateway + wire overhead, not planner work. On +// kind-on-laptop the pipeline completes in ~1–3s; the 45s budget is a +// generous upper bound that will only fail on a hard regression (e.g., +// unexpected collection-scan fallback or planner bug). +var _ = Describe("DocumentDB performance — aggregation pipeline", + Label(e2e.PerformanceLabel, e2e.SlowLabel), e2e.HighLevelLabel, + Ordered, Serial, func() { + + const ( + copies = 40 // seed.AggDataset * copies = 2,000 docs + aggBudget = 45 * time.Second + batchWrite = 500 + ) + + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.High) }) + + It("runs a $group aggregation within the smoke threshold", func(ctx SpecContext) { + conn := connectSharedRO(ctx) + DeferCleanup(conn.Stop) + + coll := conn.Client.Database(conn.DB).Collection("agg") + + // Replicate the canonical AggDataset so we stay within a + // deterministic shape while reaching non-trivial size. + base := seed.AggDataset() + buf := make([]any, 0, batchWrite) + id := 1 + flush := func() { + if len(buf) == 0 { + return + } + _, err := coll.InsertMany(ctx, buf) + Expect(err).NotTo(HaveOccurred(), "seed agg") + buf = buf[:0] + } + for c := 0; c < copies; c++ { + for _, d := range base { + cp := bson.M{} + for k, v := range d { + cp[k] = v + } + cp["_id"] = id + id++ + buf = append(buf, cp) + if len(buf) >= batchWrite { + flush() + } + } + } + flush() + + pipeline := []bson.M{ + {"$group": bson.M{"_id": "$category", "total": bson.M{"$sum": "$value"}, "n": bson.M{"$sum": 1}}}, + {"$sort": bson.M{"_id": 1}}, + } + + start := time.Now() + cur, err := coll.Aggregate(ctx, pipeline) + Expect(err).NotTo(HaveOccurred(), "Aggregate") + var out []bson.M + Expect(cur.All(ctx, &out)).To(Succeed()) + elapsed := time.Since(start) + logLatency("aggregate-group", elapsed) + + Expect(out).To(HaveLen(seed.AggDatasetGroups), + "each AggDataset category should appear once") + Expect(elapsed).To(BeNumerically("<", aggBudget), + "$group pipeline should complete within %s", aggBudget) + }) + }) diff --git a/test/e2e/tests/performance/perf_count_range_test.go b/test/e2e/tests/performance/perf_count_range_test.go new file mode 100644 index 00000000..1369f20b --- /dev/null +++ b/test/e2e/tests/performance/perf_count_range_test.go @@ -0,0 +1,66 @@ +package performance + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +// Count/range threshold rationale +// +// After seeding 5,000 documents and creating an index on `value`, a +// half-range query (value >= midpoint) should hit the index and return +// ~2,500 documents quickly — well under a second on a hot kind cluster. +// We allow 30s to absorb port-forward warmup + cold-cache index +// traversal on busy CI nodes. Any regression past 30s likely means the +// planner stopped using the index. +var _ = Describe("DocumentDB performance — count with range + index", + Label(e2e.PerformanceLabel, e2e.SlowLabel), e2e.HighLevelLabel, + Ordered, Serial, func() { + + const ( + docCount = 5_000 + countBudget = 30 * time.Second + ) + + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.High) }) + + It("counts half the range using an index within the smoke threshold", func(ctx SpecContext) { + conn := connectSharedRO(ctx) + DeferCleanup(conn.Stop) + + coll := conn.Client.Database(conn.DB).Collection("range_count") + + docs := make([]any, docCount) + for i := 0; i < docCount; i++ { + docs[i] = bson.M{"_id": i + 1, "value": i + 1} + } + _, err := coll.InsertMany(ctx, docs) + Expect(err).NotTo(HaveOccurred(), "seed range_count") + + _, err = coll.Indexes().CreateOne(ctx, mongo.IndexModel{ + Keys: bson.D{{Key: "value", Value: 1}}, + Options: options.Index().SetName("idx_value"), + }) + Expect(err).NotTo(HaveOccurred(), "create value index") + + filter := bson.M{"value": bson.M{"$gte": docCount / 2}} + + start := time.Now() + n, err := coll.CountDocuments(ctx, filter) + elapsed := time.Since(start) + logLatency("count-range", elapsed) + + Expect(err).NotTo(HaveOccurred(), "CountDocuments range") + Expect(n).To(BeEquivalentTo(docCount/2 + 1)) + Expect(elapsed).To(BeNumerically("<", countBudget), + "indexed range count should complete within %s", countBudget) + }) + }) diff --git a/test/e2e/tests/performance/perf_delete_drop_test.go b/test/e2e/tests/performance/perf_delete_drop_test.go new file mode 100644 index 00000000..ab667bf5 --- /dev/null +++ b/test/e2e/tests/performance/perf_delete_drop_test.go @@ -0,0 +1,68 @@ +package performance + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +// Delete+drop threshold rationale +// +// DeleteMany of half a collection followed by a Collection.Drop() is a +// cheap metadata-bounded path on the DocumentDB gateway. Kind-on-laptop +// baseline is ~1–3s for both combined; 60s is a generous 20x guard +// aimed at catching pathologic regressions such as tombstone fanout or +// table-rewrite fallback. +var _ = Describe("DocumentDB performance — bulk delete and drop", + Label(e2e.PerformanceLabel, e2e.SlowLabel), e2e.HighLevelLabel, + Ordered, Serial, func() { + + const ( + docCount = 5_000 + deleteBudget = 60 * time.Second + ) + + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.High) }) + + It("bulk-deletes half the collection and drops it within the smoke threshold", func(ctx SpecContext) { + conn := connectSharedRO(ctx) + DeferCleanup(conn.Stop) + + coll := conn.Client.Database(conn.DB).Collection("delete_drop") + + docs := make([]any, docCount) + for i := 0; i < docCount; i++ { + // Even-ids are deletable, odd-ids are survivors. This + // exercises a real matching predicate rather than a + // fast-path {} delete. + docs[i] = bson.M{"_id": i + 1, "even": (i+1)%2 == 0} + } + _, err := coll.InsertMany(ctx, docs) + Expect(err).NotTo(HaveOccurred(), "seed delete_drop") + + start := time.Now() + delRes, err := coll.DeleteMany(ctx, bson.M{"even": true}) + Expect(err).NotTo(HaveOccurred(), "DeleteMany") + Expect(delRes.DeletedCount).To(BeEquivalentTo(docCount / 2)) + + // Drop the collection — the operation should complete + // quickly even on a large collection because it is a + // metadata-only truncate on the server. + Expect(coll.Drop(ctx)).To(Succeed(), "Drop collection") + elapsed := time.Since(start) + logLatency("delete-drop", elapsed) + + Expect(elapsed).To(BeNumerically("<", deleteBudget), + "delete + drop should complete within %s", deleteBudget) + + n, err := coll.CountDocuments(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred(), + "CountDocuments on a dropped collection should return 0, not error") + Expect(n).To(BeEquivalentTo(0)) + }) + }) diff --git a/test/e2e/tests/performance/perf_helpers_test.go b/test/e2e/tests/performance/perf_helpers_test.go new file mode 100644 index 00000000..34aca3b4 --- /dev/null +++ b/test/e2e/tests/performance/perf_helpers_test.go @@ -0,0 +1,68 @@ +package performance + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + driver "go.mongodb.org/mongo-driver/v2/mongo" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" +) + +// perfConn bundles everything a perf spec needs to drive mongo traffic +// against the shared RO fixture: a connected client, an isolated DB +// name, and a cleanup hook that drops the DB and tears down the +// port-forward. +type perfConn struct { + Client *driver.Client + DB string + Stop func() +} + +// connectSharedRO provisions the SharedRO fixture (lazily on first +// call) and returns a connected mongo client scoped to a per-spec +// database name derived from CurrentSpecReport().FullText(). The +// returned Stop drops the spec's database and tears down the +// forward/client. +// +// The mechanics (port-forward, credential resolution, retry on +// forwarder bind) are delegated to mongo.NewFromDocumentDB so that all +// suites share a single connect path — we just wrap it to preserve the +// per-spec DB-drop cleanup contract the perf specs rely on. +func connectSharedRO(ctx context.Context) *perfConn { + GinkgoHelper() + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "SuiteEnv must be initialized") + + handle, err := fixtures.GetOrCreateSharedRO(ctx, env.Client) + Expect(err).NotTo(HaveOccurred(), "provision SharedRO fixture") + Expect(handle).NotTo(BeNil()) + + h, err := mongo.NewFromDocumentDB(ctx, env, handle.Namespace(), handle.Name()) + Expect(err).NotTo(HaveOccurred(), "open mongo connection to SharedRO") + + db := fixtures.DBNameFor(CurrentSpecReport().FullText()) + c := h.Client() + + stop := func() { + dropCtx, dropCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer dropCancel() + _ = mongo.DropDatabase(dropCtx, c, db) + closeCtx, closeCancel := context.WithTimeout(context.Background(), 15*time.Second) + defer closeCancel() + _ = h.Close(closeCtx) + } + return &perfConn{Client: c, DB: db, Stop: stop} +} + +// logLatency is a small convenience so every spec reports its measured +// duration in a uniform format that CI log scrapers can grep. +func logLatency(op string, elapsed time.Duration) { + fmt.Fprintf(GinkgoWriter, "perf[%s]: %s\n", op, elapsed) +} diff --git a/test/e2e/tests/performance/perf_insert_test.go b/test/e2e/tests/performance/perf_insert_test.go new file mode 100644 index 00000000..496ef898 --- /dev/null +++ b/test/e2e/tests/performance/perf_insert_test.go @@ -0,0 +1,76 @@ +package performance + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +// Insert threshold rationale +// +// Seeding 10,000 small documents via InsertMany is the bulk-write +// tripwire. On a kind-on-laptop baseline the operation typically +// completes in 10–20s; CI nodes add variance. The 2-minute bound is a +// generous ~8x multiplier intended to catch catastrophic regressions +// (e.g., accidental per-document round-trips, gateway CPU starvation) +// rather than to grade performance. +var _ = Describe("DocumentDB performance — bulk insert", + Label(e2e.PerformanceLabel, e2e.SlowLabel), e2e.HighLevelLabel, + Ordered, Serial, func() { + + const ( + docCount = 10_000 + insertBudget = 2 * time.Minute + perInsertBatch = 1_000 + ) + + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.High) }) + + It("bulk-inserts 10k documents within the smoke threshold", func(ctx SpecContext) { + conn := connectSharedRO(ctx) + DeferCleanup(conn.Stop) + + coll := conn.Client.Database(conn.DB).Collection("bulk_insert") + + // Build the payload outside the timed region so we measure + // server-side insert latency rather than Go allocations. + batches := make([][]any, 0, docCount/perInsertBatch) + for b := 0; b < docCount/perInsertBatch; b++ { + docs := make([]any, perInsertBatch) + base := b * perInsertBatch + for i := 0; i < perInsertBatch; i++ { + n := base + i + 1 + docs[i] = bson.M{ + "_id": n, + "kind": "perf", + "value": n, + } + } + batches = append(batches, docs) + } + + opCtx, cancel := context.WithTimeout(ctx, insertBudget) + defer cancel() + + start := time.Now() + for _, batch := range batches { + _, err := coll.InsertMany(opCtx, batch) + Expect(err).NotTo(HaveOccurred(), "InsertMany") + } + elapsed := time.Since(start) + logLatency("insert-10k", elapsed) + + Expect(elapsed).To(BeNumerically("<", insertBudget), + "bulk insert of %d docs should complete within %s", docCount, insertBudget) + + n, err := coll.CountDocuments(ctx, bson.M{}) + Expect(err).NotTo(HaveOccurred()) + Expect(n).To(BeEquivalentTo(docCount)) + }) + }) diff --git a/test/e2e/tests/performance/perf_sort_test.go b/test/e2e/tests/performance/perf_sort_test.go new file mode 100644 index 00000000..8d0655c6 --- /dev/null +++ b/test/e2e/tests/performance/perf_sort_test.go @@ -0,0 +1,86 @@ +package performance + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +// Sort threshold rationale +// +// With an ascending index on the sort field, a full collection sort +// over 5,000 docs is effectively a scan-of-index + cursor drain. On +// kind-on-laptop this completes in ~2s; the 60s cap absorbs cold index +// loads, port-forward warmup, and CI noise. A regression past the cap +// almost always means the sort fell back to in-memory post-processing. +var _ = Describe("DocumentDB performance — indexed sort", + Label(e2e.PerformanceLabel, e2e.SlowLabel), e2e.HighLevelLabel, + Ordered, Serial, func() { + + const ( + docCount = 5_000 + sortBudget = 60 * time.Second + ) + + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.High) }) + + It("drains a sorted cursor using an index within the smoke threshold", func(ctx SpecContext) { + conn := connectSharedRO(ctx) + DeferCleanup(conn.Stop) + + coll := conn.Client.Database(conn.DB).Collection("sorted") + + // Seed in reverse order so a naive collection-scan sort + // would be slower than an index-assisted one — makes the + // index actually useful for the assertion. + docs := make([]any, docCount) + for i := 0; i < docCount; i++ { + docs[i] = bson.M{"_id": i + 1, "score": docCount - i} + } + _, err := coll.InsertMany(ctx, docs) + Expect(err).NotTo(HaveOccurred(), "seed sorted") + + _, err = coll.Indexes().CreateOne(ctx, mongo.IndexModel{ + Keys: bson.D{{Key: "score", Value: 1}}, + Options: options.Index().SetName("idx_score"), + }) + Expect(err).NotTo(HaveOccurred(), "create score index") + + findOpts := options.Find().SetSort(bson.D{{Key: "score", Value: 1}}) + + start := time.Now() + cur, err := coll.Find(ctx, bson.M{}, findOpts) + Expect(err).NotTo(HaveOccurred(), "Find with sort") + var last int32 + first := true + count := 0 + for cur.Next(ctx) { + var d struct { + Score int32 `bson:"score"` + } + Expect(cur.Decode(&d)).To(Succeed()) + if !first { + Expect(d.Score).To(BeNumerically(">=", last), + "sort output must be non-decreasing") + } + last = d.Score + first = false + count++ + } + Expect(cur.Err()).NotTo(HaveOccurred()) + Expect(cur.Close(ctx)).To(Succeed()) + elapsed := time.Since(start) + logLatency("sort-index", elapsed) + + Expect(count).To(Equal(docCount)) + Expect(elapsed).To(BeNumerically("<", sortBudget), + "indexed sort should complete within %s", sortBudget) + }) + }) diff --git a/test/e2e/tests/performance/perf_update_test.go b/test/e2e/tests/performance/perf_update_test.go new file mode 100644 index 00000000..03beecc2 --- /dev/null +++ b/test/e2e/tests/performance/perf_update_test.go @@ -0,0 +1,63 @@ +package performance + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "go.mongodb.org/mongo-driver/v2/bson" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +// Update threshold rationale +// +// A single UpdateMany over 5,000 docs that sets a new field on every +// document is one round-trip per call, so the wall-clock cost is +// dominated by server-side write amplification + WAL. Kind-on-laptop +// baseline is ~2–5s; 90s is a generous tripwire that catches pathologic +// regressions (e.g., accidentally rewriting $set as per-doc upserts). +var _ = Describe("DocumentDB performance — bulk update", + Label(e2e.PerformanceLabel, e2e.SlowLabel), e2e.HighLevelLabel, + Ordered, Serial, func() { + + const ( + docCount = 5_000 + updateBudget = 90 * time.Second + ) + + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.High) }) + + It("bulk-updates every document within the smoke threshold", func(ctx SpecContext) { + conn := connectSharedRO(ctx) + DeferCleanup(conn.Stop) + + coll := conn.Client.Database(conn.DB).Collection("bulk_update") + + docs := make([]any, docCount) + for i := 0; i < docCount; i++ { + docs[i] = bson.M{"_id": i + 1, "touched": false, "value": i} + } + _, err := coll.InsertMany(ctx, docs) + Expect(err).NotTo(HaveOccurred(), "seed bulk_update") + + start := time.Now() + res, err := coll.UpdateMany(ctx, + bson.M{"touched": false}, + bson.M{"$set": bson.M{"touched": true, "stamp": "perf"}}, + ) + elapsed := time.Since(start) + logLatency("update-5k", elapsed) + + Expect(err).NotTo(HaveOccurred(), "UpdateMany") + Expect(res.MatchedCount).To(BeEquivalentTo(docCount)) + Expect(res.ModifiedCount).To(BeEquivalentTo(docCount)) + Expect(elapsed).To(BeNumerically("<", updateBudget), + "bulk update should complete within %s", updateBudget) + + remaining, err := coll.CountDocuments(ctx, bson.M{"touched": false}) + Expect(err).NotTo(HaveOccurred()) + Expect(remaining).To(BeEquivalentTo(0)) + }) + }) diff --git a/test/e2e/tests/performance/performance_suite_test.go b/test/e2e/tests/performance/performance_suite_test.go new file mode 100644 index 00000000..c05318da --- /dev/null +++ b/test/e2e/tests/performance/performance_suite_test.go @@ -0,0 +1,56 @@ +// Package performance hosts the DocumentDB E2E performance area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +package performance + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestPerformance(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - Performance", Label(e2e.PerformanceLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("performance bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("performance worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "performance teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +) + +// BeforeEach in this area aborts the spec if the operator pod has +// drifted since SetupSuite (UID/name/restart-count change). Area +// tests/upgrade/ intentionally omits this hook because operator +// restarts are part of its scenario. +var _ = BeforeEach(func() { +Expect(e2e.CheckOperatorUnchanged()).To(Succeed(), +"operator health check failed — a previous spec or reconciler likely restarted the operator") +}) diff --git a/test/e2e/tests/scale/scale_down_test.go b/test/e2e/tests/scale/scale_down_test.go new file mode 100644 index 00000000..e2b0ffa4 --- /dev/null +++ b/test/e2e/tests/scale/scale_down_test.go @@ -0,0 +1,117 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package scale + +import ( + "context" + "fmt" + + cnpgv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1" + cnpgclusterutils "github.com/cloudnative-pg/cloudnative-pg/tests/utils/clusterutils" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +var _ = Describe("DocumentDB scale — down", + Ordered, + Label(e2e.ScaleLabel, e2e.BasicLabel), + e2e.MediumLevelLabel, + func() { + var ( + handle *fixtures.SharedScaleHandle + c client.Client + ctx context.Context + key client.ObjectKey + ) + + BeforeAll(func() { + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "SuiteEnv not initialized") + ctx = env.Ctx + if ctx == nil { + ctx = context.Background() + } + c = env.Client + + h, err := fixtures.GetOrCreateSharedScale(ctx, c) + Expect(err).NotTo(HaveOccurred(), "get-or-create shared-scale fixture") + handle = h + key = client.ObjectKey{Namespace: handle.Namespace(), Name: handle.Name()} + }) + + AfterEach(func() { + Expect(handle.ResetToTwoInstances(ctx, c)).To(Succeed(), + "reset shared-scale fixture to 2 instances") + }) + + It("scales 3 → 2 instances", func() { + e2e.SkipUnlessLevel(e2e.Medium) + + // Grow to 3 first so we can assert a genuine 3→2 scale-down. + Expect(documentdb.PatchInstances(ctx, c, key.Namespace, key.Name, 3)).To(Succeed()) + Eventually(assertions.AssertInstanceCount(ctx, c, key, 3), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "CNPG Cluster should converge to readyInstances=3 before scale-down") + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed()) + + Expect(documentdb.PatchInstances(ctx, c, key.Namespace, key.Name, 2)).To(Succeed()) + + Eventually(assertions.AssertInstanceCount(ctx, c, key, 2), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "CNPG Cluster should converge to readyInstances=2") + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "DocumentDB should be Ready at 2 instances") + }) + + It("scales 2 → 1 instance and stays healthy after primary re-election", func() { + e2e.SkipUnlessLevel(e2e.Medium) + + primary, err := cnpgclusterutils.GetPrimary(ctx, c, key.Namespace, key.Name) + Expect(err).NotTo(HaveOccurred(), "fetch initial primary") + Expect(primary).NotTo(BeNil()) + GinkgoLogr.Info("initial primary before 2→1 scale-down", "pod", primary.Name) + + Expect(documentdb.PatchInstances(ctx, c, key.Namespace, key.Name, 1)).To(Succeed()) + + Eventually(assertions.AssertInstanceCount(ctx, c, key, 1), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "CNPG Cluster should converge to readyInstances=1") + + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "DocumentDB should be Ready after scaling to 1 instance") + + // After scale-down, a primary must still exist — but its + // identity may legitimately have changed via re-election, + // so we do not assert pod-name equality here. + Eventually(func() error { + cl := &cnpgv1.Cluster{} + if err := c.Get(ctx, key, cl); err != nil { + return fmt.Errorf("get CNPG cluster: %w", err) + } + if cl.Status.CurrentPrimary == "" { + return fmt.Errorf("CNPG cluster %s has no currentPrimary", key) + } + return nil + }, timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "CNPG Cluster should report a currentPrimary after re-election") + }) + }) diff --git a/test/e2e/tests/scale/scale_suite_test.go b/test/e2e/tests/scale/scale_suite_test.go new file mode 100644 index 00000000..d396f698 --- /dev/null +++ b/test/e2e/tests/scale/scale_suite_test.go @@ -0,0 +1,56 @@ +// Package scale hosts the DocumentDB E2E scale area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +package scale + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestScale(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - Scale", Label(e2e.ScaleLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("scale bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("scale worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "scale teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +) + +// BeforeEach in this area aborts the spec if the operator pod has +// drifted since SetupSuite (UID/name/restart-count change). Area +// tests/upgrade/ intentionally omits this hook because operator +// restarts are part of its scenario. +var _ = BeforeEach(func() { +Expect(e2e.CheckOperatorUnchanged()).To(Succeed(), +"operator health check failed — a previous spec or reconciler likely restarted the operator") +}) diff --git a/test/e2e/tests/scale/scale_up_test.go b/test/e2e/tests/scale/scale_up_test.go new file mode 100644 index 00000000..3c311375 --- /dev/null +++ b/test/e2e/tests/scale/scale_up_test.go @@ -0,0 +1,109 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package scale + +import ( + "context" + + cnpgclusterutils "github.com/cloudnative-pg/cloudnative-pg/tests/utils/clusterutils" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +var _ = Describe("DocumentDB scale — up", + Ordered, + Label(e2e.ScaleLabel, e2e.BasicLabel), + e2e.MediumLevelLabel, + func() { + var ( + handle *fixtures.SharedScaleHandle + c client.Client + ctx context.Context + key client.ObjectKey + ) + + BeforeAll(func() { + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "SuiteEnv not initialized") + ctx = env.Ctx + if ctx == nil { + ctx = context.Background() + } + c = env.Client + + h, err := fixtures.GetOrCreateSharedScale(ctx, c) + Expect(err).NotTo(HaveOccurred(), "get-or-create shared-scale fixture") + handle = h + key = client.ObjectKey{Namespace: handle.Namespace(), Name: handle.Name()} + }) + + AfterEach(func() { + Expect(handle.ResetToTwoInstances(ctx, c)).To(Succeed(), + "reset shared-scale fixture to 2 instances") + }) + + It("scales 2 → 3 instances while keeping the primary pod stable", func() { + e2e.SkipUnlessLevel(e2e.Medium) + + primary, err := cnpgclusterutils.GetPrimary(ctx, c, key.Namespace, key.Name) + Expect(err).NotTo(HaveOccurred(), "fetch initial primary") + Expect(primary).NotTo(BeNil()) + initialPrimary := primary.Name + + Expect(documentdb.PatchInstances(ctx, c, key.Namespace, key.Name, 3)).To(Succeed()) + + Eventually(assertions.AssertInstanceCount(ctx, c, key, 3), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "CNPG Cluster should report readyInstances=3") + + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "DocumentDB status should be Ready") + + Expect(assertions.AssertPrimaryUnchanged(ctx, c, key, initialPrimary)()). + To(Succeed(), "scaling up must not change the primary") + }) + + It("scales 1 → 2 instances after first scaling down to 1", func() { + e2e.SkipUnlessLevel(e2e.Medium) + + Expect(documentdb.PatchInstances(ctx, c, key.Namespace, key.Name, 1)).To(Succeed()) + Eventually(assertions.AssertInstanceCount(ctx, c, key, 1), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "CNPG Cluster should converge to readyInstances=1") + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "DocumentDB should be Ready at 1 instance") + + primary, err := cnpgclusterutils.GetPrimary(ctx, c, key.Namespace, key.Name) + Expect(err).NotTo(HaveOccurred(), "fetch primary before scale-up") + Expect(primary).NotTo(BeNil()) + initialPrimary := primary.Name + + Expect(documentdb.PatchInstances(ctx, c, key.Namespace, key.Name, 2)).To(Succeed()) + + Eventually(assertions.AssertInstanceCount(ctx, c, key, 2), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "CNPG Cluster should converge to readyInstances=2") + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.InstanceScale), + timeouts.PollInterval(timeouts.InstanceScale)). + Should(Succeed(), "DocumentDB should be Ready at 2 instances") + + Expect(assertions.AssertPrimaryUnchanged(ctx, c, key, initialPrimary)()). + To(Succeed(), "scaling up 1→2 must not change the primary") + }) + }) diff --git a/test/e2e/tests/status/connection_string_test.go b/test/e2e/tests/status/connection_string_test.go new file mode 100644 index 00000000..749ac7f1 --- /dev/null +++ b/test/e2e/tests/status/connection_string_test.go @@ -0,0 +1,206 @@ +package status + +import ( + "context" + "fmt" + "net/url" + "regexp" + "strconv" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/portforward" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// DocumentDB status — ConnectionString. +// +// The operator publishes a `mongodb://` URI in status.connectionString +// once the gateway Service and credential secret are ready. This spec +// has three layers, ordered cheapest-first so the failure surface is +// well-separated: +// +// 1. Shape — the string matches `^mongodb://` and carries a non-empty +// host component. Catches "field unset" and "scheme drift". +// +// 2. Semantic — the string names the expected credential secret, +// targets the default gateway port, and carries every Mongo URI +// query param the Go/JS/Python driver needs (directConnection, +// authMechanism=SCRAM-SHA-256, tls, replicaSet, and +// tlsAllowInvalidCertificates correlated with status.TLS.Ready). +// Catches operator regressions that rewrite GenerateConnectionString +// in util.go. +// +// 3. Live — open a real port-forward to the gateway Service, read the +// credential secret, and Ping via mongo-driver/v2. Proves the +// (port + params) the operator published actually reach a working +// endpoint, independent of the string's literal host (which is the +// cluster-internal Service ClusterIP and so only dialable from +// outside the cluster via port-forward). +// +// Why we do not shell-eval the string +// +// status.connectionString contains `$(kubectl get secret …)` subshells +// in userinfo so that a human can paste it into a terminal and have +// credentials auto-resolve. Running `bash -c "echo "` in-test to +// exercise that roundtrip would require `kubectl` + a valid kubeconfig +// in the Ginkgo process, conflate shell and driver failure modes, and +// not work on runners without bash — we have none today, but locking +// ourselves to bash for a status assertion is a poor tradeoff. The +// string-level assertion on the secret-name reference (below) is the +// high-signal subset of that approach at a fraction of the cost. +// +// This spec runs against the session-scoped shared RO fixture so it +// adds negligible time to the suite. +var _ = Describe("DocumentDB status — connectionString", + Label(e2e.StatusLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.Medium) }) + + It("publishes a valid, dialable mongodb:// URI", func() { + env := e2e.SuiteEnv() + Expect(env).ToNot(BeNil()) + c := env.Client + + ctx, cancel := context.WithTimeout(context.Background(), 8*time.Minute) + DeferCleanup(cancel) + + handle, err := fixtures.GetOrCreateSharedRO(ctx, c) + Expect(err).ToNot(HaveOccurred()) + + key := client.ObjectKey{Namespace: handle.Namespace(), Name: handle.Name()} + + // Layer 1: shape assertion via the shared helper, eventually-polled + // because the operator may publish the string a reconcile or two + // after the CR flips Ready. + By("asserting status.connectionString matches ^mongodb://") + Eventually( + assertions.AssertConnectionStringMatches(ctx, c, key, `^mongodb://`), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed()) + + dd, err := handle.GetCR(ctx, c) + Expect(err).ToNot(HaveOccurred()) + connStr := dd.Status.ConnectionString + Expect(connStr).ToNot(BeEmpty(), + "status.connectionString must be populated on a Ready DocumentDB") + + // Layer 2a: credential-secret reference. Catches operator typos + // and regressions that ignore spec.documentDbCredentialSecret. + // expectedSecret mirrors utils.GenerateConnectionString's + // fallback: spec override wins, else the default secret name. + expectedSecret := dd.Spec.DocumentDbCredentialSecret + if expectedSecret == "" { + expectedSecret = mongo.DefaultCredentialSecretName + } + By(fmt.Sprintf("asserting connection string references secret %q", expectedSecret)) + // The secret name appears twice inside `kubectl get secret -n ` + // subshells — one substring match is sufficient. + Expect(connStr).To(ContainSubstring("secret "+expectedSecret+" "), + "connection string must reference credential secret %q; got: %s", + expectedSecret, connStr) + + // Layer 2b: extract host:port and query params. We cannot use + // url.Parse on the full string because userinfo contains + // `$(kubectl ... | base64 -d)` which is not a valid URL + // userinfo. Strip userinfo with a regex that matches up to + // the LAST '@' before the first '/' — Mongo's default URI + // grammar guarantees userinfo does not contain '/'. + By("parsing host:port and query params from the published URI") + re := regexp.MustCompile(`^mongodb://.*@(?P[^/]+)/\?(?P.+)$`) + m := re.FindStringSubmatch(connStr) + Expect(m).ToNot(BeNil(), + "connection string must be of form mongodb://@/?; got: %s", + connStr) + hostport := m[1] + rawQuery := m[2] + + host, port, err := splitHostPort(hostport) + Expect(err).ToNot(HaveOccurred(), + "host:port segment must split cleanly; got %q", hostport) + Expect(host).ToNot(BeEmpty(), "host component must not be empty") + Expect(port).To(Equal(portforward.GatewayPort), + "connection string port must equal the default gateway port (%d); got %d", + portforward.GatewayPort, port) + + // Layer 2c: required query parameters. Each catches a distinct + // regression in GenerateConnectionString: missing + // directConnection breaks replica-set discovery through the + // gateway; missing authMechanism breaks SCRAM; missing tls or + // replicaSet breaks drivers that refuse to infer defaults. + By("asserting required Mongo URI query parameters are present") + qv, err := url.ParseQuery(rawQuery) + Expect(err).ToNot(HaveOccurred(), "query must parse: %q", rawQuery) + Expect(qv.Get("directConnection")).To(Equal("true"), + "connection string must set directConnection=true") + Expect(qv.Get("authMechanism")).To(Equal("SCRAM-SHA-256"), + "connection string must set authMechanism=SCRAM-SHA-256") + Expect(qv.Get("tls")).To(Equal("true"), + "connection string must set tls=true (gateway is TLS-only)") + Expect(qv.Get("replicaSet")).To(Equal("rs0"), + "connection string must set replicaSet=rs0") + + // Layer 2d: TLS trust flag correlates with status.TLS.Ready. + // GenerateConnectionString appends tlsAllowInvalidCertificates=true + // exactly when the CR is NOT in a "trust-ready" state + // (status.TLS nil or not Ready). Inverting this flag would + // either leak self-signed exposure into production or break + // connections to trusted CAs; both are silent footguns without + // this assertion. + trustReady := dd.Status.TLS != nil && dd.Status.TLS.Ready + if trustReady { + Expect(qv.Has("tlsAllowInvalidCertificates")).To(BeFalse(), + "with status.TLS.Ready=true the connection string must NOT set tlsAllowInvalidCertificates") + } else { + Expect(qv.Get("tlsAllowInvalidCertificates")).To(Equal("true"), + "with status.TLS.Ready=false the connection string must set tlsAllowInvalidCertificates=true") + } + + // Layer 3: live Ping through the same (port + params + secret) + // contract. NewFromDocumentDB opens a port-forward to the + // gateway Service, reads the credential secret, dials with + // TLS+InsecureSkipVerify (matching tlsAllowInvalidCertificates + // behaviour for this spec's shared self-signed fixture), and + // Pings. Any mismatch between the published string's port / + // secret-name and what actually serves traffic surfaces here + // as a connect or auth failure. + By("dialing the gateway via port-forward and running Ping") + dialCtx, dialCancel := context.WithTimeout(ctx, timeouts.For(timeouts.MongoConnect)) + DeferCleanup(dialCancel) + mh, err := mongo.NewFromDocumentDB(dialCtx, env, dd.Namespace, dd.Name, + mongo.WithTLSInsecure()) + Expect(err).ToNot(HaveOccurred(), + "must be able to dial + Ping using the contract described by status.connectionString") + DeferCleanup(func() { + closeCtx, closeCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer closeCancel() + _ = mh.Close(closeCtx) + }) + }) + }) + +// splitHostPort splits a "host:port" segment where the port is +// numeric. We avoid net.SplitHostPort only because the host in this +// spec is a ClusterIP and so unambiguously not an IPv6 literal — a +// focused parser makes the "port drift" failure message more direct. +func splitHostPort(hostport string) (host string, port int, err error) { + for i := len(hostport) - 1; i >= 0; i-- { + if hostport[i] == ':' { + p, perr := strconv.Atoi(hostport[i+1:]) + if perr != nil { + return "", 0, fmt.Errorf("port segment not numeric: %q", hostport[i+1:]) + } + return hostport[:i], p, nil + } + } + return "", 0, fmt.Errorf("host:port missing ':' separator: %q", hostport) +} diff --git a/test/e2e/tests/status/mount_options_test.go b/test/e2e/tests/status/mount_options_test.go new file mode 100644 index 00000000..0e8fef50 --- /dev/null +++ b/test/e2e/tests/status/mount_options_test.go @@ -0,0 +1,95 @@ +package status + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" +) + +// DocumentDB mount options — CRD discrepancy note. +// +// The task brief asked the spec to inspect `status.mountOptions` but no +// such field exists on `DocumentDBStatus`. Mount configuration for a +// DocumentDB cluster lives on the backing CloudNative-PG Postgres pods +// — concretely, CNPG mounts the PGDATA volume at +// `/var/lib/postgresql/data` (see CNPG's pkg/specs/volumes.go). +// +// We therefore verify the observable contract by listing the pods CNPG +// owns (label `cnpg.io/cluster=`) and asserting that at least one +// container mounts `/var/lib/postgresql/data`. +const pgdataMountPath = "/var/lib/postgresql/data" + +var _ = Describe("DocumentDB mount options — PGDATA volume mount", + Label(e2e.StatusLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.Medium) }) + + It("mounts the PGDATA volume at /var/lib/postgresql/data", func() { + env := e2e.SuiteEnv() + Expect(env).ToNot(BeNil()) + c := env.Client + + ctx, cancel := context.WithTimeout(context.Background(), 8*time.Minute) + DeferCleanup(cancel) + + handle, err := fixtures.GetOrCreateSharedRO(ctx, c) + Expect(err).ToNot(HaveOccurred()) + + Eventually(func() error { + pods := &corev1.PodList{} + if err := c.List(ctx, pods, + client.InNamespace(handle.Namespace()), + client.MatchingLabels{"cnpg.io/cluster": handle.Name()}, + ); err != nil { + return err + } + if len(pods.Items) == 0 { + return &noCNPGPodsErr{ + namespace: handle.Namespace(), name: handle.Name(), + } + } + for i := range pods.Items { + if hasPGDATAMount(&pods.Items[i]) { + return nil + } + } + return &noPGDATAMountErr{namespace: handle.Namespace(), name: handle.Name()} + }, 3*time.Minute, 5*time.Second).Should(Succeed()) + }) + }) + +func hasPGDATAMount(pod *corev1.Pod) bool { + for i := range pod.Spec.Containers { + for _, vm := range pod.Spec.Containers[i].VolumeMounts { + if vm.MountPath == pgdataMountPath { + return true + } + } + } + return false +} + +type noCNPGPodsErr struct { + namespace, name string +} + +func (e *noCNPGPodsErr) Error() string { + return "no CNPG pods labelled cnpg.io/cluster=" + e.name + " in " + e.namespace +} + +type noPGDATAMountErr struct { + namespace, name string +} + +func (e *noPGDATAMountErr) Error() string { + return "no CNPG pod in " + e.namespace + "/" + e.name + + " mounts " + pgdataMountPath +} diff --git a/test/e2e/tests/status/pv_name_test.go b/test/e2e/tests/status/pv_name_test.go new file mode 100644 index 00000000..7a7e75c4 --- /dev/null +++ b/test/e2e/tests/status/pv_name_test.go @@ -0,0 +1,86 @@ +package status + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" +) + +// DocumentDB persistent volume — CRD discrepancy note. +// +// The task brief asked the spec to inspect `status.persistentVolumeClaim` +// but the `DocumentDBStatus` type in operator/src/api/preview/documentdb_types.go +// does not expose such a field. The authoritative ownership of a +// DocumentDB's data volumes sits on the backing CloudNative-PG Cluster, +// which labels each PVC with `cnpg.io/cluster=`. +// +// We therefore verify the operator's observable contract by listing +// PersistentVolumeClaims in the DocumentDB's namespace filtered by that +// CNPG label and asserting: +// - at least one PVC exists (one per Postgres instance); +// - every returned PVC has reached phase Bound. +// +// If `status.persistentVolumeClaim` is added to the CRD in the future, +// this spec should grow an additional assertion that correlates the +// status field with the live PVC list. +var _ = Describe("DocumentDB persistent volume — CNPG PVC discovery", + Label(e2e.StatusLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.Medium) }) + + It("provisions Bound PVCs labelled with cnpg.io/cluster", func() { + env := e2e.SuiteEnv() + Expect(env).ToNot(BeNil()) + c := env.Client + + ctx, cancel := context.WithTimeout(context.Background(), 8*time.Minute) + DeferCleanup(cancel) + + handle, err := fixtures.GetOrCreateSharedRO(ctx, c) + Expect(err).ToNot(HaveOccurred()) + + Eventually(func() error { + pvcList := &corev1.PersistentVolumeClaimList{} + if err := c.List(ctx, pvcList, + client.InNamespace(handle.Namespace()), + client.MatchingLabels{"cnpg.io/cluster": handle.Name()}, + ); err != nil { + return err + } + if len(pvcList.Items) == 0 { + return &noPVCErr{namespace: handle.Namespace(), name: handle.Name()} + } + for i := range pvcList.Items { + p := &pvcList.Items[i] + if p.Status.Phase != corev1.ClaimBound { + return &pvcNotBoundErr{name: p.Name, phase: string(p.Status.Phase)} + } + } + return nil + }, 3*time.Minute, 5*time.Second).Should(Succeed()) + }) + }) + +type noPVCErr struct { + namespace, name string +} + +func (e *noPVCErr) Error() string { + return "no PVCs labelled cnpg.io/cluster=" + e.name + " in " + e.namespace +} + +type pvcNotBoundErr struct { + name, phase string +} + +func (e *pvcNotBoundErr) Error() string { + return "PVC " + e.name + " is not Bound (phase=" + e.phase + ")" +} diff --git a/test/e2e/tests/status/status_suite_test.go b/test/e2e/tests/status/status_suite_test.go new file mode 100644 index 00000000..ea3cfa7f --- /dev/null +++ b/test/e2e/tests/status/status_suite_test.go @@ -0,0 +1,56 @@ +// Package status hosts the DocumentDB E2E status area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +package status + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestStatus(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - Status", Label(e2e.StatusLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("status bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("status worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "status teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +) + +// BeforeEach in this area aborts the spec if the operator pod has +// drifted since SetupSuite (UID/name/restart-count change). Area +// tests/upgrade/ intentionally omits this hook because operator +// restarts are part of its scenario. +var _ = BeforeEach(func() { +Expect(e2e.CheckOperatorUnchanged()).To(Succeed(), +"operator health check failed — a previous spec or reconciler likely restarted the operator") +}) diff --git a/test/e2e/tests/tls/helpers_test.go b/test/e2e/tests/tls/helpers_test.go new file mode 100644 index 00000000..bb947234 --- /dev/null +++ b/test/e2e/tests/tls/helpers_test.go @@ -0,0 +1,174 @@ +package tls + +import ( + "context" + "fmt" + "net" + "os" + "path/filepath" + "runtime" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + + "github.com/documentdb/documentdb-operator/test/e2e" + ddbutil "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/fixtures" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/portforward" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// Shared per-spec setup for the TLS area. Each TLS spec uses the same +// base DocumentDB template plus a single mixin describing the TLS +// mode under test. +// +// tlsCredentialSecret is intentionally distinct from +// fixtures.DefaultCredentialSecretName so specs can exercise a custom +// secret name on the CR spec path; the credentials themselves reuse +// fixtures.DefaultCredentialUsername / DefaultCredentialPassword so a +// future rotation stays a one-file edit. +const ( + tlsCredentialSecret = "tls-e2e-credentials" + tlsCredentialUser = fixtures.DefaultCredentialUsername + tlsCredentialPassword = fixtures.DefaultCredentialPassword //nolint:gosec // fixture-only + tlsDocumentDBName = "tls-e2e" + tlsDefaultStorageSize = "1Gi" + tlsDefaultStorageCls = "standard" + tlsDefaultDDBImage = "" + tlsDefaultGatewayImage = "" +) + +// clusterSetup holds the artefacts returned by provisionCluster. +type clusterSetup struct { + NamespaceName string + DD *previewv1.DocumentDB +} + +// provisionCluster builds a TLS-configured DocumentDB from the base +// template + supplied mixin, waits for it to become healthy, and +// registers DeferCleanup hooks to tear it down. extraVars are merged +// on top of the baseline variable map so specs can inject +// mode-specific values (e.g., TLS_SECRET_NAME for Provided mode). +func provisionCluster( + ctx context.Context, + c client.Client, + area, mixin string, + extraVars map[string]string, +) *clusterSetup { + GinkgoHelper() + + nsName := namespaces.NamespaceForSpec(area) + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: nsName}} + Expect(createIdempotent(ctx, c, ns)).To(Succeed(), "create namespace %s", nsName) + + sec := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: tlsCredentialSecret, Namespace: nsName}, + Type: corev1.SecretTypeOpaque, + StringData: map[string]string{ + "username": fixtures.DefaultCredentialUsername, + "password": fixtures.DefaultCredentialPassword, + }, + } + Expect(createIdempotent(ctx, c, sec)).To(Succeed(), "create credential secret") + + vars := map[string]string{ + "STORAGE_SIZE": envDefault("E2E_STORAGE_SIZE", tlsDefaultStorageSize), + "STORAGE_CLASS": envDefault("E2E_STORAGE_CLASS", tlsDefaultStorageCls), + "DOCUMENTDB_IMAGE": envDefault("DOCUMENTDB_IMAGE", tlsDefaultDDBImage), + "GATEWAY_IMAGE": envDefault("GATEWAY_IMAGE", tlsDefaultGatewayImage), + "CREDENTIAL_SECRET": tlsCredentialSecret, + "INSTANCES": "1", + "EXPOSURE_TYPE": "ClusterIP", + "LOG_LEVEL": "info", + } + for k, v := range extraVars { + vars[k] = v + } + + dd, err := ddbutil.Create(ctx, c, nsName, tlsDocumentDBName, ddbutil.CreateOptions{ + Base: "documentdb", + Mixins: []string{mixin}, + Vars: vars, + ManifestsRoot: manifestsRoot(), + }) + Expect(err).NotTo(HaveOccurred(), "render/create documentdb with mixin %q", mixin) + + DeferCleanup(func(ctx SpecContext) { + // Best-effort namespace deletion — this also garbage-collects + // the DocumentDB CR and any child objects via ownerRefs. + _ = c.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: nsName}}) + }) + + key := types.NamespacedName{Namespace: nsName, Name: tlsDocumentDBName} + Expect(ddbutil.WaitHealthy(ctx, c, key, timeouts.For(timeouts.DocumentDBReady))). + To(Succeed(), "documentdb did not become healthy within %s", timeouts.For(timeouts.DocumentDBReady)) + + return &clusterSetup{NamespaceName: nsName, DD: dd} +} + +// openGatewayForward opens a port-forward to the gateway Service of +// dd on a locally-reserved port and returns (host, port, stop). The +// caller defers stop; the host is always "127.0.0.1". +func openGatewayForward(ctx context.Context, dd *previewv1.DocumentDB) (string, string, func()) { + GinkgoHelper() + port := pickFreeLocalPort() + stop, err := portforward.Open(ctx, e2e.SuiteEnv(), dd, port) + Expect(err).NotTo(HaveOccurred(), "open port-forward to gateway service") + // Give the forwarder a beat to bind the local listener before + // the first connect attempt on slow CI nodes. + time.Sleep(250 * time.Millisecond) + return "127.0.0.1", fmt.Sprintf("%d", port), stop +} + +// pickFreeLocalPort binds :0 to discover an unused TCP port, closes +// the listener, and returns the port. A narrow race exists between +// close and the forwarder's bind; it matches how controller-runtime +// envtest picks its local API server port and is benign on CI hosts +// without adversarial workloads. +func pickFreeLocalPort() int { + GinkgoHelper() + l, err := net.Listen("tcp", "127.0.0.1:0") + Expect(err).NotTo(HaveOccurred(), "reserve a free local TCP port") + addr := l.Addr().(*net.TCPAddr).Port + _ = l.Close() + return addr +} + +// createIdempotent wraps c.Create so tests that re-enter on retry +// don't trip over AlreadyExists. +func createIdempotent(ctx context.Context, c client.Client, obj client.Object) error { + if err := c.Create(ctx, obj); err != nil && !apierrors.IsAlreadyExists(err) { + return err + } + return nil +} + +// envDefault returns os.Getenv(k) when set, otherwise def. +func envDefault(k, def string) string { + if v := os.Getenv(k); v != "" { + return v + } + return def +} + +// manifestsRoot returns the absolute path of the shared manifests +// directory. Uses runtime.Caller so go test invocations from any +// working directory still find the templates. +func manifestsRoot() string { + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + return filepath.Join(".", "..", "..", "manifests") + } + // test/e2e/tests/tls/ -> test/e2e/manifests + return filepath.Join(filepath.Dir(thisFile), "..", "..", "manifests") +} diff --git a/test/e2e/tests/tls/tls_certmanager_test.go b/test/e2e/tests/tls/tls_certmanager_test.go new file mode 100644 index 00000000..6106316a --- /dev/null +++ b/test/e2e/tests/tls/tls_certmanager_test.go @@ -0,0 +1,251 @@ +package tls + +import ( + "bytes" + "context" + "crypto/x509" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + + "github.com/documentdb/documentdb-operator/test/e2e" + ddbutil "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + mongohelper "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// CertManager mode delegates certificate issuance to cert-manager via +// an IssuerRef on the DocumentDB CR. This spec creates a minimal +// self-signed Issuer in the test namespace, points the CR at it, and +// verifies the gateway serves a TLS connection that validates against +// the CA material cert-manager stored in the issued Secret. This +// matters because InsecureSkipVerify would mask missing CA wiring; the +// real invariant the operator promises is "the secret named in +// status.tls.secretName contains a chain that the gateway serves". +// +// The spec is skipped automatically when cert-manager is not installed +// on the target cluster, detected by the absence of the Issuer CRD. +var _ = Describe("DocumentDB TLS — cert-manager", + Label(e2e.TLSLabel, e2e.NeedsCertManagerLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.Medium) }) + + It("provisions certificates through a cert-manager Issuer", func(sctx SpecContext) { + ctx, cancel := context.WithTimeout(sctx, 10*time.Minute) + defer cancel() + + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "suite env not initialised") + + skipIfCertManagerMissing(ctx) + + // Pre-create the namespace and a self-signed Issuer in it + // so the gateway reconcile can resolve the IssuerRef on + // its first pass. provisionCluster treats the namespace + // as idempotent and reuses it. + nsName := namespaces.NamespaceForSpec(e2e.TLSLabel) + Expect(createIdempotent(ctx, env.Client, + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: nsName}})). + To(Succeed(), "create namespace %s", nsName) + + issuerName := "tls-e2e-selfsigned" + issuer := &unstructured.Unstructured{} + issuer.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "cert-manager.io", Version: "v1", Kind: "Issuer", + }) + issuer.SetName(issuerName) + issuer.SetNamespace(nsName) + // spec.selfSigned is an empty object per cert-manager schema. + Expect(unstructured.SetNestedMap(issuer.Object, map[string]any{}, + "spec", "selfSigned")).To(Succeed(), "set spec.selfSigned") + Expect(createIdempotent(ctx, env.Client, issuer)). + To(Succeed(), "create selfSigned Issuer") + + cluster := provisionCluster(ctx, env.Client, e2e.TLSLabel, + "tls_certmanager", map[string]string{ + "ISSUER_NAME": issuerName, + "ISSUER_KIND": "Issuer", + }) + Expect(cluster.NamespaceName).To(Equal(nsName), + "provisionCluster must reuse the pre-created namespace") + + key := types.NamespacedName{Namespace: cluster.NamespaceName, Name: cluster.DD.Name} + var tlsSecretName string + Eventually(func(g Gomega) bool { + dd, err := ddbutil.Get(ctx, env.Client, key) + g.Expect(err).NotTo(HaveOccurred()) + if dd.Status.TLS == nil { + return false + } + tlsSecretName = dd.Status.TLS.SecretName + return dd.Status.TLS.Ready + }, timeouts.For(timeouts.DocumentDBReady), timeouts.PollInterval(timeouts.DocumentDBReady)). + Should(BeTrue(), "status.tls.ready did not flip true with cert-manager issuer") + Expect(tlsSecretName).NotTo(BeEmpty(), + "status.tls.secretName must be populated once ready") + + // Read the cert-manager-issued secret and extract the CA + // (ca.crt for self-signed issuer; fall back to tls.crt + // when the issuer didn't populate ca.crt because the + // self-signed issuer doubles as its own CA). + caPEM := readCAFromSecret(ctx, cluster.NamespaceName, tlsSecretName) + + host, port, stop := openGatewayForward(ctx, cluster.DD) + defer stop() + + connectCtx, cancelConnect := context.WithTimeout(ctx, timeouts.For(timeouts.MongoConnect)) + defer cancelConnect() + + pool := x509.NewCertPool() + Expect(pool.AppendCertsFromPEM(caPEM)). + To(BeTrue(), "parse CA PEM from cert-manager secret") + + // The gateway certificate is issued for the Service DNS + // name; override SNI to match one of its SANs so + // hostname-verification through the 127.0.0.1 forward + // succeeds. Keep the primary Service FQDN matching + // mixins/tls_certmanager issue. + sni := "documentdb-service-" + tlsDocumentDBName + "." + cluster.NamespaceName + ".svc" + + client, err := mongohelper.NewClient(connectCtx, mongohelper.ClientOptions{ + Host: host, + Port: port, + User: tlsCredentialUser, + Password: tlsCredentialPassword, + TLS: true, + RootCAs: pool, + ServerName: sni, + }) + Expect(err).NotTo(HaveOccurred(), "TLS connect via cert-manager issuer") + defer func() { _ = client.Disconnect(ctx) }() + + Eventually(func() error { + return mongohelper.Ping(connectCtx, client) + }, timeouts.For(timeouts.MongoConnect), timeouts.PollInterval(timeouts.MongoConnect)). + Should(Succeed(), "ping via cert-manager-issued cert should succeed with CA verification") + + // --- Renewal check --- + // Force cert-manager to re-issue the Certificate by + // deleting the generated Secret; cert-manager recreates + // it with a fresh tls.crt. With the self-signed Issuer + // used here, a new leaf + new CA are produced on every + // issuance, so the old CA pool will NOT validate the + // new leaf — proving the gateway actually picked up the + // reissued material. If the gateway pinned the initial + // cert in memory, this ping would fail with a bad + // certificate error. + By("forcing cert-manager to reissue the gateway Secret") + origSec := &corev1.Secret{} + Expect(env.Client.Get(ctx, types.NamespacedName{ + Namespace: cluster.NamespaceName, Name: tlsSecretName, + }, origSec)).To(Succeed(), "read TLS secret before deletion") + origCrt := bytes.Clone(origSec.Data[corev1.TLSCertKey]) + Expect(env.Client.Delete(ctx, origSec)).To(Succeed(), + "delete TLS secret to trigger cert-manager renewal") + + // Wait for cert-manager to recreate the secret with a + // different tls.crt. Using the DocumentDBReady budget + // here because cert-manager reissue latency is dominated + // by issuer controller scheduling, not mongo connect. + By("waiting for cert-manager to reissue the TLS Secret with a new tls.crt") + Eventually(func(g Gomega) { + sec := &corev1.Secret{} + g.Expect(env.Client.Get(ctx, types.NamespacedName{ + Namespace: cluster.NamespaceName, Name: tlsSecretName, + }, sec)).To(Succeed()) + g.Expect(sec.Data[corev1.TLSCertKey]).NotTo(BeEmpty(), + "reissued secret must carry tls.crt") + g.Expect(sec.Data[corev1.TLSCertKey]).NotTo(Equal(origCrt), + "tls.crt must differ after reissue") + }, timeouts.For(timeouts.DocumentDBReady), timeouts.PollInterval(timeouts.DocumentDBReady)). + Should(Succeed(), "cert-manager did not reissue TLS secret") + + // Reconnect with the NEW CA and ping; Eventually gives + // the gateway a window to notice the remounted cert. + // Each Eventually attempt gets its own bounded context so + // the per-attempt budget does not collapse across retries + // — otherwise the first iteration's NewClient could burn + // the whole MongoConnect window, leaving no time for the + // gateway to actually pick up the reissued material. + By("reconnecting via the renewed CA and pinging through the gateway") + newCA := readCAFromSecret(ctx, cluster.NamespaceName, tlsSecretName) + newPool := x509.NewCertPool() + Expect(newPool.AppendCertsFromPEM(newCA)). + To(BeTrue(), "parse renewed CA PEM") + + Eventually(func(g Gomega) { + attemptCtx, cancelAttempt := context.WithTimeout(ctx, timeouts.For(timeouts.MongoConnect)) + defer cancelAttempt() + client2, err := mongohelper.NewClient(attemptCtx, mongohelper.ClientOptions{ + Host: host, + Port: port, + User: tlsCredentialUser, + Password: tlsCredentialPassword, + TLS: true, + RootCAs: newPool, + ServerName: sni, + }) + g.Expect(err).NotTo(HaveOccurred(), "reconnect with renewed CA") + defer func() { _ = client2.Disconnect(attemptCtx) }() + g.Expect(mongohelper.Ping(attemptCtx, client2)).To(Succeed(), + "ping via renewed cert should succeed") + }, timeouts.For(timeouts.DocumentDBReady), timeouts.PollInterval(timeouts.MongoConnect)). + Should(Succeed(), "gateway did not start serving the renewed cert (or reconnect kept failing)") + }) + }, +) + +// readCAFromSecret fetches the issued TLS secret and returns the CA +// bundle bytes. Cert-manager's self-signed Issuer populates ca.crt; +// some issuer types leave it empty and rely on tls.crt being a +// self-contained self-signed leaf, so we fall back to tls.crt when +// ca.crt is missing or empty. +func readCAFromSecret(ctx context.Context, ns, name string) []byte { + GinkgoHelper() + env := e2e.SuiteEnv() + sec := &corev1.Secret{} + Expect(env.Client.Get(ctx, types.NamespacedName{Namespace: ns, Name: name}, sec)). + To(Succeed(), "get issued TLS secret %s/%s", ns, name) + if ca := sec.Data[corev1.ServiceAccountRootCAKey]; len(ca) > 0 { + return ca + } + if crt := sec.Data[corev1.TLSCertKey]; len(crt) > 0 { + return crt + } + Fail("issued TLS secret " + ns + "/" + name + " contains neither ca.crt nor tls.crt") + return nil +} + +// skipIfCertManagerMissing probes for the cert-manager Issuer CRD via +// a no-op List on the v1 kind and calls Skip when the resource is not +// registered. Using a discovery-driven List avoids pulling in the +// apiextensions client for a single check. +func skipIfCertManagerMissing(ctx context.Context) { + GinkgoHelper() + env := e2e.SuiteEnv() + list := &unstructured.UnstructuredList{} + list.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "cert-manager.io", Version: "v1", Kind: "IssuerList", + }) + err := env.Client.List(ctx, list) + if err == nil { + return + } + // apimeta.IsNoMatchError matches the REST-mapper error when the + // CRD is not registered; NotFound covers servers that return 404 + // on the discovery round-trip. + if apimeta.IsNoMatchError(err) || apierrors.IsNotFound(err) { + Skip("cert-manager is not installed on the target cluster") + } + Expect(err).NotTo(HaveOccurred(), "unexpected error probing for cert-manager") +} diff --git a/test/e2e/tests/tls/tls_disabled_test.go b/test/e2e/tests/tls/tls_disabled_test.go new file mode 100644 index 00000000..5b5a370b --- /dev/null +++ b/test/e2e/tests/tls/tls_disabled_test.go @@ -0,0 +1,57 @@ +package tls + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" + mongohelper "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// TLS-disabled mode corresponds to spec.tls.gateway.mode=Disabled. +// The gateway still listens but accepts plain-text mongo wire +// protocol. This spec verifies the happy-path: a freshly-created +// DocumentDB with TLS disabled accepts an unencrypted connection +// from the mongo driver. +var _ = Describe("DocumentDB TLS — disabled", + Label(e2e.TLSLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.Medium) }) + + It("accepts plaintext mongo connections", func(sctx SpecContext) { + ctx, cancel := context.WithTimeout(sctx, 10*time.Minute) + defer cancel() + + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "suite env not initialised") + + cluster := provisionCluster(ctx, env.Client, e2e.TLSLabel, + "tls_disabled", nil) + + host, port, stop := openGatewayForward(ctx, cluster.DD) + defer stop() + + connectCtx, cancelConnect := context.WithTimeout(ctx, timeouts.For(timeouts.MongoConnect)) + defer cancelConnect() + + client, err := mongohelper.NewClient(connectCtx, mongohelper.ClientOptions{ + Host: host, + Port: port, + User: tlsCredentialUser, + Password: tlsCredentialPassword, + TLS: false, + }) + Expect(err).NotTo(HaveOccurred(), "connect to gateway without TLS") + defer func() { _ = client.Disconnect(ctx) }() + + Eventually(func() error { + return mongohelper.Ping(connectCtx, client) + }, timeouts.For(timeouts.MongoConnect), timeouts.PollInterval(timeouts.MongoConnect)). + Should(Succeed(), "plaintext ping should succeed when TLS is disabled") + }) + }, +) diff --git a/test/e2e/tests/tls/tls_provided_test.go b/test/e2e/tests/tls/tls_provided_test.go new file mode 100644 index 00000000..b1958dca --- /dev/null +++ b/test/e2e/tests/tls/tls_provided_test.go @@ -0,0 +1,113 @@ +package tls + +import ( + "context" + "crypto/x509" + "net" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/documentdb/documentdb-operator/test/e2e" + mongohelper "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/tlscerts" +) + +// Provided mode points the gateway at a user-supplied kubernetes.io/tls +// Secret that contains the full certificate chain and private key. +// This spec mints a throwaway CA + server cert with the tlscerts +// helper, materialises it as a Secret with the canonical data keys +// (tls.crt, tls.key, ca.crt), wires the DocumentDB CR at it, and +// verifies a TLS mongo connection succeeds while validating the server +// certificate against the locally generated CA. +// +// Because the client connects through a port-forward (SNI = 127.0.0.1), +// we explicitly override ServerName to "localhost" — one of the SANs +// baked into the issued server cert — so hostname verification passes. +// The invariants covered: (a) operator accepts the Provided Secret +// reference, (b) the gateway serves exactly the cert we handed it, and +// (c) the cert's chain validates against the CA bytes we planted. +var _ = Describe("DocumentDB TLS — provided", + Label(e2e.TLSLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.Medium) }) + + It("uses a user-provided TLS secret", func(sctx SpecContext) { + ctx, cancel := context.WithTimeout(sctx, 10*time.Minute) + defer cancel() + + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "suite env not initialised") + + nsName := namespaces.NamespaceForSpec(e2e.TLSLabel) + Expect(createIdempotent(ctx, env.Client, + &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: nsName}})). + To(Succeed(), "create namespace %s", nsName) + + bundle, err := tlscerts.Generate(tlscerts.GenerateOptions{ + CommonName: "documentdb-e2e", + DNSNames: []string{ + "localhost", + "documentdb-service-" + tlsDocumentDBName, + "documentdb-service-" + tlsDocumentDBName + "." + nsName + ".svc", + "documentdb-service-" + tlsDocumentDBName + "." + nsName + ".svc.cluster.local", + }, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + Validity: 1 * time.Hour, + }) + Expect(err).NotTo(HaveOccurred(), "generate TLS bundle") + + secretName := "tls-e2e-provided-cert" + tlsSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: secretName, Namespace: nsName}, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{ + corev1.TLSCertKey: bundle.ServerCertPEM, + corev1.TLSPrivateKeyKey: bundle.ServerKeyPEM, + corev1.ServiceAccountRootCAKey: bundle.CACertPEM, // "ca.crt" + }, + } + Expect(createIdempotent(ctx, env.Client, tlsSecret)). + To(Succeed(), "create provided TLS secret") + + cluster := provisionCluster(ctx, env.Client, e2e.TLSLabel, + "tls_provided", map[string]string{ + "TLS_SECRET_NAME": secretName, + }) + Expect(cluster.NamespaceName).To(Equal(nsName)) + + host, port, stop := openGatewayForward(ctx, cluster.DD) + defer stop() + + connectCtx, cancelConnect := context.WithTimeout(ctx, timeouts.For(timeouts.MongoConnect)) + defer cancelConnect() + + pool := x509.NewCertPool() + Expect(pool.AppendCertsFromPEM(bundle.CACertPEM)). + To(BeTrue(), "parse self-minted CA PEM") + + client, err := mongohelper.NewClient(connectCtx, mongohelper.ClientOptions{ + Host: host, + Port: port, + User: tlsCredentialUser, + Password: tlsCredentialPassword, + TLS: true, + RootCAs: pool, + ServerName: "localhost", // matches a SAN in the issued server cert + }) + Expect(err).NotTo(HaveOccurred(), "TLS connect with provided cert") + defer func() { _ = client.Disconnect(ctx) }() + + Eventually(func() error { + return mongohelper.Ping(connectCtx, client) + }, timeouts.For(timeouts.MongoConnect), timeouts.PollInterval(timeouts.MongoConnect)). + Should(Succeed(), "ping via provided cert should succeed under CA verification") + }) + }, +) diff --git a/test/e2e/tests/tls/tls_selfsigned_test.go b/test/e2e/tests/tls/tls_selfsigned_test.go new file mode 100644 index 00000000..638e6870 --- /dev/null +++ b/test/e2e/tests/tls/tls_selfsigned_test.go @@ -0,0 +1,89 @@ +package tls + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "k8s.io/apimachinery/pkg/types" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + ddbutil "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + mongohelper "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// Self-signed mode corresponds to spec.tls.gateway.mode=SelfSigned. +// The operator mints an in-cluster CA and server certificate and +// projects them into a per-DocumentDB Secret. Clients outside the +// cluster can't practically obtain that CA, so the spec connects +// with InsecureSkipVerify=true — the goal here is to prove that +// enabling TLS doesn't break the happy path, not to validate the +// chain. +var _ = Describe("DocumentDB TLS — self-signed", + Label(e2e.TLSLabel), e2e.MediumLevelLabel, + func() { + BeforeEach(func() { e2e.SkipUnlessLevel(e2e.Medium) }) + + It("deploys with self-signed certs and accepts TLS connections", func(sctx SpecContext) { + ctx, cancel := context.WithTimeout(sctx, 10*time.Minute) + defer cancel() + + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "suite env not initialised") + + cluster := provisionCluster(ctx, env.Client, e2e.TLSLabel, + "tls_selfsigned", nil) + + // Wait for the operator-published TLS status to name a + // secret and advertise Ready. The secret name is chosen + // by the operator; we don't assert a specific value — we + // only fetch whatever the status reports. + key := types.NamespacedName{Namespace: cluster.NamespaceName, Name: cluster.DD.Name} + Eventually(func(g Gomega) string { + dd, err := ddbutil.Get(ctx, env.Client, key) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(dd.Status.TLS).NotTo(BeNil(), "status.tls not populated yet") + g.Expect(dd.Status.TLS.Ready).To(BeTrue(), "status.tls.ready false") + return dd.Status.TLS.SecretName + }, timeouts.For(timeouts.DocumentDBReady), timeouts.PollInterval(timeouts.DocumentDBReady)). + ShouldNot(BeEmpty(), "operator did not publish TLS secret name") + + // Assert the projected secret looks like a TLS secret. + Eventually(func() error { + dd, err := ddbutil.Get(ctx, env.Client, key) + if err != nil { + return err + } + return assertions.AssertTLSSecretReady(ctx, env.Client, + cluster.NamespaceName, dd.Status.TLS.SecretName)() + }, timeouts.For(timeouts.DocumentDBReady), timeouts.PollInterval(timeouts.DocumentDBReady)). + Should(Succeed()) + + host, port, stop := openGatewayForward(ctx, cluster.DD) + defer stop() + + connectCtx, cancelConnect := context.WithTimeout(ctx, timeouts.For(timeouts.MongoConnect)) + defer cancelConnect() + + client, err := mongohelper.NewClient(connectCtx, mongohelper.ClientOptions{ + Host: host, + Port: port, + User: tlsCredentialUser, + Password: tlsCredentialPassword, + TLS: true, + TLSInsecure: true, + }) + Expect(err).NotTo(HaveOccurred(), "connect with insecure TLS") + defer func() { _ = client.Disconnect(ctx) }() + + Eventually(func() error { + return mongohelper.Ping(connectCtx, client) + }, timeouts.For(timeouts.MongoConnect), timeouts.PollInterval(timeouts.MongoConnect)). + Should(Succeed(), "TLS ping with insecure verify should succeed") + }) + }, +) diff --git a/test/e2e/tests/tls/tls_suite_test.go b/test/e2e/tests/tls/tls_suite_test.go new file mode 100644 index 00000000..d05e13aa --- /dev/null +++ b/test/e2e/tests/tls/tls_suite_test.go @@ -0,0 +1,56 @@ +// Package tls hosts the DocumentDB E2E tls area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +package tls + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestTLS(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - TLS", Label(e2e.TLSLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("tls bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("tls worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "tls teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +) + +// BeforeEach in this area aborts the spec if the operator pod has +// drifted since SetupSuite (UID/name/restart-count change). Area +// tests/upgrade/ intentionally omits this hook because operator +// restarts are part of its scenario. +var _ = BeforeEach(func() { +Expect(e2e.CheckOperatorUnchanged()).To(Succeed(), +"operator health check failed — a previous spec or reconciler likely restarted the operator") +}) diff --git a/test/e2e/tests/upgrade/helpers_test.go b/test/e2e/tests/upgrade/helpers_test.go new file mode 100644 index 00000000..1cda61ca --- /dev/null +++ b/test/e2e/tests/upgrade/helpers_test.go @@ -0,0 +1,163 @@ +package upgrade + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "runtime" + "time" + + . "github.com/onsi/ginkgo/v2" //nolint:revive + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Environment variables that gate and parameterize the upgrade area. +const ( + envEnable = "E2E_UPGRADE" + envPreviousChart = "E2E_UPGRADE_PREVIOUS_CHART" + envPreviousVersion = "E2E_UPGRADE_PREVIOUS_VERSION" + envCurrentChart = "E2E_UPGRADE_CURRENT_CHART" + envCurrentVersion = "E2E_UPGRADE_CURRENT_VERSION" + envReleaseName = "E2E_UPGRADE_RELEASE" + envOperatorNamespace = "E2E_UPGRADE_OPERATOR_NS" + + envOldDocumentDBImage = "E2E_UPGRADE_OLD_DOCUMENTDB_IMAGE" + envNewDocumentDBImage = "E2E_UPGRADE_NEW_DOCUMENTDB_IMAGE" + + // Optional gateway image overrides for the image-upgrade spec. + // When unset the spec patches only spec.documentDBImage and leaves + // spec.gatewayImage as-is (operator uses its default gateway). The + // gateway image has an independent release cadence from the + // extension image; setting these to the same value as the + // documentdb env vars is INCORRECT under the layered-image + // architecture (CNPG pg18 + extension image-library + gateway + // sidecar). + envOldGatewayImage = "E2E_UPGRADE_OLD_GATEWAY_IMAGE" + envNewGatewayImage = "E2E_UPGRADE_NEW_GATEWAY_IMAGE" +) + +// Defaults applied when the env vars above are not set. The chart +// references intentionally fail-closed — specs skip themselves instead +// of installing a hard-coded "latest" chart from the internet. +const ( + defaultReleaseName = "documentdb-operator" + defaultOperatorNamespace = "documentdb-operator" + + controlPlaneUpgradeTimeout = 15 * time.Minute + imageRolloutTimeout = 15 * time.Minute +) + +// skipUnlessUpgradeEnabled skips the current spec when the upgrade +// area is not explicitly enabled. Called from BeforeEach in every +// spec below so Ginkgo reports a clear "skipped" message. +func skipUnlessUpgradeEnabled() { + if os.Getenv(envEnable) != "1" { + Skip("upgrade specs require E2E_UPGRADE=1") + } + if _, err := exec.LookPath("helm"); err != nil { + Skip("upgrade specs require the `helm` CLI on PATH: " + err.Error()) + } +} + +// requireEnv returns the value of name, or Skip()s the spec when the +// variable is unset. Used for chart path / image references that must +// be supplied by the CI job — specs fail-closed rather than guess. +func requireEnv(name, reason string) string { + v := os.Getenv(name) + if v == "" { + Skip("upgrade spec skipped: " + name + " is required (" + reason + ")") + } + return v +} + +// envOr returns the value of name, or fallback when unset. +func envOr(name, fallback string) string { + if v := os.Getenv(name); v != "" { + return v + } + return fallback +} + +// credentialSecretName is the default secret populated by createCredentialSecret +// and consumed by mongo.NewFromDocumentDB / the DocumentDB CR. +const credentialSecretName = "documentdb-credentials" + +// baseVars returns the envsubst variable map for the base DocumentDB +// template. It mirrors the backup-area helper so upgrade specs share +// the same manifests/base/documentdb.yaml.template layout. The +// DOCUMENTDB_IMAGE / GATEWAY_IMAGE fields default to empty (operator +// picks layered defaults), and can be overridden via env vars — +// image-upgrade specs further override them per-call via extraVars. +func baseVars(name, ns, size string) map[string]string { + // Empty defaults → operator composes CNPG pg18 + extension + gateway. + // Do NOT fall back GATEWAY_IMAGE to DOCUMENTDB_IMAGE: the gateway is + // an independent sidecar image, not a monolithic build. + ddImage := os.Getenv("DOCUMENTDB_IMAGE") + gwImage := os.Getenv("GATEWAY_IMAGE") + sc := "standard" + if v := os.Getenv("E2E_STORAGE_CLASS"); v != "" { + sc = v + } + if size == "" { + size = "2Gi" + } + return map[string]string{ + "NAME": name, + "NAMESPACE": ns, + "INSTANCES": "1", + "STORAGE_SIZE": size, + "STORAGE_CLASS": sc, + "DOCUMENTDB_IMAGE": ddImage, + "GATEWAY_IMAGE": gwImage, + "CREDENTIAL_SECRET": credentialSecretName, + "EXPOSURE_TYPE": "ClusterIP", + "LOG_LEVEL": "info", + } +} + +// manifestsRoot returns the absolute path to test/e2e/manifests, used +// as ManifestsRoot for documentdb.Create so rendering is robust to +// the current working directory. +func manifestsRoot() string { + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + Fail("runtime.Caller failed — cannot locate test/e2e/manifests") + } + // this file: test/e2e/tests/upgrade/helpers_test.go + // manifests: test/e2e/manifests/ + return filepath.Join(filepath.Dir(thisFile), "..", "..", "manifests") +} + +// createNamespace creates ns (if missing) and registers DeferCleanup +// to delete it at spec teardown. +func createNamespace(ctx context.Context, c client.Client, ns string) { + obj := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}} + err := c.Create(ctx, obj) + if err != nil && !apierrors.IsAlreadyExists(err) { + Fail("create namespace " + ns + ": " + err.Error()) + } + DeferCleanup(func(ctx SpecContext) { + _ = c.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}}) + }) +} + +// createCredentialSecret seeds the DocumentDB credential secret in ns. +func createCredentialSecret(ctx context.Context, c client.Client, ns string) { + sec := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: credentialSecretName, Namespace: ns}, + Type: corev1.SecretTypeOpaque, + StringData: map[string]string{ + "username": "e2e_admin", + "password": "E2eAdmin100", + }, + } + err := c.Create(ctx, sec) + if err != nil && !apierrors.IsAlreadyExists(err) { + Fail("create credential secret " + ns + "/" + credentialSecretName + ": " + err.Error()) + } +} diff --git a/test/e2e/tests/upgrade/rollback_test.go b/test/e2e/tests/upgrade/rollback_test.go new file mode 100644 index 00000000..a1bac61b --- /dev/null +++ b/test/e2e/tests/upgrade/rollback_test.go @@ -0,0 +1,42 @@ +package upgrade + +import ( + . "github.com/onsi/ginkgo/v2" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +// DocumentDB upgrade — rollback: skeleton for the operator-rollback +// scenario. The upgrade flow is one-directional today — there is no +// formally supported `helm rollback` story for the DocumentDB operator +// or its CRDs (CRD removal/downgrade is the hard part). The spec +// below is Pending and always skipped with a clear reason so the +// area's intent is documented but the test does not flap against an +// unimplemented feature. +// +// When rollback support lands: +// 1. Drop the Skip() below. +// 2. Replace the placeholders with: install current, seed, helm +// rollback to previous, verify CR still reads/writes. +// 3. Confirm the previous chart's CRD schema is backward-compatible +// with the data written by the current operator, or document the +// rollback boundary. +var _ = Describe("DocumentDB upgrade — rollback", + Label(e2e.UpgradeLabel, e2e.DisruptiveLabel, e2e.SlowLabel), + e2e.HighLevelLabel, + Serial, Ordered, Pending, func() { + BeforeEach(func() { + // Defense in depth: even if Pending is removed by mistake, + // keep the spec dormant until rollback is supported. + Skip("rollback support pending") + }) + + It("rolls the operator back to the previous chart without losing data", func() { + // Placeholder intent: + // 1. Install current PR chart. + // 2. Create DocumentDB + seed data. + // 3. `helm rollback` to previously-released chart version. + // 4. Assert operator becomes Ready on the old version. + // 5. Assert DocumentDB CR is still accepted and data is intact. + }) + }) diff --git a/test/e2e/tests/upgrade/upgrade_control_plane_test.go b/test/e2e/tests/upgrade/upgrade_control_plane_test.go new file mode 100644 index 00000000..0b61e151 --- /dev/null +++ b/test/e2e/tests/upgrade/upgrade_control_plane_test.go @@ -0,0 +1,137 @@ +package upgrade + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "go.mongodb.org/mongo-driver/v2/bson" + "k8s.io/apimachinery/pkg/types" + + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/helmop" + e2emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/seed" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// DocumentDB upgrade — control plane: uninstalls the operator, installs +// a previously-released chart, deploys a DocumentDB, seeds data, then +// upgrades the chart to the PR's built chart and verifies the operator +// is healthy and the seeded data survived the bounce. +// +// Residual risk: the "previous-released chart" is NOT pinned in code. +// It must be supplied by the caller via E2E_UPGRADE_PREVIOUS_CHART and +// E2E_UPGRADE_PREVIOUS_VERSION (e.g. the chart published on +// GitHub Releases). Hard-coding "latest" here would break every time a +// new release is cut, so the spec fail-closed skips when unset. +var _ = Describe("DocumentDB upgrade — control plane", + Label(e2e.UpgradeLabel, e2e.DisruptiveLabel, e2e.SlowLabel), + e2e.HighLevelLabel, + Serial, Ordered, func() { + const ( + ddName = "upgrade-cp" + dbName = "upgrade_cp" + collName = "seed" + ) + var ( + releaseName string + operatorNs string + previousChart string + previousVer string + currentChart string + currentVer string + operatorCtx context.Context + operatorCancel context.CancelFunc + ) + + BeforeAll(func() { + skipUnlessUpgradeEnabled() + releaseName = envOr(envReleaseName, defaultReleaseName) + operatorNs = envOr(envOperatorNamespace, defaultOperatorNamespace) + previousChart = requireEnv(envPreviousChart, + "chart ref to the previous released operator chart (e.g. documentdb/documentdb-operator or a local tgz)") + previousVer = requireEnv(envPreviousVersion, + "semver of the previous released chart; see GitHub Releases or the published Helm index") + currentChart = requireEnv(envCurrentChart, + "chart ref to the PR's built chart (path to the unpacked chart dir or packaged tgz)") + currentVer = envOr(envCurrentVersion, "") + }) + + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.High) + operatorCtx, operatorCancel = context.WithTimeout(context.Background(), controlPlaneUpgradeTimeout) + DeferCleanup(func() { operatorCancel() }) + }) + + It("upgrades operator from previous released chart to current and retains data", func() { + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "SuiteEnv must be initialized by SetupSuite") + c := env.Client + + By("uninstalling any pre-existing operator release (idempotent)") + Expect(helmop.Uninstall(operatorCtx, releaseName, operatorNs)).To(Succeed()) + + By("installing the previous released operator chart") + Expect(helmop.Install(operatorCtx, releaseName, operatorNs, previousChart, previousVer, nil)). + To(Succeed(), "install previous chart %s@%s", previousChart, previousVer) + Expect(helmop.WaitOperatorReady(operatorCtx, env, operatorNs, 3*time.Minute)).To(Succeed()) + + By("creating a DocumentDB on the previous operator") + ns := namespaces.NamespaceForSpec(e2e.UpgradeLabel) + createNamespace(operatorCtx, c, ns) + createCredentialSecret(operatorCtx, c, ns) + + dd, err := documentdb.Create(operatorCtx, c, ns, ddName, documentdb.CreateOptions{ + Base: "documentdb", + Vars: baseVars(ddName, ns, "2Gi"), + ManifestsRoot: manifestsRoot(), + }) + Expect(err).NotTo(HaveOccurred(), "create DocumentDB %s/%s", ns, ddName) + DeferCleanup(func(ctx SpecContext) { + _ = documentdb.Delete(ctx, c, dd, 3*time.Minute) + }) + + key := types.NamespacedName{Namespace: ns, Name: ddName} + Eventually(assertions.AssertDocumentDBReady(operatorCtx, c, key), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed(), "DocumentDB did not reach Ready under previous operator") + + By("seeding data on the previous operator") + docs := seed.SmallDataset() + handle, err := e2emongo.NewFromDocumentDB(operatorCtx, env, ns, ddName) + Expect(err).NotTo(HaveOccurred(), "connect to DocumentDB gateway") + inserted, err := e2emongo.Seed(operatorCtx, handle.Client(), dbName, collName, docs) + Expect(err).NotTo(HaveOccurred(), "seed %s.%s", dbName, collName) + Expect(inserted).To(Equal(seed.SmallDatasetSize)) + // Explicit close before the helm upgrade: the port-forward + // goroutine must not outlive the operator bounce. + Expect(handle.Close(operatorCtx)).To(Succeed()) + + By("upgrading the chart to the PR's built version") + Expect(helmop.Upgrade(operatorCtx, releaseName, operatorNs, currentChart, currentVer, nil)). + To(Succeed(), "upgrade to current chart %s@%s", currentChart, currentVer) + Expect(helmop.WaitOperatorReady(operatorCtx, env, operatorNs, 5*time.Minute)).To(Succeed()) + + By("verifying the DocumentDB CR is still reconciled by the new operator") + Eventually(assertions.AssertDocumentDBReady(operatorCtx, c, key), + timeouts.For(timeouts.DocumentDBUpgrade), + timeouts.PollInterval(timeouts.DocumentDBUpgrade), + ).Should(Succeed(), "DocumentDB did not reach Ready after operator upgrade") + + By("verifying seeded data survived the operator bounce") + handle2, err := e2emongo.NewFromDocumentDB(operatorCtx, env, ns, ddName) + Expect(err).NotTo(HaveOccurred(), "reconnect to DocumentDB gateway") + DeferCleanup(func(ctx SpecContext) { _ = handle2.Close(ctx) }) + n, err := e2emongo.Count(operatorCtx, handle2.Client(), dbName, collName, bson.M{}) + Expect(err).NotTo(HaveOccurred(), "count %s.%s", dbName, collName) + Expect(n).To(Equal(int64(seed.SmallDatasetSize)), + "seeded document count changed across operator upgrade") + }) + }) diff --git a/test/e2e/tests/upgrade/upgrade_images_test.go b/test/e2e/tests/upgrade/upgrade_images_test.go new file mode 100644 index 00000000..c71a58b8 --- /dev/null +++ b/test/e2e/tests/upgrade/upgrade_images_test.go @@ -0,0 +1,214 @@ +package upgrade + +import ( + "context" + "fmt" + "os" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "go.mongodb.org/mongo-driver/v2/bson" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + "github.com/documentdb/documentdb-operator/test/e2e" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/assertions" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/documentdb" + e2emongo "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/mongo" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/namespaces" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/seed" + "github.com/documentdb/documentdb-operator/test/e2e/pkg/e2eutils/timeouts" +) + +// DocumentDB upgrade — images: with the operator already running at +// the current version, patches the DocumentDB spec.documentDBImage +// (and spec.gatewayImage) from an old image tag to a new one and +// verifies the rollout completes + the seeded dataset is retained. +// Unlike upgrade_control_plane_test.go this does not touch the Helm +// release; it only exercises the CR-driven data-plane image upgrade +// path. +// +// Residual risk: the spec needs two image references (old/new). They +// come from E2E_UPGRADE_OLD_DOCUMENTDB_IMAGE / +// E2E_UPGRADE_NEW_DOCUMENTDB_IMAGE — there is no pinned default +// because the set of valid old→new pairs depends on the release being +// validated. +var _ = Describe("DocumentDB upgrade — images", + Label(e2e.UpgradeLabel, e2e.DisruptiveLabel, e2e.SlowLabel), + e2e.HighLevelLabel, + Serial, Ordered, func() { + const ( + ddName = "upgrade-img" + dbName = "upgrade_img" + collName = "seed" + ) + var ( + oldImage string + newImage string + oldGwImage string + newGwImage string + ctx context.Context + cancel context.CancelFunc + ) + + BeforeAll(func() { + skipUnlessUpgradeEnabled() + oldImage = requireEnv(envOldDocumentDBImage, + "DocumentDB image tag to start from (e.g. ghcr.io/microsoft/documentdb/documentdb:0.108.0)") + newImage = requireEnv(envNewDocumentDBImage, + "DocumentDB image tag to upgrade to (must be different from the old tag)") + if oldImage == newImage { + Skip("E2E_UPGRADE_OLD_DOCUMENTDB_IMAGE and E2E_UPGRADE_NEW_DOCUMENTDB_IMAGE are identical; nothing to upgrade") + } + // The gateway is an independent sidecar image; specs may + // exercise a gateway upgrade alongside the extension + // upgrade, or leave the gateway untouched. Both env vars + // must either be set together or both left empty. + oldGwImage = os.Getenv(envOldGatewayImage) + newGwImage = os.Getenv(envNewGatewayImage) + if (oldGwImage == "") != (newGwImage == "") { + Fail(fmt.Sprintf("%s and %s must be set together (or both unset)", + envOldGatewayImage, envNewGatewayImage)) + } + if oldGwImage != "" && oldGwImage == newGwImage { + Skip(envOldGatewayImage + " and " + envNewGatewayImage + " are identical; nothing to upgrade") + } + }) + + BeforeEach(func() { + e2e.SkipUnlessLevel(e2e.High) + ctx, cancel = context.WithTimeout(context.Background(), imageRolloutTimeout) + DeferCleanup(func() { cancel() }) + }) + + It("rolls DocumentDB pods to a new image and retains data", func() { + env := e2e.SuiteEnv() + Expect(env).NotTo(BeNil(), "SuiteEnv must be initialized by SetupSuite") + Expect(ctx).NotTo(BeNil(), "BeforeEach must have populated the spec context") + c := env.Client + + By("creating a DocumentDB pinned to the old image") + ns := namespaces.NamespaceForSpec(e2e.UpgradeLabel) + createNamespace(ctx, c, ns) + createCredentialSecret(ctx, c, ns) + + vars := baseVars(ddName, ns, "2Gi") + vars["DOCUMENTDB_IMAGE"] = oldImage + if oldGwImage != "" { + vars["GATEWAY_IMAGE"] = oldGwImage + } + + dd, err := documentdb.Create(ctx, c, ns, ddName, documentdb.CreateOptions{ + Base: "documentdb", + Vars: vars, + ManifestsRoot: manifestsRoot(), + }) + Expect(err).NotTo(HaveOccurred(), "create DocumentDB %s/%s", ns, ddName) + DeferCleanup(func(ctx SpecContext) { + _ = documentdb.Delete(ctx, c, dd, 3*time.Minute) + }) + + key := types.NamespacedName{Namespace: ns, Name: ddName} + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBReady), + timeouts.PollInterval(timeouts.DocumentDBReady), + ).Should(Succeed(), "DocumentDB did not reach Ready on oldImage=%s", oldImage) + + By("seeding data on the old image") + docs := seed.SmallDataset() + handle, err := e2emongo.NewFromDocumentDB(ctx, env, ns, ddName) + Expect(err).NotTo(HaveOccurred(), "connect to DocumentDB gateway on oldImage") + inserted, err := e2emongo.Seed(ctx, handle.Client(), dbName, collName, docs) + Expect(err).NotTo(HaveOccurred(), "seed %s.%s", dbName, collName) + Expect(inserted).To(Equal(seed.SmallDatasetSize)) + Expect(handle.Close(ctx)).To(Succeed()) + + By("patching spec.documentDBImage (and optionally gatewayImage) to the new image") + fresh, err := documentdb.Get(ctx, c, key) + Expect(err).NotTo(HaveOccurred(), "re-fetch DocumentDB before patch") + Expect(documentdb.PatchSpec(ctx, c, fresh, func(s *previewv1.DocumentDBSpec) { + s.DocumentDBImage = newImage + if newGwImage != "" { + s.GatewayImage = newGwImage + } + })).To(Succeed(), "patch DocumentDB image from %s to %s", oldImage, newImage) + + By("waiting for the CNPG-backed rollout to settle on the new image") + // Poll the CR's backing pods until every container image + // matches newImage. A transient all-pods-gone window is + // acceptable during rollout, so we require at least one + // pod AND zero pods still on oldImage. + Eventually(func() error { + return allPodsOnImage(ctx, c, ns, ddName, newImage) + }, timeouts.For(timeouts.DocumentDBUpgrade), + timeouts.PollInterval(timeouts.DocumentDBUpgrade), + ).Should(Succeed(), "pods did not roll to %s", newImage) + + Eventually(assertions.AssertDocumentDBReady(ctx, c, key), + timeouts.For(timeouts.DocumentDBUpgrade), + timeouts.PollInterval(timeouts.DocumentDBUpgrade), + ).Should(Succeed(), "DocumentDB did not reach Ready on newImage=%s", newImage) + + By("verifying data seeded before the upgrade is still reachable") + handle2, err := e2emongo.NewFromDocumentDB(ctx, env, ns, ddName) + Expect(err).NotTo(HaveOccurred(), "reconnect to DocumentDB gateway on newImage") + DeferCleanup(func(ctx SpecContext) { _ = handle2.Close(ctx) }) + n, err := e2emongo.Count(ctx, handle2.Client(), dbName, collName, bson.M{}) + Expect(err).NotTo(HaveOccurred(), "count %s.%s on newImage", dbName, collName) + Expect(n).To(Equal(int64(seed.SmallDatasetSize)), + "seeded document count changed across image upgrade") + }) + }) + +// allPodsOnImage returns nil when there is at least one Pod owned by +// the CNPG Cluster backing ddName and every container in every such +// Pod reports an image equal to want. The helper intentionally errs +// on the side of "not yet done" — missing pods, empty status, or any +// mismatch returns a non-nil error so Eventually keeps polling. +func allPodsOnImage(ctx context.Context, c client.Client, ns, ddName, want string) error { + var pods corev1.PodList + sel := labels.SelectorFromSet(labels.Set{"cnpg.io/cluster": ddName}) + if err := c.List(ctx, &pods, client.InNamespace(ns), client.MatchingLabelsSelector{Selector: sel}); err != nil { + return fmt.Errorf("list pods: %w", err) + } + if len(pods.Items) == 0 { + return fmt.Errorf("no pods yet for cluster %s/%s", ns, ddName) + } + for i := range pods.Items { + p := &pods.Items[i] + if len(p.Status.ContainerStatuses) == 0 { + return fmt.Errorf("pod %s has no container statuses yet", p.Name) + } + for j := range p.Status.ContainerStatuses { + got := p.Status.ContainerStatuses[j].Image + // Container image strings can be reported by the kubelet in + // resolved form (digest appended). Accept any image whose + // reported tag contains the requested ref; this matches the + // upgrade-verification semantics used in other areas. + if got != want && !containsImageRef(got, want) { + return fmt.Errorf("pod %s container %s image=%q, want %q", + p.Name, p.Status.ContainerStatuses[j].Name, got, want) + } + } + } + return nil +} + +// containsImageRef returns true when got references want either +// verbatim or as the repository:tag prefix of a digest-resolved form +// (e.g. "repo:tag@sha256:..."). Keeps the image-rollout assertion +// resilient to kubelets that report resolved digests. +func containsImageRef(got, want string) bool { + if got == want { + return true + } + if len(got) < len(want) { + return false + } + return got[:len(want)] == want && (len(got) == len(want) || got[len(want)] == '@') +} diff --git a/test/e2e/tests/upgrade/upgrade_suite_test.go b/test/e2e/tests/upgrade/upgrade_suite_test.go new file mode 100644 index 00000000..0fa2f150 --- /dev/null +++ b/test/e2e/tests/upgrade/upgrade_suite_test.go @@ -0,0 +1,58 @@ +// Package upgrade hosts the DocumentDB E2E upgrade area. See +// docs/designs/e2e-test-suite.md for the spec catalog. This file is +// the Ginkgo root for the area binary and shares bootstrap with the +// other area binaries via the exported helpers in package e2e. +// +// This area is DISRUPTIVE — its specs install/upgrade the operator +// itself. They are gated behind the E2E_UPGRADE=1 environment variable +// to prevent accidental local runs. They require the `helm` v3 CLI on +// PATH and must run with `ginkgo -procs=1` because they mutate the +// cluster-wide operator Deployment. +// +// Unlike every other area, tests/upgrade/ does NOT install the +// [e2e.CheckOperatorUnchanged] BeforeEach hook — operator restarts are +// part of the scenario here, not a failure mode. This exemption is +// acknowledged in pkg e2e's suite.go header comment. +package upgrade + +import ( + "context" + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/e2e" +) + +const operatorReadyTimeout = 2 * time.Minute + +func TestUpgrade(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "DocumentDB E2E - Upgrade", Label(e2e.UpgradeLabel)) +} + +var _ = SynchronizedBeforeSuite( + func(ctx SpecContext) []byte { + if err := e2e.SetupSuite(ctx, operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("upgrade bootstrap: %v", err)) + } + return []byte{} + }, + func(_ SpecContext, _ []byte) { + if err := e2e.SetupSuite(context.Background(), operatorReadyTimeout); err != nil { + Fail(fmt.Sprintf("upgrade worker bootstrap: %v", err)) + } + }, +) + +var _ = SynchronizedAfterSuite( + func(ctx SpecContext) { + if err := e2e.TeardownSuite(ctx); err != nil { + fmt.Fprintf(GinkgoWriter, "upgrade teardown: %v\n", err) + } + }, + func(_ SpecContext) {}, +)