diff --git a/.agents/skills/sok-build/SKILL.md b/.agents/skills/sok-build/SKILL.md new file mode 100644 index 000000000..3d5c5eb61 --- /dev/null +++ b/.agents/skills/sok-build/SKILL.md @@ -0,0 +1,30 @@ +--- +name: sok-build +description: Build Splunk Operator artifacts with the repository-standard workflow. Use for local compile checks, CRD generation, and image build prep. +--- + +# SOK Build + +## Overview +Run deterministic build steps for Splunk Operator using repo-native Make targets. + +## Preconditions +- Run `sok-prerequisites` first. +- Any API or marker changes should be committed or staged before generation checks. + +## Workflow +1. If API types changed, run generation first: `make generate manifests`. +2. Run formatting and static checks: `make fmt vet`. +3. Build operator binary: `make build`. +4. If image validation is needed, build image with explicit tag: `make docker-build IMG=`. +5. For CRD-sensitive changes, run `scripts/verify_crd.sh`. + +## Pass / Fail Criteria +- Pass: build commands exit 0 and generated artifacts are in sync. +- Fail: any build/generation command exits non-zero. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-build/agents/openai.yaml b/.agents/skills/sok-build/agents/openai.yaml new file mode 100644 index 000000000..056faa724 --- /dev/null +++ b/.agents/skills/sok-build/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Build" + short_description: "Build operator artifacts" diff --git a/.agents/skills/sok-ci-fixer/SKILL.md b/.agents/skills/sok-ci-fixer/SKILL.md new file mode 100644 index 000000000..97b7b21b6 --- /dev/null +++ b/.agents/skills/sok-ci-fixer/SKILL.md @@ -0,0 +1,52 @@ +--- +name: sok-ci-fixer +description: Analyze CI failures, map them to local repro commands, and propose fixes. Use when CI logs or pipeline failures need a local reproduction and patch. +--- + +# SOK CI Fixer + +## Overview +Turn CI failures into a local repro and a minimal fix with a validation plan. + +## Preconditions +- CI failure context is available (failed job URL or logs). +- Local repo state matches the branch under investigation. + +## Scope +Allowed paths: +- `scripts/**` +- `test/**` +- `kuttl/**` +- `config/**` +- `api/**` +- `internal/**` +- `pkg/**` +- `Makefile`, `go.mod`, `go.sum` + +Forbidden paths: +- `vendor/**` +- `bin/**` +- `.git/**` + +If changes are needed outside the allowed paths, stop and propose a follow-up plan. + +## Workflow +1. Summarize the CI failure (job name, step, error). +2. Map to a local command (prefer `scripts/dev/pr_check.sh` or `./scripts/verify_repo.sh`). +3. Reproduce locally if possible and capture the failing output. +4. Implement the smallest safe fix. +5. Re-run the local repro command. + +## Commands +- PR gate: `scripts/dev/pr_check.sh` +- Repo verify: `./scripts/verify_repo.sh` + +## Pass / Fail Criteria +- Pass: failure is reproduced (or clearly explained), a minimal fix is validated, and regression risk is called out. +- Fail: reproduction/fix is incomplete or validation evidence is missing. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-ci-fixer/agents/openai.yaml b/.agents/skills/sok-ci-fixer/agents/openai.yaml new file mode 100644 index 000000000..2c12f9b2d --- /dev/null +++ b/.agents/skills/sok-ci-fixer/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK CI Fixer" + short_description: "Triage and fix CI failures" diff --git a/.agents/skills/sok-commit-pr/SKILL.md b/.agents/skills/sok-commit-pr/SKILL.md new file mode 100644 index 000000000..c30f30df7 --- /dev/null +++ b/.agents/skills/sok-commit-pr/SKILL.md @@ -0,0 +1,32 @@ +--- +name: sok-commit-pr +description: Commit changes and create/update GitHub draft PRs for Splunk Operator. Use after tests pass and the change summary is ready. +--- + +# SOK Commit PR + +## Overview +Apply a consistent commit and draft PR workflow aligned with splcore commit/MR discipline, adapted for GitHub. + +## Preconditions +- Local branch is based on the intended target branch. +- Required local checks are complete (at minimum `scripts/dev/pr_check.sh`). +- Git identity is configured correctly for the branch owner. + +## Workflow +1. Verify branch and working tree: `git status`, `git branch --show-current`. +2. Stage only in-scope files (`git add -p` preferred). +3. Commit with ticket-first subject and concise body. +4. Push branch: `git push -u origin `. +5. Create or update draft PR with `gh pr create --draft` (or `gh pr edit` if PR exists). +6. Ensure PR description includes: summary, tests, risks, rollback notes. + +## Pass / Fail Criteria +- Pass: commit exists on remote branch and draft PR is created/updated with required sections. +- Fail: push or PR creation/edit fails, or required PR metadata is missing. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-commit-pr/agents/openai.yaml b/.agents/skills/sok-commit-pr/agents/openai.yaml new file mode 100644 index 000000000..c4b3a1640 --- /dev/null +++ b/.agents/skills/sok-commit-pr/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Commit PR" + short_description: "Commit and manage draft PR" diff --git a/.agents/skills/sok-doc-updater/SKILL.md b/.agents/skills/sok-doc-updater/SKILL.md new file mode 100644 index 000000000..e9870fe3a --- /dev/null +++ b/.agents/skills/sok-doc-updater/SKILL.md @@ -0,0 +1,47 @@ +--- +name: sok-doc-updater +description: Update Splunk Operator docs and examples for a change. Use when a change requires docs, CR examples, or user-facing guidance updates. +--- + +# SOK Doc Updater + +## Overview +Keep docs, examples, and user-facing guidance in sync with code changes. + +## Preconditions +- Product behavior change and impacted docs are known. +- If code changes are still pending, defer this skill until code scope is stable. + +## Scope +Allowed paths: +- `docs/**` +- `README.md` +- `config/samples/**` +- `helm-chart/**` + +Forbidden paths: +- `api/**` +- `internal/**` +- `pkg/**` +- `test/**` +- `kuttl/**` +- `bundle/**` +- `vendor/**` + +If product code changes are required, stop and hand off to the appropriate skill. + +## Workflow +1. Identify the user-facing change and affected docs. +2. Update spec fields, examples, and any compatibility notes. +3. Verify examples are consistent with current CRD schema. +4. Provide a short summary and a test/validation note if applicable. + +## Pass / Fail Criteria +- Pass: docs/examples match current behavior and schema, with follow-ups called out. +- Fail: docs are incomplete, inconsistent, or missing validation notes. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-doc-updater/agents/openai.yaml b/.agents/skills/sok-doc-updater/agents/openai.yaml new file mode 100644 index 000000000..f0c094142 --- /dev/null +++ b/.agents/skills/sok-doc-updater/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Doc Updater" + short_description: "Update docs and examples" diff --git a/.agents/skills/sok-feature-scaffold/SKILL.md b/.agents/skills/sok-feature-scaffold/SKILL.md new file mode 100644 index 000000000..adaf8e341 --- /dev/null +++ b/.agents/skills/sok-feature-scaffold/SKILL.md @@ -0,0 +1,114 @@ +--- +name: sok-feature-scaffold +description: Add or change Splunk Operator behavior by introducing a new field in a CRD spec/status, wiring it into reconciliation, and updating tests and docs. Use for changes to Standalone, IndexerCluster, SearchHeadCluster, ClusterManager, LicenseManager, MonitoringConsole, or shared CRD config. Do not use for pure refactors, dependency bumps, or formatting-only changes. +--- + +# SOK Feature Scaffold + +## Overview +Implement CRD-driven features end-to-end with code, tests, and docs in this repository. + +## Preconditions +- Feature scope is agreed (target CRD kind, field shape, expected behavior). +- A spec/issue reference exists for non-trivial changes. +- Operator SDK/controller-gen workflow is available locally via Make targets. + +## Scope +Allowed paths: +- `api/**` +- `internal/controller/**` +- `pkg/**` +- `config/**` +- `docs/**` +- `test/**` +- `kuttl/**` +- `bundle/**` +- `helm-chart/**` +- `scripts/**` +- `Makefile`, `README.md`, `PROJECT`, `go.mod`, `go.sum` + +Forbidden paths: +- `vendor/**` +- `bin/**` +- `.git/**` + +If changes are needed outside the allowed paths, stop and propose a follow-up plan. + +## Workflow +1. Print the files you plan to change and the test commands you will run before editing. +2. Identify the target CRD kind and API version. Confirm the mapping in `PROJECT` and `docs/agent/CRD_MAP.md`. +3. Locate the API types under `api/v*/` and update the spec/status struct. +4. Add JSON tags, `omitempty` rules, and kubebuilder markers consistent with adjacent fields. +5. Update any defaulting or validation logic that applies to the new field. +6. Regenerate CRD/RBAC artifacts via the operator-sdk workflow. +7. Wire the field into reconciliation with idempotent logic. +8. Update or add unit tests, and add an integration test stub when relevant. +9. Update docs and examples to expose the new field. +10. Produce a PR-ready summary with tests and risks. + +## Implementation Details + +### 1) Find the right API types +- `PROJECT` is the source of truth for kind and version mapping. +- Use `docs/agent/CRD_MAP.md` for fast navigation to types, controllers, and enterprise logic. +- Prefer the latest stable API version (typically `api/v4`). +- Legacy kinds still use `api/v3` (legacy cluster manager and legacy license manager). +- Use `rg "type .*Spec" api -g "*_types.go"` to locate the spec struct. +- If the field is shared across CRDs, check `api/v4/common_types.go` (and `api/v3/common_types.go` for legacy kinds). + +### 2) Schema, CRD, and RBAC generation (operator-sdk workflow) +- Use `operator-sdk create api` when introducing a new API or controller (scaffolding). +- Add the field with a clear JSON name and `omitempty` as appropriate. +- Follow nearby kubebuilder markers for validation, defaults, and list/map behavior. +- Regenerate code and manifests with the repo targets (operator-sdk scaffolding uses controller-gen under the hood). +`make generate` for deepcopy code. +`make manifests` for CRDs and RBAC. +Run `make bundle` to refresh `bundle/manifests/*` and `helm-chart/splunk-operator/crds` when bundle or Helm CRDs are tracked. +- For verification, use `./scripts/verify_crd.sh` and optionally `./scripts/verify_bundle.sh` or `make verify VERIFY_BUNDLE=1`. +- If you add new RBAC needs, update kubebuilder RBAC markers in the controller and re-run `make manifests` to refresh `config/rbac/role.yaml`. + +### 3) Reconcile wiring +- Locate the controller in `internal/controller` and shared logic in `pkg/splunk/enterprise` or `pkg/splunk/common`. +- Read the new field from the spec and apply it in a single, idempotent reconciliation path. +- Update status only when the desired state is reached and avoid hot-looping. + +### 4) Tests +- Add or update unit tests near the logic you touched (often under `internal/controller` or `pkg/splunk/*`). +- If the behavior is user-visible or multi-resource, add a minimal integration test stub in `test/` or `kuttl/` to document coverage intent. +- Prefer helper scripts when available: `scripts/dev/unit.sh`, `scripts/dev/lint.sh`, `scripts/dev/pr_check.sh`. + +### 5) Docs +- Update `docs/CustomResources.md` for spec fields. +- Update any feature-specific doc under `docs/` and add an example manifest if needed. + +## Pass / Fail Criteria +- Pass: schema/manifests are in sync, reconcile logic is idempotent, tests/docs are updated. +- Fail: generation mismatch, missing test coverage, or behavior not wired end-to-end. + +## Assets +- Use `assets/pr-template.md` for the PR summary format. +- Use `assets/crd-change-checklist.md` as a guardrail for CRD edits. + +## Key Paths +- API types: `api/v*/` (e.g. `api/v4/*_types.go`) +- Controllers: `internal/controller/` +- Shared logic: `pkg/splunk/enterprise`, `pkg/splunk/common` +- CRDs: `config/crd/bases/` +- RBAC output: `config/rbac/role.yaml` +- Bundles: `bundle/manifests/`, `helm-chart/splunk-operator/crds` +- Docs: `docs/CustomResources.md`, `docs/Examples.md` + +## Repo Map (Common Cases) +- Standalone: `api/v4/standalone_types.go`, `internal/controller/standalone_controller.go`, `pkg/splunk/enterprise/standalone.go` +- IndexerCluster: `api/v4/indexercluster_types.go`, `internal/controller/indexercluster_controller.go`, `pkg/splunk/enterprise/indexercluster.go` +- SearchHeadCluster: `api/v4/searchheadcluster_types.go`, `internal/controller/searchheadcluster_controller.go`, `pkg/splunk/enterprise/searchheadcluster.go` +- ClusterManager: `api/v4/clustermanager_types.go`, `internal/controller/clustermanager_controller.go`, `pkg/splunk/enterprise/clustermanager.go` +- LicenseManager: `api/v4/licensemanager_types.go`, `internal/controller/licensemanager_controller.go`, `pkg/splunk/enterprise/licensemanager.go` +- MonitoringConsole: `api/v4/monitoringconsole_types.go`, `internal/controller/monitoringconsole_controller.go`, `pkg/splunk/enterprise/monitoringconsole.go` +- Legacy v3 control-plane types/controllers: search under `api/v3/`, `internal/controller/`, and `pkg/splunk/enterprise/` + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-feature-scaffold/agents/openai.yaml b/.agents/skills/sok-feature-scaffold/agents/openai.yaml new file mode 100644 index 000000000..90c462de1 --- /dev/null +++ b/.agents/skills/sok-feature-scaffold/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Feature Scaffold" + short_description: "Scaffold CRD-driven feature changes" diff --git a/.agents/skills/sok-feature-scaffold/assets/crd-change-checklist.md b/.agents/skills/sok-feature-scaffold/assets/crd-change-checklist.md new file mode 100644 index 000000000..26ba35ab8 --- /dev/null +++ b/.agents/skills/sok-feature-scaffold/assets/crd-change-checklist.md @@ -0,0 +1,11 @@ +## CRD Change Checklist +- [ ] Update the correct `api/v*/` spec or status struct +- [ ] Add JSON tags and `omitempty` consistently +- [ ] Add kubebuilder markers for validation or defaults +- [ ] Run `make manifests` +- [ ] Verify `config/rbac/role.yaml` if RBAC markers changed +- [ ] If bundle/Helm CRDs are tracked, run `make bundle` +- [ ] Run `make verify` (optionally `VERIFY_BUNDLE=1`) to confirm outputs +- [ ] Verify generated YAML under `config/crd/bases/` and `bundle/manifests/` +- [ ] Update docs under `docs/` +- [ ] Add or update tests diff --git a/.agents/skills/sok-feature-scaffold/assets/pr-template.md b/.agents/skills/sok-feature-scaffold/assets/pr-template.md new file mode 100644 index 000000000..aa2665483 --- /dev/null +++ b/.agents/skills/sok-feature-scaffold/assets/pr-template.md @@ -0,0 +1,8 @@ +## Summary +- + +## Tests +- + +## Risks / Follow-ups +- diff --git a/.agents/skills/sok-issue-triage/SKILL.md b/.agents/skills/sok-issue-triage/SKILL.md new file mode 100644 index 000000000..56ec37b6c --- /dev/null +++ b/.agents/skills/sok-issue-triage/SKILL.md @@ -0,0 +1,82 @@ +--- +name: sok-issue-triage +description: Turn a Splunk Operator issue report into scope, impacted components, proposed change list, test plan, and risks. Use when asked to triage a GitHub issue, bug report, or feature request into a PR plan. +--- + +# SOK Issue Triage + +## Overview +Convert issue context into a PR-ready plan with scope, changes, tests, and risks. + +## Preconditions +- Issue statement, expected behavior, and current behavior are available. +- Any missing context is listed as open questions before planning. + +## Scope +Allowed paths: +- `.agents/**` +- `docs/**` +- `templates/**` + +Forbidden paths: +- `api/**` +- `internal/**` +- `pkg/**` +- `test/**` +- `kuttl/**` +- `config/**` +- `bundle/**` +- `helm-chart/**` +- `vendor/**` + +This skill should not change product code. If code changes are required, stop and hand off to the appropriate skill. + +## Workflow +1. Extract the problem statement and expected behavior. +2. Identify impacted CRDs, controllers, and packages. +3. Determine the minimal scope for a safe fix. +4. Propose the change list in implementation order. +5. Define a concrete test plan. +6. Call out risks, migrations, and backward compatibility. + +## Details + +### 1) Parse the issue +- Capture user intent, repro steps, and current behavior. +- Identify the CR kind and any referenced fields. + +### 2) Map to code +- Find the spec/status types under `api/v*/`. +- Locate controllers in `internal/controller`. +- Identify shared helpers in `pkg/splunk/enterprise` or `pkg/splunk/common`. +- Use `PROJECT` to confirm CRD kind and version mapping. +- Use `docs/agent/CRD_MAP.md` for a fast file map. +- Use `docs/agent/RECONCILE_FLOW.md` for flow and phase context. + +### 3) Build a PR plan +- List files or directories to touch. +- Keep the change list ordered: schema, reconcile logic, tests, docs. + +### 4) Test plan +- Prefer `make test` for unit coverage. +- Propose an integration test or minimal stub when behavior is user-visible. + +## Pass / Fail Criteria +- Pass: plan includes explicit scope, impacted files, test commands, risks, and open questions. +- Fail: plan is ambiguous, missing verification, or lacks risk coverage. + +## Key Paths +- API types: `api/v*/` +- Controllers: `internal/controller/` +- Shared logic: `pkg/splunk/enterprise`, `pkg/splunk/common` +- Docs: `docs/` +- Project mapping: `PROJECT` +- Agent docs: `docs/agent/CRD_MAP.md`, `docs/agent/RECONCILE_FLOW.md` + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary + +Use `assets/issue-triage-template.md` for the final structure and include open questions if any context is missing. diff --git a/.agents/skills/sok-issue-triage/agents/openai.yaml b/.agents/skills/sok-issue-triage/agents/openai.yaml new file mode 100644 index 000000000..c79c3f353 --- /dev/null +++ b/.agents/skills/sok-issue-triage/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Issue Triage" + short_description: "Turn issues into implementation plans" diff --git a/.agents/skills/sok-issue-triage/assets/issue-triage-template.md b/.agents/skills/sok-issue-triage/assets/issue-triage-template.md new file mode 100644 index 000000000..cb462508c --- /dev/null +++ b/.agents/skills/sok-issue-triage/assets/issue-triage-template.md @@ -0,0 +1,17 @@ +## Scope +- + +## Impacted Components +- + +## Proposed Changes +- + +## Test Plan +- + +## Risks / Compatibility +- + +## Open Questions +- diff --git a/.agents/skills/sok-new-crd-controller/SKILL.md b/.agents/skills/sok-new-crd-controller/SKILL.md new file mode 100644 index 000000000..ca88bc7d7 --- /dev/null +++ b/.agents/skills/sok-new-crd-controller/SKILL.md @@ -0,0 +1,59 @@ +--- +name: sok-new-crd-controller +description: Create a new CRD and controller skeleton (operator-sdk), wire RBAC, add sample YAML, and add tests/docs. Use when introducing a brand-new custom resource to Splunk Operator. +--- + +# SOK New CRD + Controller + +## Overview +Scaffold and wire a new CRD + controller end-to-end, with RBAC, samples, tests, and docs. + +## Preconditions +- New CRD scope, API group/version/kind, and ownership are agreed. +- `operator-sdk` scaffolding flow is available. +- A spec/issue reference exists for non-trivial API additions. + +## Scope +Allowed paths: +- `api/**` +- `internal/controller/**` +- `cmd/**` +- `config/**` +- `docs/**` +- `test/**` +- `kuttl/**` +- `bundle/**` +- `helm-chart/**` +- `scripts/**` +- `PROJECT`, `Makefile`, `go.mod`, `go.sum` + +Forbidden paths: +- `vendor/**` +- `bin/**` +- `.git/**` + +If changes are needed outside the allowed paths, stop and propose a follow-up plan. + +## Workflow +1. Print the files you plan to change and test commands you will run. +2. Run `operator-sdk create api` (if a brand-new API) and confirm entries in `PROJECT`. +3. Implement the spec/status types and kubebuilder markers in `api/v*/`. +4. Wire the controller under `internal/controller/` and register in `cmd/main.go`. +5. Update RBAC markers and regenerate manifests: `make generate manifests` or `./scripts/verify_crd.sh`. +6. Add a sample CR under `config/samples/`. +7. Add unit tests (and an integration stub if user-visible behavior). +8. Update docs and examples. + +## Notes +- Prefer scripts when available: `scripts/dev/unit.sh`, `scripts/dev/lint.sh`, `scripts/dev/pr_check.sh`. +- Ensure CRD output is updated in `config/crd/bases/` and, if tracked, `bundle/` and `helm-chart/`. + +## Pass / Fail Criteria +- Pass: CRD/controller scaffolding compiles, generated artifacts are in sync, and tests/docs are present. +- Fail: registration/generation is incomplete, or validation artifacts are missing. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-new-crd-controller/agents/openai.yaml b/.agents/skills/sok-new-crd-controller/agents/openai.yaml new file mode 100644 index 000000000..4291290cc --- /dev/null +++ b/.agents/skills/sok-new-crd-controller/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK New CRD Controller" + short_description: "Scaffold a new CRD + controller" diff --git a/.agents/skills/sok-pr-crafter/SKILL.md b/.agents/skills/sok-pr-crafter/SKILL.md new file mode 100644 index 000000000..6a7d013a1 --- /dev/null +++ b/.agents/skills/sok-pr-crafter/SKILL.md @@ -0,0 +1,48 @@ +--- +name: sok-pr-crafter +description: Generate clean PR descriptions, checklists, and risk notes. Use after code changes are complete and tests are run (or explicitly skipped). +--- + +# SOK PR Crafter + +## Overview +Create a PR-ready summary and checklist from the current diff and test results. + +## Preconditions +- Code changes are complete for this pass. +- Test commands were run (or explicitly skipped with reason). + +## Scope +Allowed paths: +- `templates/**` +- `docs/**` +- `.agents/**` + +Forbidden paths: +- `api/**` +- `internal/**` +- `pkg/**` +- `test/**` +- `kuttl/**` +- `config/**` +- `bundle/**` +- `helm-chart/**` +- `vendor/**` + +This skill should not change product code. If changes are required, stop and hand off. + +## Workflow +1. Summarize the change set and key behavior changes. +2. List tests run (or explicitly not run). +3. Call out risks, rollbacks, and compatibility notes. +4. Format output using `templates/pull_request.md`. + +## Pass / Fail Criteria +- Pass: PR content accurately reflects diff/tests/risks and matches template expectations. +- Fail: summary is inconsistent with code or validation evidence is missing. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-pr-crafter/agents/openai.yaml b/.agents/skills/sok-pr-crafter/agents/openai.yaml new file mode 100644 index 000000000..5d82c9823 --- /dev/null +++ b/.agents/skills/sok-pr-crafter/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK PR Crafter" + short_description: "Generate PR descriptions" diff --git a/.agents/skills/sok-prerequisites/SKILL.md b/.agents/skills/sok-prerequisites/SKILL.md new file mode 100644 index 000000000..141af3a47 --- /dev/null +++ b/.agents/skills/sok-prerequisites/SKILL.md @@ -0,0 +1,30 @@ +--- +name: sok-prerequisites +description: Prepare local prerequisites for Splunk Operator development. Use before build/test/CI-fix tasks that require toolchain, cluster tooling, and repo scripts. +--- + +# SOK Prerequisites + +## Overview +Validate the local environment and repo prerequisites before running build or test workflows. + +## Preconditions +- Run from the repository root or any subdirectory inside the repository. + +## Workflow +1. Confirm repository root and current branch: `git rev-parse --show-toplevel`, `git branch --show-current`. +2. Check required tools: `go`, `make`, `docker`, `kubectl`, `python3`. +3. Check optional-but-common tools: `kind`, `skaffold`, `operator-sdk`, `ginkgo`, `gh`. +4. Print versions for tools that are present. +5. Verify baseline scripts are executable: `scripts/dev/pr_check.sh`, `scripts/dev/unit.sh`, `scripts/verify_repo.sh`. +6. If prerequisites are missing, stop and provide exact install/fix actions. + +## Pass / Fail Criteria +- Pass: required tools are available and baseline scripts are executable. +- Fail: one or more required tools are missing or scripts are not runnable. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-prerequisites/agents/openai.yaml b/.agents/skills/sok-prerequisites/agents/openai.yaml new file mode 100644 index 000000000..e75b585c0 --- /dev/null +++ b/.agents/skills/sok-prerequisites/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Prerequisites" + short_description: "Validate local prerequisites" diff --git a/.agents/skills/sok-reconcile-debugger/SKILL.md b/.agents/skills/sok-reconcile-debugger/SKILL.md new file mode 100644 index 000000000..3288efc34 --- /dev/null +++ b/.agents/skills/sok-reconcile-debugger/SKILL.md @@ -0,0 +1,97 @@ +--- +name: sok-reconcile-debugger +description: Debug reconcile loops, stuck status phases, or app framework pipeline behavior in the Splunk Operator. Use when a CR is not progressing, status is stuck, or reconciliation repeatedly requeues and needs root-cause analysis and a fix plan. +--- + +# SOK Reconcile Debugger + +## Overview +Diagnose reconciliation failures and produce a clear root cause, fix, and regression test plan. + +## Preconditions +- A failing CR instance, logs, or reproducible symptom is available. +- Namespace/kind/name and operator version are known. + +## Scope +Allowed paths: +- `api/**` +- `internal/controller/**` +- `pkg/**` +- `config/**` +- `docs/**` +- `test/**` +- `kuttl/**` +- `scripts/**` + +Forbidden paths: +- `vendor/**` +- `bin/**` +- `.git/**` + +If changes are needed outside the allowed paths, stop and propose a follow-up plan. + +## Workflow +1. Gather context. +2. Reproduce with a minimal manifest. +3. Trace the reconcile path and status gating logic. +4. Add targeted debug logs, then remove them before final output. +5. Identify the root cause and propose the smallest safe fix. +6. Add or propose a regression test. +7. Produce a concise incident summary. + +## Details + +### 1) Gather context +- Capture CR kind, namespace, spec snippet, and operator version. +- Collect `kubectl describe` output and recent operator logs. +- Note whether the issue is a hot loop, terminal error, or stalled status phase. + +### Quick triage commands +`kubectl get -n -o yaml` +`kubectl describe -n ` +`kubectl get events -n --sort-by=.lastTimestamp` +`kubectl logs -n splunk-operator deploy/splunk-operator-controller-manager -c manager --since=30m` +Use `./scripts/debug_reconcile.sh ` to capture these into a single output folder. + +### 2) Reproduce +- Start from an example in `docs/Examples.md` or `test/example/` and reduce it to the minimal spec that reproduces the bug. +- Prefer a local kind cluster for fast iteration when feasible. + +### 3) Trace reconciliation +- Find the controller in `internal/controller` for the affected kind. +- Follow the reconcile flow in shared logic under `pkg/splunk/enterprise` and `pkg/splunk/common`. +- Identify status fields and conditions that gate progression. +- Check paused annotations (see `api/v4/*_types.go` or `api/v3/*_types.go`). +- Check `Phase` constants in `api/v4/common_types.go` for expected state transitions. +- Review predicates in `internal/controller/common/predicate.go` for reconcile triggers. +- Use `docs/agent/RECONCILE_FLOW.md` and `docs/agent/OPERATIONS.md` for guidance. + +### 4) Add targeted logs +- Add temporary logs at the gate that stops progression and at any error return. +- Use consistent keys to make log filtering easy. +- Remove debug logs before final output unless explicitly requested to keep them. + +### 5) Root cause and fix +- State the precise condition that prevents progression. +- Propose the smallest fix that restores the expected state transition. +- Verify idempotency and avoid new reconcile loops. + +### 6) Regression test +- Add or outline a unit test near the affected logic. +- If the bug is integration-only, add a minimal test stub under `test/` or `kuttl/`. +- Prefer helper scripts when available: `scripts/dev/unit.sh`, `scripts/dev/pr_check.sh`. + +## Pass / Fail Criteria +- Pass: root cause is reproducible, fix is minimal/idempotent, and regression coverage is defined. +- Fail: issue cannot be traced to a concrete gate or fix/validation is incomplete. + +## Key Paths +- Controllers: `internal/controller/` +- Shared logic: `pkg/splunk/enterprise`, `pkg/splunk/common` +- Examples: `docs/Examples.md`, `test/example/` + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-reconcile-debugger/agents/openai.yaml b/.agents/skills/sok-reconcile-debugger/agents/openai.yaml new file mode 100644 index 000000000..0ccf8f9e8 --- /dev/null +++ b/.agents/skills/sok-reconcile-debugger/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Reconcile Debugger" + short_description: "Debug reconcile loops and stuck phases" diff --git a/.agents/skills/sok-release-checklist/SKILL.md b/.agents/skills/sok-release-checklist/SKILL.md new file mode 100644 index 000000000..d9810cb00 --- /dev/null +++ b/.agents/skills/sok-release-checklist/SKILL.md @@ -0,0 +1,62 @@ +--- +name: sok-release-checklist +description: Prepare or verify a Splunk Operator release checklist, including compatibility, manifests, bundles, images, docs, and upgrade notes. Use when asked about release readiness, compatibility matrices, or release process steps. +--- + +# SOK Release Checklist + +## Overview +Produce a release readiness checklist tailored to this repo's build, bundle, and documentation flow. + +## Preconditions +- Target release version and branch are identified. +- Planned compatibility targets (Splunk/Kubernetes/platform) are available. + +## Workflow +1. Gather release context (version, target Splunk Enterprise versions, Kubernetes support). +2. Verify CRD and bundle artifacts. +3. Verify docs and upgrade guidance. +4. Verify image tags and helm chart outputs. +5. Summarize risks and required follow-ups. + +## Details + +### 1) Compatibility and support +- Review `docs/README.md` for compatibility notes and pointers to release notes. +- Review `docs/SplunkOperatorUpgrade.md` for upgrade constraints and breaking changes. +- Review `docs/ChangeLog.md` for release changes. +- If a public release is being prepared, confirm compatibility in the GitHub release notes. + +### 2) Manifests and bundle +- For CRD changes, ensure `make manifests` and `make bundle` are run. +- Confirm generated CRDs in `config/crd/bases/` and `bundle/manifests/`. +- Confirm helm chart CRDs updated in `helm-chart/splunk-operator/crds`. +- Confirm CSV and manifest bases in `bundle/manifests/splunk-operator.clusterserviceversion.yaml` and `config/manifests/`. +- Use `make verify` or `./scripts/verify_crd.sh` and `./scripts/verify_bundle.sh` to confirm outputs are in sync. +- Use `docs/agent/RELEASE_FLOW.md` for the canonical release flow. + +### 3) Images and tags +- Confirm operator image tag and any distroless tag if used. +- Verify `bundle.Dockerfile` or `Dockerfile` changes if applicable. + +### 4) Docs and examples +- Update install or upgrade docs if defaults or requirements changed. +- Update examples when spec fields or defaults changed. + +## Output Contract +- Use `assets/release-checklist.md` for the final checklist. +- Call out any missing inputs needed to finish the checklist. + +## Pass / Fail Criteria +- Pass: checklist covers compatibility, generated artifacts, images, docs, and open risks. +- Fail: required release inputs are missing or validation steps are incomplete. + +## Key Paths +- Release docs: `docs/README.md`, `docs/ChangeLog.md`, `docs/SplunkOperatorUpgrade.md`, `docs/Install.md` +- CRDs: `config/crd/bases/`, `bundle/manifests/`, `helm-chart/splunk-operator/crds` +- Manifests: `config/manifests/` +- CSV: `bundle/manifests/splunk-operator.clusterserviceversion.yaml` +- Build: `Makefile`, `Dockerfile`, `bundle.Dockerfile` +- Project mapping: `PROJECT` +- Agent docs: `docs/agent/TEST_MATRIX.md` +- Release flow: `docs/agent/RELEASE_FLOW.md` diff --git a/.agents/skills/sok-release-checklist/agents/openai.yaml b/.agents/skills/sok-release-checklist/agents/openai.yaml new file mode 100644 index 000000000..cbc4ed3e8 --- /dev/null +++ b/.agents/skills/sok-release-checklist/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Release Checklist" + short_description: "Verify release readiness checklist" diff --git a/.agents/skills/sok-test-author/SKILL.md b/.agents/skills/sok-test-author/SKILL.md new file mode 100644 index 000000000..ec52f6226 --- /dev/null +++ b/.agents/skills/sok-test-author/SKILL.md @@ -0,0 +1,53 @@ +--- +name: sok-test-author +description: Write or update Splunk Operator tests (unit/envtest/integration/KUTTL). Use when a change needs new tests or test adjustments. +--- + +# SOK Test Author + +## Overview +Create or update tests for operator behavior, using existing testenv helpers and patterns. + +## Preconditions +- Expected behavior is defined (input spec + expected status/resources). +- Test target type is known (unit/envtest, Ginkgo integration, or KUTTL). + +## Scope +Allowed paths: +- `test/**` +- `kuttl/**` +- `scripts/**` +- `docs/**` +- `config/samples/**` + +Forbidden paths: +- `api/**` +- `internal/**` +- `pkg/**` +- `bundle/**` +- `helm-chart/**` +- `vendor/**` + +If product code changes are required, stop and hand off to the appropriate skill. + +## Workflow +1. Determine test type: unit/envtest, integration (Ginkgo), or KUTTL. +2. Locate existing patterns in `docs/agent/TESTCASE_PATTERNS.md` and `test/`. +3. Scaffold tests using `scripts/generate_testcase.py` if helpful. +4. Implement assertions using `test/testenv` helpers. +5. Run tests (or specify exact commands). + +## Commands +- Unit/envtest: `scripts/dev/unit.sh` +- Lint/format: `scripts/dev/lint.sh` +- KUTTL scaffolds: `python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml` + +## Pass / Fail Criteria +- Pass: tests compile/run (or are explicitly marked as scaffold) and assertions map to expected behavior. +- Fail: tests do not compile, are missing core assertions, or cannot be executed/reasoned about. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-test-author/agents/openai.yaml b/.agents/skills/sok-test-author/agents/openai.yaml new file mode 100644 index 000000000..7068e3691 --- /dev/null +++ b/.agents/skills/sok-test-author/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Test Author" + short_description: "Write or update tests" diff --git a/.agents/skills/sok-test-harness/SKILL.md b/.agents/skills/sok-test-harness/SKILL.md new file mode 100644 index 000000000..272191dc1 --- /dev/null +++ b/.agents/skills/sok-test-harness/SKILL.md @@ -0,0 +1,69 @@ +--- +name: sok-test-harness +description: Run or troubleshoot Splunk Operator tests, including unit tests, integration tests, and local kind-based workflows. Use when asked to run tests, set up a kind cluster, or produce a test failure triage summary. +--- + +# SOK Test Harness + +## Overview +Run the repo's standard unit and integration tests, and summarize failures consistently. + +## Quick Start +- Unit tests: `scripts/dev/unit.sh` +- Lint/format checks: `scripts/dev/lint.sh` +- Envtest assets: `scripts/dev/envtest.sh` +- Kind smoke: `scripts/dev/kind_smoke.sh` +- Repo verification: `scripts/dev/pr_check.sh` (or `scripts/verify_repo.sh --all`) +- Generate new tests: `python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml` + +## Workflow +1. Confirm prerequisites. +2. Run unit tests or kind integration tests. +3. If tests fail, summarize the failure and propose next steps. +4. When CRDs or bundles changed, run `make verify` to confirm generated outputs. + +## Preconditions +- Go toolchain and `ginkgo` +- Docker and `kubectl` +- `kind` installed for local integration tests + +## Unit Tests +- Default command: `make test` +- Prefer `scripts/dev/unit.sh` to run with the repo defaults. + +## Integration Tests (Kind) +- Default commands: `make cluster-up`, `make int-test`, `make cluster-down` +- Prefer `scripts/dev/kind_smoke.sh` for a quick local sanity run. +- Skill-local scripts (`scripts/run_kind_e2e.sh`, `scripts/push_kind_operator_image.sh`) remain available for deeper e2e flows. + +## Common Environment Variables +These are defined in `test/env.sh` and can be overridden in your shell. +- `SPLUNK_OPERATOR_IMAGE` default `splunk/splunk-operator:latest` +- `SPLUNK_ENTERPRISE_IMAGE` default `splunk/splunk:latest` +- `CLUSTER_PROVIDER` default `kind` for local runs +- `PRIVATE_REGISTRY` default `localhost:5000` when using kind +- `TEST_REGEX` or `TEST_FOCUS` to filter tests +- `SKIP_REGEX` to skip tests +- `CLUSTER_WIDE` to run cluster-wide operator install + +## Failure Triage Output +- Provide the failing test names or package paths. +- Include the first error and any repeated error pattern. +- Suggest the most likely code area to inspect. + +## Pass / Fail Criteria +- Pass: requested test commands complete with actionable output and no unresolved errors. +- Fail: commands fail without a reproducible triage summary or required follow-up. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary + +## Key Paths +- Test harness: `test/README.md` +- Integration scripts: `test/run-tests.sh`, `test/deploy-cluster.sh`, `test/deploy-kind-cluster.sh` +- Unit test target: `Makefile` (`make test`) +- Environment defaults: `test/env.sh` +- Harness docs: `docs/agent/TEST_MATRIX.md` diff --git a/.agents/skills/sok-test-harness/agents/openai.yaml b/.agents/skills/sok-test-harness/agents/openai.yaml new file mode 100644 index 000000000..0647dec1a --- /dev/null +++ b/.agents/skills/sok-test-harness/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Test Harness" + short_description: "Run and triage test workflows" diff --git a/.agents/skills/sok-test-harness/scripts/push_kind_operator_image.sh b/.agents/skills/sok-test-harness/scripts/push_kind_operator_image.sh new file mode 100755 index 000000000..d745fd87f --- /dev/null +++ b/.agents/skills/sok-test-harness/scripts/push_kind_operator_image.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + repo_root="$(cd "${script_dir}/../../../.." && pwd)" +fi + +cd "${repo_root}" + +SPLUNK_OPERATOR_IMAGE="${SPLUNK_OPERATOR_IMAGE:-splunk/splunk-operator:latest}" +PRIVATE_REGISTRY="${PRIVATE_REGISTRY:-localhost:5000}" + +first_segment="${SPLUNK_OPERATOR_IMAGE%%/*}" +if [[ "${first_segment}" == *.* || "${first_segment}" == *:* ]]; then + echo "SPLUNK_OPERATOR_IMAGE looks like it already includes a registry: ${first_segment}" + echo "Set SPLUNK_OPERATOR_IMAGE to a repo/name:tag without a registry prefix." + echo "Example: SPLUNK_OPERATOR_IMAGE=splunk/splunk-operator:latest" + exit 1 +fi + +if [[ "${PRIVATE_REGISTRY}" == localhost:* || "${PRIVATE_REGISTRY}" == 127.0.0.1:* ]]; then + if ! docker ps --format '{{.Names}}' | grep -q '^kind-registry$'; then + echo "Local kind registry container is not running. Run: make cluster-up" + exit 1 + fi +else + if [[ "${FORCE_PUSH:-}" != "1" ]]; then + echo "Refusing to push to non-local registry '${PRIVATE_REGISTRY}'." + echo "Set FORCE_PUSH=1 to override." + exit 1 + fi +fi + +target_image="${PRIVATE_REGISTRY}/${SPLUNK_OPERATOR_IMAGE}" + +if ! docker image inspect "${SPLUNK_OPERATOR_IMAGE}" >/dev/null 2>&1; then + echo "Local image ${SPLUNK_OPERATOR_IMAGE} not found, pulling..." + docker pull "${SPLUNK_OPERATOR_IMAGE}" +fi + +docker tag "${SPLUNK_OPERATOR_IMAGE}" "${target_image}" + +echo "Pushing ${target_image}" +docker push "${target_image}" + +echo "Pushed operator image: ${target_image}" diff --git a/.agents/skills/sok-test-harness/scripts/run_kind_e2e.sh b/.agents/skills/sok-test-harness/scripts/run_kind_e2e.sh new file mode 100755 index 000000000..43deaa06e --- /dev/null +++ b/.agents/skills/sok-test-harness/scripts/run_kind_e2e.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + echo "Usage: $(basename "$0") [--keep]" + echo " --keep Keep the kind cluster running after tests" + echo " --push-operator-image Push operator image to local kind registry before tests" +} + +keep_cluster=false +push_operator_image=false +while [[ $# -gt 0 ]]; do + case "$1" in + --keep) + keep_cluster=true + shift + ;; + --push-operator-image) + push_operator_image=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" + usage + exit 1 + ;; + esac +done + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + repo_root="$(cd "${script_dir}/../../../.." && pwd)" +fi + +cd "${repo_root}" + +export CLUSTER_PROVIDER="${CLUSTER_PROVIDER:-kind}" +export TEST_CLUSTER_PLATFORM="${TEST_CLUSTER_PLATFORM:-kind}" + +cleanup() { + if [[ "${keep_cluster}" == "true" ]]; then + echo "Keeping kind cluster running (requested)." + return 0 + fi + echo "Tearing down kind cluster: make cluster-down" + make cluster-down +} + +trap cleanup EXIT + +echo "Bringing up kind cluster: make cluster-up" +make cluster-up + +if [[ "${push_operator_image}" == "true" ]]; then + echo "Pushing operator image to local kind registry" + "${script_dir}/push_kind_operator_image.sh" +fi + +echo "Running integration tests: make int-test" +make int-test diff --git a/.agents/skills/sok-test-harness/scripts/run_unit.sh b/.agents/skills/sok-test-harness/scripts/run_unit.sh new file mode 100755 index 000000000..55f36258d --- /dev/null +++ b/.agents/skills/sok-test-harness/scripts/run_unit.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + repo_root="$(cd "${script_dir}/../../../.." && pwd)" +fi + +cd "${repo_root}" + +echo "Running unit tests: make test" +make test diff --git a/.agents/skills/sok-test/SKILL.md b/.agents/skills/sok-test/SKILL.md new file mode 100644 index 000000000..306c92b47 --- /dev/null +++ b/.agents/skills/sok-test/SKILL.md @@ -0,0 +1,29 @@ +--- +name: sok-test +description: Run Splunk Operator tests using standard local harness commands. Use for unit, fast PR gates, and smoke/integration test triage. +--- + +# SOK Test + +## Overview +Execute consistent local test workflows so behavior matches CI expectations. + +## Preconditions +- Run `sok-prerequisites` first. +- For code changes, run `sok-build` before broader test runs. + +## Workflow +1. Run unit tests: `scripts/dev/unit.sh`. +2. Run fast policy + repo gate: `PR_CHECK_FLAGS=--fast scripts/dev/pr_check.sh`. +3. For kind smoke validation, run: `scripts/dev/kind_smoke.sh`. +4. For failing cases, capture first failure and likely owner path. + +## Pass / Fail Criteria +- Pass: requested test commands complete successfully. +- Fail: one or more commands fail, with the first actionable failure identified. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-test/agents/openai.yaml b/.agents/skills/sok-test/agents/openai.yaml new file mode 100644 index 000000000..1995c2346 --- /dev/null +++ b/.agents/skills/sok-test/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Test" + short_description: "Run standard test workflows" diff --git a/.agents/skills/sok-testcase-builder/SKILL.md b/.agents/skills/sok-testcase-builder/SKILL.md new file mode 100644 index 000000000..3d10d561d --- /dev/null +++ b/.agents/skills/sok-testcase-builder/SKILL.md @@ -0,0 +1,67 @@ +--- +name: sok-testcase-builder +description: Create new Splunk Operator integration (Ginkgo) or KUTTL tests from a CR spec and expected results. Use when a developer asks to add a new test case that validates CR status phase Ready and required resources. +--- + +# SOK Testcase Builder + +## Overview +Generate scaffolds for new integration or KUTTL tests based on a CR spec and expected results. + +## Preconditions +- Input spec includes target CR kind, expected phase/resource outcomes, and test type. +- Destination test suite location is chosen. + +## Workflow +1. Determine test type: integration (Ginkgo) or KUTTL. +2. Identify the SVA architecture (S1, C3, M4, M1), features (smartstore, appframework), and any SVA validations. +3. Collect CR manifest path(s) and expected results. +4. Create or update a testcase spec file from `docs/agent/TESTCASE_SPEC.yaml`. +5. Run the generator script to scaffold the test. +6. Fill in TODOs (spec struct, resource checks, extra asserts). +7. Run the appropriate test command. + +## Test Types + +### KUTTL +- Inputs: CR manifest, expected phase, and resource assertions. +- Output: `kuttl/tests///` with deploy and assert steps. +- Recommended when validating CRD behavior with simple YAML assertions. + - Supports optional operator upgrade steps using the `upgrade` spec block. + +### Integration (Ginkgo) +- Inputs: CR spec and expected behaviors. +- Output: `test//_test.go` with a suite file if missing. +- Recommended for multi-step flows or API-based verification. + - Use `docs/agent/TESTCASE_PATTERNS.md` to map SVA patterns to helpers. + - For C3 SVA, set `validations.sva: C3` to include Monitoring Console + License Manager readiness checks. + +## Generator Script +Use `scripts/generate_testcase.py` with a spec file: + +`python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml` + +Options: +- `--force` overwrite existing files +- `--dry-run` print actions without writing +Note: YAML specs require `pyyaml` (`python3 -m pip install pyyaml`). + +## Expected Results +- Always validate `status.phase` is `Ready` (or the specified phase). +- Add asserts for key resources (StatefulSet, Service, Secret, ConfigMap) as needed. + +## Output Contract +- List created/edited files +- Provide the test command to run +- Call out any TODOs left in the scaffold + +## Pass / Fail Criteria +- Pass: scaffold files are created deterministically from spec and include required assertions/TODO markers. +- Fail: scaffold generation is incomplete or expected validation blocks are missing. + +## Key References +- Spec template: `docs/agent/TESTCASE_SPEC.yaml` +- Patterns: `docs/agent/TESTCASE_PATTERNS.md` +- Test matrix: `docs/agent/TEST_MATRIX.md` +- CRD map: `docs/agent/CRD_MAP.md` +- Test helpers: `test/testenv/verificationutils.go` diff --git a/.agents/skills/sok-testcase-builder/agents/openai.yaml b/.agents/skills/sok-testcase-builder/agents/openai.yaml new file mode 100644 index 000000000..09ee91623 --- /dev/null +++ b/.agents/skills/sok-testcase-builder/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Testcase Builder" + short_description: "Generate integration/KUTTL testcases" diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 000000000..1d943ebbb --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,35 @@ +FROM mcr.microsoft.com/devcontainers/base:bookworm + +ARG GO_VERSION=1.25.7 +ARG KUBECTL_VERSION=v1.29.1 +ARG OPERATOR_SDK_VERSION=v1.42.0 +ARG SKAFFOLD_VERSION=v2.16.1 +ARG KIND_VERSION=v0.29.0 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + git \ + jq \ + make \ + python3 \ + python3-pip \ + shellcheck \ + unzip && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" | tar -C /usr/local -xz +ENV PATH="/usr/local/go/bin:${PATH}" + +RUN curl -fsSLo /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" && \ + chmod +x /usr/local/bin/kubectl + +RUN curl -fsSLo /usr/local/bin/operator-sdk "https://github.com/operator-framework/operator-sdk/releases/download/${OPERATOR_SDK_VERSION}/operator-sdk_linux_amd64" && \ + chmod +x /usr/local/bin/operator-sdk + +RUN curl -fsSLo /usr/local/bin/skaffold "https://storage.googleapis.com/skaffold/releases/${SKAFFOLD_VERSION}/skaffold-linux-amd64" && \ + chmod +x /usr/local/bin/skaffold + +RUN curl -fsSLo /usr/local/bin/kind "https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-amd64" && \ + chmod +x /usr/local/bin/kind diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..cba689b4e --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,38 @@ +{ + "name": "splunk-operator-dev", + "build": { + "dockerfile": "Dockerfile", + "context": "..", + "args": { + "GO_VERSION": "1.25.7", + "KUBECTL_VERSION": "v1.29.1", + "OPERATOR_SDK_VERSION": "v1.42.0", + "SKAFFOLD_VERSION": "v2.16.1", + "KIND_VERSION": "v0.29.0" + } + }, + "features": { + "ghcr.io/devcontainers/features/docker-outside-of-docker:1": {}, + "ghcr.io/devcontainers/features/github-cli:1": {} + }, + "containerEnv": { + "CLUSTER_PROVIDER": "kind", + "TEST_CLUSTER_PLATFORM": "kind" + }, + "postCreateCommand": "bash .devcontainer/post-create.sh", + "remoteUser": "vscode", + "customizations": { + "vscode": { + "extensions": [ + "golang.Go", + "ms-kubernetes-tools.vscode-kubernetes-tools", + "redhat.vscode-yaml", + "github.vscode-github-actions" + ], + "settings": { + "go.useLanguageServer": true, + "terminal.integrated.defaultProfile.linux": "bash" + } + } + } +} diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh new file mode 100755 index 000000000..731d9e4bd --- /dev/null +++ b/.devcontainer/post-create.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "Bootstrapping Splunk Operator devcontainer..." +go version +kubectl version --client --output=yaml >/dev/null +skaffold version >/dev/null +operator-sdk version >/dev/null +kind version >/dev/null + +echo "Installing repository-local tools..." +make kustomize controller-gen envtest >/dev/null + +echo "Devcontainer bootstrap complete." diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 65d2b7b12..3c410faaf 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -6,10 +6,44 @@ _What does this PR have in it?_ _Highlight the updates in specific files_ +### Governing Spec + +_Link the KEP in `docs/specs/` and include status (`Draft`, `In Review`, `Approved`, `Implemented`, `Superseded`)._ + +### Spec Kit Plan + +_Link the Spec Kit folder in `speckit/specs/-/` used for planning._ + +### Harness Manifest + +_Link the machine-readable manifest in `harness/manifests/` used for this PR._ +_Include risk tier and delivery mode from the manifest._ + +### Risk Label + +_Set exactly one PR label matching manifest tier: `risk:low`, `risk:medium`, or `risk:high`._ + ### Testing and Verification _How did you test these changes? What automated tests are added?_ +Suggested local gates: +- `scripts/dev/spec_check.sh` +- `scripts/dev/harness_manifest_check.sh` +- `scripts/dev/doc_first_check.sh` +- `scripts/dev/commit_discipline_check.sh` +- `scripts/dev/appframework_parity_check.sh` +- `scripts/dev/keps_check.sh` +- `scripts/dev/harness_engineering_parity_check.sh` +- `scripts/dev/constitution_runtime_policy_check.sh` +- `scripts/dev/risk_policy_check.sh` +- `scripts/dev/risk_label_check.sh --labels risk:` +- `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` +- `scripts/dev/harness_run.sh --fast` +- `scripts/dev/pr_check.sh` +- `scripts/dev/autonomy_scorecard.sh --base-ref ` +- `scripts/dev/unit.sh` + ### Related Issues _Jira tickets, GitHub issues, Support tickets..._ diff --git a/.github/workflows/autonomy-scorecard.yml b/.github/workflows/autonomy-scorecard.yml new file mode 100644 index 000000000..2934a1728 --- /dev/null +++ b/.github/workflows/autonomy-scorecard.yml @@ -0,0 +1,44 @@ +name: Autonomy Scorecard + +on: + pull_request: + branches: + - main + - develop + workflow_dispatch: + +jobs: + autonomy-scorecard: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Generate scorecard + run: | + set -euo pipefail + mkdir -p artifacts + args=() + if [[ -n "${{ github.base_ref }}" ]]; then + args+=(--base-ref "${{ github.base_ref }}") + fi + + scripts/dev/autonomy_scorecard.sh \ + "${args[@]}" \ + --suite docs/agent/evals/policy-regression.yaml \ + --output artifacts/autonomy-scorecard.json \ + --markdown artifacts/autonomy-scorecard.md + + - name: Publish summary + run: | + cat artifacts/autonomy-scorecard.md >> "$GITHUB_STEP_SUMMARY" + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: autonomy-scorecard + path: artifacts/autonomy-scorecard.* diff --git a/.github/workflows/merge-queue-check.yml b/.github/workflows/merge-queue-check.yml new file mode 100644 index 000000000..c9b2cf345 --- /dev/null +++ b/.github/workflows/merge-queue-check.yml @@ -0,0 +1,36 @@ +name: Merge Queue Check + +on: + merge_group: + branches: + - main + - develop + +jobs: + merge-queue-pr-check: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Run merge queue harness check + run: | + set -euo pipefail + BASE_REF="${{ github.event.merge_group.base_ref }}" + if [[ -z "${BASE_REF}" ]]; then + BASE_REF=develop + fi + + HARNESS_BASE_REF="${BASE_REF}" \ + SKIP_RISK_LABEL_CHECK=1 \ + PR_CHECK_FLAGS=--fast \ + scripts/dev/pr_check.sh diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml new file mode 100644 index 000000000..44eae63a2 --- /dev/null +++ b/.github/workflows/pr-check.yml @@ -0,0 +1,40 @@ +name: PR Check + +on: + pull_request: + types: + - opened + - reopened + - synchronize + - labeled + - unlabeled + branches: + - main + - develop + +jobs: + pr-check: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Run PR check (fast) + env: + PR_CHECK_FLAGS: --fast + SPEC_CHECK_BASE_REF: ${{ github.base_ref }} + HARNESS_BASE_REF: ${{ github.base_ref }} + HARNESS_EVAL_SUITE: docs/agent/evals/policy-regression.yaml + RISK_LABELS: ${{ join(github.event.pull_request.labels.*.name, ',') }} + ENFORCE_RISK_LABEL_CHECK: "1" + run: | + scripts/dev/pr_check.sh diff --git a/.github/workflows/skaffold-smoke.yml b/.github/workflows/skaffold-smoke.yml new file mode 100644 index 000000000..2e2e0cf97 --- /dev/null +++ b/.github/workflows/skaffold-smoke.yml @@ -0,0 +1,67 @@ +name: Skaffold Smoke + +on: + pull_request: + branches: + - main + - develop + paths: + - skaffold.yaml + - config/skaffold/** + - scripts/dev/skaffold_*.sh + - .devcontainer/** + - Dockerfile + - config/default/** + - config/manager/** + workflow_dispatch: + +jobs: + skaffold-smoke: + runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install kubectl + uses: azure/setup-kubectl@v4 + with: + version: v1.29.1 + + - name: Install skaffold + run: | + curl -fsSLo skaffold https://storage.googleapis.com/skaffold/releases/v2.16.1/skaffold-linux-amd64 + sudo install skaffold /usr/local/bin/skaffold + skaffold version + + - name: Install kind + run: | + curl -fsSLo kind https://kind.sigs.k8s.io/dl/v0.29.0/kind-linux-amd64 + sudo install kind /usr/local/bin/kind + kind version + + - name: Create kind cluster + run: | + kind create cluster --name skaffold-smoke + kubectl cluster-info + + - name: Skaffold smoke deploy + env: + SKAFFOLD_PROFILE: ci-smoke + SKAFFOLD_CLEANUP: "1" + OPERATOR_NAMESPACE: splunk-operator + run: scripts/dev/skaffold_ci_smoke.sh + + - name: Dump diagnostics on failure + if: failure() + run: | + kubectl get pods -A || true + kubectl -n splunk-operator get deploy,pods,events || true + kubectl -n splunk-operator logs deploy/splunk-operator-controller-manager -c manager --tail=200 || true + + - name: Delete kind cluster + if: always() + run: kind delete cluster --name skaffold-smoke diff --git a/.gitignore b/.gitignore index 4846768ad..9a8a2cacd 100644 --- a/.gitignore +++ b/.gitignore @@ -98,5 +98,8 @@ bin/ bundle_*/ test/secret/*.log kubeconfig -.devcontainer/devcontainer.json -kuttl-artifacts/* \ No newline at end of file +kuttl-artifacts/* +.harness/runs/* +!.harness/runs/.gitkeep +.harness/scorecards/* +!.harness/scorecards/.gitkeep diff --git a/.harness/runs/.gitkeep b/.harness/runs/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.harness/scorecards/.gitkeep b/.harness/scorecards/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/AGENTS.md b/AGENTS.md index e2ba5781f..ca5f5439b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -10,6 +10,36 @@ The Splunk Operator is a Kubernetes operator that manages Splunk Enterprise depl - **Test Framework**: Ginkgo/Gomega - **CRD API Versions**: v1, v1alpha2, v1alpha3, v1beta1, v2, v3, v4 +## Spec-First Agent Workflow + +For non-trivial changes, agents must follow this order: +1. Bootstrap planning with Spec Kit: + - `scripts/dev/speckit_bridge.sh bootstrap --change-id --title ""` +2. Drive the generated KEP in `docs/specs/` through review until status is `Approved`. +3. Set execution policy in `harness/manifests/`: + - scope (`allowed_paths`, `forbidden_paths`) + - governance (`risk_tier`, `human_approvals_required`, `merge_queue_required`) +4. Implement code changes scoped to the manifest policy. +5. Validate with harness commands: + - `scripts/dev/spec_check.sh` + - `scripts/dev/harness_manifest_check.sh` + - `scripts/dev/doc_first_check.sh` + - `scripts/dev/commit_discipline_check.sh` + - `scripts/dev/appframework_parity_check.sh` + - `scripts/dev/keps_check.sh` + - `scripts/dev/harness_engineering_parity_check.sh` + - `scripts/dev/constitution_runtime_policy_check.sh` + - `scripts/dev/risk_policy_check.sh` + - `scripts/dev/risk_label_check.sh --labels risk:<tier>` + - `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` + - `scripts/dev/harness_run.sh --fast` + - `scripts/dev/autonomy_scorecard.sh` + - `scripts/dev/pr_check.sh` +6. Update spec status and graduation criteria with implementation progress. + +If non-trivial code changes exist without a valid harness manifest linked to an +approved spec, `harness_manifest_check.sh` should fail. + ## Repository Structure ``` @@ -79,8 +109,18 @@ make deploy IMG=<your-image> NAMESPACE=<namespace> ENVIRONMENT=<env> # Undeploy operator from cluster make undeploy + +# Run skaffold inner loop on kind +make skaffold-dev + +# Run one-shot skaffold smoke deploy and cleanup +make skaffold-smoke ``` +### Devcontainer + +Use `.devcontainer/` to get a consistent local/agent toolchain (Go, kubectl, operator-sdk, kind, skaffold). + ### Documentation Commands ```bash diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 000000000..08b0dcdf3 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,13 @@ +* @splunk/splunk-operator-for-kubernetes + +/api/ @splunk/splunk-operator-for-kubernetes +/internal/controller/ @splunk/splunk-operator-for-kubernetes +/pkg/splunk/enterprise/ @splunk/splunk-operator-for-kubernetes +/test/ @splunk/splunk-operator-for-kubernetes +/kuttl/ @splunk/splunk-operator-for-kubernetes +/docs/ @splunk/splunk-operator-for-kubernetes +/docs/specs/ @splunk/splunk-operator-for-kubernetes +/docs/agent/ @splunk/splunk-operator-for-kubernetes +/harness/ @splunk/splunk-operator-for-kubernetes +/speckit/ @splunk/splunk-operator-for-kubernetes +/scripts/ @splunk/splunk-operator-for-kubernetes diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..1022e8c79 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,66 @@ +# Code of Conduct + +This project follows the [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct/). + +## Our Pledge +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards +Examples of behavior that contributes to a positive environment for our +community include: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes +- Focusing on what is best for the community + +Examples of unacceptable behavior include: + +- The use of sexualized language or imagery, and sexual attention or advances +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others’ private information, such as a physical or email address, + without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. + +## Enforcement +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the project team at `opensource@splunk.com`. All complaints will be +reviewed and investigated promptly and fairly. + +## Enforcement Guidelines +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +1. **Correction** — A private, written warning and guidance. +2. **Warning** — A formal warning with consequences for continued behavior. +3. **Temporary Ban** — A temporary ban from the community. +4. **Permanent Ban** — A permanent ban from the community. + +## Attribution +This Code of Conduct is adapted from the Contributor Covenant, version 2.1. diff --git a/GOVERNANCE.md b/GOVERNANCE.md new file mode 100644 index 000000000..fb5155502 --- /dev/null +++ b/GOVERNANCE.md @@ -0,0 +1,79 @@ +# Governance + +## Purpose +This document defines project decision-making, ownership, and the required +KEP-first + harness workflow for Splunk Operator development. + +## Roles +- **Maintainers**: final technical decision makers, release owners, and policy + stewards. +- **Contributors**: anyone proposing or implementing changes. + +Maintainers are listed in `MAINTAINERS.md`. + +## Decision Model +- Default model: maintainers seek technical consensus in issue/spec/PR review. +- If consensus is not reached, maintainers decide based on correctness, + security, operability, and compatibility. + +## KEP-First Policy +Non-trivial changes must start with a KEP-lite document under `docs/specs/`. +Spec Kit planning artifacts may be created under `speckit/specs/`, but KEP is +the governance source of truth. + +Non-trivial includes: +- API/CRD or webhook behavior changes +- reconciliation/state-machine logic changes +- harness/test architecture changes +- release/upgrade behavior changes + +The governing KEP is part of the codebase and must evolve with +implementation. + +## KEP Lifecycle +Valid status values: +- `Draft` +- `In Review` +- `Approved` +- `Implemented` +- `Superseded` + +Required sections are defined in `docs/specs/README.md` and +`docs/specs/SPEC_TEMPLATE.md`. + +## Harness Policy +Validation is harness-driven, not ad hoc: +- `scripts/dev/spec_check.sh` enforces KEP structure and lifecycle validity. +- `scripts/dev/harness_manifest_check.sh` enforces machine-readable KEP linkage + and scope policy. +- `scripts/dev/risk_policy_check.sh` enforces risk-tier review/merge policy. +- `scripts/dev/risk_label_check.sh` enforces PR label alignment with risk tier. +- `scripts/dev/harness_eval.sh` enforces replayable governance regression checks. +- `scripts/dev/harness_run.sh` generates auditable run artifacts. +- `scripts/dev/autonomy_scorecard.sh` generates autonomy metrics for each diff. +- `scripts/dev/pr_check.sh` runs repository verification gates. +- CI `PR Check` runs these checks on pull requests. +- CI merge queue check runs equivalent fast gates for queued merges. + +Any emergency bypass must be explicit and documented in the PR with rationale +and rollback. + +## Pull Request Requirements +Each non-trivial PR must include: +- governing KEP path under `docs/specs/` +- optional Spec Kit path under `speckit/specs/` if used +- harness manifest path under `harness/manifests/` +- risk tier and delivery mode from manifest +- harness results +- risk and rollback notes + +PR templates and CODEOWNERS enforce review structure. + +## Entropy Management +To keep the repo maintainable: +- remove stale specs by marking them `Superseded` +- keep harness scripts deterministic and lightweight +- update docs/process whenever policy changes + +## Code of Conduct +All participants must follow `CODE_OF_CONDUCT.md`. diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 000000000..b3968df15 --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,24 @@ +# Maintainers + +Maintainers are responsible for project direction, release decisions, and final +review of changes. + +Current maintainers: +- Arjun Kondur +- Gabriel Mendoza +- Gaurav Gupta +- Igor Grzankowski +- Jakub Buczak +- Katarzyna Kozioł +- Minjie Qiu +- Patryk Wasielewski +- Qing Wang +- Raizel Lieberman +- Richard Wang +- Sirish Mohan +- Subba Gontla +- Szymon Buczak +- Vivek Reddy +- Yuhan Yang + +If you need help, tag a maintainer in a GitHub issue or PR. diff --git a/Makefile b/Makefile index 170ba70a6..e464c852a 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,7 @@ BUNDLE_IMG ?= ${IMAGE_TAG_BASE}-bundle:v${VERSION} # Image URL to use all building/pushing image targets IMG ?= controller:latest +SKAFFOLD_PROFILE ?= dev-kind # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. # Automatically derive the version from go.mod ENVTEST_VERSION := $(shell go list -m -f "{{ .Version }}" sigs.k8s.io/controller-runtime | awk -F'[v.]' '{printf "release-%d.%d", $$2, $$3}') @@ -140,6 +141,57 @@ vet: setup/ginkgo ## Run go vet against code. test: manifests generate fmt vet setup-envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use ${ENVTEST_K8S_VERSION} --bin-dir $(LOCALBIN) -p path)" ginkgo --junit-report=unit_test.xml --output-dir=`pwd` -vv --trace --keep-going --timeout=3h --cover --covermode=count --coverprofile=coverage.out ./pkg/splunk/common ./pkg/splunk/enterprise ./pkg/splunk/client ./pkg/splunk/util ./internal/controller ./pkg/splunk/splkcontroller +.PHONY: verify verify-crd verify-bundle +verify: verify-crd ## Verify generated artifacts (set VERIFY_BUNDLE=1 to include bundle) + @if [ "$(VERIFY_BUNDLE)" = "1" ]; then \ + $(MAKE) verify-bundle; \ + else \ + echo "Skipping bundle verify (set VERIFY_BUNDLE=1 to enable)"; \ + fi + +.PHONY: verify-repo +verify-repo: ## Run repository verification script (see scripts/verify_repo.sh) + @./scripts/verify_repo.sh + +.PHONY: doc-first-check commit-discipline-check appframework-parity-check keps-check harness-parity-check constitution-runtime-check start-change +doc-first-check: ## Enforce doc-first governance for changed implementation paths + @./scripts/dev/doc_first_check.sh + +commit-discipline-check: ## Enforce incremental commit discipline on current branch + @./scripts/dev/commit_discipline_check.sh + +appframework-parity-check: ## Enforce AppFramework parity evidence for gated paths + @./scripts/dev/appframework_parity_check.sh + +keps-check: ## Validate impacted components map to referenced approved IDs + @./scripts/dev/keps_check.sh + +harness-parity-check: ## Validate harness-engineering parity matrix structure and evidence + @./scripts/dev/harness_engineering_parity_check.sh + +constitution-runtime-check: ## Validate constitution and runtime issue governance policy + @./scripts/dev/constitution_runtime_policy_check.sh + +start-change: ## Create docs/changes/<date>-<topic>.md from template (use TOPIC=...) + @if [ -z "$(TOPIC)" ]; then \ + echo "Usage: make start-change TOPIC=<short-topic>"; \ + exit 1; \ + fi + @./scripts/dev/start_change.sh "$(TOPIC)" + +.PHONY: skaffold-dev skaffold-smoke +skaffold-dev: ## Run skaffold inner loop (default profile: dev-kind) + @SKAFFOLD_PROFILE=$(SKAFFOLD_PROFILE) ./scripts/dev/skaffold_dev.sh + +skaffold-smoke: ## Run skaffold smoke deploy + rollout check and cleanup + @SKAFFOLD_PROFILE=ci-smoke SKAFFOLD_CLEANUP=1 ./scripts/dev/skaffold_ci_smoke.sh + +verify-crd: ## Regenerate and verify CRD/RBAC outputs + @./scripts/verify_crd.sh + +verify-bundle: ## Regenerate and verify bundle/helm outputs + @./scripts/verify_bundle.sh + ##@ Documentation diff --git a/README.md b/README.md new file mode 100644 index 000000000..d492d7e91 --- /dev/null +++ b/README.md @@ -0,0 +1,174 @@ +# Splunk Operator for Kubernetes + +[![License](https://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) +[![PkgGoDev](https://pkg.go.dev/badge/github.com/splunk/splunk-operator)](https://pkg.go.dev/github.com/splunk/splunk-operator) +[![Go Report Card](https://goreportcard.com/badge/github.com/splunk/splunk-operator)](https://goreportcard.com/report/github.com/splunk/splunk-operator) +[![Coverage Status](https://coveralls.io/repos/github/splunk/splunk-operator/badge.svg?branch=master)](https://coveralls.io/github/splunk/splunk-operator?branch=master) +[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2Fsplunk%2Fsplunk-operator.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2Fsplunk%2Fsplunk-operator?ref=badge_shield) +[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/splunk/splunk-operator) + +The Splunk Operator for Kubernetes (SOK) makes it easy for Splunk +Administrators to deploy and operate Enterprise deployments in a Kubernetes +infrastructure. Packaged as a container, it uses the +[operator pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) +to manage Splunk-specific [custom resources](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/), +following best practices to manage all the underlying Kubernetes objects for you. + +This repository is used to build the Splunk +[Operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) +for Kubernetes (SOK). If you are just looking for documentation on how to +deploy and use the latest release, please see the +[Getting Started Documentation](docs/README.md). + +## Splunk General Terms Acceptance + +Starting with operator version 3.0.0, which includes support for Splunk Enterprise version 10.x, an additional Docker-Splunk specific parameter is required to start containers. **This is a breaking change, and user action is required.** + +Starting in 10.x image versions of Splunk Enterprise, license acceptance requires an additional `SPLUNK_GENERAL_TERMS=--accept-sgt-current-at-splunk-com` argument. This indicates that users have read and accepted the current/latest version of the Splunk General Terms, available at https://www.splunk.com/en_us/legal/splunk-general-terms.html as may be updated from time to time. Unless you have jointly executed with Splunk a negotiated version of these General Terms that explicitly supersedes this agreement, by accessing or using Splunk software, you are agreeing to the Splunk General Terms posted at the time of your access and use and acknowledging its applicability to the Splunk software. Please read and make sure you agree to the Splunk General Terms before you access or use this software. Only after doing so should you include the `--accept-sgt-current-at-splunk-com` flag to indicate your acceptance of the current/latest Splunk General Terms and launch this software. All examples below have been updated with this change. + +If you use the below examples and the ‘--accept-sgt-current-at-splunk-com’ flag, you are indicating that you have read and accepted the current/latest version of the Splunk General Terms, as may be updated from time to time, and acknowledging its applicability to this software - as noted above. + +By default, the SPLUNK_GENERAL_TERMS environment variable will be set to an empty string. You must either manually update it to have the required additional value `--accept-sgt-current-at-splunk-com` in the splunk-operator-controller-manager deployment, or you can pass the `SPLUNK_GENERAL_TERMS` parameter with the required additional value to the `make deploy` command. + +``` +make deploy IMG=docker.io/splunk/splunk-operator:<tag name> WATCH_NAMESPACE="namespace1" RELATED_IMAGE_SPLUNK_ENTERPRISE="splunk/splunk:edge" SPLUNK_GENERAL_TERMS="--accept-sgt-current-at-splunk-com" +``` + +For more information about this change, see the [Splunk General Terms Migration Documentation](docs/SplunkGeneralTermsMigration.md). + +## Prerequisites + +You must have [Docker Engine](https://docs.docker.com/install/) installed to +build the Splunk Operator. + +This project uses [Go modules](https://blog.golang.org/using-go-modules), +and requires [golang](https://golang.org/doc/install) 1.23.0 or later. +You must `export GO111MODULE=on` if cloning these repositories into your +`$GOPATH` (not recommended). + +The [Kubernetes Operator SDK](https://github.com/operator-framework/operator-sdk) +must also be installed to build this project. + +``` +git clone -b v1.31.0 https://github.com/operator-framework/operator-sdk +cd operator-sdk +make tidy +make install +``` + +You may need to add `$GOPATH/bin` to your path to run the `operator-sdk` +command line tool: + +``` +export PATH=${PATH}:${GOPATH}/bin +``` + +It is also recommended that you install the following golang tools, +which are used by various `make` targets: + +```shell +go install golang.org/x/lint/golint +go install golang.org/x/tools/cmd/cover +go install github.com/mattn/goveralls +go get -u github.com/mikefarah/yq/v3 +go get -u github.com/go-delve/delve/cmd/dlv +``` + +## Cloning this repository + +```shell +git clone git@github.com:splunk/splunk-operator.git +cd splunk-operator +``` + +## Repository overview + +This repository consists of the following code used to build the splunk-operator binary: + +* `main.go`: Provides the main() function, where everything begins +* `apis/`: Source code for the operator's custom resource definition types +* `controllers/`: Used to register controllers that watch for changes to custom resources +* `pkg/splunk/enterprise/`: Source code for controllers that manage Splunk Enterprise resources +* `pkg/splunk/controller/`: Common code shared across Splunk controllers +* `pkg/splunk/common/`: Common code used by most other splunk packages +* `pkg/splunk/client/`: Simple client for Splunk Enterprise REST API +* `pkg/splunk/test/`: Common code used by other packages for unit testing + +`main()` uses `controllers` to register all the `enterprise` controllers +that manage custom resources by watching for Kubernetes events. +The `enterprise` controllers are implemented using common code provided +by the `controllers` package. The `enterprise` controllers also use the REST API client +provided in the `pkg/splunk/client` package. The types provided by `apis/` and +common code in the `pkg/splunk/common/` package are used universally. Note that the +source code for `main()` is generated from a template provided by the Operator SDK. + +In addition to the source code, this repository includes: + +* `tools`: Build scripts, templates, etc. used to build the container image +* `config`: Kubernetes YAML templates used to install the Splunk Operator +* `docs`: Getting Started Guide and other documentation in Markdown format +* `test`: Integration test framework built using Ginko. See [docs](test/README.md) for more info. + +## Building the operator + +You can build the operator by just running `make`. + +Other make targets include (more info below): + +* `make all`: builds `manager` executable +* `make test`: Runs unit tests with Coveralls code coverage output to coverage.out +* `make scorecard`: Runs operator-sdk scorecard tests using OLM installation bundle +* `make generate`: runs operator-generate k8s, crds and csv commands, updating installation YAML files and OLM bundle +* `make docker-build`: generates `splunk-operator` container image example `make docker-build IMG=docker.io/splunk/splunk-operator:<tag name>` +* `make docker-buildx`: generates `splunk-operator` container image for multiple platforms, example `make docker-buildx IMG=docker.io/splunk/splunk-operator:<tag name>` +* `make docker-push`: push docker image to given repository example `make docker-push IMG=docker.io/splunk/splunk-operator:<tag name>` +* `make clean`: removes the binary build output and `splunk-operator` container image example `make docker-push IMG=docker.io/splunk/splunk-operator:<tag name>` +* `make run`: runs the Splunk Operator locally, monitoring the Kubernetes cluster configured in your current `kubectl` context +* `make fmt`: runs `go fmt` on all `*.go` source files in this project +* `make bundle-build`: generates `splunk-operator-bundle` bundle container image for OLM example `make bundle-build IMAGE_TAG_BASE=docker.io/splunk/splunk-operator VERSION=<tag name> IMG=docker.io/splunk/splunk-operator:<tag name>` +* `make bundle-push`: push OLM bundle docker image to given repository example `make bundle-push IMAGE_TAG_BASE=docker.io/splunk/splunk-operator VERSION=<tag name> IMG=docker.io/splunk/splunk-operator:<tag name>` +* `make catalog-build`: generates `splunk-operator-catalog` catalog container image example `make catalog-build IMAGE_TAG_BASE=docker.io/splunk/splunk-operator VERSION=<tag name> IMG=docker.io/splunk/splunk-operator:<tag name>` +* `make catalog-push`: push catalog docker image to given repository example`make catalog-push IMAGE_TAG_BASE=docker.io/splunk/splunk-operator VERSION=<tag name> IMG=docker.io/splunk/splunk-operator:<tag name>` + +## Agent Harness + +For agent-assisted development, see `docs/agent/README.md`. +Useful scripts include `scripts/verify_crd.sh`, `scripts/verify_bundle.sh`, `scripts/verify_repo.sh`, `scripts/debug_reconcile.sh`, and `scripts/generate_testcase.py`. +For Skaffold workflows, use `make skaffold-dev` (inner loop) and `make skaffold-smoke` (one-shot smoke deploy). +For a reproducible local toolchain, use the devcontainer in `.devcontainer/`. + +## Deploying the Splunk Operator +`make deploy` command will deploy all the necessary resources to run Splunk Operator like RBAC policies, services, configmaps, deployment. Operator will be installed in `splunk-operator` namespace. If `splunk-operator` namespace does not exist, it will create the namespace. By default `make deploy` will install operator clusterwide. Operator will watch all the namespaces for any splunk enterprise custom resources. + +```shell +make deploy IMG=docker.io/splunk/splunk-operator:<tag name> +``` + +If you want operator for specific namespace then you must pass `WATCH_NAMESPACE` parameter to `make deploy` command + +``` +make deploy IMG=docker.io/splunk/splunk-operator:<tag name> WATCH_NAMESPACE="namespace1" +``` + +If you want operator to use specific version of splunk instance, then you must pass `RELATED_IMAGE_SPLUNK_ENTERPRISE` parameter to `make deploy` command + +``` +make deploy IMG=docker.io/splunk/splunk-operator:<tag name> WATCH_NAMESPACE="namespace1" RELATED_IMAGE_SPLUNK_ENTERPRISE="splunk/splunk:edge" +``` + +Use this to run the operator as a local foreground process on your machine: + +```shell +make run +``` + +This will use your current Kubernetes context from `~/.kube/config` to manage +resources in your current namespace. + +Please see the [Getting Started Documentation](docs/README.md) for more +information, including instructions on how to install the operator in your +cluster. + + +## License +[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2Fsplunk%2Fsplunk-operator.svg?type=large)](https://app.fossa.com/projects/git%2Bgithub.com%2Fsplunk%2Fsplunk-operator?ref=badge_large) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..27b84e37c --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,14 @@ +# Security Policy + +## Reporting a Vulnerability +If you believe you have found a security vulnerability, please report it +privately. Do **not** open a public GitHub issue. + +Contact: `security@splunk.com` + +We will acknowledge your report, assess the impact, and provide a timeline for +remediation as soon as possible. + +## Supported Versions +Security fixes are applied to currently supported releases. See `docs/ChangeLog.md` +for version history. diff --git a/SUPPORT.md b/SUPPORT.md new file mode 100644 index 000000000..bffbdfc5e --- /dev/null +++ b/SUPPORT.md @@ -0,0 +1,11 @@ +# Support + +## Community Support +- GitHub Issues: https://github.com/splunk/splunk-operator/issues + +## Commercial Support +If you are a Splunk customer and need formal support, contact Splunk Support +through your normal support channels. + +## Documentation +Project docs live under `docs/` in this repository. diff --git a/api/AGENTS.md b/api/AGENTS.md new file mode 100644 index 000000000..d1ee53850 --- /dev/null +++ b/api/AGENTS.md @@ -0,0 +1,25 @@ +# api/ — CRD Types and Schemas + +## What Lives Here +- CRD Go types (spec/status structs) +- Kubebuilder markers (`+kubebuilder:validation`, `+kubebuilder:default`, etc.) +- JSON tags and OpenAPI schema generation + +## Invariants +- JSON tags must match field names and `omitempty` rules. +- Optional fields should be pointers or `omitempty` where appropriate. +- Status fields must be write-only from controllers. + +## Common Pitfalls +- Forgetting to run generation after type changes. +- Mismatched JSON tag or missing `omitempty`. +- Breaking backward compatibility by removing/renaming fields. + +## Commands +- Regenerate CRDs: `./scripts/verify_crd.sh` +- Full repo verify: `make verify-repo` + +## Notes +If you update types here, you likely need changes in: +- `internal/controller/` for reconciliation +- `docs/` for user-facing updates diff --git a/config/skaffold/kustomization.yaml b/config/skaffold/kustomization.yaml new file mode 100644 index 000000000..a5fc210d9 --- /dev/null +++ b/config/skaffold/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../default + +patches: +- path: manager_patch.yaml + target: + kind: Deployment + name: controller-manager diff --git a/config/skaffold/manager_patch.yaml b/config/skaffold/manager_patch.yaml new file mode 100644 index 000000000..e4d320643 --- /dev/null +++ b/config/skaffold/manager_patch.yaml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager +spec: + template: + spec: + containers: + - name: manager + imagePullPolicy: IfNotPresent + env: + - name: WATCH_NAMESPACE + value: "" + - name: RELATED_IMAGE_SPLUNK_ENTERPRISE + value: splunk/splunk:10.0.0 + - name: SPLUNK_GENERAL_TERMS + value: --accept-sgt-current-at-splunk-com diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 7e3c7531f..42d445a81 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -17,6 +17,7 @@ This document is the single source of truth on contributing towards this codebas - [Contribution Workflow](#contribution-workflow) - [Bug reports and feature requests](#bug-reports-and-feature-requests) - [Fixing issues](#fixing-issues) + - [Spec-First Workflow](#spec-first-workflow) - [Pull requests](#pull-requests) - [Maintainer Workflow for External Contributions](#maintainer-workflow-for-external-contributions) - [Code Review](#code-review) @@ -33,7 +34,7 @@ We only accept pull requests submitted from: * Individuals who have signed the [Splunk Contributor License Agreement](https://www.splunk.com/en_us/form/contributions.html) #### Code of Conduct -All contributors are expected to read our [Code of Conduct](contributing/code-of-conduct.md) and observe it in all interactions involving this project. +All contributors are expected to read our [Code of Conduct](../CODE_OF_CONDUCT.md) and observe it in all interactions involving this project. ## Contribution Workflow Help is always welcome! For example, documentation can always use improvement. There's always code that can be clarified, functionality that can be extended, and tests to be added to guarantee behavior. If you see something you think should be fixed, don't be afraid to own it. @@ -53,6 +54,27 @@ We'd also like to hear your feature suggestions. Feel free to submit them as iss #### Fixing issues Look through our [issue tracker](https://github.com/splunk/splunk-operator/issues) to find problems to fix! Feel free to comment and tag corresponding stakeholders or full-time maintainers of this project with any questions or concerns. +#### Spec-First Workflow +For non-trivial changes, start with Spec Kit + KEP-lite before implementation: +1. Bootstrap planning, KEP, and manifest: + ```bash + $ scripts/dev/speckit_bridge.sh bootstrap --change-id CSPL-XXXX --title "your title" + ``` +2. Refine `speckit/specs/<id>-<slug>/spec.md`, `plan.md`, and `tasks.md`. +3. Fill KEP sections and request review. +4. Move KEP status to `Approved` before implementation. +5. Use `risk_tier` and scope in `harness/manifests/*.yaml` as execution policy. +6. Keep KEP in sync and move status to `Implemented` when merged. +7. Add one PR label that matches manifest risk tier (`risk:low|risk:medium|risk:high`). + +The PR harness enforces this with: +```bash +$ scripts/dev/spec_check.sh +$ scripts/dev/harness_manifest_check.sh +$ scripts/dev/risk_policy_check.sh +$ scripts/dev/risk_label_check.sh --labels risk:<tier> +``` + #### Pull requests A pull request informs the project's core developers about the changes you want to review and merge. Once you submit a pull request, it enters a stage of code review where you and others can discuss its potential modifications and add more commits later on. @@ -70,9 +92,21 @@ To make a pull request against this project: # Create your feature/bugfix branch $ git checkout -b your-branch-name develop ``` -1. Run tests to verify your environment. +1. Create or update the governing spec for non-trivial changes. + ```bash + $ scripts/dev/speckit_bridge.sh bootstrap --change-id CSPL-XXXX --title "your title" + ``` +1. Run harness and tests to verify your changes. ``` $ cd splunk-operator + $ scripts/dev/spec_check.sh + $ scripts/dev/harness_manifest_check.sh + $ scripts/dev/risk_policy_check.sh + $ scripts/dev/risk_label_check.sh --labels risk:<tier> + $ scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml + $ scripts/dev/harness_run.sh --fast + $ scripts/dev/pr_check.sh + $ scripts/dev/autonomy_scorecard.sh --base-ref develop $ make test ``` 1. Push your changes once your tests have passed. @@ -148,7 +182,7 @@ A PR is easy to review if you: * Write good commit messages, concise and descriptive. * Break large changes into a logical series of smaller patches. Patches individually make easily understandable changes, and in aggregate, solve a broader issue. -Reviewers are highly encouraged to revisit the [Code of Conduct](contributing/code-of-conduct.md) and must go above and beyond to promote a collaborative, respectful community. +Reviewers are highly encouraged to revisit the [Code of Conduct](../CODE_OF_CONDUCT.md) and must go above and beyond to promote a collaborative, respectful community. When reviewing PRs from others, [The Gentle Art of Patch Review](http://sage.thesharps.us/2014/09/01/the-gentle-art-of-patch-review/) suggests an iterative series of focuses, designed to lead new contributors to positive collaboration without inundating them initially with nuances: * Is the idea behind the contribution sound? @@ -167,6 +201,36 @@ Testing is the responsibility of all contributors. To run Unit Tests in Splunk O $ make test ``` +For agent-assisted or standardized local workflows, prefer the scripts under `scripts/dev/`: +- `scripts/dev/speckit_bridge.sh` +- `scripts/dev/spec_check.sh` +- `scripts/dev/harness_manifest_check.sh` +- `scripts/dev/doc_first_check.sh` +- `scripts/dev/commit_discipline_check.sh` +- `scripts/dev/appframework_parity_check.sh` +- `scripts/dev/risk_policy_check.sh` +- `scripts/dev/risk_label_check.sh` +- `scripts/dev/harness_eval.sh` +- `scripts/dev/harness_run.sh` +- `scripts/dev/autonomy_scorecard.sh` +- `scripts/dev/start_change.sh` +- `scripts/dev/skill_lint.sh` +- `scripts/dev/script_sanity_check.sh` +- `scripts/dev/unit.sh` +- `scripts/dev/lint.sh` +- `scripts/dev/skaffold_dev.sh` +- `scripts/dev/skaffold_ci_smoke.sh` +- `scripts/dev/pr_check.sh` + +For a consistent local environment across contributors and agents, open the repo in the devcontainer at `.devcontainer/`. + +#### Agentic Development Workflow +This repo includes skills and harness scripts to make common workflows repeatable. +Start with: +- `AGENTS.md` (root) for repo map and conventions +- `.agents/skills/*/SKILL.md` for skill workflows +- `docs/agent/README.md` for agent-focused guidance + #### Documentation We can always use improvements to our documentation! Anyone can contribute to these docs, whether you identify as a developer, an end user, or someone who just can’t stand seeing typos. What exactly is needed? @@ -197,11 +261,7 @@ If you need help, tag one of the active maintainers of this project in a post or () Gaurav Gupta () Subba Gontla () Arjun Kondur -() Kriti Ashok -() Param Dhanoya -() Victor Ebken -() Ajeet Kumar -() Jeff Rybczynski -() Patrick Ogdin + + ``` diff --git a/docs/agent/APPFRAMEWORK_PARITY.md b/docs/agent/APPFRAMEWORK_PARITY.md new file mode 100644 index 000000000..4cd1d8ac8 --- /dev/null +++ b/docs/agent/APPFRAMEWORK_PARITY.md @@ -0,0 +1,23 @@ +# App Framework Parity Governance + +This gate ensures appframework-related implementation changes include parity evidence. + +## Trigger Paths +- `pkg/splunk/enterprise/afwscheduler.go` +- `pkg/splunk/enterprise/afwscheduler_test.go` +- `test/testenv/appframework_utils.go` +- `test/appframework_*/**` +- `config/samples/*appframework*` +- `docs/AppFramework.md` + +## Required Evidence +When trigger paths change in a PR: +- Update at least one parity doc in the same diff: + - `docs/AppFramework.md` or + - `docs/agent/APPFRAMEWORK_PARITY.md` +- Include appframework test updates in the same diff. +- Run and report: + - `scripts/dev/appframework_parity_check.sh` + +## Gate +- `scripts/dev/appframework_parity_check.sh` diff --git a/docs/agent/BRANCH_POLICY.md b/docs/agent/BRANCH_POLICY.md new file mode 100644 index 000000000..508487eae --- /dev/null +++ b/docs/agent/BRANCH_POLICY.md @@ -0,0 +1,55 @@ +# Branch and Merge Queue Policy + +This document defines repository settings needed to enforce harness governance. + +## Protected Branches +Apply to `develop` and `main`: +- Require pull request before merge. +- Require approvals (`2` recommended baseline). +- Require review from Code Owners. +- Dismiss stale approvals on new commits. +- Require conversation resolution. +- Require linear history (recommended). + +## Required Status Checks +Require these checks before merge: +- `pr-check` +- `check-formating` +- `unit-tests` +- `Analyze (go)` +- `Analyze (python)` +- `ContributorLicenseAgreement` +- `CodeOfConduct` +- `Semgrep Scanner` +- `FOSSA-scanner` + +## Required Labels +For non-trivial PRs with manifest changes, include exactly one risk label: +- `risk:low` +- `risk:medium` +- `risk:high` + +`scripts/dev/risk_label_check.sh` validates that PR label matches manifest +`risk_tier`. + +## Merge Queue +Enable merge queue on protected branches and require: +- `.github/workflows/merge-queue-check.yml` job `merge-queue-pr-check` +- Up-to-date branch before merging + +## Risk-Tier Rules +Risk tier is declared in harness manifest and validated by +`scripts/dev/risk_policy_check.sh`. + +- `low`: + - `human_approvals_required >= 0` + - `auto_merge_allowed` may be `true` +- `medium`: + - `human_approvals_required >= 1` + - `merge_queue_required: true` + - `auto_merge_allowed: false` +- `high`: + - `human_approvals_required >= 2` + - `merge_queue_required: true` + - `auto_merge_allowed: false` + - command list includes deeper validation (unit/integration) diff --git a/docs/agent/COMMIT_DISCIPLINE.md b/docs/agent/COMMIT_DISCIPLINE.md new file mode 100644 index 000000000..31cbac116 --- /dev/null +++ b/docs/agent/COMMIT_DISCIPLINE.md @@ -0,0 +1,15 @@ +# Commit Discipline + +For implementation-heavy changes, prefer multiple reviewable commits instead of one monolithic commit. + +## Gate +- `scripts/dev/commit_discipline_check.sh` + +## Rules +- Minimum `2` non-merge commits for implementation-gated paths. +- Single-commit exceptions require `[single-commit-ok]` in commit message with justification. +- Oversized commits fail when changed file count exceeds configured threshold. + +## Why +- Improves code review quality. +- Helps isolate regressions and revert safely. diff --git a/docs/agent/CRD_MAP.md b/docs/agent/CRD_MAP.md new file mode 100644 index 000000000..8b8f5d0d9 --- /dev/null +++ b/docs/agent/CRD_MAP.md @@ -0,0 +1,25 @@ +# CRD Map + +`PROJECT` is the source of truth for kind and API version mapping. +This map is optimized for fast navigation in the codebase. + +## Primary Controller Map + +| Kind | API Version | Type File | Controller | Enterprise Logic | +| --- | --- | --- | --- | --- | +| Standalone | v4 | `api/v4/standalone_types.go` | `internal/controller/standalone_controller.go` | `pkg/splunk/enterprise/standalone.go` | +| IndexerCluster | v4 | `api/v4/indexercluster_types.go` | `internal/controller/indexercluster_controller.go` | `pkg/splunk/enterprise/indexercluster.go` | +| SearchHeadCluster | v4 | `api/v4/searchheadcluster_types.go` | `internal/controller/searchheadcluster_controller.go` | `pkg/splunk/enterprise/searchheadcluster.go` | +| ClusterManager | v4 | `api/v4/clustermanager_types.go` | `internal/controller/clustermanager_controller.go` | `pkg/splunk/enterprise/clustermanager.go` | +| LicenseManager | v4 | `api/v4/licensemanager_types.go` | `internal/controller/licensemanager_controller.go` | `pkg/splunk/enterprise/licensemanager.go` | +| MonitoringConsole | v4 | `api/v4/monitoringconsole_types.go` | `internal/controller/monitoringconsole_controller.go` | `pkg/splunk/enterprise/monitoringconsole.go` | +| ClusterMaster (legacy) | v3 | `api/v3/clustermaster_types.go` | `internal/controller/clustermaster_controller.go` | `pkg/splunk/enterprise/clustermaster.go` | +| LicenseMaster (legacy) | v3 | `api/v3/licensemaster_types.go` | `internal/controller/licensemaster_controller.go` | `pkg/splunk/enterprise/licensemaster.go` | + +## Shared Types +- Common spec fields and phases live in `api/v4/common_types.go`. +- Legacy common types live in `api/v3/common_types.go`. + +## Docs Pointers +- Spec field reference: `docs/CustomResources.md` +- Example manifests: `docs/Examples.md` diff --git a/docs/agent/DOC_FIRST_WORKFLOW.md b/docs/agent/DOC_FIRST_WORKFLOW.md new file mode 100644 index 000000000..133d58649 --- /dev/null +++ b/docs/agent/DOC_FIRST_WORKFLOW.md @@ -0,0 +1,22 @@ +# Doc-First Workflow + +For implementation-gated changes, this repository requires a change-intent file under `docs/changes/`. + +## Workflow +1. Create a change-intent file: + - `scripts/dev/start_change.sh "<topic>"` +2. Fill all required sections. +3. Implement code/tests. +4. Run governance gates and include evidence in PR. + +## Gate +- `scripts/dev/doc_first_check.sh` + +## Required Sections +- `## Intent` +- `## Scope` +- `## Constitution Impact` +- `## Harness Coverage Plan` +- `## Test Plan` +- `## Runtime Issue Tracker Review` +- `## Implementation Log` diff --git a/docs/agent/HARNESS_CONTRACT.md b/docs/agent/HARNESS_CONTRACT.md new file mode 100644 index 000000000..dc0113536 --- /dev/null +++ b/docs/agent/HARNESS_CONTRACT.md @@ -0,0 +1,63 @@ +# Harness Contract + +This document defines the minimum harness contract for agent-driven changes. + +## Inputs +- Governing KEP in `docs/specs/` +- Harness manifest in `harness/manifests/` +- Code change set +- Target branch and CI context + +## Required Local Gates +- `scripts/dev/spec_check.sh` +- `scripts/dev/harness_manifest_check.sh` +- `scripts/dev/doc_first_check.sh` +- `scripts/dev/commit_discipline_check.sh` +- `scripts/dev/appframework_parity_check.sh` +- `scripts/dev/keps_check.sh` +- `scripts/dev/harness_engineering_parity_check.sh` +- `scripts/dev/constitution_runtime_policy_check.sh` +- `scripts/dev/risk_policy_check.sh` +- `scripts/dev/risk_label_check.sh --labels risk:<tier>` (optional local, required in PR CI) +- `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` +- `scripts/dev/skill_lint.sh` +- `scripts/dev/script_sanity_check.sh` +- `scripts/dev/pr_check.sh` + +## Required CI Gate +- `.github/workflows/pr-check.yml` job `pr-check` +- `.github/workflows/merge-queue-check.yml` job `merge-queue-pr-check` +- `.github/workflows/autonomy-scorecard.yml` job `autonomy-scorecard` (reporting) + +## Output Contract +Every implementation PR should report: +- governing KEP path and status +- harness manifest path +- risk tier and delivery mode +- changed files summary +- commands run +- results and known risks + +## Scope Contract +- Non-trivial implementation PRs must include a changed manifest under + `harness/manifests/`. +- The manifest must define `allowed_paths`, `forbidden_paths`, and + `required_commands`. +- Changed files must satisfy the manifest scope policy. + +## Runtime Audit Contract +- Harness runs should emit artifacts under `.harness/runs/<timestamp>-<sha>/`: + - step logs + - `trace.tsv` + - `summary.txt` + +## Failure Policy +- If `spec_check.sh` fails, the change is not merge-ready. +- If `harness_manifest_check.sh` fails, the change is not merge-ready. +- If `keps_check.sh` fails, map impacted components to referenced approved IDs. +- If `harness_engineering_parity_check.sh` fails, update parity matrix status/evidence. +- If `constitution_runtime_policy_check.sh` fails, update constitution/runtime policy evidence. +- If `risk_policy_check.sh` fails, the change is not merge-ready. +- If `risk_label_check.sh` fails in CI, the PR risk label must be corrected. +- If `harness_eval.sh` fails, governance regressions must be fixed first. +- If harness checks fail, fix the implementation or update the spec and tests. diff --git a/docs/agent/HARNESS_MANIFEST.md b/docs/agent/HARNESS_MANIFEST.md new file mode 100644 index 000000000..6e0601e7b --- /dev/null +++ b/docs/agent/HARNESS_MANIFEST.md @@ -0,0 +1,72 @@ +# Harness Manifest Format + +`harness/manifests/*.yaml` defines machine-readable governance for one +implementation change. + +## Required Fields + +```yaml +version: v1 +change_id: CSPL-0000 +title: Short title +spec_file: docs/specs/CSPL-0000-example.md +owner: @splunk/splunk-operator-for-kubernetes +delivery_mode: agent +risk_tier: medium +human_approvals_required: 1 +auto_merge_allowed: false +merge_queue_required: true +evaluation_suite: docs/agent/evals/policy-regression.yaml +allowed_paths: + - api/** + - internal/** +forbidden_paths: + - vendor/** +required_commands: + - scripts/dev/spec_check.sh + - scripts/dev/doc_first_check.sh + - scripts/dev/commit_discipline_check.sh + - scripts/dev/appframework_parity_check.sh + - scripts/dev/keps_check.sh + - scripts/dev/harness_engineering_parity_check.sh + - scripts/dev/constitution_runtime_policy_check.sh + - scripts/dev/risk_policy_check.sh + - scripts/dev/risk_label_check.sh --labels risk:<tier> + - scripts/dev/skill_lint.sh + - scripts/dev/script_sanity_check.sh + - scripts/dev/pr_check.sh --fast +``` + +## Field Notes +- `version`: currently `v1`. +- `change_id`: Jira issue or GitHub issue identifier. +- `spec_file`: path to governing KEP in `docs/specs/`. +- `owner`: owning team or maintainer group. +- `delivery_mode`: one of `agent`, `hybrid`, `human`. +- `risk_tier`: one of `low`, `medium`, `high`. +- `human_approvals_required`: minimum human approvals expected by policy. +- `auto_merge_allowed`: indicates whether branch policy may auto-merge. +- `merge_queue_required`: indicates whether merge queue is required. +- `evaluation_suite`: replayable eval corpus path. +- `allowed_paths`: glob patterns that changed files are allowed to touch. +- `forbidden_paths`: glob patterns that changed files must not touch. +- `required_commands`: minimum commands expected for this change. + +## Validation Rules +`scripts/dev/harness_manifest_check.sh` enforces: +- non-trivial code changes require a changed manifest +- referenced `spec_file` exists and has status `Approved` or `Implemented` +- risk and delivery fields exist +- `allowed_paths`, `forbidden_paths`, and `required_commands` exist +- `required_commands` include `scripts/dev/spec_check.sh`, `scripts/dev/risk_policy_check.sh`, and `scripts/dev/pr_check.sh` +- `required_commands` should include `scripts/dev/doc_first_check.sh`, `scripts/dev/commit_discipline_check.sh`, `scripts/dev/appframework_parity_check.sh`, `scripts/dev/keps_check.sh`, `scripts/dev/harness_engineering_parity_check.sh`, and `scripts/dev/constitution_runtime_policy_check.sh` for governance parity +- changed files satisfy manifest scope policy + +`scripts/dev/risk_policy_check.sh` enforces: +- risk-tier value constraints +- minimum approval and merge-queue policy by tier +- deeper required command expectations for medium/high risk changes + +`scripts/dev/risk_label_check.sh` enforces: +- exactly one PR risk label (`risk:low|risk:medium|risk:high`) +- label tier must match manifest `risk_tier` diff --git a/docs/agent/OPERATIONS.md b/docs/agent/OPERATIONS.md new file mode 100644 index 000000000..bafe85f82 --- /dev/null +++ b/docs/agent/OPERATIONS.md @@ -0,0 +1,24 @@ +# Operations and Debug + +## Quick Capture Commands +`kubectl get <kind> <name> -n <ns> -o yaml` +`kubectl describe <kind> <name> -n <ns>` +`kubectl get events -n <ns> --sort-by=.lastTimestamp` +`kubectl logs -n splunk-operator deploy/splunk-operator-controller-manager -c manager --since=30m` + +## Skaffold Workflows +`scripts/dev/skaffold_dev.sh` +`scripts/dev/skaffold_ci_smoke.sh` +`skaffold run -p ci-smoke` + +## Operator Metrics and pprof +The operator registers pprof handlers on the metrics server when `--pprof=true` (default). +Default metrics bind address is `:8080` with `--metrics-secure=false`. + +Example local access: +`kubectl -n splunk-operator port-forward deploy/splunk-operator-controller-manager 8080:8080` +Then open `/debug/pprof` on `http://127.0.0.1:8080`. + +## Where Debug Endpoints Are Wired +- Registration: `internal/controller/debug/register.go` +- Flags and setup: `cmd/main.go` diff --git a/docs/agent/README.md b/docs/agent/README.md new file mode 100644 index 000000000..881d13ab9 --- /dev/null +++ b/docs/agent/README.md @@ -0,0 +1,40 @@ +# Agent Harness Docs + +These documents are the system of record for agent-assisted development in this repo. +They are short, concrete, and intended to be read by Codex skills and humans. + +KEP-first governance lives in `docs/specs/`. For non-trivial implementation +changes, agents must link an approved KEP via `harness/manifests/*.yaml` and +pass harness checks. + +## Index +- `../specs/` contains KEP-lite design docs and lifecycle state +- `../changes/` contains doc-first change-intent records for implementation work +- `CRD_MAP.md` maps kinds to API versions, types, controllers, and enterprise logic files +- `RECONCILE_FLOW.md` outlines the reconciliation flow, gates, and status phases +- `TEST_MATRIX.md` lists unit and integration test paths and environment variables +- `TESTCASE_SPEC.yaml` is a template for generating new integration/KUTTL tests +- `TESTCASE_PATTERNS.md` maps SVA patterns and features to test helpers +- `OPERATIONS.md` provides debug commands, log access, and pprof access notes +- `../../skaffold.yaml` defines shared local/CI deployment workflows +- `../../.devcontainer/` defines the reproducible local/agent development environment +- `RELEASE_FLOW.md` provides a concise release checklist and artifact map +- `HARNESS_CONTRACT.md` defines required harness gates and output contract +- `HARNESS_MANIFEST.md` defines the machine-readable harness manifest format +- `DOC_FIRST_WORKFLOW.md` defines required change-intent sections and flow +- `COMMIT_DISCIPLINE.md` defines commit slicing rules for implementation changes +- `APPFRAMEWORK_PARITY.md` defines appframework parity evidence rules +- `../specs/COMPONENT_KEP_INDEX.md` maps components to approved KEP/change IDs +- `../engineering/HARNESS_ENGINEERING_PARITY.md` tracks harness-engineering parity status and evidence +- `../engineering/CONSTITUTION.md` defines always-on engineering rules +- `../testing/RUNTIME_ISSUE_TRACKER.md` tracks runtime issue lifecycle with evidence +- `SKILL_ALIGNMENT.md` defines the shared skill contract and splcore-to-SOK mapping +- `SPECKIT_KEP_BRIDGE.md` explains Spec Kit to KEP to harness-manifest mapping +- `BRANCH_POLICY.md` defines branch protection and merge-queue requirements +- `evals/policy-regression.yaml` is the replayable governance regression corpus +- `.github/workflows/autonomy-scorecard.yml` publishes autonomy score reports +- `scripts/dev/autonomy_scorecard.sh` generates score metrics for current diff +- `scripts/dev/risk_label_check.sh` validates PR risk label versus manifest tier +- `scripts/dev/keps_check.sh` validates component-to-KEP mapping in changed scope +- `scripts/dev/harness_engineering_parity_check.sh` validates parity matrix status/evidence quality +- `scripts/dev/constitution_runtime_policy_check.sh` enforces constitution + runtime issue governance diff --git a/docs/agent/RECONCILE_FLOW.md b/docs/agent/RECONCILE_FLOW.md new file mode 100644 index 000000000..5899c3bd3 --- /dev/null +++ b/docs/agent/RECONCILE_FLOW.md @@ -0,0 +1,27 @@ +# Reconcile Flow + +This document describes the typical reconcile flow and the most common gates. + +## Control Flow +1. Controller `Reconcile` fetches the CR instance. +2. Paused annotation check may short-circuit and requeue. +3. Apply function in `pkg/splunk/enterprise/*` builds desired state. +4. Status is updated and the controller returns a requeue or completion. + +## Common Gates +- Paused annotation in `api/v4/*_types.go` or `api/v3/*_types.go`. +- Phase gating constants in `api/v4/common_types.go`. +- Predicate filtering in `internal/controller/common/predicate.go`. + +## Phases +`Pending`, `Ready`, `Updating`, `ScalingUp`, `ScalingDown`, `Terminating`, `Error`. + +## Where To Look First +- Controller entry: `internal/controller/<kind>_controller.go`. +- Apply logic: `pkg/splunk/enterprise/<kind>.go`. +- Shared helpers: `pkg/splunk/enterprise/util.go` and `pkg/splunk/common/*`. + +## Debug Checklist +- Confirm CR spec and status (`kubectl get -o yaml`). +- Check events (`kubectl get events -n <ns> --sort-by=.lastTimestamp`). +- Inspect operator logs for the CR name. diff --git a/docs/agent/RELEASE_FLOW.md b/docs/agent/RELEASE_FLOW.md new file mode 100644 index 000000000..c7c5db39a --- /dev/null +++ b/docs/agent/RELEASE_FLOW.md @@ -0,0 +1,34 @@ +# Release Flow + +This is a concise, repo-specific release flow intended for humans and Codex skills. + +## Inputs +- Release version (update `VERSION` in `Makefile` or set `VERSION=<x.y.z>` in the environment). +- Target Splunk Enterprise compatibility (update docs and release notes accordingly). + +## Core Steps +1. Update `docs/ChangeLog.md` and any release notes. +2. Update compatibility notes in `docs/README.md` and `docs/SplunkOperatorUpgrade.md` as needed. +3. If CRDs changed, run `make generate` and `make manifests` (or `./scripts/verify_crd.sh`). +4. If bundle/CSV outputs are needed, run `make bundle` (or `./scripts/verify_bundle.sh`). +5. Run `make verify VERIFY_BUNDLE=1` to ensure generated outputs are consistent. +6. Run unit tests (`make test`) and any required integration tests. +7. Build/push images and bundle artifacts as required by release packaging. + +## Artifacts to Inspect +- CRDs: `config/crd/bases/` +- RBAC: `config/rbac/role.yaml` +- Bundle/CSV: `bundle/manifests/`, `bundle/manifests/splunk-operator.clusterserviceversion.yaml` +- Helm CRDs: `helm-chart/splunk-operator/crds` + +## Common Commands +- `make verify VERIFY_BUNDLE=1` +- `make bundle` +- `make bundle-build` and `make bundle-push` +- `make catalog-build` and `make catalog-push` + +## Docs to Update +- `docs/ChangeLog.md` +- `docs/README.md` +- `docs/SplunkOperatorUpgrade.md` +- `docs/Install.md` (if install defaults or requirements changed) diff --git a/docs/agent/SKILL_ALIGNMENT.md b/docs/agent/SKILL_ALIGNMENT.md new file mode 100644 index 000000000..e4a2e68ff --- /dev/null +++ b/docs/agent/SKILL_ALIGNMENT.md @@ -0,0 +1,54 @@ +# Skill Alignment (splcore <-> SOK) + +This document standardizes how Codex skills are written in this repository so +contributors can move between `~/main` (splcore) and `splunk-operator` with the +same execution model. + +## Shared Skill Contract + +Every `SKILL.md` under `.agents/skills/` must include: +- YAML frontmatter with `name` and `description` +- `## Overview` +- `## Preconditions` +- `## Workflow` +- `## Pass / Fail Criteria` +- `## Output Contract` + +This mirrors the practical structure used in splcore skills: +- predictable start (`Preconditions`) +- deterministic execution (`Workflow`) +- explicit completion signal (`Pass / Fail Criteria`) +- consistent reporting (`Output Contract`) + +The contract is enforced by `scripts/dev/skill_lint.sh` and included in +`scripts/dev/pr_check.sh`. + +## Naming Alignment + +SOK keeps `sok-*` prefixes to avoid collisions with globally installed skills, +but follows splcore conventions for base developer workflows. + +| splcore pattern | SOK equivalent | Purpose | +| --- | --- | --- | +| `splcore-prerequisites` | `sok-prerequisites` | Validate local tooling and repo prerequisites | +| `splcore-build` | `sok-build` | Build operator artifacts/images | +| `splcore-test` | `sok-test` | Run standard local test flows | +| `splcore-commit-mr` | `sok-commit-pr` | Commit/push and create draft PR workflow | + +Domain-specific SOK skills remain separate (for example: +`sok-feature-scaffold`, `sok-reconcile-debugger`, `sok-testcase-builder`). + +## Output Contract (Standard) + +All skills should end with the same reporting block: +- Changed files +- Commands run +- Results +- PR-ready summary + +## Review Policy + +When adding or updating skills: +- keep the skill body concise and task-focused +- move deep reference material to `references/` or docs under `docs/agent/` +- prefer deterministic scripts under `scripts/dev/` for repeated commands diff --git a/docs/agent/SPECKIT_KEP_BRIDGE.md b/docs/agent/SPECKIT_KEP_BRIDGE.md new file mode 100644 index 000000000..f70067995 --- /dev/null +++ b/docs/agent/SPECKIT_KEP_BRIDGE.md @@ -0,0 +1,48 @@ +# Spec Kit to KEP Bridge + +This bridge keeps planning (`speckit/`), design governance (`docs/specs/`), +and execution policy (`harness/manifests/`) synchronized. + +## Bootstrap +Run: + +```bash +scripts/dev/speckit_bridge.sh bootstrap \ + --change-id CSPL-5000 \ + --title "Indexer rollout validation improvements" \ + --risk-tier medium \ + --delivery-mode agent +``` + +Generated artifacts: +- `speckit/specs/CSPL-5000-.../spec.md` +- `speckit/specs/CSPL-5000-.../plan.md` +- `speckit/specs/CSPL-5000-.../tasks.md` +- `docs/specs/CSPL-5000-....md` +- `harness/manifests/CSPL-5000-....yaml` + +## Required Progression +1. Refine Spec Kit docs and KEP draft. +2. Review and move KEP to `Status: Approved`. +3. Execute implementation scoped by manifest `allowed_paths`. +4. Run harness checks: + - `scripts/dev/spec_check.sh` + - `scripts/dev/harness_manifest_check.sh` + - `scripts/dev/doc_first_check.sh` + - `scripts/dev/commit_discipline_check.sh` + - `scripts/dev/appframework_parity_check.sh` + - `scripts/dev/keps_check.sh` + - `scripts/dev/harness_engineering_parity_check.sh` + - `scripts/dev/constitution_runtime_policy_check.sh` + - `scripts/dev/risk_policy_check.sh` + - `scripts/dev/risk_label_check.sh --labels risk:<tier>` + - `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` + - `scripts/dev/harness_run.sh --fast` + - `scripts/dev/pr_check.sh --fast` + +## Output Contract +Each implementation PR should include: +- Spec Kit path +- KEP path and status +- Manifest path and risk tier +- Harness command results diff --git a/docs/agent/TESTCASE_PATTERNS.md b/docs/agent/TESTCASE_PATTERNS.md new file mode 100644 index 000000000..e2df1acf7 --- /dev/null +++ b/docs/agent/TESTCASE_PATTERNS.md @@ -0,0 +1,53 @@ +# Testcase Patterns + +This doc maps common Splunk Validated Architectures (SVA) and features to test helpers. + +## Integration Helpers (test/testenv/deployment.go) + +S1 (Standalone) +- `DeployStandalone` (basic) +- `DeployStandaloneWithGivenSmartStoreSpec` (smartstore) + +C3 (Single-site cluster, SHC optional) +- `DeploySingleSiteCluster` (basic) +- `DeploySingleSiteClusterWithGivenAppFrameworkSpec` (app framework) +- SmartStore for C3 requires manual flow: use `DeployClusterManagerWithSmartStoreIndexes` plus `DeployIndexerCluster` and optional SHC. + +M4 (Multisite with SHC) +- `DeployMultisiteClusterWithSearchHead` (basic) +- `DeployMultisiteClusterWithSearchHeadAndAppFramework` (app framework) +- `DeployMultisiteClusterWithSearchHeadAndIndexes` (smartstore) + +M1 (Multisite, no SHC) +- `DeployMultisiteCluster` (basic) +- For app framework, use `DeployMultisiteClusterWithSearchHeadAndAppFramework` with `shc=false` +- SmartStore without SHC requires manual flow or a custom helper + +## Readiness Helpers (test/testenv/verificationutils.go) +- `StandaloneReady` +- `ClusterManagerReady` or `LegacyClusterManagerReady` +- `SearchHeadClusterReady` +- `SingleSiteIndexersReady` or `IndexersReady` +- `IndexerClusterMultisiteStatus` +- `VerifyRFSFMet` +- `LicenseManagerReady` or `LegacyLicenseManagerReady` +- `VerifyMonitoringConsoleReady` + +## SVA Validation (Integration) +Use `validations` in `docs/agent/TESTCASE_SPEC.yaml` to auto-add readiness checks. +- For C3 SVA: `validations.sva: C3` (adds Monitoring Console + License Manager checks unless disabled) +- Ensure a license file/configmap is configured when enabling License Manager checks + +## App Framework Helpers +- `GenerateAppFrameworkSpec` (test/testenv/appframework_utils.go) +- `DeploySingleSiteClusterWithGivenAppFrameworkSpec` +- `DeployMultisiteClusterWithSearchHeadAndAppFramework` + +## SmartStore Helpers +- `DeployStandaloneWithGivenSmartStoreSpec` +- `DeployClusterManagerWithSmartStoreIndexes` +- `DeployMultisiteClusterWithSearchHeadAndIndexes` + +## Operator Upgrade (KUTTL) +- Use `upgrade` in `docs/agent/TESTCASE_SPEC.yaml` to generate helm install/upgrade steps. +- Example suite: `kuttl/tests/upgrade/c3-with-operator` (checks operator deployment and image). diff --git a/docs/agent/TESTCASE_SPEC.yaml b/docs/agent/TESTCASE_SPEC.yaml new file mode 100644 index 000000000..43ad7ea9f --- /dev/null +++ b/docs/agent/TESTCASE_SPEC.yaml @@ -0,0 +1,89 @@ +# Testcase specification template +# Save a copy and pass it to scripts/generate_testcase.py --spec <path> + +# Test type: kuttl or integration +# - kuttl: generates kuttl/tests/<suite>/<name>/... with deploy/assert steps +# - integration: generates test/<suite>/<name>_test.go scaffold + +type: kuttl +suite: smoke +name: c3-basic-ready + +architecture: + # Supported: S1, C3, M4, M1 (used by integration scaffolds) + name: C3 + indexerReplicas: 3 + siteCount: 1 + shc: true + # Use legacy v3 control-plane readiness helpers + useLegacyClusterManager: false + +features: + smartstore: false + appframework: false + +# Optional: extra readiness validations for integration tests +# For C3 SVA, set `sva: C3` to auto-enable Monitoring Console + License Manager checks +# (unless you explicitly set them to false). +validations: + sva: "" # e.g., C3 + monitoringConsole: false + monitoringConsoleName: "" # optional override (defaults to deployment name) + licenseManager: false # requires a license file/configmap to be configured + +# Optional: operator upgrade flow (KUTTL only) +# When enabled, generator will create: +# 00-install.yaml (helm install) +# 01-assert-operator-ready.yaml +# 02-upgrade.yaml (helm upgrade) +# 03-assert-operator-image.yaml +# 04-assert.yaml (CR readiness/resource asserts) +upgrade: + enabled: false + method: helm + helmRelease: splunk-test + helmChartPathEnv: HELM_REPO_PATH + namespaceEnv: NAMESPACE + valuesFile: "" # optional values file to copy into test dir + operatorImageEnv: KUTTL_SPLUNK_OPERATOR_IMAGE + operatorImageNewEnv: KUTTL_SPLUNK_OPERATOR_NEW_IMAGE + enterpriseImageEnv: KUTTL_SPLUNK_ENTERPRISE_IMAGE + enterpriseImageNewEnv: KUTTL_SPLUNK_ENTERPRISE_NEW_IMAGE + extraHelmArgs: [] + +# For a single-CR test, use `cr`. For multi-CR (C3/M4), use `crs` in deploy order. +# cr: +# path: ./path/to/your-cr.yaml +# apiVersion: enterprise.splunk.com/v4 +# kind: Standalone +# name: example-standalone + +crs: + - path: ./path/to/clustermanager.yaml + apiVersion: enterprise.splunk.com/v4 + kind: ClusterManager + name: test + - path: ./path/to/indexercluster.yaml + apiVersion: enterprise.splunk.com/v4 + kind: IndexerCluster + name: test + - path: ./path/to/searchheadcluster.yaml + apiVersion: enterprise.splunk.com/v4 + kind: SearchHeadCluster + name: test + +expected: + phase: Ready + # Optional: path to extra KUTTL asserts (resources, fields) appended after phase checks + assert_path: "" + +# Optional: additional resources for KUTTL assertion generation (if assert_path is empty) +resources: + - apiVersion: apps/v1 + kind: StatefulSet + name: splunk-test-cluster-manager + status: + replicas: 1 + - apiVersion: v1 + kind: Secret + name: splunk-test-cluster-manager-secret-v1 diff --git a/docs/agent/TEST_MATRIX.md b/docs/agent/TEST_MATRIX.md new file mode 100644 index 000000000..61335dc86 --- /dev/null +++ b/docs/agent/TEST_MATRIX.md @@ -0,0 +1,33 @@ +# Test Matrix + +## Unit Tests +- Command: `make test` +- Scope: `./pkg/splunk/common`, `./pkg/splunk/enterprise`, `./internal/controller`, and related packages. + +## Integration Tests (Kind) +- Create cluster: `make cluster-up` +- Run tests: `make int-test` +- Teardown: `make cluster-down` + +## KUTTL Tests +- Suite config: `kuttl/kuttl-test.yaml` +- Tests live under: `kuttl/tests/` +- Example run (if kuttl is installed): `kubectl kuttl test --config kuttl/kuttl-test.yaml` +- Upgrade suite: `kubectl kuttl test --config kuttl/kuttl-test-helm-upgrade.yaml` + +## Patterns +- Use `docs/agent/TESTCASE_PATTERNS.md` for SVA helper mapping. + +## Environment Variables (from `test/env.sh`) +- `SPLUNK_OPERATOR_IMAGE` default `splunk/splunk-operator:latest` +- `SPLUNK_ENTERPRISE_IMAGE` default `splunk/splunk:latest` +- `CLUSTER_PROVIDER` default `kind` +- `PRIVATE_REGISTRY` default `localhost:5000` for kind +- `TEST_REGEX` or `TEST_FOCUS` to filter tests +- `SKIP_REGEX` to skip tests +- `CLUSTER_WIDE` to install operator cluster-wide +- `DEPLOYMENT_TYPE` set to `manifest` or `helm` + +## Targeted Test Runs +- Run a single suite: `cd test/<suite> && ginkgo -v -progress ...` +- Default focus is `smoke` when `TEST_REGEX` is not set. diff --git a/docs/agent/evals/policy-regression.yaml b/docs/agent/evals/policy-regression.yaml new file mode 100644 index 000000000..ab15683b9 --- /dev/null +++ b/docs/agent/evals/policy-regression.yaml @@ -0,0 +1,114 @@ +version: v1 +name: harness-policy-regression +cases: + - id: kep_template_summary + file: docs/specs/SPEC_TEMPLATE.md + pattern: "## Summary" + - id: kep_template_graduation + file: docs/specs/SPEC_TEMPLATE.md + pattern: "## Graduation Criteria" + - id: spec_readme_kep_policy + file: docs/specs/README.md + pattern: "# KEP-Lite Workflow" + - id: pr_check_manifest_gate + file: scripts/dev/pr_check.sh + pattern: "harness_manifest_check.sh" + - id: pr_check_doc_first_gate + file: scripts/dev/pr_check.sh + pattern: "doc_first_check.sh" + - id: pr_check_commit_discipline_gate + file: scripts/dev/pr_check.sh + pattern: "commit_discipline_check.sh" + - id: pr_check_appframework_parity_gate + file: scripts/dev/pr_check.sh + pattern: "appframework_parity_check.sh" + - id: pr_check_keps_gate + file: scripts/dev/pr_check.sh + pattern: "keps_check.sh" + - id: pr_check_harness_parity_gate + file: scripts/dev/pr_check.sh + pattern: "harness_engineering_parity_check.sh" + - id: pr_check_constitution_runtime_gate + file: scripts/dev/pr_check.sh + pattern: "constitution_runtime_policy_check.sh" + - id: pr_check_eval_gate + file: scripts/dev/pr_check.sh + pattern: "harness_eval.sh" + - id: pr_check_skill_lint_gate + file: scripts/dev/pr_check.sh + pattern: "skill_lint.sh" + - id: pr_check_script_sanity_gate + file: scripts/dev/pr_check.sh + pattern: "script_sanity_check.sh" + - id: pr_check_risk_gate + file: scripts/dev/pr_check.sh + pattern: "risk_policy_check.sh" + - id: pr_check_risk_label_gate + file: scripts/dev/pr_check.sh + pattern: "risk_label_check.sh" + - id: speckit_bridge_script + file: scripts/dev/speckit_bridge.sh + pattern: "bootstrap --change-id" + - id: risk_policy_script + file: scripts/dev/risk_policy_check.sh + pattern: "risk_tier" + - id: risk_label_script + file: scripts/dev/risk_label_check.sh + pattern: "risk:" + - id: scorecard_script + file: scripts/dev/autonomy_scorecard.sh + pattern: "autonomy_score" + - id: keps_check_script + file: scripts/dev/keps_check.sh + pattern: "COMPONENT_KEP_INDEX.md" + - id: harness_parity_script + file: scripts/dev/harness_engineering_parity_check.sh + pattern: "HARNESS_ENGINEERING_PARITY.md" + - id: constitution_runtime_script + file: scripts/dev/constitution_runtime_policy_check.sh + pattern: "RUNTIME_ISSUE_TRACKER.md" + - id: scope_check_base_ref + file: scripts/dev/scope_check.sh + pattern: "--base-ref" + - id: codeowners_specs + file: CODEOWNERS + pattern: "/docs/specs/" + - id: codeowners_harness + file: CODEOWNERS + pattern: "/harness/" + - id: manifest_doc + file: docs/agent/HARNESS_MANIFEST.md + pattern: "## Required Fields" + - id: merge_queue_workflow + file: .github/workflows/merge-queue-check.yml + pattern: "merge_group" + - id: scorecard_workflow + file: .github/workflows/autonomy-scorecard.yml + pattern: "Autonomy Scorecard" + - id: pr_check_label_event + file: .github/workflows/pr-check.yml + pattern: "labeled" + - id: speckit_bridge_doc + file: docs/agent/SPECKIT_KEP_BRIDGE.md + pattern: "scripts/dev/speckit_bridge.sh bootstrap" + - id: branch_policy_doc + file: docs/agent/BRANCH_POLICY.md + pattern: "## Merge Queue" + - id: doc_first_template_sections + file: docs/changes/TEMPLATE.md + pattern: "## Harness Coverage Plan" + - id: appframework_parity_doc + file: docs/agent/APPFRAMEWORK_PARITY.md + pattern: "## Required Evidence" + - id: component_kep_index_doc + file: docs/specs/COMPONENT_KEP_INDEX.md + pattern: "# Component to KEP Index" + - id: harness_engineering_parity_doc + file: docs/engineering/HARNESS_ENGINEERING_PARITY.md + pattern: "# Harness Engineering Parity" + - id: engineering_constitution_doc + file: docs/engineering/CONSTITUTION.md + pattern: "# Engineering Constitution" + - id: runtime_issue_tracker_doc + file: docs/testing/RUNTIME_ISSUE_TRACKER.md + pattern: "# Runtime Issue Tracker" diff --git a/docs/changes/2026-03-03-doc-first-commit-discipline-appframework-parity.md b/docs/changes/2026-03-03-doc-first-commit-discipline-appframework-parity.md new file mode 100644 index 000000000..24bc86990 --- /dev/null +++ b/docs/changes/2026-03-03-doc-first-commit-discipline-appframework-parity.md @@ -0,0 +1,44 @@ +# Change Intent: Doc-First, Commit Discipline, and AppFramework Parity Gates + +## Intent +- Add three governance gates to make agent-driven development more reviewable and deterministic: + - doc-first change intent enforcement + - incremental commit discipline enforcement + - appframework parity evidence enforcement + +## Scope +- In scope: + - new scripts under `scripts/dev/` for the three governance gates + - `pr_check.sh` integration + - harness governance docs/evals/manifests updates + - PR templates and make targets updates +- Out of scope: + - changing operator runtime behavior + - changing existing feature logic or CRD semantics + +## Constitution Impact +- Risk tier expectation: `medium` (governance/pipeline behavior update). +- This strengthens governance and review evidence requirements for non-trivial changes. + +## Harness Coverage Plan +- `scripts/dev/spec_check.sh --base-ref develop` +- `scripts/dev/harness_manifest_check.sh --base-ref develop` +- `scripts/dev/doc_first_check.sh --base-ref develop` +- `scripts/dev/commit_discipline_check.sh --base-ref develop` +- `scripts/dev/appframework_parity_check.sh --base-ref develop` +- `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` +- `scripts/dev/pr_check.sh --fast` + +## Test Plan +- Shell syntax checks for new scripts via `scripts/dev/script_sanity_check.sh`. +- Full governance fast-pass via `PR_CHECK_FLAGS=--fast scripts/dev/pr_check.sh`. +- Validate policy regression suite includes new gate references. + +## Runtime Issue Tracker Review +- Added governance bootstrap entries in `docs/testing/RUNTIME_ISSUE_TRACKER.md` to track rollout risks. + +## Implementation Log +- 2026-03-03: Added new governance check scripts and integrated them into `pr_check.sh`. +- 2026-03-03: Added `docs/changes/` doc-first workflow artifacts and a helper creator script. +- 2026-03-03: Updated harness docs, eval suite, and manifest required commands. +- 2026-03-04: Hardened EKS smoke bootstrap scripts with OIDC quota preflight, EBS CSI readiness waits, and improved operator rollout diagnostics. diff --git a/docs/changes/README.md b/docs/changes/README.md new file mode 100644 index 000000000..2b9d8959a --- /dev/null +++ b/docs/changes/README.md @@ -0,0 +1,16 @@ +# Change Intent Docs + +`docs/changes/` captures doc-first intent for implementation changes. + +Use `scripts/dev/start_change.sh "<topic>"` to create a new file. + +## Required Sections + +Each change-intent file must contain: +- `## Intent` +- `## Scope` +- `## Constitution Impact` +- `## Harness Coverage Plan` +- `## Test Plan` +- `## Runtime Issue Tracker Review` +- `## Implementation Log` diff --git a/docs/changes/TEMPLATE.md b/docs/changes/TEMPLATE.md new file mode 100644 index 000000000..a9c733f49 --- /dev/null +++ b/docs/changes/TEMPLATE.md @@ -0,0 +1,30 @@ +# Change Intent: <short-title> + +## Intent +- What problem this change solves. + +## Scope +- In-scope paths and behavior. +- Out-of-scope items. + +## Constitution Impact +- Risk tier expectation (`low|medium|high`). +- Any policy/contract updates needed. + +## Harness Coverage Plan +- `scripts/dev/spec_check.sh` +- `scripts/dev/harness_manifest_check.sh` +- `scripts/dev/doc_first_check.sh` +- `scripts/dev/commit_discipline_check.sh` +- `scripts/dev/appframework_parity_check.sh` +- `scripts/dev/pr_check.sh --fast` + +## Test Plan +- Unit/integration/e2e commands that validate behavior. + +## Runtime Issue Tracker Review +- `docs/testing/RUNTIME_ISSUE_TRACKER.md` updated, or +- Explicit note that no new runtime issues were observed. + +## Implementation Log +- Timestamped notes for key implementation decisions. diff --git a/docs/engineering/CONSTITUTION.md b/docs/engineering/CONSTITUTION.md new file mode 100644 index 000000000..f24ae3ead --- /dev/null +++ b/docs/engineering/CONSTITUTION.md @@ -0,0 +1,46 @@ +# Engineering Constitution + +This is the always-on engineering contract for Splunk Operator for Kubernetes (SOK). + +If a change conflicts with this constitution, either: +1. Update this document with a technical justification, or +2. Redesign the change to comply. + +## Product Goal + +Deliver a Kubernetes operator that keeps Splunk deployments safe, predictable, +and auditable across install, upgrade, and steady-state operations. + +## Security Baselines +- Default to least privilege in RBAC and pod/container security context. +- Never log secrets, credentials, session tokens, or private keys. +- Validate and constrain user-provided config before using it in reconcile actions. +- Prefer immutable image references for test and production deployments. + +## Reconcile and State Management +- Reconcile paths must be idempotent. +- Status transitions must be explicit and explain waiting/failure conditions. +- Transient dependency failures should use retry/backoff, not terminal phase flips. +- Controller changes must include regression tests for phase/condition behavior. + +## CRD and Compatibility Rules +- CRD/API changes require an approved KEP in `docs/specs/`. +- CRD schema and generated manifests must be regenerated with project tooling. +- Backward compatibility expectations and upgrade behavior must be documented. + +## Test and Harness Requirements +- Non-trivial changes require harness manifest + policy checks. +- Required baseline checks are executed through `scripts/dev/pr_check.sh`. +- Integration and KUTTL coverage must be updated when runtime behavior changes. +- App framework changes must satisfy parity gate requirements. + +## Documentation Requirements +- Implementation-heavy changes must start with `docs/changes/*.md`. +- User-facing behavior changes require docs updates under `docs/`. +- Governance/process changes require updates under `docs/agent/` or `docs/engineering/`. + +## Runtime Issue Governance +- Newly observed runtime/test issues must be recorded in + `docs/testing/RUNTIME_ISSUE_TRACKER.md` before marking work complete. +- Issue status lifecycle is mandatory: `Open` -> `In Progress` -> `Mitigated` -> `Closed`. +- Closing an issue must include evidence (PR, commit, or validated test run). diff --git a/docs/engineering/HARNESS_ENGINEERING_PARITY.md b/docs/engineering/HARNESS_ENGINEERING_PARITY.md new file mode 100644 index 000000000..e8ca8625d --- /dev/null +++ b/docs/engineering/HARNESS_ENGINEERING_PARITY.md @@ -0,0 +1,33 @@ +# Harness Engineering Parity + +Source reference: https://openai.com/index/harness-engineering/ + +## Product Profile +- Product: Splunk Operator for Kubernetes (SOK) +- Development mode: KEP-first + harness-governed execution +- Primary objective: keep agent-generated changes auditable, scoped, and test-backed + +## Backend Parity Matrix +| ID | Capability | SOK Implementation | Status | Evidence | +|---|---|---|---|---| +| H01 | Spec-first workflow | Non-trivial changes require KEP in `docs/specs/` | Implemented | `docs/specs/README.md`, `scripts/dev/spec_check.sh` | +| H02 | Machine-readable execution policy | Harness manifests define scope/risk/commands | Implemented | `harness/manifests/*.yaml`, `scripts/dev/harness_manifest_check.sh` | +| H03 | Risk-tier governance | Risk tier enforces approvals/merge queue/command depth | Implemented | `scripts/dev/risk_policy_check.sh`, `docs/agent/BRANCH_POLICY.md` | +| H04 | PR label and policy consistency | PR labels must match manifest risk tier | Implemented | `scripts/dev/risk_label_check.sh`, `.github/workflows/pr-check.yml` | +| H05 | Doc-first implementation intent | Code-heavy changes require `docs/changes/*.md` with mandatory sections | Implemented | `scripts/dev/doc_first_check.sh`, `docs/changes/TEMPLATE.md` | +| H06 | Reviewable commit slicing | Implementation-heavy diffs require incremental commits or explicit override | Implemented | `scripts/dev/commit_discipline_check.sh`, `docs/agent/COMMIT_DISCIPLINE.md` | +| H07 | App framework parity guard | App framework code changes require parity docs + tests | Implemented | `scripts/dev/appframework_parity_check.sh`, `docs/agent/APPFRAMEWORK_PARITY.md` | +| H08 | KEP/component coverage validation | Impacted components must map to referenced approved IDs | Implemented | `scripts/dev/keps_check.sh`, `docs/specs/COMPONENT_KEP_INDEX.md` | +| H09 | Constitution and runtime issue governance | Always-on engineering rules + runtime issue lifecycle tracking | Implemented | `docs/engineering/CONSTITUTION.md`, `docs/testing/RUNTIME_ISSUE_TRACKER.md`, `scripts/dev/constitution_runtime_policy_check.sh` | +| H10 | Replayable policy regression checks | Governance contract validated via file-pattern eval suite | Implemented | `docs/agent/evals/policy-regression.yaml`, `scripts/dev/harness_eval.sh` | +| H11 | Harness run audit artifacts | Harness run emits trace/log/summary artifacts under `.harness/runs/` | Implemented | `scripts/dev/harness_run.sh`, `.harness/runs/` | +| H12 | Human-in-the-loop UI workflows | GitHub review UI for approvals/labels/merge queue remains required | N/A (No UI) | `docs/agent/BRANCH_POLICY.md`, `.github/workflows/merge-queue-check.yml` | + +## No-UI Equivalents (Required) +SOK is backend-focused. UI-only controls from general harness guidance are represented as +policy + CI checks. Rows marked `N/A (No UI)` must still point to auditable evidence. + +## Maintenance Rules +- Update this matrix whenever governance capability status changes. +- Keep every row status in: `Implemented`, `In Progress`, `Planned`, `N/A (No UI)`. +- Keep evidence column non-empty and repository-local where possible. diff --git a/docs/specs/COMPONENT_KEP_INDEX.md b/docs/specs/COMPONENT_KEP_INDEX.md new file mode 100644 index 000000000..90209313f --- /dev/null +++ b/docs/specs/COMPONENT_KEP_INDEX.md @@ -0,0 +1,30 @@ +# Component to KEP Index + +This index maps Splunk Operator components to approved design IDs +(`CSPL-####` or `GH-####`). + +## How To Update +- Add your change ID to every component section impacted by your diff. +- Keep IDs sorted newest-first within each section. +- `scripts/dev/keps_check.sh` validates this mapping for implementation-gated changes. + +## api-crd +- CSPL-0000 (placeholder: replace with approved API/CRD KEP IDs) + +## controller-reconcile +- CSPL-0000 (placeholder: replace with approved reconcile KEP IDs) + +## enterprise-runtime +- CSPL-0000 (placeholder: replace with approved runtime KEP IDs) + +## manifests-release +- CSPL-0000 (placeholder: replace with approved manifest/release KEP IDs) + +## test-harness +- CSPL-0000 (placeholder: replace with approved test/harness KEP IDs) + +## governance-harness +- CSPL-4577 (KEP-first harness governance hardening) + +## app-framework +- CSPL-0000 (placeholder: replace with approved app-framework KEP IDs) diff --git a/docs/specs/CSPL-4577-agentic-harness-governance.md b/docs/specs/CSPL-4577-agentic-harness-governance.md new file mode 100644 index 000000000..15de5cf6a --- /dev/null +++ b/docs/specs/CSPL-4577-agentic-harness-governance.md @@ -0,0 +1,133 @@ +# CSPL-4577 Agentic Harness Governance + +- ID: CSPL-4577 +- Status: Implemented +- Owners: Splunk Operator maintainers +- Reviewers: Splunk Operator maintainers +- Created: 2026-02-27 +- Last Updated: 2026-03-03 (governance hardening pass: KEP mapping, parity matrix, constitution/runtime policy) +- Related Links: Jira CSPL-4577, PR #1738 + +## Summary +Adopt a production-grade, KEP-driven harness workflow so non-trivial Splunk +Operator changes are governed by reviewed design docs, machine-readable +implementation manifests, deterministic checks, and auditable run artifacts. + +## Motivation +The repository had useful agent skills and helper scripts, but governance and +execution contracts were still partly implicit. That allowed implementation to +move ahead without a strongly enforced linkage between approved design intent, +scoped edits, and deterministic harness validation. + +## Goals +- Use KEP-lite docs as the design system of record for non-trivial changes. +- Require machine-readable harness manifests for non-trivial implementation PRs. +- Enforce policy, scope, and quality gates in local and CI workflows. +- Keep validation replayable and auditable for maintainers and contributors. + +## Non-Goals +- Replace all existing CI workflows with a new framework. +- Introduce external policy engines or paid governance tooling. +- Require KEP files for typo-only or formatting-only edits. + +## Proposal +1. Convert `docs/specs/` to a KEP-lite format aligned with Kubernetes-style + design sections. +2. Enforce KEP structure and lifecycle checks via `scripts/dev/spec_check.sh`. +3. Add `scripts/dev/harness_manifest_check.sh` to validate machine-readable + implementation manifests and spec linkage. +4. Add replayable policy evaluations via `scripts/dev/harness_eval.sh` and an + evaluation corpus in `docs/agent/evals/`. +5. Add `scripts/dev/harness_run.sh` to produce run artifacts under + `.harness/runs/` with per-step logs and summaries. +6. Add `scripts/dev/speckit_bridge.sh` so Spec Kit outputs can bootstrap KEP + and manifest artifacts in a deterministic way. +7. Add `scripts/dev/risk_policy_check.sh` for risk-tiered governance + (approvals, merge queue, and required validation depth). +8. Add `scripts/dev/risk_label_check.sh` so PR labels must match manifest + `risk_tier`. +9. Add `scripts/dev/autonomy_scorecard.sh` and a CI workflow to publish + autonomy metrics for each PR. +10. Add merge queue CI workflow and branch policy documentation. +11. Wire all checks into `scripts/dev/pr_check.sh` and CI `pr-check`. +12. Add component-to-KEP mapping and parity matrix validation, plus constitution/runtime issue policy checks. + +## API/CRD Impact +- No CRD schema changes. +- Process and governance changes only. + +## Reconcile/State Impact +- No controller reconcile behavior changes. +- No status phase or condition semantics changed. + +## Test Plan +- Unit: not applicable for this governance-only change. +- Integration (Ginkgo): not applicable. +- KUTTL: not applicable. +- Governance/harness: + - `scripts/dev/spec_check.sh` + - `scripts/dev/harness_manifest_check.sh` + - `scripts/dev/doc_first_check.sh` + - `scripts/dev/commit_discipline_check.sh` + - `scripts/dev/appframework_parity_check.sh` + - `scripts/dev/keps_check.sh` + - `scripts/dev/harness_engineering_parity_check.sh` + - `scripts/dev/constitution_runtime_policy_check.sh` + - `scripts/dev/risk_policy_check.sh` + - `scripts/dev/risk_label_check.sh --labels risk:medium` + - `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` + - `scripts/dev/harness_run.sh --skip-pr-check` + - `scripts/dev/autonomy_scorecard.sh --base-ref develop` + - `scripts/dev/pr_check.sh` + +## Harness Validation +- `scripts/dev/spec_check.sh` +- `scripts/dev/harness_manifest_check.sh` +- `scripts/dev/doc_first_check.sh` +- `scripts/dev/commit_discipline_check.sh` +- `scripts/dev/appframework_parity_check.sh` +- `scripts/dev/keps_check.sh` +- `scripts/dev/harness_engineering_parity_check.sh` +- `scripts/dev/constitution_runtime_policy_check.sh` +- `scripts/dev/risk_policy_check.sh` +- `scripts/dev/risk_label_check.sh` +- `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` +- `scripts/dev/harness_run.sh` +- `scripts/dev/autonomy_scorecard.sh` +- `scripts/dev/pr_check.sh` +- CI workflow `PR Check` (`.github/workflows/pr-check.yml`) +- CI workflow `Merge Queue Check` (`.github/workflows/merge-queue-check.yml`) +- CI workflow `Autonomy Scorecard` (`.github/workflows/autonomy-scorecard.yml`) + +## Risks +- Risk: Additional workflow overhead for contributors. + - Mitigation: templates, scripts, and clear docs keep overhead predictable. +- Risk: false positives from path/scope checks. + - Mitigation: explicit allowed/forbidden patterns in manifests. +- Risk: process drift. + - Mitigation: replayable eval suite and required PR gates. + +## Rollout and Rollback +Rollout: +1. Merge KEP-lite docs, scripts, and CI wiring. +2. Require harness manifests for non-trivial implementation PRs. +3. Enable branch protection and merge queue with required status checks. +4. Monitor failures and tighten policies incrementally. + +Rollback: +1. Set `SKIP_MANIFEST_CHECK=1` and/or `SKIP_HARNESS_EVAL=1` in emergency CI. +2. Revert harness script integration in `scripts/dev/pr_check.sh` if needed. +3. Retain KEP docs for auditability. + +## Graduation Criteria +- [x] KEP-lite template and lifecycle documented in-repo. +- [x] Harness manifest and scope checks enforced for non-trivial changes. +- [x] Replayable harness policy evaluation suite added. +- [x] Harness run artifacts are generated with logs and summaries. +- [x] CI `pr-check` includes governance + harness gates. +- [x] Risk-tier policy checks are enforced for changed manifests. +- [x] Spec Kit bridge is available for planning-to-implementation handoff. +- [x] Autonomy scorecard is published for each PR. +- [x] Component-to-KEP mapping validation is enforced. +- [x] Harness engineering parity matrix validation is enforced. +- [x] Constitution and runtime issue governance validation is enforced. diff --git a/docs/specs/README.md b/docs/specs/README.md new file mode 100644 index 000000000..68660717c --- /dev/null +++ b/docs/specs/README.md @@ -0,0 +1,70 @@ +# KEP-Lite Workflow + +`docs/specs/` is the system of record for KEP-lite documents that govern +non-trivial behavior changes in Splunk Operator. + +## Why KEP-lite +We use a Kubernetes KEP-inspired format so design intent, rollout safety, and +validation are reviewable before implementation. + +Spec Kit planning docs can live under `speckit/specs/`, but the KEP in +`docs/specs/` is the governance source of truth used by harness checks. + +## When a KEP Is Required +A KEP is required for non-trivial changes, including: +- CRD/API changes +- reconciliation/state-machine changes +- integration/harness workflow changes +- release/compatibility behavior changes + +KEP is usually not required for: +- typo fixes +- comment-only edits +- formatting-only edits +- dependency patch bumps with no behavior change + +## Lifecycle +Use one status value in each KEP: +- `Draft` +- `In Review` +- `Approved` +- `Implemented` +- `Superseded` + +## Naming +Use a stable file name: +- `CSPL-<ticket>-<short-kebab-title>.md` +- `GH-<issue>-<short-kebab-title>.md` + +## Component Mapping +- `COMPONENT_KEP_INDEX.md` maps implementation components to approved change IDs. +- Keep component mappings current as specs are approved and implemented. +- `scripts/dev/keps_check.sh` enforces this mapping for implementation-gated changes. + +## Review and Merge Policy +- Non-trivial code PRs must include a machine-readable harness manifest in + `harness/manifests/`. +- Each harness manifest must reference a KEP file in this directory with status + `Approved` or `Implemented`. +- Each harness manifest should declare `risk_tier` and review/merge policy. +- Risk-tier policy is validated by `scripts/dev/risk_policy_check.sh`. +- PR risk label must match manifest `risk_tier` (`risk:low|risk:medium|risk:high`). +- Keep KEP status and acceptance/graduation criteria current as work lands. + +## Required KEP Sections +Each KEP must include: +- `Status:` +- `## Summary` +- `## Motivation` +- `## Goals` +- `## Non-Goals` +- `## Proposal` +- `## API/CRD Impact` +- `## Reconcile/State Impact` +- `## Test Plan` +- `## Harness Validation` +- `## Risks` +- `## Rollout and Rollback` +- `## Graduation Criteria` + +Use [SPEC_TEMPLATE.md](SPEC_TEMPLATE.md) as the baseline. diff --git a/docs/specs/SPEC_TEMPLATE.md b/docs/specs/SPEC_TEMPLATE.md new file mode 100644 index 000000000..e9655e8ae --- /dev/null +++ b/docs/specs/SPEC_TEMPLATE.md @@ -0,0 +1,72 @@ +# <Short Title> + +- ID: <CSPL-XXXX or GH-issue> +- Status: Draft +- Owners: <name(s)> +- Reviewers: <name(s)> +- Created: YYYY-MM-DD +- Last Updated: YYYY-MM-DD +- Related Links: <issue/PR/docs links> + +## Summary +One paragraph describing what is changing and why. + +## Motivation +Describe user/operator pain and current limitations. + +## Goals +- Goal 1 +- Goal 2 + +## Non-Goals +- Out of scope 1 +- Out of scope 2 + +## Proposal +Describe the technical design and decision rationale. + +## API/CRD Impact +- Affected CRDs/kinds: +- Schema/marker/defaulting changes: +- Compatibility notes: + +## Reconcile/State Impact +- Reconciler flows touched: +- Status phase/conditions behavior: +- Idempotency and requeue behavior: + +## Test Plan +- Unit: +- Integration (Ginkgo): +- KUTTL: +- Upgrade/regression: + +## Harness Validation +List objective checks that must pass: +- `scripts/dev/spec_check.sh` +- `scripts/dev/harness_manifest_check.sh` +- `scripts/dev/doc_first_check.sh` +- `scripts/dev/commit_discipline_check.sh` +- `scripts/dev/appframework_parity_check.sh` +- `scripts/dev/keps_check.sh` +- `scripts/dev/harness_engineering_parity_check.sh` +- `scripts/dev/constitution_runtime_policy_check.sh` +- `scripts/dev/risk_policy_check.sh` +- `scripts/dev/risk_label_check.sh --labels risk:<tier>` +- `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` +- `scripts/dev/harness_run.sh --fast` +- `scripts/dev/pr_check.sh` +- `scripts/dev/autonomy_scorecard.sh --base-ref <target-branch>` +- Additional checks specific to this spec + +## Risks +List behavioral, operational, and compatibility risks plus mitigations. + +## Rollout and Rollback +Define rollout order, observability, and rollback steps. + +## Graduation Criteria +- [ ] Design reviewed and status moved to `Approved` +- [ ] Implementation merged and status moved to `Implemented` +- [ ] Required harness checks pass in CI +- [ ] Docs and examples updated (if applicable) diff --git a/docs/testing/RUNTIME_ISSUE_TRACKER.md b/docs/testing/RUNTIME_ISSUE_TRACKER.md new file mode 100644 index 000000000..e41333c42 --- /dev/null +++ b/docs/testing/RUNTIME_ISSUE_TRACKER.md @@ -0,0 +1,25 @@ +# Runtime Issue Tracker + +Last updated: `2026-03-04` (UTC) + +This file tracks runtime and integration issues discovered during operator +validation (unit/integration/KUTTL/kind/cluster runs). + +## How To Use +- Add one row per new issue. +- Keep statuses in lifecycle order: `Open` -> `In Progress` -> `Mitigated` -> `Closed`. +- Keep evidence concrete: timestamp, namespace/resource, and log/event signature. +- Do not delete historical rows; close them with validation evidence. + +## Operator Runtime Issues + +| ID | Status | Issue | Evidence | Impact | Next Action | +|---|---|---|---|---|---| +| `OP-4577-001` | Open | Governance hardening rollout pending validation on mixed CI providers. | 2026-03-03 initial capture during CSPL-4577 rollout. | Temporary uncertainty across GitHub and downstream pipelines. | Track failures to closure while enabling new governance gates incrementally. | +| `OP-4577-002` | In Progress | EKS smoke setup can fail when IAM OIDC provider quota is exhausted, causing downstream EBS/PVC/operator readiness failures. | 2026-03-04 GitHub run `22648768822`: `LimitExceeded: Cannot exceed quota for OpenIdConnectProvidersPerAccount: 100`, then `FailedScheduling ... VolumeBinding: context deadline exceeded` and operator readiness timeout. | Long-running smoke jobs fail late with infra symptoms unrelated to feature behavior. | Add OIDC quota preflight and fail-fast checks in EKS bootstrap; wait for EBS CSI readiness before running tests. | + +## Test Harness Issues + +| ID | Status | Issue | Evidence | Impact | Next Action | +|---|---|---|---|---|---| +| `TH-4577-001` | Open | Baseline runtime issue tracking introduced; historical issue migration pending. | 2026-03-03 governance bootstrap. | Existing open issues may not yet be represented uniformly. | Backfill high-priority open issues from active test runs. | diff --git a/harness/README.md b/harness/README.md new file mode 100644 index 000000000..05b34b3b1 --- /dev/null +++ b/harness/README.md @@ -0,0 +1,9 @@ +# Harness Artifacts + +This directory contains machine-readable manifests used by governance checks. + +- `manifests/`: per-change harness manifests consumed by + `scripts/dev/harness_manifest_check.sh`. + +Runtime artifacts are written to `.harness/runs/` and are intentionally ignored +in git. diff --git a/harness/manifests/CSPL-4577-governance-hardening.yaml b/harness/manifests/CSPL-4577-governance-hardening.yaml new file mode 100644 index 000000000..88e34934f --- /dev/null +++ b/harness/manifests/CSPL-4577-governance-hardening.yaml @@ -0,0 +1,58 @@ +version: v1 +change_id: CSPL-4577 +title: KEP-first harness governance hardening +spec_file: docs/specs/CSPL-4577-agentic-harness-governance.md +owner: @splunk/splunk-operator-for-kubernetes +delivery_mode: agent +risk_tier: medium +human_approvals_required: 1 +auto_merge_allowed: false +merge_queue_required: true +evaluation_suite: docs/agent/evals/policy-regression.yaml +allowed_paths: + - .devcontainer/** + - .agents/** + - .github/** + - .harness/** + - AGENTS.md + - CODEOWNERS + - CODE_OF_CONDUCT.md + - GOVERNANCE.md + - MAINTAINERS.md + - Makefile + - README.md + - SECURITY.md + - SUPPORT.md + - api/** + - docs/** + - harness/** + - internal/** + - pkg/** + - scripts/** + - skaffold.yaml + - config/skaffold/** + - speckit/** + - test/** + - templates/** + - scripts/dev/** + - .gitignore +forbidden_paths: + - vendor/** + - bin/** + - .git/** +required_commands: + - scripts/dev/spec_check.sh + - scripts/dev/harness_manifest_check.sh + - scripts/dev/doc_first_check.sh + - scripts/dev/commit_discipline_check.sh + - scripts/dev/appframework_parity_check.sh + - scripts/dev/keps_check.sh + - scripts/dev/harness_engineering_parity_check.sh + - scripts/dev/constitution_runtime_policy_check.sh + - scripts/dev/risk_policy_check.sh + - scripts/dev/risk_label_check.sh --labels risk:medium + - scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml + - scripts/dev/skill_lint.sh + - scripts/dev/script_sanity_check.sh + - scripts/dev/harness_run.sh --fast + - scripts/dev/pr_check.sh --fast diff --git a/harness/manifests/EXAMPLE.yaml b/harness/manifests/EXAMPLE.yaml new file mode 100644 index 000000000..d5c371e31 --- /dev/null +++ b/harness/manifests/EXAMPLE.yaml @@ -0,0 +1,41 @@ +version: v1 +change_id: CSPL-0000 +title: Example change title +spec_file: docs/specs/CSPL-0000-example.md +owner: @splunk/splunk-operator-for-kubernetes +delivery_mode: agent +risk_tier: medium +human_approvals_required: 1 +auto_merge_allowed: false +merge_queue_required: true +evaluation_suite: docs/agent/evals/policy-regression.yaml +allowed_paths: + - api/** + - internal/** + - pkg/** + - test/** + - docs/** + - harness/manifests/** + - scripts/dev/** + - speckit/** + - .harness/** +forbidden_paths: + - vendor/** + - bin/** + - .git/** +required_commands: + - scripts/dev/spec_check.sh + - scripts/dev/harness_manifest_check.sh + - scripts/dev/doc_first_check.sh + - scripts/dev/commit_discipline_check.sh + - scripts/dev/appframework_parity_check.sh + - scripts/dev/keps_check.sh + - scripts/dev/harness_engineering_parity_check.sh + - scripts/dev/constitution_runtime_policy_check.sh + - scripts/dev/risk_policy_check.sh + - scripts/dev/risk_label_check.sh --labels risk:<tier> + - scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml + - scripts/dev/skill_lint.sh + - scripts/dev/script_sanity_check.sh + - scripts/dev/harness_run.sh --fast + - scripts/dev/pr_check.sh --fast diff --git a/internal/controller/AGENTS.md b/internal/controller/AGENTS.md new file mode 100644 index 000000000..ef0db51d2 --- /dev/null +++ b/internal/controller/AGENTS.md @@ -0,0 +1,23 @@ +# internal/controller/ — Reconcilers + +## What Lives Here +- Controller setup and reconciliation logic for CRs +- Watches, predicates, and event handling + +## Invariants +- Reconcile must be idempotent and safe to retry. +- Status updates must reflect actual observed state. +- Avoid tight loops; use requeues sparingly and intentionally. + +## Common Pitfalls +- Updating status without checking resource version or observed generation. +- Creating resources without proper ownership or labels. +- Missing RBAC updates when new resources are added. + +## Commands +- Unit tests (envtest + ginkgo): `make test` +- Repo verify: `make verify-repo` + +## Notes +Controller behavior is tightly coupled to `pkg/splunk/enterprise/` helpers. +When updating reconciliation, update or add tests in `test/`. diff --git a/pkg/splunk/enterprise/AGENTS.md b/pkg/splunk/enterprise/AGENTS.md new file mode 100644 index 000000000..36f5ae562 --- /dev/null +++ b/pkg/splunk/enterprise/AGENTS.md @@ -0,0 +1,24 @@ +# pkg/splunk/enterprise/ — Core Operator Logic + +## What Lives Here +- Enterprise CR orchestration and state transitions +- App framework workflows +- Stateful resource creation helpers + +## Invariants +- State transitions must be monotonic and recoverable. +- Helpers should be idempotent and tolerate partial resources. +- Respect spec defaults and validate inputs before use. + +## Common Pitfalls +- Assuming resources exist without checking. +- Updating status too early (before resources are ready). +- Cross-CR dependencies without clear ordering. + +## Commands +- Unit tests: `make test` +- Repo verify: `make verify-repo` + +## Notes +When touching app framework paths, add/adjust tests in `test/` and +consider any KUTTL coverage under `kuttl/`. diff --git a/scripts/debug_reconcile.sh b/scripts/debug_reconcile.sh new file mode 100755 index 000000000..849eaf72e --- /dev/null +++ b/scripts/debug_reconcile.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + echo "Usage: $(basename "$0") <kind> <name> <namespace>" >&2 + echo "Environment overrides:" >&2 + echo " OPERATOR_NAMESPACE (default: splunk-operator)" >&2 + echo " LOG_SINCE (default: 30m)" >&2 + echo " OUTPUT_DIR (default: ./.agent-output/reconcile-<kind>-<name>-<timestamp>)" >&2 +} + +if [[ $# -ne 3 ]]; then + usage + exit 1 +fi + +kind="$1" +name="$2" +namespace="$3" + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v kubectl >/dev/null 2>&1; then + echo "kubectl is required for debug_reconcile.sh" >&2 + exit 1 +fi + +operator_namespace="${OPERATOR_NAMESPACE:-splunk-operator}" +log_since="${LOG_SINCE:-30m}" + +if [[ -n "${OUTPUT_DIR:-}" ]]; then + out_dir="${OUTPUT_DIR}" +else + ts="$(date +%Y%m%d-%H%M%S)" + out_dir="${repo_root}/.agent-output/reconcile-${kind}-${name}-${ts}" +fi + +mkdir -p "${out_dir}" + +printf "Collecting reconcile debug data into %s\n" "${out_dir}" + +kubectl get "${kind}" "${name}" -n "${namespace}" -o yaml > "${out_dir}/cr.yaml" +kubectl describe "${kind}" "${name}" -n "${namespace}" > "${out_dir}/cr.describe.txt" + +kubectl get events -n "${namespace}" --sort-by=.lastTimestamp > "${out_dir}/events.txt" + +kubectl get pods -n "${namespace}" -o wide > "${out_dir}/pods.txt" +kubectl get svc -n "${namespace}" -o wide > "${out_dir}/services.txt" + +kubectl logs -n "${operator_namespace}" deploy/splunk-operator-controller-manager -c manager --since="${log_since}" > "${out_dir}/operator.logs.txt" + +printf "Done.\n" diff --git a/scripts/dev/appframework_parity_check.sh b/scripts/dev/appframework_parity_check.sh new file mode 100755 index 000000000..499af594d --- /dev/null +++ b/scripts/dev/appframework_parity_check.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/appframework_parity_check.sh [--base-ref <branch>] [--help] + +Enforces AppFramework parity governance when AppFramework code/API paths change. +- Validates required governance artifacts and review-template references. +- If appframework-gated files changed, requires parity doc updates and test updates. + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for appframework parity check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + { + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" + git diff --name-only --diff-filter=ACMRT + git diff --name-only --cached --diff-filter=ACMRT + git ls-files --others --exclude-standard + } | awk 'NF' | sort -u +} + +need_file() { + local file="$1" + if [[ ! -f "${file}" ]]; then + echo "appframework_parity_check: FAIL: missing required artifact ${file}" >&2 + exit 1 + fi +} + +need_pattern() { + local file="$1" + local pattern="$2" + local label="$3" + if command -v rg >/dev/null 2>&1; then + if ! rg -q "${pattern}" "${file}"; then + echo "appframework_parity_check: FAIL: ${label} missing in ${file}" >&2 + exit 1 + fi + else + if ! grep -Eq "${pattern}" "${file}"; then + echo "appframework_parity_check: FAIL: ${label} missing in ${file}" >&2 + exit 1 + fi + fi +} + +need_file "docs/AppFramework.md" +need_file "docs/agent/APPFRAMEWORK_PARITY.md" +need_file ".github/pull_request_template.md" +need_file "templates/pull_request.md" + +need_pattern "docs/agent/APPFRAMEWORK_PARITY.md" '^# App Framework Parity Governance' "title" +need_pattern "docs/agent/APPFRAMEWORK_PARITY.md" '^## Trigger Paths' "trigger paths section" +need_pattern "docs/agent/APPFRAMEWORK_PARITY.md" '^## Required Evidence' "required evidence section" +need_pattern "docs/agent/APPFRAMEWORK_PARITY.md" 'scripts/dev/appframework_parity_check.sh' "gate command reference" +need_pattern ".github/pull_request_template.md" 'scripts/dev/appframework_parity_check.sh' "PR template gate reference" +need_pattern "templates/pull_request.md" 'scripts/dev/appframework_parity_check.sh' "local PR template gate reference" + +declare -A seen=() +changed_files=() + +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "appframework_parity_check: no changed files detected." + exit 0 +fi + +needs_parity=0 +for file in "${changed_files[@]}"; do + case "${file}" in + docs/AppFramework.md|pkg/splunk/enterprise/afwscheduler.go|pkg/splunk/enterprise/afwscheduler_test.go|test/testenv/appframework_utils.go|test/appframework_*|config/samples/*appframework*) + needs_parity=1 + ;; + esac +done + +if [[ "${needs_parity}" -eq 0 ]]; then + echo "appframework_parity_check: passed (no appframework-gated paths changed)." + exit 0 +fi + +if ! printf '%s\n' "${changed_files[@]}" | grep -Eq '^(docs/AppFramework\.md|docs/agent/APPFRAMEWORK_PARITY\.md)$'; then + cat >&2 <<'ERROR' +appframework_parity_check: FAIL: appframework-gated changes detected without parity-doc update. + +Required update in this diff: +- docs/AppFramework.md or +- docs/agent/APPFRAMEWORK_PARITY.md +ERROR + exit 1 +fi + +if ! printf '%s\n' "${changed_files[@]}" | grep -Eq '^(test/appframework_.*|test/testenv/appframework_utils\.go|pkg/splunk/enterprise/afwscheduler_test\.go)$'; then + cat >&2 <<'ERROR' +appframework_parity_check: FAIL: appframework-gated changes detected without appframework test updates. + +Add at least one relevant test update: +- test/appframework_*/... +- test/testenv/appframework_utils.go +- pkg/splunk/enterprise/afwscheduler_test.go +ERROR + exit 1 +fi + +echo "appframework_parity_check: passed." diff --git a/scripts/dev/autonomy_scorecard.sh b/scripts/dev/autonomy_scorecard.sh new file mode 100755 index 000000000..135d08da2 --- /dev/null +++ b/scripts/dev/autonomy_scorecard.sh @@ -0,0 +1,309 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/autonomy_scorecard.sh [--base-ref <branch>] [--suite <path>] [--output <json>] [--markdown <md>] [--help] + +Generates an autonomy scorecard for the current diff. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +suite="docs/agent/evals/policy-regression.yaml" +json_out="" +md_out="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + base_ref="$2" + shift 2 + ;; + --suite) + suite="$2" + shift 2 + ;; + --output) + json_out="$2" + shift 2 + ;; + --markdown) + md_out="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for autonomy scorecard." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" +} + +extract_scalar() { + local key="$1" + local file="$2" + local line + line="$(grep -E "^${key}:[[:space:]]*" "${file}" | head -n1 || true)" + line="${line#*:}" + line="${line# }" + line="${line% }" + line="${line#\"}" + line="${line%\"}" + line="${line#\'}" + line="${line%\'}" + printf '%s\n' "${line}" +} + +is_non_trivial_path() { + case "$1" in + api/*|cmd/*|config/*|internal/*|kuttl/*|pkg/*|scripts/*|test/*|Makefile|go.mod|go.sum|PROJECT) + return 0 + ;; + *) + return 1 + ;; + esac +} + +declare -A seen=() +changed_files=() + +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +total_changed=${#changed_files[@]} +non_trivial_count=0 +manifest_files=() +for file in "${changed_files[@]}"; do + if is_non_trivial_path "${file}"; then + non_trivial_count=$((non_trivial_count + 1)) + fi + case "${file}" in + harness/manifests/*.yaml|harness/manifests/*.yml) + manifest_files+=("${file}") + ;; + esac +done + +manifest_count=${#manifest_files[@]} +agent_mode_count=0 +hybrid_mode_count=0 +human_mode_count=0 +risk_low_count=0 +risk_medium_count=0 +risk_high_count=0 + +for manifest in "${manifest_files[@]}"; do + [[ -f "${manifest}" ]] || continue + mode="$(extract_scalar "delivery_mode" "${manifest}")" + risk="$(extract_scalar "risk_tier" "${manifest}")" + + case "${mode}" in + agent) agent_mode_count=$((agent_mode_count + 1)) ;; + hybrid) hybrid_mode_count=$((hybrid_mode_count + 1)) ;; + human) human_mode_count=$((human_mode_count + 1)) ;; + esac + + case "${risk}" in + low) risk_low_count=$((risk_low_count + 1)) ;; + medium) risk_medium_count=$((risk_medium_count + 1)) ;; + high) risk_high_count=$((risk_high_count + 1)) ;; + esac +done + +range="" +if [[ -n "${base_ref}" ]]; then + remote_ref="refs/remotes/origin/${base_ref}" + if git show-ref --verify --quiet "${remote_ref}"; then + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -n "${merge_base}" ]]; then + range="${merge_base}...HEAD" + fi + fi +fi + +if [[ -z "${range}" ]]; then + if git rev-parse --verify HEAD~1 >/dev/null 2>&1; then + range="HEAD~1...HEAD" + else + range="HEAD" + fi +fi + +total_commits="$(git rev-list --count "${range}" 2>/dev/null || echo 0)" +agent_commits="$(git log --format='%an <%ae>' "${range}" 2>/dev/null | grep -Eic '(codex|openai|bot|agent)' || true)" +if [[ -z "${agent_commits}" ]]; then + agent_commits=0 +fi + +agent_commit_ratio=0 +if [[ "${total_commits}" -gt 0 ]]; then + agent_commit_ratio=$((agent_commits * 100 / total_commits)) +fi + +spec_rc=0 +manifest_rc=0 +risk_rc=0 +eval_rc=0 + +spec_args=() +if [[ -n "${base_ref}" ]]; then + spec_args+=(--base-ref "${base_ref}") +fi + +set +e +scripts/dev/spec_check.sh "${spec_args[@]}" >/tmp/scorecard-spec.log 2>&1 +spec_rc=$? +scripts/dev/harness_manifest_check.sh "${spec_args[@]}" >/tmp/scorecard-manifest.log 2>&1 +manifest_rc=$? +scripts/dev/risk_policy_check.sh "${spec_args[@]}" >/tmp/scorecard-risk.log 2>&1 +risk_rc=$? +scripts/dev/harness_eval.sh --suite "${suite}" >/tmp/scorecard-eval.log 2>&1 +eval_rc=$? +set -e + +score=0 +[[ ${spec_rc} -eq 0 ]] && score=$((score + 25)) +[[ ${manifest_rc} -eq 0 ]] && score=$((score + 25)) +[[ ${risk_rc} -eq 0 ]] && score=$((score + 20)) +[[ ${eval_rc} -eq 0 ]] && score=$((score + 20)) +if [[ ${agent_commit_ratio} -ge 50 || ${agent_mode_count} -gt 0 ]]; then + score=$((score + 10)) +fi + +status="needs-work" +if [[ ${score} -ge 90 ]]; then + status="excellent" +elif [[ ${score} -ge 75 ]]; then + status="good" +elif [[ ${score} -ge 60 ]]; then + status="fair" +fi + +json_payload="{\n" +json_payload+=" \"generated_at\": \"$(date -u +"%Y-%m-%dT%H:%M:%SZ")\",\n" +json_payload+=" \"branch\": \"$(git branch --show-current)\",\n" +json_payload+=" \"head_sha\": \"$(git rev-parse HEAD)\",\n" +json_payload+=" \"base_ref\": \"${base_ref}\",\n" +json_payload+=" \"changed_files\": ${total_changed},\n" +json_payload+=" \"non_trivial_files\": ${non_trivial_count},\n" +json_payload+=" \"manifests_changed\": ${manifest_count},\n" +json_payload+=" \"risk_tiers\": {\"low\": ${risk_low_count}, \"medium\": ${risk_medium_count}, \"high\": ${risk_high_count}},\n" +json_payload+=" \"delivery_modes\": {\"agent\": ${agent_mode_count}, \"hybrid\": ${hybrid_mode_count}, \"human\": ${human_mode_count}},\n" +json_payload+=" \"commit_metrics\": {\"total\": ${total_commits}, \"agent_like\": ${agent_commits}, \"agent_like_ratio\": ${agent_commit_ratio}},\n" +json_payload+=" \"gate_results\": {\"spec_check\": ${spec_rc}, \"manifest_check\": ${manifest_rc}, \"risk_policy_check\": ${risk_rc}, \"harness_eval\": ${eval_rc}},\n" +json_payload+=" \"autonomy_score\": ${score},\n" +json_payload+=" \"status\": \"${status}\"\n" +json_payload+="}" + +markdown_payload="## Autonomy Scorecard\n" +markdown_payload+="- Status: **${status}**\n" +markdown_payload+="- Score: **${score}/100**\n" +markdown_payload+="- Branch: $(git branch --show-current)\n" +markdown_payload+="- Head SHA: $(git rev-parse --short HEAD)\n" +if [[ -n "${base_ref}" ]]; then + markdown_payload+="- Base ref: ${base_ref}\n" +fi +markdown_payload+="\n### Diff Metrics\n" +markdown_payload+="- Changed files: ${total_changed}\n" +markdown_payload+="- Non-trivial changed files: ${non_trivial_count}\n" +markdown_payload+="- Manifests changed: ${manifest_count}\n" +markdown_payload+="\n### Delivery Modes\n" +markdown_payload+="- Agent: ${agent_mode_count}\n" +markdown_payload+="- Hybrid: ${hybrid_mode_count}\n" +markdown_payload+="- Human: ${human_mode_count}\n" +markdown_payload+="\n### Risk Tiers\n" +markdown_payload+="- Low: ${risk_low_count}\n" +markdown_payload+="- Medium: ${risk_medium_count}\n" +markdown_payload+="- High: ${risk_high_count}\n" +markdown_payload+="\n### Commit Signals\n" +markdown_payload+="- Total commits in range: ${total_commits}\n" +markdown_payload+="- Agent-like commit authors: ${agent_commits}\n" +markdown_payload+="- Agent-like commit ratio: ${agent_commit_ratio}%\n" +markdown_payload+="\n### Gate Results\n" +markdown_payload+="- spec_check: ${spec_rc}\n" +markdown_payload+="- harness_manifest_check: ${manifest_rc}\n" +markdown_payload+="- risk_policy_check: ${risk_rc}\n" +markdown_payload+="- harness_eval: ${eval_rc}\n" + +if [[ -n "${json_out}" ]]; then + mkdir -p "$(dirname "${json_out}")" + printf '%b\n' "${json_payload}" > "${json_out}" +else + printf '%b\n' "${json_payload}" +fi + +if [[ -n "${md_out}" ]]; then + mkdir -p "$(dirname "${md_out}")" + printf '%b\n' "${markdown_payload}" > "${md_out}" +else + printf '%b\n' "${markdown_payload}" +fi diff --git a/scripts/dev/commit_discipline_check.sh b/scripts/dev/commit_discipline_check.sh new file mode 100755 index 000000000..e681b9a6d --- /dev/null +++ b/scripts/dev/commit_discipline_check.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/commit_discipline_check.sh [--base-ref <branch>] [--help] + +Enforces incremental commit discipline for implementation-gated paths. +- Requires at least 2 non-merge commits for implementation-heavy branches. +- Allows a single commit only with explicit override token: [single-commit-ok]. +- Fails oversized commits by changed file count threshold. + +Options: + --base-ref <branch> Compare commit history against origin/<branch> (CI mode). + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_input="${COMMIT_BASE_REF:-${KEP_BASE_REF:-origin/develop}}" +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_input="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +resolve_base_ref() { + local candidate="$1" + + if git rev-parse --verify "${candidate}" >/dev/null 2>&1; then + printf '%s\n' "${candidate}" + return 0 + fi + + local normalized="${candidate#origin/}" + local remote_ref="refs/remotes/origin/${normalized}" + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${normalized}:${remote_ref}" >/dev/null 2>&1 || true + fi + if git show-ref --verify --quiet "${remote_ref}"; then + printf '%s\n' "${remote_ref}" + return 0 + fi + + local origin_head_ref + origin_head_ref="$(git symbolic-ref -q refs/remotes/origin/HEAD || true)" + if [[ -n "${origin_head_ref}" ]] && git show-ref --verify --quiet "${origin_head_ref}"; then + printf '%s\n' "${origin_head_ref}" + return 0 + fi + + local fallback + for fallback in refs/remotes/origin/develop refs/remotes/origin/main; do + if git show-ref --verify --quiet "${fallback}"; then + printf '%s\n' "${fallback}" + return 0 + fi + done + + return 1 +} + +if ! base_ref="$(resolve_base_ref "${base_input}")"; then + echo "commit_discipline_check: unable to resolve base ref (set --base-ref or COMMIT_BASE_REF)." + exit 0 +fi + +if ! merge_base="$(git merge-base HEAD "${base_ref}" || true)"; then + merge_base="" +fi +if [[ -z "${merge_base}" ]]; then + echo "commit_discipline_check: unable to compute merge-base against ${base_ref}." + exit 1 +fi + +range_files="$(git diff --name-only "${merge_base}...HEAD" --)" +if [[ -z "${range_files}" ]]; then + echo "commit_discipline_check: no changes detected in ${merge_base}...HEAD" + exit 0 +fi + +requires_incremental=0 +while IFS= read -r file; do + [[ -n "${file}" ]] || continue + case "${file}" in + api/*|cmd/*|config/*|internal/*|kuttl/*|pkg/*|scripts/*|test/*|Makefile|go.mod|go.sum|PROJECT|skaffold.yaml|.github/workflows/*) + requires_incremental=1 + ;; + esac +done < <(printf '%s\n' "${range_files}") + +if [[ "${requires_incremental}" -eq 0 ]]; then + echo "commit_discipline_check: no commit-discipline-gated paths changed." + exit 0 +fi + +commits="$(git rev-list --reverse "${base_ref}"..HEAD)" +if [[ -z "${commits}" ]]; then + echo "commit_discipline_check: no commits in range." + exit 0 +fi + +non_merge_count=0 +max_files_per_commit="${COMMIT_MAX_FILES_PER_COMMIT:-200}" +for sha in ${commits}; do + subject="$(git log -1 --pretty=%s "${sha}")" + if [[ "${subject}" =~ ^Merge[[:space:]] ]]; then + continue + fi + + non_merge_count=$((non_merge_count + 1)) + files_changed="$(git show --name-only --pretty=format: "${sha}" | awk 'NF' | wc -l | tr -d '[:space:]')" + if [[ "${files_changed}" -gt "${max_files_per_commit}" ]]; then + echo "commit_discipline_check: FAIL: ${sha} touches ${files_changed} files (limit ${max_files_per_commit})." >&2 + exit 1 + fi +done + +if [[ "${non_merge_count}" -lt 2 ]]; then + latest_msg="$(git log -1 --pretty=%B HEAD)" + if ! printf '%s\n' "${latest_msg}" | grep -q '\[single-commit-ok\]'; then + cat >&2 <<'ERROR' +commit_discipline_check: FAIL: implementation-gated changes must be split into at least 2 non-merge commits. + +If a single commit is required, add [single-commit-ok] in the latest commit message with justification. +ERROR + exit 1 + fi +fi + +echo "commit_discipline_check: passed." diff --git a/scripts/dev/constitution_runtime_policy_check.sh b/scripts/dev/constitution_runtime_policy_check.sh new file mode 100755 index 000000000..e06005aeb --- /dev/null +++ b/scripts/dev/constitution_runtime_policy_check.sh @@ -0,0 +1,235 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/constitution_runtime_policy_check.sh [--base-ref <branch>] [--help] + +Validates always-on engineering constitution and runtime issue tracker policy. +- Checks required constitution/runtime policy docs and mandatory sections. +- Validates runtime issue tracker status values. +- For implementation-gated changes, requires either runtime tracker update or an + explicit review note in docs/changes/*.md. + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for constitution/runtime policy check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + { + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" + git diff --name-only --diff-filter=ACMRT + git diff --name-only --cached --diff-filter=ACMRT + git ls-files --others --exclude-standard + } | awk 'NF' | sort -u +} + +has_pattern() { + local pattern="$1" + local file="$2" + if command -v rg >/dev/null 2>&1; then + rg -q "${pattern}" "${file}" + else + grep -Eq "${pattern}" "${file}" + fi +} + +constitution_doc="docs/engineering/CONSTITUTION.md" +runtime_doc="docs/testing/RUNTIME_ISSUE_TRACKER.md" + +for f in "${constitution_doc}" "${runtime_doc}"; do + if [[ ! -f "${f}" ]]; then + echo "constitution_runtime_policy_check: FAIL: missing required file ${f}" >&2 + exit 1 + fi +done + +required_constitution_sections=( + '^# Engineering Constitution' + '^## Product Goal' + '^## Security Baselines' + '^## Reconcile and State Management' + '^## CRD and Compatibility Rules' + '^## Test and Harness Requirements' + '^## Runtime Issue Governance' +) +for section in "${required_constitution_sections[@]}"; do + if ! has_pattern "${section}" "${constitution_doc}"; then + echo "constitution_runtime_policy_check: FAIL: missing section ${section} in ${constitution_doc}" >&2 + exit 1 + fi +done + +required_runtime_sections=( + '^# Runtime Issue Tracker' + '^## How To Use' + 'Open` -> `In Progress` -> `Mitigated` -> `Closed' +) +for section in "${required_runtime_sections[@]}"; do + if ! has_pattern "${section}" "${runtime_doc}"; then + echo "constitution_runtime_policy_check: FAIL: missing section/policy '${section}' in ${runtime_doc}" >&2 + exit 1 + fi +done + +# Validate issue status values in markdown tables. +status_errors=0 +while IFS= read -r line; do + [[ -n "${line}" ]] || continue + [[ "${line}" =~ ^\| ]] || continue + [[ "${line}" =~ ^\|[[:space:]]*ID[[:space:]]*\| ]] && continue + [[ "${line}" =~ ^\|[-[:space:]]+\| ]] && continue + + status="$(printf '%s\n' "${line}" | awk -F'|' '{s=$3; gsub(/^[ \t`]+|[ \t`]+$/, "", s); print s}')" + if [[ -z "${status}" ]]; then + continue + fi + + case "${status}" in + Open|In\ Progress|Mitigated|Closed) + ;; + *) + echo "constitution_runtime_policy_check: FAIL: invalid runtime issue status '${status}' in line:" >&2 + echo "${line}" >&2 + status_errors=1 + ;; + esac +done < "${runtime_doc}" + +if [[ "${status_errors}" -ne 0 ]]; then + exit 1 +fi + +declare -A seen=() +changed_files=() +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "constitution_runtime_policy_check: passed." + exit 0 +fi + +requires_runtime_review=false +runtime_tracker_touched=false +review_noted=false + +for file in "${changed_files[@]}"; do + case "${file}" in + api/*|cmd/*|config/*|internal/*|kuttl/*|pkg/*|scripts/*|test/*|Makefile|go.mod|go.sum|PROJECT|skaffold.yaml|.github/workflows/*) + requires_runtime_review=true + ;; + esac + + if [[ "${file}" == "docs/testing/RUNTIME_ISSUE_TRACKER.md" ]]; then + runtime_tracker_touched=true + fi + + case "${file}" in + docs/changes/*.md) + base_name="$(basename "${file}")" + if [[ "${base_name}" != "README.md" && "${base_name}" != "TEMPLATE.md" && -f "${file}" ]]; then + if has_pattern '^## Runtime Issue Tracker Review' "${file}"; then + review_noted=true + fi + fi + ;; + esac +done + +if [[ "${requires_runtime_review}" == "true" && "${runtime_tracker_touched}" != "true" && "${review_noted}" != "true" ]]; then + cat >&2 <<'ERROR' +constitution_runtime_policy_check: FAIL: implementation-gated changes require runtime issue governance evidence. + +Provide one of: +- update docs/testing/RUNTIME_ISSUE_TRACKER.md, or +- add `## Runtime Issue Tracker Review` section in docs/changes/<date>-<topic>.md. +ERROR + exit 1 +fi + +echo "constitution_runtime_policy_check: passed." diff --git a/scripts/dev/doc_first_check.sh b/scripts/dev/doc_first_check.sh new file mode 100755 index 000000000..257fcc943 --- /dev/null +++ b/scripts/dev/doc_first_check.sh @@ -0,0 +1,232 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/doc_first_check.sh [--base-ref <branch>] [--help] + +Enforces doc-first governance for implementation-gated changes. +- If code/manifests/tests/scripts changed, require a change-intent doc under docs/changes/. +- Validate required sections in changed docs/changes/*.md files. +- Require at least one harness/test artifact in the overall change set. + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for doc-first check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + { + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" + git diff --name-only --diff-filter=ACMRT + git diff --name-only --cached --diff-filter=ACMRT + git ls-files --others --exclude-standard + } | awk 'NF' | sort -u +} + +has_rg=false +if command -v rg >/dev/null 2>&1; then + has_rg=true +fi + +has_section() { + local section="$1" + local file="$2" + if [[ "${has_rg}" == "true" ]]; then + rg -Fq "${section}" "${file}" + else + grep -Fq "${section}" "${file}" + fi +} + +is_doc_first_gated_path() { + case "$1" in + api/*|cmd/*|config/*|internal/*|kuttl/*|pkg/*|scripts/*|test/*|Makefile|go.mod|go.sum|PROJECT|skaffold.yaml|.github/workflows/*) + return 0 + ;; + *) + return 1 + ;; + esac +} + +declare -A seen=() +changed_files=() + +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "doc_first_check: no changed files detected." + exit 0 +fi + +requires_doc_first=false +for file in "${changed_files[@]}"; do + if is_doc_first_gated_path "${file}"; then + requires_doc_first=true + break + fi +done + +if [[ "${requires_doc_first}" != "true" ]]; then + echo "doc_first_check: no doc-first-gated paths changed." + exit 0 +fi + +change_docs=() +for file in "${changed_files[@]}"; do + case "${file}" in + docs/changes/*.md) + base_name="$(basename "${file}")" + if [[ "${base_name}" != "README.md" && "${base_name}" != "TEMPLATE.md" ]]; then + change_docs+=("${file}") + fi + ;; + esac +done + +if [[ ${#change_docs[@]} -eq 0 ]]; then + cat >&2 <<'ERROR' +doc_first_check: FAIL: doc-first-gated paths changed without a change-intent document. + +Add: docs/changes/<YYYY-MM-DD>-<topic>.md +Helper: scripts/dev/start_change.sh "<topic>" +ERROR + exit 1 +fi + +required_sections=( + "## Intent" + "## Scope" + "## Constitution Impact" + "## Harness Coverage Plan" + "## Test Plan" + "## Runtime Issue Tracker Review" + "## Implementation Log" +) + +errors=() +for doc in "${change_docs[@]}"; do + if [[ ! -f "${doc}" ]]; then + errors+=("${doc}: file missing") + continue + fi + + for section in "${required_sections[@]}"; do + if ! has_section "${section}" "${doc}"; then + errors+=("${doc}: missing section '${section}'") + fi + done + + if [[ "${has_rg}" == "true" ]]; then + if ! rg -q 'scripts/dev/|make |go test|ginkgo|kubectl kuttl|test/' "${doc}"; then + errors+=("${doc}: Harness Coverage Plan/Test Plan appears empty (add command evidence)") + fi + else + if ! grep -Eq 'scripts/dev/|make |go test|ginkgo|kubectl kuttl|test/' "${doc}"; then + errors+=("${doc}: Harness Coverage Plan/Test Plan appears empty (add command evidence)") + fi + fi +done + +if [[ "${has_rg}" == "true" ]]; then + if ! printf '%s\n' "${changed_files[@]}" | rg -q '(^scripts/dev/.*(check|eval|run).*\.sh$|_test\.go$|^test/|^kuttl/|^harness/)'; then + errors+=("overall diff: missing harness/test evidence file (scripts/dev/*check*.sh, *_test.go, test/, kuttl/, or harness/)") + fi +else + if ! printf '%s\n' "${changed_files[@]}" | grep -Eq '(^scripts/dev/.*(check|eval|run).*\.sh$|_test\.go$|^test/|^kuttl/|^harness/)'; then + errors+=("overall diff: missing harness/test evidence file (scripts/dev/*check*.sh, *_test.go, test/, kuttl/, or harness/)") + fi +fi + +if [[ ${#errors[@]} -gt 0 ]]; then + printf 'doc_first_check failed:\n' >&2 + printf ' - %s\n' "${errors[@]}" >&2 + exit 1 +fi + +echo "doc_first_check: passed." diff --git a/scripts/dev/envtest.sh b/scripts/dev/envtest.sh new file mode 100755 index 000000000..922d3ed9a --- /dev/null +++ b/scripts/dev/envtest.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" +echo "Ensuring envtest assets are available" +make envtest + +if [[ "${RUN_TESTS:-}" == "1" ]]; then + echo "Running unit/envtest suite (make test)" + make test +fi diff --git a/scripts/dev/harness_engineering_parity_check.sh b/scripts/dev/harness_engineering_parity_check.sh new file mode 100755 index 000000000..bc08a8518 --- /dev/null +++ b/scripts/dev/harness_engineering_parity_check.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/harness_engineering_parity_check.sh [--help] + +Validates docs/engineering/HARNESS_ENGINEERING_PARITY.md structure and matrix quality. +USAGE +} + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +doc="docs/engineering/HARNESS_ENGINEERING_PARITY.md" +if [[ ! -f "${doc}" ]]; then + echo "harness_engineering_parity_check: FAIL: missing ${doc}" >&2 + exit 1 +fi + +must_contain() { + local pattern="$1" + local label="$2" + if command -v rg >/dev/null 2>&1; then + if ! rg -q "${pattern}" "${doc}"; then + echo "harness_engineering_parity_check: FAIL: missing ${label} in ${doc}" >&2 + exit 1 + fi + else + if ! grep -Eq "${pattern}" "${doc}"; then + echo "harness_engineering_parity_check: FAIL: missing ${label} in ${doc}" >&2 + exit 1 + fi + fi +} + +must_contain '^# Harness Engineering Parity' "document title" +must_contain '^## Product Profile' "product profile section" +must_contain '^## Backend Parity Matrix' "backend parity matrix section" +must_contain '^## No-UI Equivalents \(Required\)' "no-UI equivalents section" +must_contain 'N/A \(No UI\)' "no-UI status entry" +must_contain 'https://openai.com/index/harness-engineering/' "source link" + +rows="$(awk '/^\| H[0-9]+ /{print}' "${doc}")" +row_count="$(printf '%s\n' "${rows}" | awk 'NF{c++} END{print c+0}')" +if [[ "${row_count}" -lt 10 ]]; then + echo "harness_engineering_parity_check: FAIL: expected at least 10 parity rows, found ${row_count}" >&2 + exit 1 +fi + +bad=0 +while IFS= read -r row; do + [[ -n "${row}" ]] || continue + status="$(printf '%s\n' "${row}" | awk -F'|' '{s=$5; gsub(/^[ \t]+|[ \t]+$/, "", s); print s}')" + evidence="$(printf '%s\n' "${row}" | awk -F'|' '{e=$6; gsub(/^[ \t]+|[ \t]+$/, "", e); print e}')" + + case "${status}" in + "Implemented"|"In Progress"|"Planned"|"N/A (No UI)") + ;; + *) + echo "harness_engineering_parity_check: FAIL: invalid status '${status}' in row: ${row}" >&2 + bad=1 + ;; + esac + + if [[ -z "${evidence}" ]]; then + echo "harness_engineering_parity_check: FAIL: missing evidence column in row: ${row}" >&2 + bad=1 + fi +done < <(printf '%s\n' "${rows}") + +if [[ "${bad}" -ne 0 ]]; then + exit 1 +fi + +echo "harness_engineering_parity_check: passed." diff --git a/scripts/dev/harness_eval.sh b/scripts/dev/harness_eval.sh new file mode 100755 index 000000000..71f7a3292 --- /dev/null +++ b/scripts/dev/harness_eval.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/harness_eval.sh [--suite <path>] [--help] + +Runs replayable governance regression checks from a suite YAML. +Default suite: docs/agent/evals/policy-regression.yaml +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +suite="docs/agent/evals/policy-regression.yaml" +while [[ $# -gt 0 ]]; do + case "$1" in + --suite) + if [[ $# -lt 2 ]]; then + echo "--suite requires a value." >&2 + exit 1 + fi + suite="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ ! -f "${suite}" ]]; then + echo "harness_eval: suite file not found: ${suite}" >&2 + exit 1 +fi + +has_rg=false +if command -v rg >/dev/null 2>&1; then + has_rg=true +fi + +contains_pattern() { + local file="$1" + local pattern="$2" + if [[ "${has_rg}" == "true" ]]; then + rg -Fq -- "${pattern}" "${file}" + else + grep -Fq -- "${pattern}" "${file}" + fi +} + +parse_cases() { + local suite_file="$1" + awk ' + function trim(s) { + gsub(/^[[:space:]]+|[[:space:]]+$/, "", s) + gsub(/^["\047]|["\047]$/, "", s) + return s + } + function flush_case() { + if (id != "" && file != "" && pattern != "") { + print id "|" file "|" pattern + } + id = "" + file = "" + pattern = "" + } + /^[[:space:]]*-[[:space:]]id:[[:space:]]*/ { + flush_case() + line = $0 + sub(/^[[:space:]]*-[[:space:]]id:[[:space:]]*/, "", line) + id = trim(line) + next + } + /^[[:space:]]*file:[[:space:]]*/ { + line = $0 + sub(/^[[:space:]]*file:[[:space:]]*/, "", line) + file = trim(line) + next + } + /^[[:space:]]*pattern:[[:space:]]*/ { + line = $0 + sub(/^[[:space:]]*pattern:[[:space:]]*/, "", line) + pattern = trim(line) + next + } + END { + flush_case() + } + ' "${suite_file}" +} + +mapfile -t cases < <(parse_cases "${suite}") +if [[ ${#cases[@]} -eq 0 ]]; then + echo "harness_eval: no cases found in ${suite}" >&2 + exit 1 +fi + +total=0 +passed=0 +failed=0 + +for entry in "${cases[@]}"; do + total=$((total + 1)) + IFS='|' read -r case_id case_file case_pattern <<<"${entry}" + + if [[ ! -f "${case_file}" ]]; then + echo "[FAIL] ${case_id}: missing file ${case_file}" + failed=$((failed + 1)) + continue + fi + + if contains_pattern "${case_file}" "${case_pattern}"; then + echo "[PASS] ${case_id}" + passed=$((passed + 1)) + else + echo "[FAIL] ${case_id}: pattern not found in ${case_file}: ${case_pattern}" + failed=$((failed + 1)) + fi +done + +score=0 +if [[ ${total} -gt 0 ]]; then + score=$((passed * 100 / total)) +fi + +echo "harness_eval: total=${total} passed=${passed} failed=${failed} score=${score}%" + +if [[ ${failed} -gt 0 ]]; then + exit 1 +fi diff --git a/scripts/dev/harness_manifest_check.sh b/scripts/dev/harness_manifest_check.sh new file mode 100755 index 000000000..bf568daa4 --- /dev/null +++ b/scripts/dev/harness_manifest_check.sh @@ -0,0 +1,345 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/harness_manifest_check.sh [--base-ref <branch>] [--help] + +Checks harness manifest policy for changed files. +- Non-trivial changes require a changed manifest under harness/manifests/. +- Each changed manifest must reference an Approved/Implemented KEP. +- Scope policy from manifest must match changed files. + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for harness manifest check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" +} + +extract_scalar() { + local key="$1" + local file="$2" + local line + line="$(grep -E "^${key}:[[:space:]]*" "${file}" | head -n1 || true)" + line="${line#*:}" + line="${line# }" + line="${line% }" + line="${line#\"}" + line="${line%\"}" + line="${line#\'}" + line="${line%\'}" + printf '%s\n' "${line}" +} + +extract_list() { + local key="$1" + local file="$2" + awk -v key="${key}" ' + BEGIN {in_section=0} + $0 ~ "^" key ":[[:space:]]*$" {in_section=1; next} + in_section == 1 && $0 ~ /^[^[:space:]]/ {in_section=0} + in_section == 1 && $0 ~ /^[[:space:]]*-[[:space:]]*/ { + line=$0 + sub(/^[[:space:]]*-[[:space:]]*/, "", line) + gsub(/^["\047]|["\047]$/, "", line) + print line + } + ' "${file}" +} + +has_value() { + [[ -n "$1" ]] +} + +is_non_trivial_path() { + case "$1" in + api/*|cmd/*|config/*|internal/*|kuttl/*|pkg/*|scripts/*|test/*|Makefile|go.mod|go.sum|PROJECT) + return 0 + ;; + *) + return 1 + ;; + esac +} + +has_rg=false +if command -v rg >/dev/null 2>&1; then + has_rg=true +fi + +match_status_approved() { + local file="$1" + local pattern='^(- )?Status:[[:space:]]*(Approved|Implemented)[[:space:]]*$' + if [[ "${has_rg}" == "true" ]]; then + rg -q "${pattern}" "${file}" + else + grep -Eq "${pattern}" "${file}" + fi +} + +declare -A seen=() +changed_files=() + +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "harness_manifest_check: no changed files detected." + exit 0 +fi + +manifest_files=() +non_trivial=false +for file in "${changed_files[@]}"; do + if is_non_trivial_path "${file}"; then + non_trivial=true + fi + + case "${file}" in + harness/manifests/*.yaml|harness/manifests/*.yml) + manifest_files+=("${file}") + ;; + esac +done + +if [[ "${non_trivial}" != "true" ]]; then + echo "harness_manifest_check: no non-trivial code paths changed." + exit 0 +fi + +if [[ ${#manifest_files[@]} -eq 0 ]]; then + echo "harness_manifest_check: non-trivial changes detected but no manifest changed." >&2 + echo "Add harness/manifests/<ticket>-<topic>.yaml and link an approved KEP." >&2 + exit 1 +fi + +errors=() +allowed_union=() +forbidden_union=() + +for manifest in "${manifest_files[@]}"; do + if [[ ! -f "${manifest}" ]]; then + errors+=("${manifest}: file missing") + continue + fi + + is_example_manifest=false + manifest_name="$(basename "${manifest}")" + if [[ "${manifest_name}" == "EXAMPLE.yaml" || "${manifest_name}" == "EXAMPLE.yml" ]]; then + is_example_manifest=true + fi + + version="$(extract_scalar "version" "${manifest}")" + change_id="$(extract_scalar "change_id" "${manifest}")" + title="$(extract_scalar "title" "${manifest}")" + spec_file="$(extract_scalar "spec_file" "${manifest}")" + owner="$(extract_scalar "owner" "${manifest}")" + delivery_mode="$(extract_scalar "delivery_mode" "${manifest}")" + risk_tier="$(extract_scalar "risk_tier" "${manifest}")" + human_approvals_required="$(extract_scalar "human_approvals_required" "${manifest}")" + auto_merge_allowed="$(extract_scalar "auto_merge_allowed" "${manifest}")" + merge_queue_required="$(extract_scalar "merge_queue_required" "${manifest}")" + evaluation_suite="$(extract_scalar "evaluation_suite" "${manifest}")" + + if ! has_value "${version}"; then + errors+=("${manifest}: missing required key 'version'") + fi + if ! has_value "${change_id}"; then + errors+=("${manifest}: missing required key 'change_id'") + fi + if ! has_value "${title}"; then + errors+=("${manifest}: missing required key 'title'") + fi + if ! has_value "${spec_file}"; then + errors+=("${manifest}: missing required key 'spec_file'") + fi + if ! has_value "${owner}"; then + errors+=("${manifest}: missing required key 'owner'") + fi + if ! has_value "${delivery_mode}"; then + errors+=("${manifest}: missing required key 'delivery_mode'") + fi + if ! has_value "${risk_tier}"; then + errors+=("${manifest}: missing required key 'risk_tier'") + fi + if ! has_value "${human_approvals_required}"; then + errors+=("${manifest}: missing required key 'human_approvals_required'") + fi + if ! has_value "${auto_merge_allowed}"; then + errors+=("${manifest}: missing required key 'auto_merge_allowed'") + fi + if ! has_value "${merge_queue_required}"; then + errors+=("${manifest}: missing required key 'merge_queue_required'") + fi + if ! has_value "${evaluation_suite}"; then + errors+=("${manifest}: missing required key 'evaluation_suite'") + fi + + if [[ -n "${spec_file}" && "${is_example_manifest}" != "true" ]]; then + if [[ ! -f "${spec_file}" ]]; then + errors+=("${manifest}: referenced spec file not found: ${spec_file}") + elif ! match_status_approved "${spec_file}"; then + errors+=("${manifest}: referenced spec must be Status Approved or Implemented: ${spec_file}") + fi + fi + + if [[ -n "${evaluation_suite}" && ! -f "${evaluation_suite}" ]]; then + errors+=("${manifest}: evaluation_suite file not found: ${evaluation_suite}") + fi + + mapfile -t allowed_paths < <(extract_list "allowed_paths" "${manifest}") + mapfile -t forbidden_paths < <(extract_list "forbidden_paths" "${manifest}") + mapfile -t required_commands < <(extract_list "required_commands" "${manifest}") + + if ! grep -Eq '^required_commands:[[:space:]]*$' "${manifest}"; then + errors+=("${manifest}: missing required key 'required_commands'") + fi + if [[ ${#required_commands[@]} -eq 0 ]]; then + errors+=("${manifest}: required_commands must include at least spec_check and pr_check") + fi + + has_spec_cmd=false + has_pr_cmd=false + has_risk_cmd=false + for cmd in "${required_commands[@]}"; do + [[ "${cmd}" == scripts/dev/spec_check.sh* ]] && has_spec_cmd=true + [[ "${cmd}" == scripts/dev/pr_check.sh* ]] && has_pr_cmd=true + [[ "${cmd}" == scripts/dev/risk_policy_check.sh* ]] && has_risk_cmd=true + done + if [[ "${has_spec_cmd}" != "true" ]]; then + errors+=("${manifest}: required_commands missing scripts/dev/spec_check.sh") + fi + if [[ "${has_pr_cmd}" != "true" ]]; then + errors+=("${manifest}: required_commands missing scripts/dev/pr_check.sh") + fi + if [[ "${has_risk_cmd}" != "true" ]]; then + errors+=("${manifest}: required_commands missing scripts/dev/risk_policy_check.sh") + fi + + if grep -Eq '^allowed_paths:[[:space:]]*$' "${manifest}"; then + for item in "${allowed_paths[@]}"; do + allowed_union+=("${item}") + done + else + errors+=("${manifest}: missing required key 'allowed_paths'") + fi + + if grep -Eq '^forbidden_paths:[[:space:]]*$' "${manifest}"; then + for item in "${forbidden_paths[@]}"; do + forbidden_union+=("${item}") + done + else + errors+=("${manifest}: missing required key 'forbidden_paths'") + fi + +done + +if [[ ${#errors[@]} -gt 0 ]]; then + printf 'harness_manifest_check failed:\n' >&2 + printf ' - %s\n' "${errors[@]}" >&2 + exit 1 +fi + +if [[ ${#allowed_union[@]} -eq 0 ]]; then + echo "harness_manifest_check failed: manifest allowed_paths resolved to empty set." >&2 + exit 1 +fi + +allowed_csv="$(IFS=,; echo "${allowed_union[*]}")" +forbidden_csv="$(IFS=,; echo "${forbidden_union[*]}")" + +scope_args=() +if [[ -n "${base_ref}" ]]; then + scope_args+=(--base-ref "${base_ref}") +fi + +ALLOWED_PATHS="${allowed_csv}" FORBIDDEN_PATHS="${forbidden_csv}" \ + scripts/dev/scope_check.sh "${scope_args[@]}" + +echo "harness_manifest_check: passed." diff --git a/scripts/dev/harness_run.sh b/scripts/dev/harness_run.sh new file mode 100755 index 000000000..ade7683f0 --- /dev/null +++ b/scripts/dev/harness_run.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/harness_run.sh [--base-ref <branch>] [--suite <path>] [--fast] [--skip-pr-check] [--help] + +Runs harness gates and stores auditable artifacts in .harness/runs/<timestamp>-<sha>/. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +suite="docs/agent/evals/policy-regression.yaml" +run_fast=false +skip_pr_check=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + --suite) + if [[ $# -lt 2 ]]; then + echo "--suite requires a value." >&2 + exit 1 + fi + suite="$2" + shift 2 + ;; + --fast) + run_fast=true + shift + ;; + --skip-pr-check) + skip_pr_check=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +short_sha="$(git rev-parse --short HEAD)" +timestamp="$(date -u +"%Y%m%dT%H%M%SZ")" +run_dir=".harness/runs/${timestamp}-${short_sha}" +mkdir -p "${run_dir}" + +trace_file="${run_dir}/trace.tsv" +summary_file="${run_dir}/summary.txt" + +echo -e "step\tstatus\texit_code\tstarted_at\tended_at\tcommand\tlog_file" > "${trace_file}" + +failures=0 +run_step() { + local step="$1" + shift + local log_file="${run_dir}/${step}.log" + local started_at ended_at rc status + + started_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + set +e + "$@" >"${log_file}" 2>&1 + rc=$? + set -e + ended_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + + if [[ ${rc} -eq 0 ]]; then + status="pass" + else + status="fail" + failures=$((failures + 1)) + fi + + echo -e "${step}\t${status}\t${rc}\t${started_at}\t${ended_at}\t$*\t${log_file}" >> "${trace_file}" + echo "[${status^^}] ${step} (log: ${log_file})" +} + +base_args=() +if [[ -n "${base_ref}" ]]; then + base_args+=(--base-ref "${base_ref}") +fi + +run_step spec_check scripts/dev/spec_check.sh "${base_args[@]}" +run_step manifest_check scripts/dev/harness_manifest_check.sh "${base_args[@]}" +run_step doc_first_check scripts/dev/doc_first_check.sh "${base_args[@]}" +run_step commit_discipline_check scripts/dev/commit_discipline_check.sh "${base_args[@]}" +run_step appframework_parity_check scripts/dev/appframework_parity_check.sh "${base_args[@]}" +run_step keps_check scripts/dev/keps_check.sh "${base_args[@]}" +run_step harness_engineering_parity_check scripts/dev/harness_engineering_parity_check.sh +run_step constitution_runtime_policy_check scripts/dev/constitution_runtime_policy_check.sh "${base_args[@]}" +run_step risk_policy_check scripts/dev/risk_policy_check.sh "${base_args[@]}" +run_step harness_eval scripts/dev/harness_eval.sh --suite "${suite}" + +if [[ "${skip_pr_check}" != "true" ]]; then + if [[ "${run_fast}" == "true" ]]; then + run_step pr_check bash -lc "SKIP_SPEC_CHECK=1 SKIP_MANIFEST_CHECK=1 SKIP_DOC_FIRST_CHECK=1 SKIP_COMMIT_DISCIPLINE_CHECK=1 SKIP_APPFRAMEWORK_PARITY_CHECK=1 SKIP_KEPS_CHECK=1 SKIP_HARNESS_PARITY_CHECK=1 SKIP_CONSTITUTION_RUNTIME_CHECK=1 SKIP_RISK_POLICY_CHECK=1 SKIP_RISK_LABEL_CHECK=1 SKIP_HARNESS_EVAL=1 PR_CHECK_FLAGS=--fast scripts/dev/pr_check.sh" + else + run_step pr_check bash -lc "SKIP_SPEC_CHECK=1 SKIP_MANIFEST_CHECK=1 SKIP_DOC_FIRST_CHECK=1 SKIP_COMMIT_DISCIPLINE_CHECK=1 SKIP_APPFRAMEWORK_PARITY_CHECK=1 SKIP_KEPS_CHECK=1 SKIP_HARNESS_PARITY_CHECK=1 SKIP_CONSTITUTION_RUNTIME_CHECK=1 SKIP_RISK_POLICY_CHECK=1 SKIP_RISK_LABEL_CHECK=1 SKIP_HARNESS_EVAL=1 scripts/dev/pr_check.sh" + fi +fi + +{ + echo "Harness Run Summary" + echo "run_dir: ${run_dir}" + echo "git_sha: $(git rev-parse HEAD)" + echo "branch: $(git branch --show-current)" + echo "base_ref: ${base_ref:-<none>}" + echo "suite: ${suite}" + echo "failed_steps: ${failures}" + echo "" + echo "Changed files:" + if [[ -n "${base_ref}" ]]; then + remote_ref="refs/remotes/origin/${base_ref}" + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -n "${merge_base}" ]]; then + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" + else + echo "<unable to compute merge-base against origin/${base_ref}>" + fi + else + git diff --name-only --diff-filter=ACMRT + git diff --name-only --cached --diff-filter=ACMRT + git ls-files --others --exclude-standard + fi +} > "${summary_file}" + +if [[ ${failures} -gt 0 ]]; then + echo "harness_run: completed with failures (see ${summary_file})." >&2 + exit 1 +fi + +echo "harness_run: passed (artifacts: ${run_dir})." diff --git a/scripts/dev/keps_check.sh b/scripts/dev/keps_check.sh new file mode 100755 index 000000000..8a3c10b94 --- /dev/null +++ b/scripts/dev/keps_check.sh @@ -0,0 +1,292 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/keps_check.sh [--base-ref <branch>] [--help] + +Validates component-to-KEP mapping for implementation-gated changes. +- Detects impacted components from changed paths. +- Collects referenced change IDs (CSPL-#### / GH-####) from specs, manifests, + change docs, and commit messages. +- Ensures every impacted component maps to at least one referenced ID in + docs/specs/COMPONENT_KEP_INDEX.md. + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for keps check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + { + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" + git diff --name-only --diff-filter=ACMRT + git diff --name-only --cached --diff-filter=ACMRT + git ls-files --others --exclude-standard + } | awk 'NF' | sort -u +} + +extract_ids_from_text() { + awk ' + { + while (match($0, /(CSPL-[0-9]+|GH-[0-9]+)/)) { + print substr($0, RSTART, RLENGTH) + $0 = substr($0, RSTART + RLENGTH) + } + } + ' +} + +component_has_section() { + local component="$1" + if command -v rg >/dev/null 2>&1; then + rg -q "^## ${component}$" "${index_file}" + else + grep -Eq "^## ${component}$" "${index_file}" + fi +} + +component_has_id() { + local component="$1" + local id="$2" + awk -v section="## ${component}" ' + $0 == section {in_section=1; next} + in_section && /^## / {exit} + in_section {print} + ' "${index_file}" | grep -Fq "${id}" +} + +declare -A seen=() +changed_files=() +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "keps_check: no changed files detected." + exit 0 +fi + +declare -A impacted_components=() +mark_component() { + impacted_components["$1"]=1 +} + +for file in "${changed_files[@]}"; do + case "${file}" in + api/v*/*) + mark_component "api-crd" + ;; + internal/controller/*.go|internal/controller/*/*.go) + mark_component "controller-reconcile" + ;; + pkg/splunk/*.go|pkg/splunk/*/*.go|pkg/splunk/*/*/*.go) + mark_component "enterprise-runtime" + ;; + config/crd/*|config/manager/*|config/default/*|config/samples/*|bundle/*|helm-chart/*) + mark_component "manifests-release" + ;; + test/*|kuttl/*) + mark_component "test-harness" + ;; + scripts/*|harness/*|docs/agent/*|docs/specs/*|docs/changes/*|.github/workflows/*|.agents/skills/*|Makefile|skaffold.yaml|AGENTS.md) + mark_component "governance-harness" + ;; + docs/AppFramework.md|pkg/splunk/enterprise/afwscheduler.go|pkg/splunk/enterprise/afwscheduler_test.go|test/testenv/appframework_utils.go|test/appframework_*/*) + mark_component "app-framework" + ;; + esac +done + +if [[ ${#impacted_components[@]} -eq 0 ]]; then + echo "keps_check: passed (no KEP-mapped components impacted)." + exit 0 +fi + +index_file="docs/specs/COMPONENT_KEP_INDEX.md" +if [[ ! -f "${index_file}" ]]; then + echo "keps_check: FAIL: missing ${index_file}" >&2 + exit 1 +fi + +declare -A referenced_ids=() +add_id() { + local id="${1:-}" + [[ -n "${id}" ]] || return 0 + referenced_ids["${id}"]=1 +} + +for file in "${changed_files[@]}"; do + case "${file}" in + docs/specs/*.md) + base_name="$(basename "${file}")" + if [[ "${base_name}" != "README.md" && "${base_name}" != "SPEC_TEMPLATE.md" && "${base_name}" != "COMPONENT_KEP_INDEX.md" ]]; then + while IFS= read -r id; do + add_id "${id}" + done < <(printf '%s\n' "${file}" | extract_ids_from_text | sort -u) + + if [[ -f "${file}" ]]; then + while IFS= read -r id; do + add_id "${id}" + done < <(extract_ids_from_text < "${file}" | sort -u) + fi + fi + ;; + docs/changes/*.md|harness/manifests/*.yaml|harness/manifests/*.yml) + while IFS= read -r id; do + add_id "${id}" + done < <(printf '%s\n' "${file}" | extract_ids_from_text | sort -u) + + if [[ -f "${file}" ]]; then + while IFS= read -r id; do + add_id "${id}" + done < <(extract_ids_from_text < "${file}" | sort -u) + + if [[ "${file}" == harness/manifests/*.y*ml ]]; then + manifest_change_id="$(grep -E '^change_id:' "${file}" | head -n1 | cut -d':' -f2- | xargs || true)" + add_id "${manifest_change_id}" + fi + fi + ;; + esac +done + +if [[ -n "${base_ref}" ]]; then + remote_ref="refs/remotes/origin/${base_ref}" + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${base_ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + if git show-ref --verify --quiet "${remote_ref}"; then + while IFS= read -r id; do + add_id "${id}" + done < <(git log --pretty=%B "${remote_ref}"..HEAD 2>/dev/null | extract_ids_from_text | sort -u || true) + fi +else + while IFS= read -r id; do + add_id "${id}" + done < <(git log -n 50 --pretty=%B 2>/dev/null | extract_ids_from_text | sort -u || true) +fi + +if [[ ${#referenced_ids[@]} -eq 0 ]]; then + cat >&2 <<'ERROR' +keps_check: FAIL: impacted components detected without referenced change IDs. + +Include at least one ID (CSPL-#### or GH-####) in: +- docs/specs/<id>-*.md, or +- docs/changes/*.md, or +- harness/manifests/*.yaml (change_id), or +- commit message. +ERROR + exit 1 +fi + +missing=0 +for component in "${!impacted_components[@]}"; do + if ! component_has_section "${component}"; then + echo "keps_check: FAIL: missing component section in ${index_file}: ${component}" >&2 + missing=1 + continue + fi + + matched=0 + for id in "${!referenced_ids[@]}"; do + if component_has_id "${component}" "${id}"; then + matched=1 + break + fi + done + + if [[ "${matched}" -eq 0 ]]; then + echo "keps_check: FAIL: component '${component}' has no referenced ID mapped in ${index_file}" >&2 + echo "keps_check: referenced IDs: $(printf '%s ' "${!referenced_ids[@]}")" >&2 + missing=1 + fi +done + +if [[ "${missing}" -ne 0 ]]; then + exit 1 +fi + +echo "keps_check: passed." diff --git a/scripts/dev/kind_smoke.sh b/scripts/dev/kind_smoke.sh new file mode 100755 index 000000000..62ef94d2e --- /dev/null +++ b/scripts/dev/kind_smoke.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v kubectl >/dev/null 2>&1; then + echo "kubectl not found in PATH." >&2 + exit 1 +fi +if ! command -v kind >/dev/null 2>&1; then + echo "kind not found in PATH." >&2 + exit 1 +fi + +export CLUSTER_PROVIDER=kind +export TEST_CLUSTER_PLATFORM=kind +: "${TEST_CLUSTER_NAME:=sok-kind}" +: "${CLUSTER_WORKERS:=3}" + +echo "Bringing up kind cluster" +make cluster-up + +: "${NAMESPACE:=splunk-operator}" +: "${WATCH_NAMESPACE:=${NAMESPACE}}" +: "${ENVIRONMENT:=default}" +: "${SPLUNK_GENERAL_TERMS:=--accept-sgt-current-at-splunk-com}" +: "${SPLUNK_ENTERPRISE_IMAGE:=splunk/splunk:latest}" +: "${IMG:=splunk/splunk-operator:latest}" + +export NAMESPACE WATCH_NAMESPACE ENVIRONMENT SPLUNK_GENERAL_TERMS SPLUNK_ENTERPRISE_IMAGE IMG + +kubectl create namespace "${NAMESPACE}" >/dev/null 2>&1 || true + +echo "Deploying operator" +make deploy + +echo "Waiting for operator deployment to be ready" +kubectl -n "${NAMESPACE}" rollout status deploy/splunk-operator-controller-manager --timeout="${OPERATOR_READY_TIMEOUT:-5m}" + +if [[ "${APPLY_SAMPLE:-}" == "1" ]]; then + echo "Applying sample CR (best effort)" + kubectl -n "${NAMESPACE}" apply -f config/samples/enterprise_v4_standalone.yaml +fi + +echo "Kind smoke complete" diff --git a/scripts/dev/lint.sh b/scripts/dev/lint.sh new file mode 100755 index 000000000..c9124fe6a --- /dev/null +++ b/scripts/dev/lint.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" +echo "Running format + vet" +make fmt vet + +if [[ "${RUN_STATICCHECK:-}" == "1" ]]; then + echo "Running staticcheck" + make scheck +fi + +if [[ "${RUN_BIAS_LINT:-}" == "1" ]]; then + echo "Running bias language linter" + make lang +fi diff --git a/scripts/dev/pr_check.sh b/scripts/dev/pr_check.sh new file mode 100755 index 000000000..12bfccb59 --- /dev/null +++ b/scripts/dev/pr_check.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +spec_args=() +base_ref="${SPEC_CHECK_BASE_REF:-${HARNESS_BASE_REF:-}}" +if [[ -n "${base_ref}" ]]; then + spec_args+=(--base-ref "${base_ref}") +fi + +if [[ "${SKIP_SPEC_CHECK:-0}" != "1" ]]; then + echo "Running spec check: ./scripts/dev/spec_check.sh ${spec_args[*]}" + ./scripts/dev/spec_check.sh "${spec_args[@]}" +fi + +if [[ "${SKIP_MANIFEST_CHECK:-0}" != "1" ]]; then + echo "Running harness manifest check: ./scripts/dev/harness_manifest_check.sh ${spec_args[*]}" + ./scripts/dev/harness_manifest_check.sh "${spec_args[@]}" +fi + +if [[ "${SKIP_DOC_FIRST_CHECK:-0}" != "1" ]]; then + echo "Running doc-first check: ./scripts/dev/doc_first_check.sh ${spec_args[*]}" + ./scripts/dev/doc_first_check.sh "${spec_args[@]}" +fi + +if [[ "${SKIP_COMMIT_DISCIPLINE_CHECK:-0}" != "1" ]]; then + echo "Running commit discipline check: ./scripts/dev/commit_discipline_check.sh ${spec_args[*]}" + ./scripts/dev/commit_discipline_check.sh "${spec_args[@]}" +fi + +if [[ "${SKIP_APPFRAMEWORK_PARITY_CHECK:-0}" != "1" ]]; then + echo "Running appframework parity check: ./scripts/dev/appframework_parity_check.sh ${spec_args[*]}" + ./scripts/dev/appframework_parity_check.sh "${spec_args[@]}" +fi + +if [[ "${SKIP_KEPS_CHECK:-0}" != "1" ]]; then + echo "Running KEP/component mapping check: ./scripts/dev/keps_check.sh ${spec_args[*]}" + ./scripts/dev/keps_check.sh "${spec_args[@]}" +fi + +if [[ "${SKIP_HARNESS_PARITY_CHECK:-0}" != "1" ]]; then + echo "Running harness engineering parity check: ./scripts/dev/harness_engineering_parity_check.sh" + ./scripts/dev/harness_engineering_parity_check.sh +fi + +if [[ "${SKIP_CONSTITUTION_RUNTIME_CHECK:-0}" != "1" ]]; then + echo "Running constitution/runtime policy check: ./scripts/dev/constitution_runtime_policy_check.sh ${spec_args[*]}" + ./scripts/dev/constitution_runtime_policy_check.sh "${spec_args[@]}" +fi + +if [[ "${SKIP_RISK_POLICY_CHECK:-0}" != "1" ]]; then + echo "Running risk policy check: ./scripts/dev/risk_policy_check.sh ${spec_args[*]}" + ./scripts/dev/risk_policy_check.sh "${spec_args[@]}" +fi + +if [[ "${SKIP_RISK_LABEL_CHECK:-0}" != "1" ]]; then + label_args=() + if [[ -n "${RISK_LABELS:-}" ]]; then + label_args+=(--labels "${RISK_LABELS}") + fi + echo "Running risk label check: ./scripts/dev/risk_label_check.sh ${spec_args[*]} ${label_args[*]}" + ./scripts/dev/risk_label_check.sh "${spec_args[@]}" "${label_args[@]}" +fi + +if [[ "${SKIP_HARNESS_EVAL:-0}" != "1" ]]; then + suite="${HARNESS_EVAL_SUITE:-docs/agent/evals/policy-regression.yaml}" + echo "Running harness eval: ./scripts/dev/harness_eval.sh --suite ${suite}" + ./scripts/dev/harness_eval.sh --suite "${suite}" +fi + +if [[ "${SKIP_SKILL_LINT:-0}" != "1" ]]; then + echo "Running skill contract lint: ./scripts/dev/skill_lint.sh" + ./scripts/dev/skill_lint.sh +fi + +if [[ "${SKIP_SCRIPT_SANITY:-0}" != "1" ]]; then + echo "Running script sanity checks: ./scripts/dev/script_sanity_check.sh" + ./scripts/dev/script_sanity_check.sh +fi + +args=() +if [[ "${RUN_ALL:-}" == "1" ]]; then + args+=(--all) +else + if [[ "${RUN_BUNDLE:-}" == "1" ]]; then + args+=(--bundle) + fi + if [[ "${RUN_TESTS:-}" == "1" ]]; then + args+=(--tests) + fi +fi + +if [[ -n "${PR_CHECK_FLAGS:-}" ]]; then + # shellcheck disable=SC2206 + extra_flags=(${PR_CHECK_FLAGS}) + args+=("${extra_flags[@]}") +fi + +echo "Running repo PR checks: ./scripts/verify_repo.sh ${args[*]}" +./scripts/verify_repo.sh "${args[@]}" diff --git a/scripts/dev/risk_label_check.sh b/scripts/dev/risk_label_check.sh new file mode 100755 index 000000000..db8995c95 --- /dev/null +++ b/scripts/dev/risk_label_check.sh @@ -0,0 +1,227 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/risk_label_check.sh [--base-ref <branch>] [--labels <csv>] [--help] + +Checks that PR risk label matches manifest risk_tier for changed manifests. + +Rules: +- Reads changed manifests under harness/manifests/ (excluding EXAMPLE.yaml). +- Requires exactly one risk label of form risk:low|risk:medium|risk:high. +- Label tier must equal manifest risk_tier. + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + --labels <csv> Comma-separated labels. Fallback: RISK_LABELS env var. + -h, --help Show this help. + +Env: + RISK_LABELS Comma-separated labels. + ENFORCE_RISK_LABEL_CHECK Set to 1 to fail when labels are unavailable. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +labels_raw="${RISK_LABELS:-}" +enforce="${ENFORCE_RISK_LABEL_CHECK:-0}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + --labels) + if [[ $# -lt 2 ]]; then + echo "--labels requires a value." >&2 + exit 1 + fi + labels_raw="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for risk label check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" +} + +extract_scalar() { + local key="$1" + local file="$2" + local line + line="$(grep -E "^${key}:[[:space:]]*" "${file}" | head -n1 || true)" + line="${line#*:}" + line="${line# }" + line="${line% }" + line="${line#\"}" + line="${line%\"}" + line="${line#\'}" + line="${line%\'}" + printf '%s\n' "${line}" +} + +declare -A seen=() +changed_files=() +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "risk_label_check: no changed files detected." + exit 0 +fi + +manifest_files=() +for file in "${changed_files[@]}"; do + case "${file}" in + harness/manifests/*.yaml|harness/manifests/*.yml) + manifest_name="$(basename "${file}")" + if [[ "${manifest_name}" != "EXAMPLE.yaml" && "${manifest_name}" != "EXAMPLE.yml" ]]; then + manifest_files+=("${file}") + fi + ;; + esac +done + +if [[ ${#manifest_files[@]} -eq 0 ]]; then + echo "risk_label_check: no changed non-example manifests detected." + exit 0 +fi + +manifest_tier="" +for manifest in "${manifest_files[@]}"; do + if [[ ! -f "${manifest}" ]]; then + echo "risk_label_check: missing manifest file: ${manifest}" >&2 + exit 1 + fi + + tier="$(extract_scalar "risk_tier" "${manifest}")" + case "${tier}" in + low|medium|high) + ;; + *) + echo "risk_label_check: invalid or missing risk_tier in ${manifest}" >&2 + exit 1 + ;; + esac + + if [[ -z "${manifest_tier}" ]]; then + manifest_tier="${tier}" + elif [[ "${manifest_tier}" != "${tier}" ]]; then + echo "risk_label_check: manifests in this change have mixed risk tiers (${manifest_tier} vs ${tier})." >&2 + exit 1 + fi +done + +if [[ -z "${labels_raw}" ]]; then + if [[ "${enforce}" == "1" ]]; then + echo "risk_label_check: labels unavailable; set RISK_LABELS or provide --labels." >&2 + exit 1 + fi + echo "risk_label_check: labels unavailable in local mode; skipping strict check." + exit 0 +fi + +labels_normalized="${labels_raw//,/ }" +risk_labels=() +for label in ${labels_normalized}; do + label="${label# }" + label="${label% }" + case "${label}" in + risk:low|risk:medium|risk:high) + risk_labels+=("${label}") + ;; + esac +done + +if [[ ${#risk_labels[@]} -eq 0 ]]; then + echo "risk_label_check: missing required risk label. Expected one of risk:low|risk:medium|risk:high" >&2 + exit 1 +fi + +if [[ ${#risk_labels[@]} -gt 1 ]]; then + echo "risk_label_check: multiple risk labels found (${risk_labels[*]}). Keep exactly one." >&2 + exit 1 +fi + +label_tier="${risk_labels[0]#risk:}" +if [[ "${label_tier}" != "${manifest_tier}" ]]; then + echo "risk_label_check: label risk:${label_tier} does not match manifest risk_tier ${manifest_tier}." >&2 + exit 1 +fi + +echo "risk_label_check: passed (risk:${label_tier})." diff --git a/scripts/dev/risk_policy_check.sh b/scripts/dev/risk_policy_check.sh new file mode 100755 index 000000000..bc886c09b --- /dev/null +++ b/scripts/dev/risk_policy_check.sh @@ -0,0 +1,292 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/risk_policy_check.sh [--base-ref <branch>] [--help] + +Checks risk-tier governance on changed harness manifests. +- Validates required risk keys and allowed values. +- Enforces minimum approvals/merge queue policy by risk tier. +- Enforces command-depth requirements for medium/high risk changes. + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for risk policy check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" +} + +extract_scalar() { + local key="$1" + local file="$2" + local line + line="$(grep -E "^${key}:[[:space:]]*" "${file}" | head -n1 || true)" + line="${line#*:}" + line="${line# }" + line="${line% }" + line="${line#\"}" + line="${line%\"}" + line="${line#\'}" + line="${line%\'}" + printf '%s\n' "${line}" +} + +extract_list() { + local key="$1" + local file="$2" + awk -v key="${key}" ' + BEGIN {in_section=0} + $0 ~ "^" key ":[[:space:]]*$" {in_section=1; next} + in_section == 1 && $0 ~ /^[^[:space:]]/ {in_section=0} + in_section == 1 && $0 ~ /^[[:space:]]*-[[:space:]]*/ { + line=$0 + sub(/^[[:space:]]*-[[:space:]]*/, "", line) + gsub(/^["\047]|["\047]$/, "", line) + print line + } + ' "${file}" +} + +declare -A seen=() +changed_files=() +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "risk_policy_check: no changed files detected." + exit 0 +fi + +manifest_files=() +for file in "${changed_files[@]}"; do + case "${file}" in + harness/manifests/*.yaml|harness/manifests/*.yml) + manifest_files+=("${file}") + ;; + esac +done + +if [[ ${#manifest_files[@]} -eq 0 ]]; then + echo "risk_policy_check: no changed manifest files detected." + exit 0 +fi + +errors=() +for manifest in "${manifest_files[@]}"; do + if [[ ! -f "${manifest}" ]]; then + errors+=("${manifest}: file missing") + continue + fi + + manifest_name="$(basename "${manifest}")" + if [[ "${manifest_name}" == "EXAMPLE.yaml" || "${manifest_name}" == "EXAMPLE.yml" ]]; then + continue + fi + + risk_tier="$(extract_scalar "risk_tier" "${manifest}")" + delivery_mode="$(extract_scalar "delivery_mode" "${manifest}")" + approvals="$(extract_scalar "human_approvals_required" "${manifest}")" + auto_merge="$(extract_scalar "auto_merge_allowed" "${manifest}")" + merge_queue="$(extract_scalar "merge_queue_required" "${manifest}")" + + if [[ -z "${risk_tier}" ]]; then + errors+=("${manifest}: missing required key 'risk_tier'") + continue + fi + if [[ -z "${delivery_mode}" ]]; then + errors+=("${manifest}: missing required key 'delivery_mode'") + fi + if [[ -z "${approvals}" ]]; then + errors+=("${manifest}: missing required key 'human_approvals_required'") + fi + if [[ -z "${auto_merge}" ]]; then + errors+=("${manifest}: missing required key 'auto_merge_allowed'") + fi + if [[ -z "${merge_queue}" ]]; then + errors+=("${manifest}: missing required key 'merge_queue_required'") + fi + + case "${risk_tier}" in + low|medium|high) + ;; + *) + errors+=("${manifest}: invalid risk_tier '${risk_tier}' (use low|medium|high)") + ;; + esac + + case "${delivery_mode}" in + agent|hybrid|human) + ;; + *) + errors+=("${manifest}: invalid delivery_mode '${delivery_mode}' (use agent|hybrid|human)") + ;; + esac + + if [[ ! "${approvals}" =~ ^[0-9]+$ ]]; then + errors+=("${manifest}: human_approvals_required must be an integer") + continue + fi + + if [[ "${auto_merge}" != "true" && "${auto_merge}" != "false" ]]; then + errors+=("${manifest}: auto_merge_allowed must be true|false") + fi + + if [[ "${merge_queue}" != "true" && "${merge_queue}" != "false" ]]; then + errors+=("${manifest}: merge_queue_required must be true|false") + fi + + min_approvals=0 + case "${risk_tier}" in + low) + min_approvals=0 + ;; + medium) + min_approvals=1 + ;; + high) + min_approvals=2 + ;; + esac + + if (( approvals < min_approvals )); then + errors+=("${manifest}: approvals ${approvals} below minimum ${min_approvals} for ${risk_tier} risk") + fi + + if [[ "${risk_tier}" == "medium" || "${risk_tier}" == "high" ]]; then + if [[ "${merge_queue}" != "true" ]]; then + errors+=("${manifest}: merge_queue_required must be true for ${risk_tier} risk") + fi + if [[ "${auto_merge}" != "false" ]]; then + errors+=("${manifest}: auto_merge_allowed must be false for ${risk_tier} risk") + fi + fi + + mapfile -t required_commands < <(extract_list "required_commands" "${manifest}") + has_harness_run=false + has_unit=false + has_keps=false + has_harness_parity=false + has_constitution_runtime=false + for cmd in "${required_commands[@]}"; do + [[ "${cmd}" == scripts/dev/harness_run.sh* ]] && has_harness_run=true + [[ "${cmd}" == scripts/dev/unit.sh* || "${cmd}" == make\ test* ]] && has_unit=true + [[ "${cmd}" == scripts/dev/keps_check.sh* ]] && has_keps=true + [[ "${cmd}" == scripts/dev/harness_engineering_parity_check.sh* ]] && has_harness_parity=true + [[ "${cmd}" == scripts/dev/constitution_runtime_policy_check.sh* ]] && has_constitution_runtime=true + done + + if [[ "${risk_tier}" == "medium" || "${risk_tier}" == "high" ]]; then + if [[ "${has_harness_run}" != "true" ]]; then + errors+=("${manifest}: required_commands must include scripts/dev/harness_run.sh for ${risk_tier} risk") + fi + if [[ "${has_keps}" != "true" ]]; then + errors+=("${manifest}: required_commands must include scripts/dev/keps_check.sh for ${risk_tier} risk") + fi + if [[ "${has_harness_parity}" != "true" ]]; then + errors+=("${manifest}: required_commands must include scripts/dev/harness_engineering_parity_check.sh for ${risk_tier} risk") + fi + if [[ "${has_constitution_runtime}" != "true" ]]; then + errors+=("${manifest}: required_commands must include scripts/dev/constitution_runtime_policy_check.sh for ${risk_tier} risk") + fi + fi + + if [[ "${risk_tier}" == "high" ]]; then + if [[ "${has_unit}" != "true" ]]; then + errors+=("${manifest}: required_commands must include scripts/dev/unit.sh or make test for high risk") + fi + fi +done + +if [[ ${#errors[@]} -gt 0 ]]; then + printf 'risk_policy_check failed:\n' >&2 + printf ' - %s\n' "${errors[@]}" >&2 + exit 1 +fi + +echo "risk_policy_check: passed." diff --git a/scripts/dev/scope_check.sh b/scripts/dev/scope_check.sh new file mode 100755 index 000000000..0f9df0e48 --- /dev/null +++ b/scripts/dev/scope_check.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/scope_check.sh [--base-ref <branch>] [--allowed <patterns>] [--forbidden <patterns>] [--exclude <patterns>] [--help] + +Checks changed files against allowed/forbidden glob patterns. +Patterns accept comma or whitespace separators (for example: "api/**,internal/**"). + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + --allowed <patterns> Allowed path globs. Falls back to ALLOWED_PATHS env var. + --forbidden <patterns> Forbidden path globs. Falls back to FORBIDDEN_PATHS env var. + --exclude <patterns> Excluded path globs. Falls back to EXCLUDE_PATHS env var. + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +allowed_raw="${ALLOWED_PATHS:-}" +forbidden_raw="${FORBIDDEN_PATHS:-}" +exclude_raw="${EXCLUDE_PATHS:-}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + --allowed) + if [[ $# -lt 2 ]]; then + echo "--allowed requires a value." >&2 + exit 1 + fi + allowed_raw="$2" + shift 2 + ;; + --forbidden) + if [[ $# -lt 2 ]]; then + echo "--forbidden requires a value." >&2 + exit 1 + fi + forbidden_raw="$2" + shift 2 + ;; + --exclude) + if [[ $# -lt 2 ]]; then + echo "--exclude requires a value." >&2 + exit 1 + fi + exclude_raw="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for scope check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" +} + +split_patterns() { + local raw="$1" + raw="${raw//,/ }" + local pat + set -f + for pat in ${raw}; do + [[ -n "${pat}" ]] && printf '%s\n' "${pat}" + done + set +f +} + +matches_any_pattern() { + local file="$1" + shift + local pat + for pat in "$@"; do + [[ -z "${pat}" ]] && continue + if [[ "${file}" == ${pat} ]]; then + return 0 + fi + done + return 1 +} + +declare -A seen=() +changed_files=() + +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "scope_check: no changed files detected." + exit 0 +fi + +allowed_patterns=() +forbidden_patterns=() +exclude_patterns=() + +if [[ -n "${allowed_raw}" ]]; then + while IFS= read -r pat; do + [[ -n "${pat}" ]] && allowed_patterns+=("${pat}") + done < <(split_patterns "${allowed_raw}") +fi +if [[ -n "${forbidden_raw}" ]]; then + while IFS= read -r pat; do + [[ -n "${pat}" ]] && forbidden_patterns+=("${pat}") + done < <(split_patterns "${forbidden_raw}") +fi +if [[ -n "${exclude_raw}" ]]; then + while IFS= read -r pat; do + [[ -n "${pat}" ]] && exclude_patterns+=("${pat}") + done < <(split_patterns "${exclude_raw}") +fi + +errors=() +for file in "${changed_files[@]}"; do + if [[ ${#exclude_patterns[@]} -gt 0 ]] && matches_any_pattern "${file}" "${exclude_patterns[@]}"; then + continue + fi + + if [[ ${#allowed_patterns[@]} -gt 0 ]] && ! matches_any_pattern "${file}" "${allowed_patterns[@]}"; then + errors+=("File not in allowed scope: ${file}") + continue + fi + + if [[ ${#forbidden_patterns[@]} -gt 0 ]] && matches_any_pattern "${file}" "${forbidden_patterns[@]}"; then + errors+=("File in forbidden scope: ${file}") + fi +done + +if [[ ${#errors[@]} -gt 0 ]]; then + printf 'scope_check failed:\n' >&2 + printf ' - %s\n' "${errors[@]}" >&2 + exit 1 +fi + +echo "scope_check: passed." diff --git a/scripts/dev/script_sanity_check.sh b/scripts/dev/script_sanity_check.sh new file mode 100755 index 000000000..c31ffaf32 --- /dev/null +++ b/scripts/dev/script_sanity_check.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +echo "Checking shell script syntax" +while IFS= read -r file; do + bash -n "${file}" +done < <(find scripts .agents/skills -type f -name '*.sh' | sort) + +if command -v python3 >/dev/null 2>&1; then + echo "Checking Python script syntax" + python3 - <<'PY' +from pathlib import Path + +path = Path("scripts/generate_testcase.py") +compile(path.read_text(encoding="utf-8"), str(path), "exec") +PY + + echo "Running testcase generator dry-run contract check" + PYTHONDONTWRITEBYTECODE=1 python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml --dry-run >/dev/null +fi + +echo "script_sanity_check: passed." diff --git a/scripts/dev/skaffold_ci_smoke.sh b/scripts/dev/skaffold_ci_smoke.sh new file mode 100755 index 000000000..e849b23b2 --- /dev/null +++ b/scripts/dev/skaffold_ci_smoke.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/skaffold_ci_smoke.sh [--profile <name>] [--namespace <ns>] [--timeout <duration>] [--no-cleanup] [--help] + +Builds and deploys the operator using skaffold, then validates controller rollout. +Defaults: + profile=ci-smoke + namespace=splunk-operator + timeout=300s +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v skaffold >/dev/null 2>&1; then + echo "skaffold is required. Install it from https://skaffold.dev/docs/install/" >&2 + exit 1 +fi + +if ! command -v kubectl >/dev/null 2>&1; then + echo "kubectl is required to validate smoke deployment." >&2 + exit 1 +fi + +profile="${SKAFFOLD_PROFILE:-ci-smoke}" +namespace="${OPERATOR_NAMESPACE:-splunk-operator}" +timeout="${SKAFFOLD_ROLLOUT_TIMEOUT:-300s}" +cleanup="${SKAFFOLD_CLEANUP:-1}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --profile) + if [[ $# -lt 2 ]]; then + echo "--profile requires a value." >&2 + exit 1 + fi + profile="$2" + shift 2 + ;; + --namespace) + if [[ $# -lt 2 ]]; then + echo "--namespace requires a value." >&2 + exit 1 + fi + namespace="$2" + shift 2 + ;; + --timeout) + if [[ $# -lt 2 ]]; then + echo "--timeout requires a value." >&2 + exit 1 + fi + timeout="$2" + shift 2 + ;; + --no-cleanup) + cleanup="0" + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +cleanup_fn() { + if [[ "${cleanup}" == "1" ]]; then + skaffold delete -p "${profile}" || true + fi +} +trap cleanup_fn EXIT + +echo "Running skaffold smoke deploy (profile=${profile})" +skaffold run -p "${profile}" --status-check=true + +echo "Validating operator rollout in namespace ${namespace}" +kubectl -n "${namespace}" rollout status deployment/splunk-operator-controller-manager --timeout="${timeout}" +kubectl -n "${namespace}" get pods -l control-plane=controller-manager + +echo "Skaffold smoke deployment succeeded." diff --git a/scripts/dev/skaffold_dev.sh b/scripts/dev/skaffold_dev.sh new file mode 100755 index 000000000..abde146b6 --- /dev/null +++ b/scripts/dev/skaffold_dev.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/skaffold_dev.sh [--profile <name>] [--help] [-- <extra skaffold args>] + +Runs skaffold dev for Splunk Operator. +Default profile: dev-kind +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v skaffold >/dev/null 2>&1; then + echo "skaffold is required. Install it from https://skaffold.dev/docs/install/" >&2 + exit 1 +fi + +profile="${SKAFFOLD_PROFILE:-dev-kind}" +extra_args=() +while [[ $# -gt 0 ]]; do + case "$1" in + --profile) + if [[ $# -lt 2 ]]; then + echo "--profile requires a value." >&2 + exit 1 + fi + profile="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + --) + shift + extra_args+=("$@") + break + ;; + *) + extra_args+=("$1") + shift + ;; + esac +done + +echo "Running skaffold dev (profile=${profile})" +skaffold dev -p "${profile}" "${extra_args[@]}" diff --git a/scripts/dev/skill_lint.sh b/scripts/dev/skill_lint.sh new file mode 100755 index 000000000..62ff90384 --- /dev/null +++ b/scripts/dev/skill_lint.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +python3 - <<'PY' +from pathlib import Path +import re +import sys + +required_sections = [ + "## Overview", + "## Preconditions", + "## Workflow", + "## Pass / Fail Criteria", + "## Output Contract", +] + +skills = sorted(Path(".agents/skills").glob("*/SKILL.md")) +if not skills: + print("skill_lint: no skills found under .agents/skills") + sys.exit(1) + +failed = False +for skill in skills: + text = skill.read_text(encoding="utf-8") + local_fail = False + + frontmatter = re.match(r"^---\n(.*?)\n---\n", text, re.DOTALL) + if not frontmatter: + print(f"[FAIL] {skill}: missing YAML frontmatter") + failed = True + continue + + fm = frontmatter.group(1) + if not re.search(r"(?m)^name:\s*\S+", fm): + print(f"[FAIL] {skill}: frontmatter missing 'name'") + local_fail = True + if not re.search(r"(?m)^description:\s*.+", fm): + print(f"[FAIL] {skill}: frontmatter missing 'description'") + local_fail = True + + body = text[frontmatter.end() :] + for section in required_sections: + if section not in body: + print(f"[FAIL] {skill}: missing section '{section}'") + local_fail = True + + if local_fail: + failed = True + else: + print(f"[PASS] {skill}") + +if failed: + print("skill_lint: FAILED") + sys.exit(1) + +print(f"skill_lint: passed ({len(skills)} skills)") +PY diff --git a/scripts/dev/spec_check.sh b/scripts/dev/spec_check.sh new file mode 100755 index 000000000..55641ed03 --- /dev/null +++ b/scripts/dev/spec_check.sh @@ -0,0 +1,197 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/spec_check.sh [--base-ref <branch>] [--help] + +Checks KEP-lite quality for changed files. +- Validates changed docs/specs/*.md files for required status and sections. +- Non-trivial code changes are linked to KEPs via harness manifests (checked by + scripts/dev/harness_manifest_check.sh). + +Options: + --base-ref <branch> Compare HEAD against origin/<branch> (CI mode). + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +base_ref="" +while [[ $# -gt 0 ]]; do + case "$1" in + --base-ref) + if [[ $# -lt 2 ]]; then + echo "--base-ref requires a value." >&2 + exit 1 + fi + base_ref="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +collect_changed_files_local() { + local files=() + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git diff --name-only --cached --diff-filter=ACMRT) + + while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") + done < <(git ls-files --others --exclude-standard) + + printf '%s\n' "${files[@]}" +} + +collect_changed_files_ci() { + local ref="$1" + local remote_ref="refs/remotes/origin/${ref}" + + if ! git show-ref --verify --quiet "${remote_ref}"; then + git fetch --no-tags --depth=200 origin "${ref}:${remote_ref}" >/dev/null 2>&1 || true + fi + + if ! git show-ref --verify --quiet "${remote_ref}"; then + echo "Unable to resolve origin/${ref} for spec check." >&2 + exit 1 + fi + + local merge_base + merge_base="$(git merge-base HEAD "${remote_ref}" || true)" + if [[ -z "${merge_base}" ]]; then + echo "Unable to compute merge-base against origin/${ref}." >&2 + exit 1 + fi + + git diff --name-only --diff-filter=ACMRT "${merge_base}...HEAD" +} + +has_rg=false +if command -v rg >/dev/null 2>&1; then + has_rg=true +fi + +has_section() { + local section="$1" + local file="$2" + if [[ "${has_rg}" == "true" ]]; then + rg -Fq "${section}" "${file}" + else + grep -Fq "${section}" "${file}" + fi +} + +match_status() { + local file="$1" + local pattern='^(- )?Status:[[:space:]]*(Draft|In Review|Approved|Implemented|Superseded)[[:space:]]*$' + if [[ "${has_rg}" == "true" ]]; then + rg -q "${pattern}" "${file}" + else + grep -Eq "${pattern}" "${file}" + fi +} + +declare -A seen=() +changed_files=() + +if [[ -n "${base_ref}" ]]; then + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_ci "${base_ref}") +else + while IFS= read -r f; do + [[ -n "${f}" ]] || continue + if [[ -z "${seen[$f]+x}" ]]; then + seen["$f"]=1 + changed_files+=("$f") + fi + done < <(collect_changed_files_local) +fi + +if [[ ${#changed_files[@]} -eq 0 ]]; then + echo "spec_check: no changed files detected." + exit 0 +fi + +spec_files=() +for file in "${changed_files[@]}"; do + case "${file}" in + docs/specs/*.md) + base_name="$(basename "${file}")" + if [[ "${base_name}" != "README.md" && "${base_name}" != "SPEC_TEMPLATE.md" && "${base_name}" != "COMPONENT_KEP_INDEX.md" ]]; then + spec_files+=("${file}") + fi + ;; + esac +done + +if [[ ${#spec_files[@]} -eq 0 ]]; then + echo "spec_check: no changed KEP files detected." + exit 0 +fi + +required_sections=( + "## Summary" + "## Motivation" + "## Goals" + "## Non-Goals" + "## Proposal" + "## API/CRD Impact" + "## Reconcile/State Impact" + "## Test Plan" + "## Harness Validation" + "## Risks" + "## Rollout and Rollback" + "## Graduation Criteria" +) + +errors=() +for spec in "${spec_files[@]}"; do + if [[ ! -f "${spec}" ]]; then + errors+=("${spec}: file missing") + continue + fi + + if ! match_status "${spec}"; then + errors+=("${spec}: missing or invalid Status field") + fi + + for section in "${required_sections[@]}"; do + if ! has_section "${section}" "${spec}"; then + errors+=("${spec}: missing section '${section}'") + fi + done +done + +if [[ ${#errors[@]} -gt 0 ]]; then + printf 'spec_check failed:\n' >&2 + printf ' - %s\n' "${errors[@]}" >&2 + exit 1 +fi + +echo "spec_check: passed." diff --git a/scripts/dev/speckit_bridge.sh b/scripts/dev/speckit_bridge.sh new file mode 100755 index 000000000..7ea97e9aa --- /dev/null +++ b/scripts/dev/speckit_bridge.sh @@ -0,0 +1,360 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + scripts/dev/speckit_bridge.sh bootstrap --change-id <ID> --title <title> [options] + +Bootstraps a Spec Kit workspace, KEP-lite spec, and harness manifest. + +Options: + --change-id <ID> Required. Jira/GitHub ID (for example CSPL-5000). + --title <title> Required. Human-readable change title. + --slug <slug> Optional. Kebab-case slug override. + --owner <owner> Optional. Default: @splunk/splunk-operator-for-kubernetes + --risk-tier <tier> Optional. low|medium|high (default: medium) + --delivery-mode <mode> Optional. agent|hybrid|human (default: agent) + -h, --help Show this help. +USAGE +} + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +slugify() { + local value="$1" + value="$(printf '%s' "${value}" | tr '[:upper:]' '[:lower:]')" + value="$(printf '%s' "${value}" | sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//; s/-+/-/g')" + printf '%s\n' "${value}" +} + +set_risk_defaults() { + local tier="$1" + case "${tier}" in + low) + risk_approvals=0 + risk_auto_merge=true + risk_merge_queue=false + ;; + medium) + risk_approvals=1 + risk_auto_merge=false + risk_merge_queue=true + ;; + high) + risk_approvals=2 + risk_auto_merge=false + risk_merge_queue=true + ;; + *) + echo "Invalid --risk-tier: ${tier}. Use low|medium|high." >&2 + exit 1 + ;; + esac +} + +write_file_if_absent() { + local file="$1" + shift + if [[ -f "${file}" ]]; then + echo "exists: ${file}" + return + fi + mkdir -p "$(dirname "${file}")" + cat > "${file}" + echo "created: ${file}" +} + +cmd="${1:-}" +if [[ -z "${cmd}" ]]; then + usage + exit 1 +fi +shift || true + +case "${cmd}" in + bootstrap) + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown command: ${cmd}" >&2 + usage + exit 1 + ;; +esac + +change_id="" +title="" +slug="" +owner="@splunk/splunk-operator-for-kubernetes" +risk_tier="medium" +delivery_mode="agent" + +while [[ $# -gt 0 ]]; do + case "$1" in + --change-id) + change_id="$2" + shift 2 + ;; + --title) + title="$2" + shift 2 + ;; + --slug) + slug="$2" + shift 2 + ;; + --owner) + owner="$2" + shift 2 + ;; + --risk-tier) + risk_tier="$2" + shift 2 + ;; + --delivery-mode) + delivery_mode="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "${change_id}" || -z "${title}" ]]; then + echo "--change-id and --title are required." >&2 + usage + exit 1 +fi + +case "${delivery_mode}" in + agent|hybrid|human) + ;; + *) + echo "Invalid --delivery-mode: ${delivery_mode}. Use agent|hybrid|human." >&2 + exit 1 + ;; +esac + +if [[ -z "${slug}" ]]; then + slug="$(slugify "${title}")" +fi +if [[ -z "${slug}" ]]; then + echo "Unable to derive slug from title. Pass --slug explicitly." >&2 + exit 1 +fi + +set_risk_defaults "${risk_tier}" + +today="$(date -u +"%Y-%m-%d")" +base_name="${change_id}-${slug}" +speckit_dir="speckit/specs/${base_name}" +spec_file="docs/specs/${base_name}.md" +manifest_file="harness/manifests/${base_name}.yaml" + +write_file_if_absent "${speckit_dir}/spec.md" <<EOF_SPEC +# ${title} + +- Change ID: ${change_id} +- Status: Draft +- Owner: ${owner} +- Created: ${today} + +## Problem +Describe the concrete user/operator problem. + +## Success Criteria +- Criterion 1 +- Criterion 2 + +## Constraints +- Constraint 1 +- Constraint 2 + +## Links +- KEP: ${spec_file} +- Harness manifest: ${manifest_file} +EOF_SPEC + +write_file_if_absent "${speckit_dir}/plan.md" <<EOF_PLAN +# Implementation Plan: ${title} + +## Milestones +1. KEP review and approval +2. Implementation with harness checks +3. Validation and PR merge + +## Scope +- In scope: +- Out of scope: + +## Risks +- Risk: +- Mitigation: +EOF_PLAN + +write_file_if_absent "${speckit_dir}/tasks.md" <<EOF_TASKS +# Task List: ${title} + +- [ ] Draft and review KEP +- [ ] Set risk tier and scope in manifest +- [ ] Implement code changes +- [ ] Add/update tests +- [ ] Run harness gates +- [ ] Prepare PR summary and rollout notes +EOF_TASKS + +write_file_if_absent "${speckit_dir}/bridge.yaml" <<EOF_BRIDGE +version: v1 +change_id: ${change_id} +title: ${title} +speckit_dir: ${speckit_dir} +kepspec: ${spec_file} +manifest: ${manifest_file} +risk_tier: ${risk_tier} +delivery_mode: ${delivery_mode} +owner: ${owner} +EOF_BRIDGE + +write_file_if_absent "${spec_file}" <<EOF_KEP +# ${title} + +- ID: ${change_id} +- Status: Draft +- Owners: ${owner} +- Reviewers: ${owner} +- Created: ${today} +- Last Updated: ${today} +- Related Links: ${speckit_dir}/spec.md, ${speckit_dir}/plan.md, ${speckit_dir}/tasks.md + +## Summary +One paragraph describing what is changing and why. + +## Motivation +Describe user/operator pain and current limitations. + +## Goals +- Goal 1 +- Goal 2 + +## Non-Goals +- Out of scope 1 +- Out of scope 2 + +## Proposal +Describe the technical design and decision rationale. + +## API/CRD Impact +- Affected CRDs/kinds: +- Schema/marker/defaulting changes: +- Compatibility notes: + +## Reconcile/State Impact +- Reconciler flows touched: +- Status phase/conditions behavior: +- Idempotency and requeue behavior: + +## Test Plan +- Unit: +- Integration (Ginkgo): +- KUTTL: +- Upgrade/regression: + +## Harness Validation +- scripts/dev/spec_check.sh +- scripts/dev/harness_manifest_check.sh +- scripts/dev/doc_first_check.sh +- scripts/dev/commit_discipline_check.sh +- scripts/dev/appframework_parity_check.sh +- scripts/dev/keps_check.sh +- scripts/dev/harness_engineering_parity_check.sh +- scripts/dev/constitution_runtime_policy_check.sh +- scripts/dev/risk_policy_check.sh +- scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml +- scripts/dev/harness_run.sh --fast +- scripts/dev/pr_check.sh --fast + +## Risks +List behavioral, operational, and compatibility risks plus mitigations. + +## Rollout and Rollback +Define rollout order, observability, and rollback steps. + +## Graduation Criteria +- [ ] Design reviewed and status moved to Approved +- [ ] Implementation merged and status moved to Implemented +- [ ] Required harness checks pass in CI +- [ ] Docs and examples updated (if applicable) +EOF_KEP + +write_file_if_absent "${manifest_file}" <<EOF_MANIFEST +version: v1 +change_id: ${change_id} +title: ${title} +spec_file: ${spec_file} +owner: ${owner} +delivery_mode: ${delivery_mode} +risk_tier: ${risk_tier} +human_approvals_required: ${risk_approvals} +auto_merge_allowed: ${risk_auto_merge} +merge_queue_required: ${risk_merge_queue} +evaluation_suite: docs/agent/evals/policy-regression.yaml +allowed_paths: + - api/** + - cmd/** + - config/** + - docs/** + - harness/** + - internal/** + - kuttl/** + - pkg/** + - scripts/** + - test/** + - templates/** +forbidden_paths: + - vendor/** + - bin/** + - .git/** +required_commands: + - scripts/dev/spec_check.sh + - scripts/dev/harness_manifest_check.sh + - scripts/dev/doc_first_check.sh + - scripts/dev/commit_discipline_check.sh + - scripts/dev/appframework_parity_check.sh + - scripts/dev/keps_check.sh + - scripts/dev/harness_engineering_parity_check.sh + - scripts/dev/constitution_runtime_policy_check.sh + - scripts/dev/risk_policy_check.sh + - scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml + - scripts/dev/harness_run.sh --fast + - scripts/dev/pr_check.sh --fast +EOF_MANIFEST + +cat <<EOF_OUT + +Bridge bootstrap complete. +- Spec Kit: ${speckit_dir}/ +- KEP: ${spec_file} +- Manifest: ${manifest_file} + +Next: +1. Fill ${speckit_dir}/spec.md and ${speckit_dir}/plan.md. +2. Drive ${spec_file} to Status: Approved. +3. Implement code and run scripts/dev/harness_run.sh --fast. +EOF_OUT diff --git a/scripts/dev/start_change.sh b/scripts/dev/start_change.sh new file mode 100755 index 000000000..41ca93c0c --- /dev/null +++ b/scripts/dev/start_change.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: scripts/dev/start_change.sh <topic> + +Creates docs/changes/<YYYY-MM-DD>-<topic>.md from docs/changes/TEMPLATE.md. +USAGE +} + +if [[ $# -ne 1 ]]; then + usage >&2 + exit 1 +fi + +topic="$1" +slug="$(printf '%s' "${topic}" | tr '[:upper:]' '[:lower:]' | sed -E 's/[^a-z0-9]+/-/g; s/^-+|-+$//g')" +if [[ -z "${slug}" ]]; then + echo "Unable to derive topic slug from input: ${topic}" >&2 + exit 1 +fi + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi +cd "${repo_root}" + +mkdir -p docs/changes + +date_prefix="$(date +%F)" +out_file="docs/changes/${date_prefix}-${slug}.md" +template="docs/changes/TEMPLATE.md" + +if [[ ! -f "${template}" ]]; then + echo "Missing template: ${template}" >&2 + exit 1 +fi + +if [[ -e "${out_file}" ]]; then + echo "File already exists: ${out_file}" >&2 + exit 1 +fi + +cp "${template}" "${out_file}" +{ + echo + echo "<!-- topic: ${topic} -->" +} >> "${out_file}" + +echo "Created ${out_file}" diff --git a/scripts/dev/unit.sh b/scripts/dev/unit.sh new file mode 100755 index 000000000..a40b14f80 --- /dev/null +++ b/scripts/dev/unit.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" +echo "Running unit/envtest suite (make test)" +make test diff --git a/scripts/generate_testcase.py b/scripts/generate_testcase.py new file mode 100755 index 000000000..1297bf500 --- /dev/null +++ b/scripts/generate_testcase.py @@ -0,0 +1,661 @@ +#!/usr/bin/env python3 +import argparse +import json +import os +import re +import shutil +import sys +from datetime import datetime +from pathlib import Path + + +def load_spec(path: Path): + suffix = path.suffix.lower() + if suffix == ".json": + with path.open("r", encoding="utf-8") as f: + return json.load(f) + if suffix in (".yaml", ".yml"): + try: + import yaml # type: ignore + except Exception: + print("[ERROR] PyYAML is required to read YAML specs.") + print("Install with: python3 -m pip install pyyaml") + sys.exit(2) + with path.open("r", encoding="utf-8") as f: + return yaml.safe_load(f) + print(f"[ERROR] Unsupported spec extension: {suffix}") + print("Use .yaml, .yml, or .json") + sys.exit(2) + + +def slugify(value: str) -> str: + value = value.strip().lower() + value = re.sub(r"[^a-z0-9]+", "-", value) + value = re.sub(r"-+", "-", value).strip("-") + return value or "test" + + +def ensure_dir(path: Path): + path.mkdir(parents=True, exist_ok=True) + + +def read_text(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def write_text(path: Path, content: str, force: bool): + if path.exists() and not force: + print(f"[ERROR] Refusing to overwrite existing file: {path}") + print("Use --force to overwrite") + sys.exit(1) + path.write_text(content, encoding="utf-8") + + +def indent_block(text: str, spaces: int) -> str: + prefix = " " * spaces + return "\n".join(prefix + line if line.strip() else "" for line in text.splitlines()) + + +def go_bool(value: bool) -> str: + return "true" if value else "false" + + +def kuttl_assert_for_resource(res: dict) -> str: + api_version = res.get("apiVersion", "") + kind = res.get("kind", "") + name = res.get("name", "") + status = res.get("status") + lines = ["---", f"apiVersion: {api_version}", f"kind: {kind}", "metadata:", f" name: {name}"] + if isinstance(status, dict) and status: + lines.append("status:") + for key, value in status.items(): + if isinstance(value, str): + lines.append(f" {key}: {value}") + else: + lines.append(f" {key}: {value}") + return "\n".join(lines) + "\n" + + +def generate_kuttl(spec: dict, repo_root: Path, force: bool, dry_run: bool): + suite = spec["suite"] + name = spec["name"] + crs = spec.get("crs") + if crs is None: + cr = spec.get("cr") + if cr is None: + crs = [] + else: + crs = [cr] + if not isinstance(crs, list): + print("[ERROR] crs must be a list") + sys.exit(1) + upgrade = spec.get("upgrade", {}) if isinstance(spec.get("upgrade", {}), dict) else {} + upgrade_enabled = bool(upgrade.get("enabled", False)) + if not crs and not upgrade_enabled: + print("[ERROR] kuttl spec requires cr or crs unless upgrade.enabled=true") + sys.exit(1) + expected = spec.get("expected", {}) + resources = spec.get("resources", []) + phase = expected.get("phase", "Ready") + phases = expected.get("phases", {}) + assert_path = expected.get("assert_path", "") + + test_dir = repo_root / "kuttl" / "tests" / suite / name + assert_name = "00-assert.yaml" if not upgrade_enabled else "04-assert.yaml" + assert_target = test_dir / assert_name + + if dry_run: + print(f"[DRY-RUN] Create {test_dir}") + if upgrade_enabled: + print(f"[DRY-RUN] Write {test_dir / '00-install.yaml'}") + print(f"[DRY-RUN] Write {test_dir / '01-assert-operator-ready.yaml'}") + print(f"[DRY-RUN] Write {test_dir / '02-upgrade.yaml'}") + print(f"[DRY-RUN] Write {test_dir / '03-assert-operator-image.yaml'}") + for index, cr in enumerate(crs): + kind = cr.get("kind", "") + deploy_index = index if not upgrade_enabled else index + 4 + deploy_name = f"{deploy_index:02d}-deploy-{slugify(kind)}.yaml" + cr_path = Path(cr.get("path", "")).expanduser() + if not cr_path.is_absolute(): + cr_path = (repo_root / cr_path).resolve() + print(f"[DRY-RUN] Copy {cr_path} -> {test_dir / deploy_name}") + print(f"[DRY-RUN] Write {assert_target}") + return + + ensure_dir(test_dir) + + if upgrade_enabled: + method = str(upgrade.get("method", "helm")).lower() + if method != "helm": + print("[ERROR] upgrade.method only supports 'helm' for now") + sys.exit(1) + + helm_release = str(upgrade.get("helmRelease", "splunk-test")) + helm_repo_env = str(upgrade.get("helmChartPathEnv", "HELM_REPO_PATH")) + namespace_env = str(upgrade.get("namespaceEnv", "NAMESPACE")) + values_file = str(upgrade.get("valuesFile", "")).strip() + operator_image_env = str(upgrade.get("operatorImageEnv", "KUTTL_SPLUNK_OPERATOR_IMAGE")) + operator_image_new_env = str(upgrade.get("operatorImageNewEnv", "KUTTL_SPLUNK_OPERATOR_NEW_IMAGE")) + enterprise_image_env = str(upgrade.get("enterpriseImageEnv", "KUTTL_SPLUNK_ENTERPRISE_IMAGE")) + enterprise_image_new_env = str(upgrade.get("enterpriseImageNewEnv", "KUTTL_SPLUNK_ENTERPRISE_NEW_IMAGE")) + extra_args = upgrade.get("extraHelmArgs", []) + if not isinstance(extra_args, list): + extra_args = [] + + values_arg = "" + if values_file: + values_path = Path(values_file).expanduser() + if not values_path.is_absolute(): + values_path = (repo_root / values_path).resolve() + if not values_path.exists(): + print(f"[ERROR] valuesFile not found: {values_path}") + sys.exit(1) + values_target = test_dir / values_path.name + if not values_target.exists() or force: + shutil.copyfile(values_path, values_target) + values_arg = f"-f {values_target.name}" + + extra = " ".join(extra_args) + install_cmd = ( + f"helm install {helm_release} " + f"${{{helm_repo_env}}}/splunk-enterprise {values_arg} " + f"--set splunk-operator.splunkOperator.image.repository=${{{operator_image_env}}} " + f"--set splunk-operator.image.repository=${{{enterprise_image_env}}} " + f"--namespace ${{{namespace_env}}} " + f"--set splunk-operator.splunkOperator.splunkGeneralTerms=\\\"--accept-sgt-current-at-splunk-com\\\" " + f"{extra}" + ).strip() + + upgrade_cmd = ( + f"helm upgrade {helm_release} " + f"${{{helm_repo_env}}}/splunk-enterprise --reuse-values {values_arg} " + f"--set splunk-operator.splunkOperator.image.repository=${{{operator_image_new_env}}} " + f"--set splunk-operator.image.repository=${{{enterprise_image_new_env}}} " + f"--namespace ${{{namespace_env}}} " + f"{extra}" + ).strip() + + install_step = f"""--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - command: {install_cmd} + namespaced: true +""" + ready_assert = """--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: splunk-operator-controller-manager +status: + readyReplicas: 1 + availableReplicas: 1 +""" + upgrade_step = f"""--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - command: {upgrade_cmd} + namespaced: true +""" + image_check_cmd = ( + f"kubectl -n ${{{namespace_env}}} get deploy splunk-operator-controller-manager " + f"-o jsonpath='{{{{.spec.template.spec.containers[?(@.name==\\\"manager\\\")].image}}}}' " + f"| grep -q \"${{{operator_image_new_env}}}\"" + ) + image_assert_step = f"""--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - command: {image_check_cmd} + namespaced: true +""" + + write_text(test_dir / "00-install.yaml", install_step, force) + write_text(test_dir / "01-assert-operator-ready.yaml", ready_assert, force) + write_text(test_dir / "02-upgrade.yaml", upgrade_step, force) + write_text(test_dir / "03-assert-operator-image.yaml", image_assert_step, force) + for index, cr in enumerate(crs): + api_version = cr.get("apiVersion", "") + kind = cr.get("kind", "") + cr_name = cr.get("name", "") + cr_path = Path(cr.get("path", "")).expanduser() + if not api_version or not kind or not cr_name: + print("[ERROR] crs entries must include apiVersion, kind, and name") + sys.exit(1) + if not cr_path.is_absolute(): + cr_path = (repo_root / cr_path).resolve() + if not cr_path.exists(): + print(f"[ERROR] CR manifest not found: {cr_path}") + sys.exit(1) + deploy_index = index if not upgrade_enabled else index + 4 + deploy_name = f"{deploy_index:02d}-deploy-{slugify(kind)}.yaml" + deploy_target = test_dir / deploy_name + if deploy_target.exists() and not force: + print(f"[ERROR] Deploy file exists: {deploy_target}") + sys.exit(1) + shutil.copyfile(cr_path, deploy_target) + + # Build assert content + content = [] + for cr in crs: + api_version = cr.get("apiVersion", "") + kind = cr.get("kind", "") + cr_name = cr.get("name", "") + if not api_version or not kind or not cr_name: + print("[ERROR] crs entries must include apiVersion, kind, and name") + sys.exit(1) + phase_for_cr = phases.get(cr_name, phase) if isinstance(phases, dict) else phase + content.append("---") + content.append(f"apiVersion: {api_version}") + content.append(f"kind: {kind}") + content.append("metadata:") + content.append(f" name: {cr_name}") + content.append("status:") + content.append(f" phase: {phase_for_cr}") + content.append("") + + if assert_path: + assert_file = Path(assert_path).expanduser() + if not assert_file.is_absolute(): + assert_file = (repo_root / assert_file).resolve() + if not assert_file.exists(): + print(f"[ERROR] assert_path not found: {assert_file}") + sys.exit(1) + content.append(read_text(assert_file).rstrip()) + content.append("") + elif resources: + for res in resources: + content.append(kuttl_assert_for_resource(res).rstrip()) + elif crs: + content.append("# TODO: add resource assertions (StatefulSet, Service, Secret, etc.)") + + if content: + write_text(assert_target, "\n".join(content).rstrip() + "\n", force) + + readme_path = test_dir / "readme.txt" + if not readme_path.exists(): + readme_path.write_text(f"KUTTL test: {name}\n", encoding="utf-8") + + print(f"[OK] Created KUTTL test: {test_dir}") + + +def suite_template(suite: str) -> str: + suite_name = slugify(suite) + return f"""// Copyright (c) 2018-2022 Splunk Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the \"License\"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//\thttp://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an \"AS IS\" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package {suite_name} + +import ( + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/splunk/splunk-operator/test/testenv" +) + +const ( + PollInterval = 5 * time.Second + ConsistentPollInterval = 200 * time.Millisecond + ConsistentDuration = 2000 * time.Millisecond +) + +var ( + testenvInstance *testenv.TestEnv + testSuiteName = "{suite_name}-" + testenv.RandomDNSName(3) +) + +func TestBasic(t *testing.T) {{ + RegisterFailHandler(Fail) + RunSpecs(t, "Running "+testSuiteName) +}} + +var _ = BeforeSuite(func() {{ + var err error + testenvInstance, err = testenv.NewDefaultTestEnv(testSuiteName) + Expect(err).ToNot(HaveOccurred()) +}}) + +var _ = AfterSuite(func() {{ + if testenvInstance != nil {{ + Expect(testenvInstance.Teardown()).ToNot(HaveOccurred()) + }} +}}) +""" + + +def integration_flow(spec: dict): + arch = spec.get("architecture", {}) if isinstance(spec.get("architecture", {}), dict) else {} + arch_name = str(arch.get("name", "")).upper() + indexer_replicas = int(arch.get("indexerReplicas", 3) or 3) + site_count = int(arch.get("siteCount", 3) or 3) + shc = bool(arch.get("shc", True)) + use_legacy_cluster_manager = bool(arch.get("useLegacyClusterManager", False)) + features = spec.get("features", {}) if isinstance(spec.get("features", {}), dict) else {} + use_smartstore = bool(features.get("smartstore", False)) + use_appframework = bool(features.get("appframework", False)) + + deploy_lines = [] + ready_lines = [] + notes = [] + + upgrade = spec.get("upgrade", {}) if isinstance(spec.get("upgrade", {}), dict) else {} + upgrade_enabled = bool(upgrade.get("enabled", False)) + operator_image_env = str(upgrade.get("operatorImageNewEnv", "UPGRADE_OPERATOR_IMAGE")) + enterprise_image_env = str(upgrade.get("enterpriseImageNewEnv", "UPGRADE_SPLUNK_IMAGE")) + upgrade_lines = [] + + validations = spec.get("validations", {}) if isinstance(spec.get("validations", {}), dict) else {} + sva = str(validations.get("sva", "")).strip().upper() + has_mc_flag = isinstance(validations, dict) and "monitoringConsole" in validations + has_lm_flag = isinstance(validations, dict) and "licenseManager" in validations + mc_enabled = bool(validations.get("monitoringConsole", False)) + lm_enabled = bool(validations.get("licenseManager", False)) + mc_name_override = str(validations.get("monitoringConsoleName", "")).strip() + if sva in ("C3", "SINGLE-SITE", "SINGLE_SITE", "SVA") and arch_name in ("C3", "SINGLE-SITE", "SINGLE_SITE"): + if not has_mc_flag: + mc_enabled = True + if not has_lm_flag: + lm_enabled = True + + go_shc = go_bool(shc) + + if arch_name in ("", "S1", "STANDALONE"): + if use_smartstore: + deploy_lines.append('Skip("TODO: implement smartstore standalone using DeployStandaloneWithGivenSmartStoreSpec")') + elif use_appframework: + deploy_lines.append('Skip("TODO: implement app framework standalone (no helper) using custom spec")') + else: + deploy_lines.append('instance, err := deployment.DeployStandalone(ctx, deployment.GetName(), "", "")') + deploy_lines.append('Expect(err).To(Succeed(), "Unable to deploy standalone instance")') + ready_lines.append("testenv.StandaloneReady(ctx, deployment, deployment.GetName(), instance, testcaseEnvInst)") + elif arch_name in ("C3", "SINGLE-SITE", "SINGLE_SITE"): + if use_appframework: + deploy_lines.append('Skip("TODO: implement app framework using DeploySingleSiteClusterWithGivenAppFrameworkSpec")') + elif use_smartstore: + deploy_lines.append('Skip("TODO: implement smartstore using DeployClusterManagerWithSmartStoreIndexes + DeployIndexerCluster")') + else: + if mc_enabled: + mc_ref_expr = "deployment.GetName()" + if mc_name_override: + mc_ref_expr = json.dumps(mc_name_override) + deploy_lines.append(f"mcRef := {mc_ref_expr}") + deploy_lines.append('lmRef := ""') + if lm_enabled: + deploy_lines.append("lmRef = deployment.GetName()") + mc_ref_arg = "mcRef" if mc_enabled else '""' + deploy_lines.append( + f"err := deployment.DeploySingleSiteCluster(ctx, deployment.GetName(), {indexer_replicas}, {go_shc}, {mc_ref_arg})" + ) + deploy_lines.append('Expect(err).To(Succeed(), "Unable to deploy single-site cluster")') + ready_lines.append( + "testenv.LegacyClusterManagerReady(ctx, deployment, testcaseEnvInst)" + if use_legacy_cluster_manager + else "testenv.ClusterManagerReady(ctx, deployment, testcaseEnvInst)" + ) + if shc: + ready_lines.append("testenv.SearchHeadClusterReady(ctx, deployment, testcaseEnvInst)") + ready_lines.append("testenv.SingleSiteIndexersReady(ctx, deployment, testcaseEnvInst)") + ready_lines.append("testenv.VerifyRFSFMet(ctx, deployment, testcaseEnvInst)") + if lm_enabled: + ready_lines.append( + "testenv.LegacyLicenseManagerReady(ctx, deployment, testcaseEnvInst)" + if use_legacy_cluster_manager + else "testenv.LicenseManagerReady(ctx, deployment, testcaseEnvInst)" + ) + notes.append("License Manager readiness requires a license file/configmap to be configured for the test env.") + if mc_enabled: + ready_lines.append('mc, err := deployment.DeployMonitoringConsole(ctx, mcRef, lmRef)') + ready_lines.append('Expect(err).To(Succeed(), "Unable to deploy Monitoring Console")') + ready_lines.append("testenv.VerifyMonitoringConsoleReady(ctx, deployment, mcRef, mc, testcaseEnvInst)") + elif arch_name in ("M4", "MULTISITE", "MULTI-SITE", "M4-SHC", "MULTISITE-SHC"): + if use_appframework: + deploy_lines.append('Skip("TODO: implement app framework using DeployMultisiteClusterWithSearchHeadAndAppFramework")') + elif use_smartstore: + deploy_lines.append('Skip("TODO: implement smartstore using DeployMultisiteClusterWithSearchHeadAndIndexes")') + else: + deploy_lines.append( + f'err := deployment.DeployMultisiteClusterWithSearchHead(ctx, deployment.GetName(), {indexer_replicas}, {site_count}, "")' + ) + deploy_lines.append('Expect(err).To(Succeed(), "Unable to deploy multisite cluster with SHC")') + ready_lines.append( + "testenv.LegacyClusterManagerReady(ctx, deployment, testcaseEnvInst)" + if use_legacy_cluster_manager + else "testenv.ClusterManagerReady(ctx, deployment, testcaseEnvInst)" + ) + ready_lines.append(f"testenv.IndexersReady(ctx, deployment, testcaseEnvInst, {site_count})") + ready_lines.append(f"testenv.IndexerClusterMultisiteStatus(ctx, deployment, testcaseEnvInst, {site_count})") + ready_lines.append("testenv.SearchHeadClusterReady(ctx, deployment, testcaseEnvInst)") + ready_lines.append("testenv.VerifyRFSFMet(ctx, deployment, testcaseEnvInst)") + elif arch_name in ("M1", "MULTISITE-NOSHC", "MULTISITE_NO_SHC", "MULTISITE-NO-SHC"): + if use_appframework: + deploy_lines.append('Skip("TODO: implement app framework using DeployMultisiteClusterWithSearchHeadAndAppFramework (shc=false)")') + elif use_smartstore: + deploy_lines.append('Skip("TODO: implement smartstore multisite without SHC (no helper) - use custom flow")') + else: + deploy_lines.append( + f'err := deployment.DeployMultisiteCluster(ctx, deployment.GetName(), {indexer_replicas}, {site_count}, "")' + ) + deploy_lines.append('Expect(err).To(Succeed(), "Unable to deploy multisite cluster")') + ready_lines.append( + "testenv.LegacyClusterManagerReady(ctx, deployment, testcaseEnvInst)" + if use_legacy_cluster_manager + else "testenv.ClusterManagerReady(ctx, deployment, testcaseEnvInst)" + ) + ready_lines.append(f"testenv.IndexersReady(ctx, deployment, testcaseEnvInst, {site_count})") + ready_lines.append(f"testenv.IndexerClusterMultisiteStatus(ctx, deployment, testcaseEnvInst, {site_count})") + ready_lines.append("testenv.VerifyRFSFMet(ctx, deployment, testcaseEnvInst)") + else: + deploy_lines.append('Skip("TODO: unsupported architecture in generator. Update spec or add helper mapping.")') + notes.append("Use docs/agent/TESTCASE_PATTERNS.md to pick the right helper.") + + if deploy_lines and deploy_lines[0].startswith("Skip("): + ready_lines = ["// TODO: add readiness checks once deployment is implemented"] + + if upgrade_enabled: + upgrade_lines = [ + f'operatorImage := os.Getenv("{operator_image_env}")', + 'Expect(operatorImage).ToNot(BeEmpty())', + 'err = testcaseEnvInst.UpdateOperatorImage(operatorImage)', + 'Expect(err).To(Succeed(), "Unable to update operator image")', + 'testenv.VerifyOperatorImage(ctx, testcaseEnvInst, operatorImage)', + '', + f'splunkImage := os.Getenv("{enterprise_image_env}")', + 'Expect(splunkImage).ToNot(BeEmpty())', + '// TODO: update CR spec image to splunkImage and wait for reconciliation', + '// TODO: verify splunk pod images updated', + 'testenv.VerifySplunkPodImagesContain(testcaseEnvInst.GetName(), splunkImage)', + ] + + return deploy_lines, ready_lines, upgrade_lines, notes, upgrade_enabled + + +def integration_template(spec: dict) -> str: + suite = slugify(spec["suite"]) + name = spec["name"] + arch = spec.get("architecture", {}) if isinstance(spec.get("architecture", {}), dict) else {} + arch_name = str(arch.get("name", "")).upper() or "Custom" + cr = spec.get("cr", {}) if isinstance(spec.get("cr", {}), dict) else {} + kind = cr.get("kind", "") or arch_name + cr_path = cr.get("path", "") + + deploy_lines, ready_lines, upgrade_lines, notes, upgrade_enabled = integration_flow(spec) + deploy_snippet = indent_block("\n".join(deploy_lines), 12) + ready_snippet = indent_block("\n".join(ready_lines), 12) + upgrade_snippet = "" + post_upgrade_ready = "" + if upgrade_lines: + upgrade_snippet = indent_block("\n".join(upgrade_lines), 12) + post_upgrade_ready = ready_snippet + notes_snippet = "" + if notes: + notes_snippet = indent_block("\n".join([f"// NOTE: {n}" for n in notes]), 12) + extra_imports = "" + if upgrade_enabled: + extra_imports = " \"os\"\\n" + + return f"""// Copyright (c) 2018-2022 Splunk Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the \"License\"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//\thttp://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an \"AS IS\" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package {suite} + +import ( + "context" + "fmt" +{extra_imports} + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/onsi/ginkgo/v2/types" + + "github.com/splunk/splunk-operator/test/testenv" +) + +var _ = Describe("{kind} integration test", func() {{ + var testcaseEnvInst *testenv.TestCaseEnv + var deployment *testenv.Deployment + ctx := context.TODO() + + BeforeEach(func() {{ + var err error + name := fmt.Sprintf("%s-%s", testenvInstance.GetName(), testenv.RandomDNSName(3)) + testcaseEnvInst, err = testenv.NewDefaultTestCaseEnv(testenvInstance.GetKubeClient(), name) + Expect(err).To(Succeed(), "Unable to create testcaseenv") + deployment, err = testcaseEnvInst.NewDeployment(testenv.RandomDNSName(3)) + Expect(err).To(Succeed(), "Unable to create deployment") + }}) + + AfterEach(func() {{ + if types.SpecState(CurrentSpecReport().State) == types.SpecStateFailed {{ + testcaseEnvInst.SkipTeardown = true + }} + if deployment != nil {{ + deployment.Teardown() + }} + if testcaseEnvInst != nil {{ + Expect(testcaseEnvInst.Teardown()).ToNot(HaveOccurred()) + }} + }}) + + Context("{name}", func() {{ + It("integration, {kind}: {name}", func() {{ + // Architecture: {arch_name} + // Spec path: {cr_path} +{notes_snippet} +{deploy_snippet} + + {ready_snippet} + +{upgrade_snippet} + +{post_upgrade_ready} + + // TODO: verify resources (pods, services, statefulsets, secrets) + }}) + }}) +}}) +""" + + +def generate_integration(spec: dict, repo_root: Path, force: bool, dry_run: bool): + suite = slugify(spec["suite"]) + name = slugify(spec["name"]).replace("-", "_") + + suite_dir = repo_root / "test" / suite + test_file = suite_dir / f"{name}_test.go" + suite_file = suite_dir / f"{suite}_suite_test.go" + + if dry_run: + print(f"[DRY-RUN] Create {suite_dir}") + print(f"[DRY-RUN] Write {test_file}") + if not suite_file.exists(): + print(f"[DRY-RUN] Write {suite_file}") + return + + ensure_dir(suite_dir) + + if not suite_file.exists(): + write_text(suite_file, suite_template(suite), force=False) + print(f"[OK] Created suite file: {suite_file}") + + write_text(test_file, integration_template(spec), force) + print(f"[OK] Created integration test: {test_file}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate KUTTL or integration test scaffolds from a spec file.") + parser.add_argument("--spec", required=True, help="Path to testcase spec (.yaml/.yml/.json)") + parser.add_argument("--force", action="store_true", help="Overwrite existing files") + parser.add_argument("--dry-run", action="store_true", help="Print actions without writing files") + args = parser.parse_args() + + spec_path = Path(args.spec).expanduser().resolve() + if not spec_path.exists(): + print(f"[ERROR] Spec file not found: {spec_path}") + sys.exit(1) + + spec = load_spec(spec_path) + if not isinstance(spec, dict): + print("[ERROR] Spec must be a dictionary") + sys.exit(1) + + required = ["type", "suite", "name"] + for key in required: + if key not in spec: + print(f"[ERROR] Missing required field: {key}") + sys.exit(1) + + cr = spec.get("cr") + crs = spec.get("crs") + if cr is None and crs is None: + print("[ERROR] spec must include cr or crs") + sys.exit(1) + if cr is not None: + if not isinstance(cr, dict): + print("[ERROR] cr must be an object") + sys.exit(1) + if "path" not in cr: + print("[ERROR] cr.path is required") + sys.exit(1) + if crs is not None and not isinstance(crs, list): + print("[ERROR] crs must be a list") + sys.exit(1) + + repo_root = Path(__file__).resolve().parent.parent + test_type = str(spec["type"]).lower() + + if test_type == "kuttl": + generate_kuttl(spec, repo_root, args.force, args.dry_run) + elif test_type in ("integration", "ginkgo"): + generate_integration(spec, repo_root, args.force, args.dry_run) + else: + print(f"[ERROR] Unknown test type: {test_type}") + print("Use 'kuttl' or 'integration'") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/verify_bundle.sh b/scripts/verify_bundle.sh new file mode 100755 index 000000000..ea70f29d2 --- /dev/null +++ b/scripts/verify_bundle.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v git >/dev/null 2>&1; then + echo "git is required for verify_bundle.sh" >&2 + exit 1 +fi + +if ! command -v operator-sdk >/dev/null 2>&1; then + echo "operator-sdk is required for bundle generation." >&2 + echo "Install it via: make setup/devsetup" >&2 + exit 1 +fi + +printf "Running bundle generation...\n" +make bundle + +changed=$(git diff --name-only -- bundle/manifests helm-chart/splunk-operator/crds) +if [[ -n "${changed}" ]]; then + echo "Bundle or Helm CRD outputs changed after regeneration:" >&2 + echo "${changed}" >&2 + echo "Please commit regenerated files." >&2 + exit 1 +fi + +echo "Bundle outputs are up to date." diff --git a/scripts/verify_crd.sh b/scripts/verify_crd.sh new file mode 100755 index 000000000..b0cfcefe4 --- /dev/null +++ b/scripts/verify_crd.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v git >/dev/null 2>&1; then + echo "git is required for verify_crd.sh" >&2 + exit 1 +fi + +expected_controller_gen_version="$( + awk -F'?=' '/^CONTROLLER_TOOLS_VERSION[[:space:]]*\\?=/{gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2); print $2; exit}' Makefile +)" +if [[ -n "${expected_controller_gen_version}" ]]; then + current_controller_gen_version="" + if [[ -x "${repo_root}/bin/controller-gen" ]]; then + current_controller_gen_version="$("${repo_root}/bin/controller-gen" --version 2>/dev/null || true)" + fi + if [[ "${current_controller_gen_version}" != *"${expected_controller_gen_version}"* ]]; then + echo "Installing controller-gen ${expected_controller_gen_version} for deterministic CRD generation..." + GOBIN="${repo_root}/bin" go install "sigs.k8s.io/controller-tools/cmd/controller-gen@${expected_controller_gen_version}" + fi +fi + +printf "Running CRD/RBAC generation...\n" +make generate +make manifests + +changed=$(git diff --name-only -- config/crd/bases config/rbac/role.yaml) +if [[ -n "${changed}" ]]; then + echo "CRD/RBAC outputs changed after regeneration:" >&2 + echo "${changed}" >&2 + echo "Please commit regenerated files." >&2 + exit 1 +fi + +echo "CRD/RBAC outputs are up to date." diff --git a/scripts/verify_repo.sh b/scripts/verify_repo.sh new file mode 100755 index 000000000..10ede0013 --- /dev/null +++ b/scripts/verify_repo.sh @@ -0,0 +1,121 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: verify_repo.sh [options] + +Options: + --bundle Verify bundle/helm outputs (runs scripts/verify_bundle.sh) + --tests Run unit tests (make test) + --fmt Check gofmt formatting (no changes) + --vet Run go vet ./... + --no-fmt Skip gofmt check + --no-vet Skip go vet + --fast Only verify CRD/RBAC outputs (skip fmt/vet/tests/bundle) + --all Run bundle, tests, fmt, and vet + -h, --help Show this help + +Default behavior runs CRD/RBAC verification plus fmt and vet. +USAGE +} + +bundle=false +tests=false +fmt=true +vet=true + +while [[ $# -gt 0 ]]; do + case "$1" in + --bundle) + bundle=true + shift + ;; + --tests) + tests=true + shift + ;; + --fmt) + fmt=true + shift + ;; + --vet) + vet=true + shift + ;; + --no-fmt) + fmt=false + shift + ;; + --no-vet) + vet=false + shift + ;; + --fast) + bundle=false + tests=false + fmt=false + vet=false + shift + ;; + --all) + bundle=true + tests=true + fmt=true + vet=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +printf "Running CRD/RBAC verification...\n" +./scripts/verify_crd.sh + +if [[ "${bundle}" == "true" ]]; then + printf "Running bundle verification...\n" + ./scripts/verify_bundle.sh +fi + +if [[ "${fmt}" == "true" ]]; then + printf "Checking gofmt formatting...\n" + if ! command -v gofmt >/dev/null 2>&1; then + echo "gofmt not found in PATH." >&2 + exit 1 + fi + unformatted=$(gofmt -l $(git ls-files '*.go')) + if [[ -n "${unformatted}" ]]; then + echo "gofmt needed for the following files:" >&2 + echo "${unformatted}" >&2 + exit 1 + fi +fi + +if [[ "${vet}" == "true" ]]; then + printf "Running go vet...\n" + go vet ./... +fi + +if [[ "${tests}" == "true" ]]; then + printf "Running unit tests...\n" + make test +fi + +echo "verify_repo.sh completed successfully." diff --git a/skaffold.yaml b/skaffold.yaml new file mode 100644 index 000000000..e38b704a3 --- /dev/null +++ b/skaffold.yaml @@ -0,0 +1,39 @@ +apiVersion: skaffold/v4beta11 +kind: Config +metadata: + name: splunk-operator + +build: + local: + push: false + useBuildkit: true + artifacts: + - image: docker.io/splunk/splunk-operator + docker: + dockerfile: Dockerfile + +manifests: + kustomize: + paths: + - config/skaffold + +deploy: + kubectl: + flags: + apply: + - --server-side + - --force-conflicts + +profiles: +- name: dev-kind + activation: + - kubeContext: kind-.* + portForward: + - resourceType: deployment + resourceName: splunk-operator-controller-manager + namespace: splunk-operator + port: 8081 + +- name: ci-smoke + activation: + - env: SKAFFOLD_CI_SMOKE=true diff --git a/speckit/README.md b/speckit/README.md new file mode 100644 index 000000000..516a1af09 --- /dev/null +++ b/speckit/README.md @@ -0,0 +1,18 @@ +# Spec Kit Workspace + +This directory stores Spec Kit artifacts that precede implementation. + +## Workflow +1. Bootstrap artifacts: + - `scripts/dev/speckit_bridge.sh bootstrap --change-id CSPL-XXXX --title "Your change"` +2. Fill and review Spec Kit docs: + - `speckit/specs/<change-id>-<slug>/spec.md` + - `speckit/specs/<change-id>-<slug>/plan.md` + - `speckit/specs/<change-id>-<slug>/tasks.md` +3. Drive generated KEP in `docs/specs/` to `Status: Approved`. +4. Implement through harness manifest policy and PR checks. + +## Notes +- Spec Kit artifacts are planning inputs. +- KEP in `docs/specs/` is the governance system of record. +- Harness manifest in `harness/manifests/` is the machine-readable execution contract. diff --git a/templates/pull_request.md b/templates/pull_request.md new file mode 100644 index 000000000..012bbe287 --- /dev/null +++ b/templates/pull_request.md @@ -0,0 +1,40 @@ +## Summary +- + +## Governing Spec +- Spec file: +- Spec status (`Draft`/`In Review`/`Approved`/`Implemented`/`Superseded`): +- Spec Kit path (`speckit/specs/<id>-<slug>/`): +- Harness manifest (`harness/manifests/*.yaml`): +- Risk tier (`low`/`medium`/`high`): +- Delivery mode (`agent`/`hybrid`/`human`): +- PR risk label (`risk:low`/`risk:medium`/`risk:high`): + +## Changes +- + +## Tests +- [ ] `scripts/dev/spec_check.sh` +- [ ] `scripts/dev/harness_manifest_check.sh` +- [ ] `scripts/dev/doc_first_check.sh` +- [ ] `scripts/dev/commit_discipline_check.sh` +- [ ] `scripts/dev/appframework_parity_check.sh` +- [ ] `scripts/dev/keps_check.sh` +- [ ] `scripts/dev/harness_engineering_parity_check.sh` +- [ ] `scripts/dev/constitution_runtime_policy_check.sh` +- [ ] `scripts/dev/risk_policy_check.sh` +- [ ] `scripts/dev/risk_label_check.sh --labels risk:<tier>` +- [ ] `scripts/dev/harness_eval.sh --suite docs/agent/evals/policy-regression.yaml` +- [ ] `scripts/dev/harness_run.sh --fast` +- [ ] `scripts/dev/pr_check.sh` +- [ ] `scripts/dev/autonomy_scorecard.sh --base-ref <target-branch>` +- [ ] `scripts/dev/unit.sh` +- [ ] Other: + +## Risks / Rollback +- + +## Checklist +- [ ] CRD/RBAC artifacts updated (if applicable) +- [ ] Docs/examples updated (if user-facing change) +- [ ] Backward compatibility considered diff --git a/test/AGENTS.md b/test/AGENTS.md new file mode 100644 index 000000000..aad55538b --- /dev/null +++ b/test/AGENTS.md @@ -0,0 +1,23 @@ +# test/ — Integration Tests (Ginkgo) + +## What Lives Here +- Ginkgo integration suites (`test/<suite>/...`) +- `test/testenv` helpers for deployment and verification + +## Invariants +- Tests should clean up resources (unless DEBUG_RUN / failure). +- Use `testenv` helpers for common readiness checks. +- Keep suites scoped and name tests with searchable tags. + +## Common Pitfalls +- Skipping teardown without `DEBUG_RUN`. +- Mixing cluster-wide and namespace-scoped assumptions. +- Assuming license manager exists without a license file/configmap. + +## Commands +- Run integration tests: `make int-test` +- Run unit/envtest suite: `make test` +- Generate scaffolds: `python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml` + +## Notes +KUTTL tests live under `kuttl/` and are not executed by `make test`. diff --git a/test/deploy-eks-cluster.sh b/test/deploy-eks-cluster.sh index cf30ba81e..784a14dd2 100755 --- a/test/deploy-eks-cluster.sh +++ b/test/deploy-eks-cluster.sh @@ -20,6 +20,88 @@ if [[ -z "${EKS_CLUSTER_K8_VERSION}" ]]; then export EKS_CLUSTER_K8_VERSION="1.26" fi +AWS_REGION="${AWS_REGION:-us-west-2}" +OIDC_PROVIDER_SOFT_LIMIT="${OIDC_PROVIDER_SOFT_LIMIT:-95}" +OIDC_PROVIDER_HARD_LIMIT="${OIDC_PROVIDER_HARD_LIMIT:-100}" + +function ensureOIDCQuotaAvailable() { + local oidc_count + oidc_count=$(aws iam list-open-id-connect-providers --query 'length(OpenIDConnectProviderList)' --output text 2>/dev/null || true) + + if ! [[ "${oidc_count}" =~ ^[0-9]+$ ]]; then + echo "Warning: unable to determine current OIDC provider count; continuing." + return 0 + fi + + echo "Current IAM OIDC providers in account: ${oidc_count}" + if [ "${oidc_count}" -ge "${OIDC_PROVIDER_HARD_LIMIT}" ]; then + echo "ERROR: OIDC provider quota reached (${oidc_count}/${OIDC_PROVIDER_HARD_LIMIT})." + echo "Please clean up stale IAM OIDC providers before running smoke tests." + echo "Hint: aws iam list-open-id-connect-providers" + return 1 + fi + + if [ "${oidc_count}" -ge "${OIDC_PROVIDER_SOFT_LIMIT}" ]; then + echo "Warning: OIDC provider usage is high (${oidc_count}/${OIDC_PROVIDER_HARD_LIMIT}); tests may fail during IRSA setup." + fi +} + +function waitForEBSCSIReady() { + local timeout_seconds="${1:-600}" + local waited=0 + + echo "Waiting for EBS CSI controller deployment to become available..." + while ! kubectl get deployment ebs-csi-controller -n kube-system >/dev/null 2>&1; do + if [ "${waited}" -ge "${timeout_seconds}" ]; then + echo "ERROR: timed out waiting for deployment/ebs-csi-controller to appear in kube-system." + return 1 + fi + sleep 10 + waited=$((waited + 10)) + done + + kubectl rollout status deployment/ebs-csi-controller -n kube-system --timeout="${timeout_seconds}s" + if [ $? -ne 0 ]; then + echo "ERROR: deployment/ebs-csi-controller did not become ready." + kubectl describe deployment ebs-csi-controller -n kube-system || true + kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-ebs-csi-driver || true + return 1 + fi + + if kubectl get daemonset ebs-csi-node -n kube-system >/dev/null 2>&1; then + kubectl rollout status daemonset/ebs-csi-node -n kube-system --timeout="${timeout_seconds}s" + if [ $? -ne 0 ]; then + echo "ERROR: daemonset/ebs-csi-node did not become ready." + kubectl describe daemonset ebs-csi-node -n kube-system || true + return 1 + fi + fi + + return 0 +} + +function setDefaultStorageClass() { + local sc_name="" + if kubectl get storageclass gp2 >/dev/null 2>&1; then + sc_name="gp2" + elif kubectl get storageclass gp3 >/dev/null 2>&1; then + sc_name="gp3" + fi + + if [ -z "${sc_name}" ]; then + echo "ERROR: neither gp2 nor gp3 storageclass exists in the cluster." + kubectl get storageclass || true + return 1 + fi + + kubectl patch storageclass "${sc_name}" -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' + if [ $? -ne 0 ]; then + echo "ERROR: unable to patch storageclass ${sc_name} as default." + return 1 + fi + echo "Patched storageclass ${sc_name} as default." +} + function deleteCluster() { echo "Cleanup role, security-group, open-id ${TEST_CLUSTER_NAME}" account_id=$(aws sts get-caller-identity --query "Account" --output text) @@ -89,18 +171,36 @@ function createCluster() { found=$(eksctl get cluster --name "${TEST_CLUSTER_NAME}" -v 0) if [ -z "${found}" ]; then + ensureOIDCQuotaAvailable || return 1 + eksctl create cluster --name=${TEST_CLUSTER_NAME} --nodes=${CLUSTER_WORKERS} --vpc-public-subnets=${EKS_VPC_PUBLIC_SUBNET_STRING} --vpc-private-subnets=${EKS_VPC_PRIVATE_SUBNET_STRING} --instance-types=${EKS_INSTANCE_TYPE} --version=${EKS_CLUSTER_K8_VERSION} if [ $? -ne 0 ]; then echo "Unable to create cluster - ${TEST_CLUSTER_NAME}" return 1 fi - eksctl utils associate-iam-oidc-provider --cluster=${TEST_CLUSTER_NAME} --approve + + oidc_output=$(eksctl utils associate-iam-oidc-provider --cluster=${TEST_CLUSTER_NAME} --approve 2>&1) + if [ $? -ne 0 ]; then + echo "${oidc_output}" + echo "ERROR: unable to associate IAM OIDC provider for cluster ${TEST_CLUSTER_NAME}." + if echo "${oidc_output}" | grep -qi "LimitExceeded"; then + echo "OIDC quota is exhausted; clean up stale IAM OIDC providers and retry." + fi + return 1 + fi + oidc_id=$(aws eks describe-cluster --name ${TEST_CLUSTER_NAME} --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5) + if [ -z "${oidc_id}" ]; then + echo "ERROR: unable to resolve OIDC ID for cluster ${TEST_CLUSTER_NAME}." + return 1 + fi account_id=$(aws sts get-caller-identity --query "Account" --output text) - oidc_provider=$(aws eks describe-cluster --name ${TEST_CLUSTER_NAME} --region "us-west-2" --query "cluster.identity.oidc.issuer" --output text | sed -e "s/^https:\/\///") + oidc_provider=$(aws eks describe-cluster --name ${TEST_CLUSTER_NAME} --region "${AWS_REGION}" --query "cluster.identity.oidc.issuer" --output text | sed -e "s/^https:\/\///") namespace=kube-system service_account=ebs-csi-controller-sa - kubectl create serviceaccount ${service_account} --namespace ${namespace} + + kubectl get serviceaccount "${service_account}" --namespace "${namespace}" >/dev/null 2>&1 || kubectl create serviceaccount "${service_account}" --namespace "${namespace}" + echo "{ \"Version\": \"2012-10-17\", \"Statement\": [ @@ -119,22 +219,48 @@ function createCluster() { } ] }" >aws-ebs-csi-driver-trust-policy.json + rolename=$(echo ${TEST_CLUSTER_NAME} | awk -F- '{print "EBS_" $(NF-1) "_" $(NF)}') - aws iam create-role --role-name ${rolename} --assume-role-policy-document file://aws-ebs-csi-driver-trust-policy.json --description "irsa role for ${TEST_CLUSTER_NAME}" - aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy --role-name ${rolename} - kubectl annotate serviceaccount -n ${namespace} ${service_account} eks.amazonaws.com/role-arn=arn:aws:iam::${account_id}:role/${rolename} - eksctl create addon --name aws-ebs-csi-driver --cluster ${TEST_CLUSTER_NAME} --service-account-role-arn arn:aws:iam::${account_id}:role/${rolename} --force + + aws iam get-role --role-name "${rolename}" >/dev/null 2>&1 || \ + aws iam create-role --role-name "${rolename}" --assume-role-policy-document file://aws-ebs-csi-driver-trust-policy.json --description "irsa role for ${TEST_CLUSTER_NAME}" + if [ $? -ne 0 ]; then + echo "ERROR: unable to create/get IAM role ${rolename}." + return 1 + fi + + aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy --role-name "${rolename}" + if [ $? -ne 0 ]; then + echo "ERROR: unable to attach EBS CSI policy to role ${rolename}." + return 1 + fi + + kubectl annotate serviceaccount -n "${namespace}" "${service_account}" eks.amazonaws.com/role-arn=arn:aws:iam::${account_id}:role/${rolename} --overwrite + if [ $? -ne 0 ]; then + echo "ERROR: unable to annotate serviceaccount ${namespace}/${service_account} with IRSA role." + return 1 + fi + + eksctl create addon --name aws-ebs-csi-driver --cluster "${TEST_CLUSTER_NAME}" --service-account-role-arn arn:aws:iam::${account_id}:role/${rolename} --force + if [ $? -ne 0 ]; then + echo "ERROR: unable to create/update aws-ebs-csi-driver addon." + return 1 + fi + + waitForEBSCSIReady 900 || return 1 + eksctl utils update-cluster-logging --cluster ${TEST_CLUSTER_NAME} - # CSPL-2887 - Patch the default storage class to gp2 - kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' + setDefaultStorageClass || return 1 else echo "Retrieving kubeconfig for ${TEST_CLUSTER_NAME}" # Cluster exists but kubeconfig may not eksctl utils write-kubeconfig --cluster=${TEST_CLUSTER_NAME} + waitForEBSCSIReady 900 || return 1 + setDefaultStorageClass || return 1 fi echo "Logging in to ECR" - rc=$(aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin "${ECR_REPOSITORY}"/splunk/splunk-operator) + rc=$(aws ecr get-login-password --region "${AWS_REGION}" | docker login --username AWS --password-stdin "${ECR_REPOSITORY}"/splunk/splunk-operator) if [ "$rc" != "Login Succeeded" ]; then echo "Unable to login to ECR - $rc" return 1 @@ -145,4 +271,4 @@ function createCluster() { # Output echo "EKS cluster nodes:" eksctl get cluster --name=${TEST_CLUSTER_NAME} -} \ No newline at end of file +} diff --git a/test/deploy-operator.sh b/test/deploy-operator.sh index c14447fe7..43274579c 100644 --- a/test/deploy-operator.sh +++ b/test/deploy-operator.sh @@ -55,16 +55,22 @@ if [ "${CLUSTER_WIDE}" == "true" ]; then # sleep before checking for deployment, in slow clusters deployment call may not even started # in those cases, kubectl will fail with error: no matching resources found sleep 2 - kubectl wait --for=condition=ready pod -l control-plane=controller-manager --timeout=600s -n splunk-operator + kubectl rollout status deployment/splunk-operator-controller-manager -n splunk-operator --timeout=600s + if [ $? -ne 0 ]; then + echo "rollout status for operator deployment timed out; falling back to pod readiness diagnostics..." + kubectl wait --for=condition=ready pod -l control-plane=controller-manager --timeout=120s -n splunk-operator + fi if [ $? -ne 0 ]; then echo "kubectl get pods -n kube-system ---" kubectl get pods -n kube-system - echo "kubectl get deployement ebs-csi-controller -n kube-system ---" - kubectl get deployement ebs-csi-controller -n kube-system + echo "kubectl get deployment ebs-csi-controller -n kube-system ---" + kubectl get deployment ebs-csi-controller -n kube-system echo "kubectl describe pvc -n splunk-operator ---" kubectl describe pvc -n splunk-operator echo "kubectl describe pv ---" kubectl describe pv + echo "kubectl get events -n splunk-operator --sort-by=.lastTimestamp ---" + kubectl get events -n splunk-operator --sort-by=.lastTimestamp || true echo "kubectl describe pod -n splunk-operator ---" kubectl describe pod -n splunk-operator echo "Operator installation not ready..." diff --git a/test/testenv/remote_index_utils.go b/test/testenv/remote_index_utils.go index f696a4a17..cd8273b8d 100644 --- a/test/testenv/remote_index_utils.go +++ b/test/testenv/remote_index_utils.go @@ -86,11 +86,26 @@ func RollHotToWarm(ctx context.Context, deployment *Deployment, podName string, return true } +const ( + // Queue credential volumes are used for secret lookup. The remaining fields are + // populated to satisfy CRD schema validation for VolumeSpec. + queueVolumeDefaultEndpoint = "https://s3.us-west-2.amazonaws.com" + queueVolumeDefaultPath = "index-ingest-separation-test-bucket/queue-creds" + queueVolumeDefaultProvider = "aws" + queueVolumeDefaultType = "s3" + queueVolumeDefaultRegion = "us-west-2" +) + // GenerateQueueVolumeSpec return VolumeSpec struct with given values func GenerateQueueVolumeSpec(name, secretRef string) enterpriseApi.VolumeSpec { return enterpriseApi.VolumeSpec{ Name: name, + Endpoint: queueVolumeDefaultEndpoint, + Path: queueVolumeDefaultPath, SecretRef: secretRef, + Provider: queueVolumeDefaultProvider, + Type: queueVolumeDefaultType, + Region: queueVolumeDefaultRegion, } } diff --git a/test/testenv/remote_index_utils_test.go b/test/testenv/remote_index_utils_test.go new file mode 100644 index 000000000..8c45539c4 --- /dev/null +++ b/test/testenv/remote_index_utils_test.go @@ -0,0 +1,29 @@ +package testenv + +import "testing" + +func TestGenerateQueueVolumeSpecHasRequiredFields(t *testing.T) { + vol := GenerateQueueVolumeSpec("queue-secret-ref-volume", "queue-secret") + + if vol.Name != "queue-secret-ref-volume" { + t.Fatalf("unexpected volume name: %s", vol.Name) + } + if vol.SecretRef != "queue-secret" { + t.Fatalf("unexpected secretRef: %s", vol.SecretRef) + } + if vol.Endpoint == "" { + t.Fatal("endpoint must not be empty") + } + if vol.Path == "" { + t.Fatal("path must not be empty") + } + if vol.Provider == "" { + t.Fatal("provider must not be empty") + } + if vol.Type == "" { + t.Fatal("storage type must not be empty") + } + if vol.Region == "" { + t.Fatal("region must not be empty for aws provider") + } +} diff --git a/test/testenv/testcaseenv.go b/test/testenv/testcaseenv.go index cb3c8a107..9c8fbafa9 100644 --- a/test/testenv/testcaseenv.go +++ b/test/testenv/testcaseenv.go @@ -118,6 +118,69 @@ func (testenv *TestCaseEnv) GetSplunkImage() string { return testenv.splunkImage } +// GetOperatorName returns operator deployment name for this test env +func (testenv *TestCaseEnv) GetOperatorName() string { + return testenv.operatorName +} + +// UpdateOperatorImage updates the operator deployment image and waits for rollout +func (testenv *TestCaseEnv) UpdateOperatorImage(image string) error { + operatorNamespace := testenv.namespace + if testenv.clusterWideOperator == "true" { + operatorNamespace = "splunk-operator" + } + namespacedName := client.ObjectKey{Name: testenv.operatorName, Namespace: operatorNamespace} + operator := &appsv1.Deployment{} + err := testenv.GetKubeClient().Get(context.TODO(), namespacedName, operator) + if err != nil { + testenv.Log.Error(err, "Unable to get operator", "operator name", testenv.operatorName, "namespace", operatorNamespace) + return err + } + + if len(operator.Spec.Template.Spec.Containers) == 0 { + err = fmt.Errorf("operator deployment %s/%s has no containers to update", operatorNamespace, testenv.operatorName) + testenv.Log.Error(err, "Unable to update operator image") + return err + } + + containerIndex := -1 + for i, container := range operator.Spec.Template.Spec.Containers { + if container.Name == "manager" { + containerIndex = i + break + } + } + if containerIndex == -1 { + err = fmt.Errorf("manager container not found in operator deployment %s/%s", operatorNamespace, testenv.operatorName) + testenv.Log.Error(err, "Unable to update operator image") + return err + } + operator.Spec.Template.Spec.Containers[containerIndex].Image = image + + err = testenv.GetKubeClient().Update(context.TODO(), operator) + if err != nil { + testenv.Log.Error(err, "Unable to update operator image", "operator name", testenv.operatorName, "namespace", operatorNamespace) + return err + } + + operatorInstallTimeout := 5 * time.Minute + return wait.PollImmediate(PollInterval, operatorInstallTimeout, func() (bool, error) { + deployment := &appsv1.Deployment{} + err := testenv.GetKubeClient().Get(context.TODO(), namespacedName, deployment) + if err != nil { + testenv.Log.Error(err, "operator not found waiting", "operator name", testenv.operatorName, "namespace", operatorNamespace) + return false, nil + } + if deployment.Status.UpdatedReplicas < deployment.Status.Replicas { + return false, nil + } + if deployment.Status.ReadyReplicas < deployment.Status.Replicas { + return false, nil + } + return true, nil + }) +} + // IsOperatorInstalledClusterWide returns if operator is installed clusterwide func (testenv *TestCaseEnv) IsOperatorInstalledClusterWide() string { return testenv.clusterWideOperator diff --git a/test/testenv/verificationutils.go b/test/testenv/verificationutils.go index cb611254d..0f3a719d9 100644 --- a/test/testenv/verificationutils.go +++ b/test/testenv/verificationutils.go @@ -20,23 +20,38 @@ import ( "context" "encoding/json" "fmt" - "math/rand" "os/exec" - "sigs.k8s.io/controller-runtime/pkg/client" "strings" "time" gomega "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" enterpriseApi "github.com/splunk/splunk-operator/api/v4" splcommon "github.com/splunk/splunk-operator/pkg/splunk/common" + splenterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + crclient "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/log" ) var StabilizationDuration = time.Second * 20 +const ( + telemetryConfigMapPrefix = "splunk-operator-" + telemetryLabelKey = "name" + telemetryLabelValue = "splunk-operator" + telemetryStatusKey = "status" +) + +type telemetryStatus struct { + LastTransmission string `json:"lastTransmission,omitempty"` + Test string `json:"test,omitempty"` + SokVersion string `json:"sokVersion,omitempty"` +} + // PodDetailsStruct captures output of kubectl get pods podname -o json type PodDetailsStruct struct { Spec struct { @@ -58,6 +73,7 @@ type PodDetailsStruct struct { Status struct { ContainerStatuses []struct { + Name string `json:"name"` ContainerID string `json:"containerID"` Image string `json:"image"` ImageID string `json:"imageID"` @@ -187,31 +203,26 @@ func SingleSiteIndexersReady(ctx context.Context, deployment *Deployment, testen }, ConsistentDuration, ConsistentPollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) } -// IngestorsReady verify ingestors go to ready state +// IngestorReady verify ingestor cluster is in Ready status and does not flip-flop func IngestorReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { - ingest := &enterpriseApi.IngestorCluster{} + ingestor := &enterpriseApi.IngestorCluster{} instanceName := fmt.Sprintf("%s-ingest", deployment.GetName()) - gomega.Eventually(func() enterpriseApi.Phase { - err := deployment.GetInstance(ctx, instanceName, ingest) + err := deployment.GetInstance(ctx, instanceName, ingestor) if err != nil { return enterpriseApi.PhaseError } - - testenvInstance.Log.Info("Waiting for ingestor instance's phase to be ready", "instance", instanceName, "phase", ingest.Status.Phase) + testenvInstance.Log.Info("Waiting for ingestor cluster phase to be ready", "instance", instanceName, "Phase", ingestor.Status.Phase) DumpGetPods(testenvInstance.GetName()) - - return ingest.Status.Phase + return ingestor.Status.Phase }, deployment.GetTimeout(), PollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) // In a steady state, we should stay in Ready and not flip-flop around gomega.Consistently(func() enterpriseApi.Phase { - _ = deployment.GetInstance(ctx, instanceName, ingest) - - testenvInstance.Log.Info("Check for Consistency ingestor instance's phase to be ready", "instance", instanceName, "phase", ingest.Status.Phase) - DumpGetSplunkVersion(ctx, testenvInstance.GetName(), deployment, "-ingest-") - - return ingest.Status.Phase + _ = deployment.GetInstance(ctx, instanceName, ingestor) + testenvInstance.Log.Info("Check for Consistency ingestor cluster phase to be ready", "instance", instanceName, "Phase", ingestor.Status.Phase) + DumpGetSplunkVersion(ctx, testenvInstance.GetName(), deployment, "-ingestor-") + return ingestor.Status.Phase }, ConsistentDuration, ConsistentPollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) } @@ -241,6 +252,11 @@ func ClusterManagerReady(ctx context.Context, deployment *Deployment, testenvIns }, ConsistentDuration, ConsistentPollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) } +// LegacyClusterManagerReady wraps the legacy v3 control-plane readiness check +func LegacyClusterManagerReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { + ClusterMasterReady(ctx, deployment, testenvInstance) +} + // ClusterMasterReady verify Cluster Master Instance is in ready status func ClusterMasterReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { // Ensure that the cluster-master goes to Ready phase @@ -340,6 +356,109 @@ func VerifyRFSFMet(ctx context.Context, deployment *Deployment, testenvInstance }, deployment.GetTimeout(), PollInterval).Should(gomega.Equal(true)) } +func getTelemetryConfigMap(ctx context.Context, deployment *Deployment) (*corev1.ConfigMap, string, error) { + operatorNamespace := deployment.testenv.GetName() + if deployment.testenv.IsOperatorInstalledClusterWide() == "true" { + operatorNamespace = "splunk-operator" + } + cmName := splenterprise.GetTelemetryConfigMapName(telemetryConfigMapPrefix) + cm := &corev1.ConfigMap{} + err := deployment.testenv.GetKubeClient().Get(ctx, crclient.ObjectKey{Name: cmName, Namespace: operatorNamespace}, cm) + return cm, operatorNamespace, err +} + +func parseTelemetryStatus(cm *corev1.ConfigMap) (telemetryStatus, error) { + if cm == nil || cm.Data == nil { + return telemetryStatus{}, fmt.Errorf("telemetry configmap is empty") + } + raw, ok := cm.Data[telemetryStatusKey] + if !ok || strings.TrimSpace(raw) == "" { + return telemetryStatus{}, fmt.Errorf("telemetry status not found") + } + var status telemetryStatus + if err := json.Unmarshal([]byte(raw), &status); err != nil { + return telemetryStatus{}, err + } + return status, nil +} + +// GetTelemetryLastSubmissionTime returns last telemetry transmission time (UTC) or zero time if unavailable +func GetTelemetryLastSubmissionTime(ctx context.Context, deployment *Deployment) time.Time { + cm, _, err := getTelemetryConfigMap(ctx, deployment) + if err != nil { + deployment.testenv.Log.Info("Unable to get telemetry configmap", "error", err) + return time.Time{} + } + status, err := parseTelemetryStatus(cm) + if err != nil { + deployment.testenv.Log.Info("Unable to parse telemetry status", "error", err) + return time.Time{} + } + if status.LastTransmission == "" { + return time.Time{} + } + ts, err := time.Parse(time.RFC3339, status.LastTransmission) + if err != nil { + deployment.testenv.Log.Info("Unable to parse telemetry timestamp", "value", status.LastTransmission, "error", err) + return time.Time{} + } + return ts +} + +// TriggerTelemetrySubmission updates telemetry configmap to request a new submission +func TriggerTelemetrySubmission(ctx context.Context, deployment *Deployment) { + cmName := splenterprise.GetTelemetryConfigMapName(telemetryConfigMapPrefix) + cm, operatorNamespace, err := getTelemetryConfigMap(ctx, deployment) + create := false + if err != nil { + if k8serrors.IsNotFound(err) { + create = true + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cmName, + Namespace: operatorNamespace, + }, + } + } else { + gomega.Expect(err).ToNot(gomega.HaveOccurred(), "Unable to get telemetry configmap") + } + } + + status, _ := parseTelemetryStatus(cm) + status.Test = "true" + status.LastTransmission = "" + statusBytes, err := json.MarshalIndent(status, "", " ") + gomega.Expect(err).ToNot(gomega.HaveOccurred(), "Unable to marshal telemetry status") + + if cm.Data == nil { + cm.Data = map[string]string{} + } + cm.Data[telemetryStatusKey] = string(statusBytes) + + if cm.Labels == nil { + cm.Labels = map[string]string{} + } + cm.Labels[telemetryLabelKey] = telemetryLabelValue + + if create { + err = deployment.testenv.GetKubeClient().Create(ctx, cm) + } else { + err = deployment.testenv.GetKubeClient().Update(ctx, cm) + } + gomega.Expect(err).ToNot(gomega.HaveOccurred(), "Unable to update telemetry configmap") +} + +// VerifyTelemetry ensures telemetry lastTransmission is updated after triggering submission +func VerifyTelemetry(ctx context.Context, deployment *Deployment, prev time.Time) { + gomega.Eventually(func() bool { + current := GetTelemetryLastSubmissionTime(ctx, deployment) + if prev.IsZero() { + return !current.IsZero() + } + return current.After(prev) + }, deployment.GetTimeout(), PollInterval).Should(gomega.Equal(true)) +} + // VerifyNoDisconnectedSHPresentOnCM is present on cluster manager func VerifyNoDisconnectedSHPresentOnCM(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { gomega.Consistently(func() bool { @@ -382,6 +501,11 @@ func LicenseManagerReady(ctx context.Context, deployment *Deployment, testenvIns }, ConsistentDuration, ConsistentPollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) } +// LegacyLicenseManagerReady wraps the legacy v3 license control-plane readiness check +func LegacyLicenseManagerReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { + LicenseMasterReady(ctx, deployment, testenvInstance) +} + // LicenseMasterReady verify LM is in ready status and does not flip flop func LicenseMasterReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { LicenseMaster := &enterpriseApiV3.LicenseMaster{} @@ -1027,11 +1151,7 @@ func VerifyAppListPhase(ctx context.Context, deployment *Deployment, testenvInst appDeploymentInfo, err := GetAppDeploymentInfo(ctx, deployment, testenvInstance, name, crKind, appSourceName, appName) if err != nil { testenvInstance.Log.Error(err, "Failed to get app deployment info") - return phase // Continue polling - } - if appDeploymentInfo.AppName == "" { - testenvInstance.Log.Info(fmt.Sprintf("App deployment info not found yet for app %s (CR %s/%s, AppSource %s), continuing to poll", appName, crKind, name, appSourceName)) - return phase // Continue polling + return phase } testenvInstance.Log.Info(fmt.Sprintf("App State found for CR %s NAME %s APP NAME %s Expected Phase should not be %s", crKind, name, appName, phase), "Actual Phase", appDeploymentInfo.PhaseInfo.Phase, "App State", appDeploymentInfo) return appDeploymentInfo.PhaseInfo.Phase @@ -1044,11 +1164,7 @@ func VerifyAppListPhase(ctx context.Context, deployment *Deployment, testenvInst appDeploymentInfo, err := GetAppDeploymentInfo(ctx, deployment, testenvInstance, name, crKind, appSourceName, appName) if err != nil { testenvInstance.Log.Error(err, "Failed to get app deployment info") - return enterpriseApi.PhaseDownload // Continue polling - } - if appDeploymentInfo.AppName == "" { - testenvInstance.Log.Info(fmt.Sprintf("App deployment info not found yet for app %s (CR %s/%s, AppSource %s), continuing to poll", appName, crKind, name, appSourceName)) - return enterpriseApi.PhaseDownload // Continue polling + return enterpriseApi.PhaseDownload } testenvInstance.Log.Info(fmt.Sprintf("App State found for CR %s NAME %s APP NAME %s Expected Phase %s", crKind, name, appName, phase), "Actual Phase", appDeploymentInfo.PhaseInfo.Phase, "App Phase Status", appDeploymentInfo.PhaseInfo.Status, "App State", appDeploymentInfo) if appDeploymentInfo.PhaseInfo.Status != enterpriseApi.AppPkgInstallComplete { @@ -1252,83 +1368,102 @@ func VerifyFilesInDirectoryOnPod(ctx context.Context, deployment *Deployment, te } } -func GetTelemetryLastSubmissionTime(ctx context.Context, deployment *Deployment) string { - const ( - configMapName = "splunk-operator-manager-telemetry" - statusKey = "status" - ) - type telemetryStatus struct { - LastTransmission string `json:"lastTransmission"` - } - - cm := &corev1.ConfigMap{} - err := deployment.testenv.GetKubeClient().Get(ctx, client.ObjectKey{Name: configMapName, Namespace: "splunk-operator"}, cm) - if err != nil { - logf.Log.Error(err, "GetTelemetryLastSubmissionTime: failed to retrieve configmap") - return "" - } - - statusVal, ok := cm.Data[statusKey] - if !ok || statusVal == "" { - logf.Log.Info("GetTelemetryLastSubmissionTime: failed to retrieve status") - return "" +// VerifyOperatorImage verifies the operator pod is running the expected image +func VerifyOperatorImage(ctx context.Context, testenvInstance *TestCaseEnv, expectedImage string) { + _ = ctx // reserved for future use + var ns string + if testenvInstance.clusterWideOperator != "true" { + ns = testenvInstance.GetName() + } else { + ns = "splunk-operator" } - logf.Log.Info("GetTelemetryLastSubmissionTime: retrieved status", "status", statusVal) + timeout := time.Duration(SpecifiedTestTimeout) * time.Second + gomega.Eventually(func() bool { + operatorPod := GetOperatorPodName(testenvInstance) + if operatorPod == "" { + logf.Log.Info("Operator pod not found yet", "namespace", ns) + return false + } + return podImageContains(ns, operatorPod, expectedImage) + }, timeout, PollInterval).Should(gomega.Equal(true)) +} - var status telemetryStatus - if err := json.Unmarshal([]byte(statusVal), &status); err != nil { - logf.Log.Error(err, "GetTelemetryLastSubmissionTime: failed to unmarshal status", "statusVal", statusVal) - return "" - } - return status.LastTransmission +// VerifyPodImageContains verifies the pod is running a container image that contains expectedImage +func VerifyPodImageContains(ns string, podName string, expectedImage string) { + timeout := time.Duration(SpecifiedTestTimeout) * time.Second + gomega.Eventually(func() bool { + return podImageContains(ns, podName, expectedImage) + }, timeout, PollInterval).Should(gomega.Equal(true)) } -// VerifyTelemetry checks that the telemetry ConfigMap has a non-empty lastTransmission field in its status key. -func VerifyTelemetry(ctx context.Context, deployment *Deployment, prevVal string) { - logf.Log.Info("VerifyTelemetry: start") +// VerifySplunkPodImagesContain verifies all Splunk pods (excluding operator) are running expected image +func VerifySplunkPodImagesContain(ns string, expectedImage string) { + timeout := time.Duration(SpecifiedTestTimeout) * time.Second gomega.Eventually(func() bool { - currentVal := GetTelemetryLastSubmissionTime(ctx, deployment) - if currentVal != "" && currentVal != prevVal { - logf.Log.Info("VerifyTelemetry: success", "previous", prevVal, "current", currentVal) - return true + pods := DumpGetPods(ns) + checked := 0 + for _, pod := range pods { + if !isSplunkWorkloadPod(pod) { + continue + } + checked++ + if !podImageContains(ns, pod, expectedImage) { + return false + } } - return false - }, deployment.GetTimeout(), PollInterval).Should(gomega.Equal(true)) + if checked == 0 { + logf.Log.Info("No Splunk pods found yet", "namespace", ns) + return false + } + return true + }, timeout, PollInterval).Should(gomega.Equal(true)) } -// TriggerTelemetrySubmission updates or adds the 'test_submission' key in the telemetry ConfigMap with a JSON value containing a random number. -func TriggerTelemetrySubmission(ctx context.Context, deployment *Deployment) { - const ( - configMapName = "splunk-operator-manager-telemetry" - testKey = "test_submission" - ) - - // Generate a random number - rand.Seed(time.Now().UnixNano()) - randomNumber := rand.Intn(1000) - - // Create the JSON value - jsonValue, err := json.Marshal(map[string]int{"value": randomNumber}) - if err != nil { - logf.Log.Error(err, "Failed to marshal JSON value") - return +// podImageContains checks if any container image or imageID on the pod contains expectedImage +func podImageContains(ns string, podName string, expectedImage string) bool { + if podName == "" { + logf.Log.Info("Pod name is empty; cannot verify image", "namespace", ns) + return false } - - // Update the ConfigMap - cm := &corev1.ConfigMap{} - err = deployment.testenv.GetKubeClient().Get(ctx, client.ObjectKey{Name: configMapName, Namespace: "splunk-operator"}, cm) + output, err := exec.Command("kubectl", "get", "pods", "-n", ns, podName, "-o", "json").Output() if err != nil { - logf.Log.Error(err, "Failed to get ConfigMap") - return + cmd := fmt.Sprintf("kubectl get pods -n %s %s -o json", ns, podName) + logf.Log.Error(err, "Failed to execute command", "command", cmd) + return false } - - // Update the test_submission key - cm.Data[testKey] = string(jsonValue) - err = deployment.testenv.GetKubeClient().Update(ctx, cm) + restResponse := PodDetailsStruct{} + err = json.Unmarshal([]byte(output), &restResponse) if err != nil { - logf.Log.Error(err, "Failed to update ConfigMap") - return + logf.Log.Error(err, "Failed to parse pod JSON") + return false + } + found := false + images := []string{} + for _, status := range restResponse.Status.ContainerStatuses { + if status.Image != "" { + images = append(images, status.Image) + } + if status.ImageID != "" { + images = append(images, status.ImageID) + } + if strings.Contains(status.Image, expectedImage) || strings.Contains(status.ImageID, expectedImage) { + found = true + } } + logf.Log.Info("Pod image check", "pod", podName, "expected", expectedImage, "found", found, "images", images) + return found +} - logf.Log.Info("Successfully updated telemetry ConfigMap", "key", testKey, "value", jsonValue) +func isOperatorPod(podName string) bool { + return strings.HasPrefix(podName, "splunk-op") || strings.HasPrefix(podName, "splunk-operator") +} + +func isSplunkWorkloadPod(podName string) bool { + if podName == "" { + return false + } + if isOperatorPod(podName) { + return false + } + return strings.HasPrefix(podName, "splunk-") }