Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions docs/loop.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,26 @@ After each agent action, CodeDecay re-runs deterministic analysis and configured

## Safety Rules

`codedecay loop` only reports `merge-safe` when all of these are true:
`codedecay loop` never prints an unqualified "safe" verdict. Clean outcomes are always qualified by evidence depth.

The loop can only report a `merge-safe-*` verdict when all of these are true:

- final risk is at or below the configured safe threshold, `low` by default
- weak-test findings are zero
- security score is at or below the configured threshold, `0` by default
- no high-severity findings remain in deterministic analysis
- configured checks exist and pass

If no checks are configured, the best possible terminal status is `unverified`, not `merge-safe`.
If no checks are configured, the best possible terminal status is `unverified`, not a `merge-safe-*` verdict.

`merge-safe-verified` means configured checks passed, deterministic security matchers were clean, Semgrep was enabled and clean, and coverage/mutation evidence was available if configured.

`merge-safe-shallow` means the gates passed, but one or more deeper evidence streams were missing. Treat it as heuristic clean, not as deep verification. Run `codedecay doctor` to configure OSS adapters such as Semgrep, coverage, and StrykerJS.

Terminal statuses:

- `merge-safe`: deterministic risk is low enough, weak tests are gone, and configured checks passed
- `merge-safe-verified`: configured and enabled checks found nothing at the selected thresholds, including available security/coverage/mutation depth
- `merge-safe-shallow`: risk, weak-test, security-score, and configured-check gates passed, but depth evidence such as Semgrep, coverage, or mutation testing is missing
- `unverified`: risk and weak-test evidence are clean, but no configured checks proved the result
- `plan-only`: no agent command was configured
- `stuck`: the agent made no progress for two rounds
Expand All @@ -54,6 +63,7 @@ codedecay loop \
--cwd ../my-repo \
--agent-cmd "codex exec --apply" \
--max-rounds 3 \
--max-security-score 0 \
--format markdown
```

Expand Down
53 changes: 53 additions & 0 deletions packages/cli/src/benchmark/corpus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ export function createDefaultBenchmarkCorpus(): BenchmarkCorpus {
setup: createOneHopSqlInjectionRepo,
expectedRuleIds: ["security-sql-injection"]
},
{
id: "indirect-dynamic-sqli",
kind: "positive",
setup: createIndirectDynamicSqlInjectionRepo,
expectedRuleIds: ["security-sql-injection"]
},
{
id: "plain-exported-destructive-missing-auth",
kind: "positive",
Expand Down Expand Up @@ -115,6 +121,12 @@ export function createDefaultBenchmarkCorpus(): BenchmarkCorpus {
kind: "decoy",
setup: createGuardedDestructiveAuthDecoyRepo,
expectedRuleIds: []
},
{
id: "dynamic-sql-local-decoy",
kind: "decoy",
setup: createDynamicSqlLocalDecoyRepo,
expectedRuleIds: []
}
],
cleanup: cleanupBenchmarkCorpus
Expand Down Expand Up @@ -329,6 +341,26 @@ export function createOneHopSqlInjectionRepo(): string {
return repo;
}

export function createIndirectDynamicSqlInjectionRepo(): string {
const repo = createRepo({
"src/reports/search.ts": "export function buildSearchQuery(status) { return 'select 1'; }\n"
});

writeFile(
repo,
"src/reports/search.ts",
[
"export function buildSearchQuery(status) {",
" const sql = `select * from invoices where status = '${status}'`;",
" return sql;",
"}",
""
].join("\n")
);

return repo;
}

export function createPlainExportedDestructiveMissingAuthRepo(): string {
const repo = createRepo({
"src/services/users.ts": "export function listUsers() { return []; }\n"
Expand Down Expand Up @@ -485,6 +517,27 @@ export function createGuardedDestructiveAuthDecoyRepo(): string {
return repo;
}

export function createDynamicSqlLocalDecoyRepo(): string {
const repo = createRepo({
"src/reports/static.ts": "export function buildStaticReportQuery() { return 'select 1'; }\n"
});

writeFile(
repo,
"src/reports/static.ts",
[
"export function buildStaticReportQuery() {",
" const status = 'paid';",
" const sql = 'select * from invoices where status = ' + status;",
" return sql;",
"}",
""
].join("\n")
);

return repo;
}

function createBranchingFunction(name: string, ifCount: number): string {
return [
`export function ${name}(input) {`,
Expand Down
191 changes: 188 additions & 3 deletions packages/cli/src/commands/loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,31 @@ import { resolve } from "node:path";
import { createAgentTaskBundle, renderAgentTaskBundle } from "@submuxhq/codedecay-agent";
import { loadCodeDecayConfig, type CodeDecayConfig, type LoadedCodeDecayConfig } from "@submuxhq/codedecay-config";
import { getGitChangedFiles } from "@submuxhq/codedecay-git";
import { renderLoopReport, runCodeDecayLoop, type LoopCheckSnapshot, type LoopReport } from "@submuxhq/codedecay-harness";
import {
renderLoopReport,
runCodeDecayLoop,
type Evidence,
type LoopCheckSnapshot,
type LoopCheckStatus,
type LoopCoverageSnapshot,
type LoopMutationSnapshot,
type LoopReport,
type LoopSecurityToolSnapshot
} from "@submuxhq/codedecay-harness";
import type { ConfiguredToolAdapterKind } from "@submuxhq/codedecay-tool-adapters";
import type { RedteamReport } from "@submuxhq/codedecay-redteam";
import { CliExit } from "../errors";
import { parseLoopArgs } from "../parsers/args";
import type { AgentOptions, AnalyzeOptions, CliAnalysisContext, CliCommandContext, CliRuntime, RedteamOptions } from "../types";
import type {
AgentOptions,
AnalyzeOptions,
CliAnalysisContext,
CliCommandContext,
CliRuntime,
ExecutionReport,
ExecutionToolAdapterResult,
RedteamOptions
} from "../types";
import { createExecutionReport } from "./execute/report";
import type { RunExecuteCommandDependencies } from "./execute/types";
import { createRedteamReportForCli, type RedteamReportDependencies } from "./redteam-report";
Expand Down Expand Up @@ -41,6 +61,7 @@ export async function runLoopCommand(
maxRounds: options.maxRounds,
agentCommand: options.agentCommand,
safeRiskLevel: options.safeRiskLevel,
securityScoreThreshold: options.securityScoreThreshold,
agentTimeoutMs: loadedConfig.config.safety.commandTimeoutMs,
commandSafety: {
allowCommands: loadedConfig.config.safety.allowCommands
Expand Down Expand Up @@ -85,11 +106,15 @@ async function createLoopCheckSnapshot(
timedOut: 0,
errors: 0,
durationMs: 0,
semgrep: emptySecurityToolSnapshot(),
coverage: emptyCoverageSnapshot(),
mutation: emptyMutationSnapshot(),
note: "No configured commands, probes, or tool adapters were found."
};
}

const report = await createExecutionReport(rootDir, loadedConfig, dependencies);
const adapterEvidence = createAdapterEvidenceSnapshot(report);
return {
configured: true,
status: report.summary.status,
Expand All @@ -99,7 +124,10 @@ async function createLoopCheckSnapshot(
skipped: report.summary.skipped,
timedOut: report.summary.timedOut,
errors: report.summary.errors,
durationMs: report.summary.durationMs
durationMs: report.summary.durationMs,
semgrep: adapterEvidence.semgrep,
coverage: adapterEvidence.coverage,
mutation: adapterEvidence.mutation
};
}

Expand All @@ -119,3 +147,160 @@ function shouldFail(report: LoopReport): boolean {
report.status === "needs-human" ||
report.status === "agent-error";
}

function createAdapterEvidenceSnapshot(report: ExecutionReport): {
semgrep: LoopSecurityToolSnapshot;
coverage: LoopCoverageSnapshot;
mutation: LoopMutationSnapshot;
} {
return {
semgrep: summarizeSemgrepAdapter(report.toolAdapters.find((adapter) => adapter.kind === "semgrep")),
coverage: summarizeCoverageAdapter(report.toolAdapters.find((adapter) => adapter.kind === "coverage")),
mutation: summarizeMutationAdapter(report.toolAdapters.find((adapter) => adapter.kind === "stryker"))
};
}

function summarizeSemgrepAdapter(adapter: ExecutionToolAdapterResult | undefined): LoopSecurityToolSnapshot {
if (!adapter) {
return emptySecurityToolSnapshot();
}

const evidence = evidenceForAdapter(adapter, "semgrep");
const findingCount = firstFiniteMetadataNumber(evidence, "findingCount") ?? evidence.filter((item) => item.file).length;
const highFindingCount = evidence.filter((item) => item.file && item.severity === "high").length;
const maxSeverity = maxRiskSeverity(evidence);
return {
configured: true,
ran: adapterRan(adapter.status),
status: adapterStatusToLoopStatus(adapter.status),
findingCount,
highFindingCount,
maxSeverity
};
}

function summarizeCoverageAdapter(adapter: ExecutionToolAdapterResult | undefined): LoopCoverageSnapshot {
if (!adapter) {
return emptyCoverageSnapshot();
}

const evidence = evidenceForAdapter(adapter, "coverage");
const measuredLines = firstFiniteMetadataNumber(evidence, "measuredLines");
const coveredLines = firstFiniteMetadataNumber(evidence, "coveredLines");
const uncoveredLines = firstFiniteMetadataNumber(evidence, "uncoveredLines");
const percent = measuredLines && measuredLines > 0 && coveredLines !== undefined
? roundPercent((coveredLines / measuredLines) * 100)
: undefined;
return {
configured: true,
present: measuredLines !== undefined,
status: adapterStatusToLoopStatus(adapter.status),
percent,
measuredLines,
coveredLines,
uncoveredLines
};
}

function summarizeMutationAdapter(adapter: ExecutionToolAdapterResult | undefined): LoopMutationSnapshot {
if (!adapter) {
return emptyMutationSnapshot();
}

const evidence = evidenceForAdapter(adapter, "stryker");
const totalMutants = firstFiniteMetadataNumber(evidence, "totalMutants");
const survivedMutants = firstFiniteMetadataNumber(evidence, "survivedMutants") ?? 0;
const noCoverageMutants = firstFiniteMetadataNumber(evidence, "noCoverageMutants") ?? 0;
const mutationScore = firstFiniteMetadataNumber(evidence, "mutationScore");
const weakMutants = totalMutants === undefined ? undefined : survivedMutants + noCoverageMutants;
return {
configured: true,
present: totalMutants !== undefined || mutationScore !== undefined,
status: adapterStatusToLoopStatus(adapter.status),
mutationScore,
totalMutants,
weakMutants
};
}

function emptySecurityToolSnapshot(): LoopSecurityToolSnapshot {
return {
configured: false,
ran: false,
status: "not-configured",
findingCount: 0,
highFindingCount: 0
};
}

function emptyCoverageSnapshot(): LoopCoverageSnapshot {
return {
configured: false,
present: false,
status: "not-configured"
};
}

function emptyMutationSnapshot(): LoopMutationSnapshot {
return {
configured: false,
present: false,
status: "not-configured"
};
}

function evidenceForAdapter(adapter: ExecutionToolAdapterResult, id: ConfiguredToolAdapterKind | "semgrep" | "coverage" | "stryker"): Evidence[] {
return adapter.evidence.filter((item) => item.source.id === id || item.kind === evidenceKindForAdapter(id));
}

function evidenceKindForAdapter(id: ConfiguredToolAdapterKind | "semgrep" | "coverage" | "stryker"): Evidence["kind"] {
if (id === "coverage") {
return "coverage";
}

if (id === "stryker") {
return "mutation";
}

return "static-analysis";
}

function firstFiniteMetadataNumber(evidence: Evidence[], key: string): number | undefined {
for (const item of evidence) {
const value = item.metadata?.[key];
if (typeof value === "number" && Number.isFinite(value)) {
return value;
}
}

return undefined;
}

function maxRiskSeverity(evidence: Evidence[]): LoopSecurityToolSnapshot["maxSeverity"] {
const severities = evidence.map((item) => item.severity).filter((severity) => severity !== "info");
if (severities.includes("high")) {
return "high";
}

if (severities.includes("medium")) {
return "medium";
}

if (severities.includes("low")) {
return "low";
}

return undefined;
}

function adapterRan(status: ExecutionToolAdapterResult["status"]): boolean {
return status !== "skipped";
}

function adapterStatusToLoopStatus(status: ExecutionToolAdapterResult["status"]): LoopCheckStatus {
return status;
}

function roundPercent(value: number): number {
return Math.round(value * 100) / 100;
}
4 changes: 3 additions & 1 deletion packages/cli/src/docs/command-docs/orchestration.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ export const ORCHESTRATION_COMMAND_DOCS: Record<string, CommandDoc> = {
{ flag: "--max-rounds <n>", description: "Maximum fix/recheck rounds (default: 4)" },
{ flag: "--agent-cmd <command>", description: "Explicit user-owned agent command that reads the task bundle on stdin and may edit the working tree" },
{ flag: "--safe-risk <level>", description: "Maximum acceptable risk level: low, medium, or high (default: low)" },
{ flag: "--max-security-score <score>", description: "Maximum acceptable security score from deterministic analysis, 0-100 (default: 0)" },
{ flag: "--format <format>", description: "json or markdown (default: markdown)" },
{ flag: "--output <path>", description: "Write loop report to a file instead of stdout" }
],
Expand All @@ -139,7 +140,8 @@ export const ORCHESTRATION_COMMAND_DOCS: Record<string, CommandDoc> = {
"CodeDecay does not embed a model. The agent command is user-owned and explicit.",
"The loop never auto-commits or auto-pushes. It leaves edits in the working tree for human review.",
"Agent output is untrusted. CodeDecay re-runs deterministic analysis and configured checks after each agent action.",
"Exit codes: 0 for merge-safe or plan-only report generation, 1 for unverified, needs-human, stuck, or agent-error, and 2 for CLI/internal errors."
"Terminal clean verdicts are always qualified: merge-safe-verified has configured checks plus security/coverage/mutation depth, while merge-safe-shallow passed gates but is missing deeper evidence.",
"Exit codes: 0 for merge-safe-verified, merge-safe-shallow, or plan-only report generation; 1 for unverified, needs-human, stuck, or agent-error; and 2 for CLI/internal errors."
]
},
doctor: {
Expand Down
Loading