Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ generic_automation_mcp/
│ ├── loader.py # Path-based recipe metadata utilities
│ ├── _api.py # Orchestration API
│ ├── diagrams.py # Flow diagram generation + staleness detection
│ ├── experiment_type_registry.py # ExperimentTypeSpec, load_all_experiment_types
│ ├── registry.py # RuleFinding, RuleSpec, semantic_rule decorator
│ ├── repository.py
│ ├── _analysis.py # Step graph building + dataflow analysis
Expand Down
6 changes: 6 additions & 0 deletions src/autoskillit/recipe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
diagram_stale_to_suggestions,
load_recipe_diagram,
)
from autoskillit.recipe.experiment_type_registry import ( # noqa: E402
ExperimentTypeSpec,
load_all_experiment_types,
)
from autoskillit.recipe.io import ( # noqa: E402
builtin_sub_recipes_dir,
find_recipe_by_name,
Expand Down Expand Up @@ -130,4 +134,6 @@
"diagram_stale_to_suggestions",
"builtin_sub_recipes_dir",
"find_sub_recipe_by_name",
"ExperimentTypeSpec",
"load_all_experiment_types",
]
79 changes: 79 additions & 0 deletions src/autoskillit/recipe/experiment_type_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Experiment type registry — load bundled and user-defined experiment type specs."""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path

from autoskillit.core import get_logger, load_yaml, pkg_root

_log = get_logger(__name__)

BUNDLED_EXPERIMENT_TYPES_DIR: Path = pkg_root() / "recipes" / "experiment-types"


@dataclass
class ExperimentTypeSpec:
"""Specification for a single experiment type."""

name: str
classification_triggers: list[str]
dimension_weights: dict[str, str]
applicable_lenses: dict[str, str | None]
red_team_focus: dict[str, str]
l1_severity: dict[str, str]


def _parse_experiment_type(data: dict, source_path: Path) -> ExperimentTypeSpec:
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[warning] defense: data parameter is typed as bare dict instead of dict[str, object]. Widen the type annotation so mypy can verify the signature correctly.

if "name" not in data:
raise ValueError(f"Experiment type YAML missing 'name' field: {source_path}")
return ExperimentTypeSpec(
name=data["name"],
classification_triggers=list(data.get("classification_triggers", [])),
dimension_weights=dict(data.get("dimension_weights", {})),
applicable_lenses=dict(data.get("applicable_lenses", {})),
red_team_focus=dict(data.get("red_team_focus", {})),
l1_severity=dict(data.get("l1_severity", {})),
)


def _load_types_from_dir(directory: Path) -> dict[str, ExperimentTypeSpec]:
Comment thread
Trecek marked this conversation as resolved.
if not directory.exists():
return {}
result: dict[str, ExperimentTypeSpec] = {}
for path in sorted(directory.glob("*.yaml")):
try:
data = load_yaml(path)
if isinstance(data, dict):
spec = _parse_experiment_type(data, path)
result[spec.name] = spec
except Exception:
_log.warning("Skipping malformed experiment type file: %s", path, exc_info=True)
return result


def load_all_experiment_types(
project_dir: Path | None = None,
) -> dict[str, ExperimentTypeSpec]:
"""Load experiment types: bundled types merged with user-defined overrides.

User-defined types with the same name as a bundled type replace the bundled
type entirely — no field merging. User-defined types with a new name are added
alongside bundled types.

Args:
project_dir: Project root containing optional user-defined overrides at
``.autoskillit/experiment-types/``. When ``None``, only bundled types
are returned.

Returns:
Mapping of experiment type name to ``ExperimentTypeSpec``.
"""
types = _load_types_from_dir(BUNDLED_EXPERIMENT_TYPES_DIR)

if project_dir is not None:
user_dir = Path(project_dir) / ".autoskillit" / "experiment-types"
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[info] defense: Redundant Path(project_dir) cast — project_dir is already typed Path | None. Use it directly.

user_types = _load_types_from_dir(user_dir)
types.update(user_types)

return types
21 changes: 21 additions & 0 deletions src/autoskillit/recipes/experiment-types/benchmark.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: benchmark
classification_triggers:
- "IVs are system/method names, DVs are performance metrics, multiple comparators"
dimension_weights:
causal_structure: S
variance_protocol: H
statistical_corrections: M
ecological_validity: M
measurement_alignment: M
resource_proportionality: L
data_acquisition: M
agent_implementability: H
applicable_lenses:
primary: exp-lens-estimand-clarity
secondary: null
red_team_focus:
specific: asymmetric effort
severity_cap: warning
l1_severity:
estimand_clarity: warning
hypothesis_falsifiability: warning
21 changes: 21 additions & 0 deletions src/autoskillit/recipes/experiment-types/causal_inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: causal_inference
classification_triggers:
- "Causal language (\"causes\", \"effect of\"), confounders in threats"
dimension_weights:
causal_structure: H
variance_protocol: L
statistical_corrections: H
ecological_validity: L
measurement_alignment: M
resource_proportionality: L
data_acquisition: M
agent_implementability: M
applicable_lenses:
primary: exp-lens-estimand-clarity
secondary: null
red_team_focus:
specific: unblocked backdoor path
severity_cap: critical
l1_severity:
estimand_clarity: critical
hypothesis_falsifiability: critical
21 changes: 21 additions & 0 deletions src/autoskillit/recipes/experiment-types/configuration_study.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: configuration_study
classification_triggers:
- "IVs are numeric parameters of one system, grid/sweep structure"
dimension_weights:
causal_structure: S
variance_protocol: H
statistical_corrections: H
ecological_validity: L
measurement_alignment: M
resource_proportionality: L
data_acquisition: M
agent_implementability: H
applicable_lenses:
primary: exp-lens-estimand-clarity
secondary: null
red_team_focus:
specific: overfitting to held-out set
severity_cap: warning
l1_severity:
estimand_clarity: warning
hypothesis_falsifiability: warning
21 changes: 21 additions & 0 deletions src/autoskillit/recipes/experiment-types/exploratory.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: exploratory
classification_triggers:
- "Default — no prior rule fires, or hypothesis absent"
dimension_weights:
causal_structure: L
variance_protocol: L
statistical_corrections: S
ecological_validity: M
measurement_alignment: M
resource_proportionality: L
data_acquisition: M
agent_implementability: L
applicable_lenses:
primary: exp-lens-estimand-clarity
secondary: null
red_team_focus:
specific: HARKing vulnerability
severity_cap: info
l1_severity:
estimand_clarity: info
hypothesis_falsifiability: info
21 changes: 21 additions & 0 deletions src/autoskillit/recipes/experiment-types/robustness_audit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: robustness_audit
classification_triggers:
- "Tests generalization/stability, deliberately varied conditions"
dimension_weights:
causal_structure: M
variance_protocol: M
statistical_corrections: S
ecological_validity: H
measurement_alignment: H
resource_proportionality: L
data_acquisition: H
agent_implementability: M
applicable_lenses:
primary: exp-lens-estimand-clarity
secondary: null
red_team_focus:
specific: unrealistic threat distribution
severity_cap: warning
l1_severity:
estimand_clarity: warning
hypothesis_falsifiability: warning
92 changes: 39 additions & 53 deletions src/autoskillit/skills_extended/review-design/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,19 @@ best available plan.
3. Read the plan file.
**Error handling:** If the file does not exist or is unreadable at the resolved path,
emit `verdict = STOP` with message "Plan file not found: {path}" and exit 0.
Parse YAML frontmatter using the **backward-compatible two-level fallback**:
4. **Load the experiment type registry:**
a. Locate bundled types dir: run
`python -c "from autoskillit.core import pkg_root; print(pkg_root() / 'recipes' / 'experiment-types')"`
to get the absolute bundled directory path.
b. Use Glob `*.yaml` in that directory, then Read each file. Parse YAML frontmatter to
extract `name`, `classification_triggers`, `dimension_weights`, `applicable_lenses`,
`red_team_focus`, and `l1_severity` fields from each.
c. Check `.autoskillit/experiment-types/` in the current working directory. If it exists,
read all `*.yaml` files there. A user-defined type with the same `name` as a bundled
type replaces the bundled entry entirely — do not merge fields.
d. The resulting registry is a mapping of type name → spec. The set of valid
`experiment_type` values for this run is the set of keys in the registry.
5. Parse YAML frontmatter using the **backward-compatible two-level fallback**:
- **Level 1 (frontmatter)**: Read YAML frontmatter between `---` delimiters directly
(zero LLM tokens). Return present fields and note which are missing.
Record `source: frontmatter` for each extracted field.
Expand All @@ -91,7 +103,7 @@ best available plan.

| Missing Field | Prose Target | Extraction Prompt |
|---|---|---|
| experiment_type | Full plan | "Classify: benchmark, configuration_study, causal_inference, robustness_audit, exploratory" |
| experiment_type | Full plan | "Classify using the loaded registry types: {', '.join(registry.keys())}" |
| hypothesis_h0/h1 | ## Hypothesis | "Extract the null/alternative hypothesis" |
| estimand | ## Hypothesis + ## Independent Variables | "Extract: treatment, outcome, population, contrast" |
| metrics | ## Dependent Variables table | "Extract each row as structured object" |
Expand All @@ -106,46 +118,36 @@ best available plan.
### Step 1: Triage Dispatcher

Launch one subagent. Receives full plan text plus parsed fields. Returns:
- `experiment_type`: one of `benchmark | configuration_study | causal_inference |
robustness_audit | exploratory`
- `experiment_type`: one of the type names in the loaded registry (from Step 0)
- `dimension_weights`: the complete weight matrix for this plan (H/M/L/S per dimension)
- `secondary_modifiers`: list of active modifiers with their effects on weights

**Schema validation:** After the subagent returns, verify that `experiment_type` is one of
the five enumerated values above. If the returned value is unrecognized, default to
`exploratory` and log a warning — do not silently pass an invalid type into the weight
**Schema validation:** After the subagent returns, verify that `experiment_type` is a key
in the loaded registry (from Step 0). If the returned value is not in the registry, default
to `exploratory` and log a warning — do not silently pass an invalid type into the weight
matrix lookup, as this would corrupt all subsequent spawning decisions.

**Triage classification rules (first-match):**

| Rule | Type | Trigger |
|---|---|---|
| 1 | benchmark | IVs are system/method names, DVs are performance metrics, multiple comparators |
| 2 | configuration_study | IVs are numeric parameters of one system, grid/sweep structure |
| 3 | causal_inference | Causal language ("causes", "effect of"), confounders in threats |
| 4 | robustness_audit | Tests generalization/stability, deliberately varied conditions |
| 5 | exploratory | Default — no prior rule fires, or hypothesis absent |
Use the `classification_triggers` list from each type in the loaded registry to classify
the experiment. Apply first-match: iterate types in registry insertion order (bundled types
sorted alphabetically, then user-defined types sorted alphabetically). The first type whose
trigger description matches the plan is selected. If no trigger matches, default to
`exploratory`.

**Secondary modifiers** (additive, increase dimension weights):
- `+causal`: mechanism claim in non-causal type → causal_structure weight +1 tier
- `+high_cost`: resources > 4 GPU-hours → resource_proportionality L→M
- `+deployment`: motivation references production/users → ecological_validity floor = M
- `+multi_metric`: ≥3 DVs → statistical_corrections weight +1 tier

**Full dimension-to-weight matrix** (W = weight per experiment type):
**Dimension weights:**

| Dimension | benchmark | config_study | causal_inf | robust_audit | exploratory |
|---|---|---|---|---|---|
| causal_structure | S | S | H | M | L |
| variance_protocol | H | H | L | M | L |
| statistical_corrections | M | H | H | S | S |
| ecological_validity | M | L | L | H | M |
| measurement_alignment | M | M | M | H | M |
| resource_proportionality | L | L | L | L | L |
| data_acquisition | M | M | M | H | M |
| agent_implementability | H | H | M | M | L |

Weight tiers: H (High), M (Medium), L (Low), S (SILENT — dimension not spawned, not mentioned).
Use the `dimension_weights` dict from the matched type's registry entry (loaded in Step 0).
Each key is a dimension name; each value is one of weight=H (High), weight=M (Medium), weight=L (Low),
or weight=S (SILENT — dimension not spawned, not mentioned in output). Pass the full
`dimension_weights` dict to the triage subagent so it can return the complete weight
matrix for this plan.

### Subagent Evaluation Scope (applies to ALL dimension subagents)

Expand Down Expand Up @@ -189,23 +191,17 @@ Each L1 subagent receives as explicit inputs:

**Severity calibration rubric for L1 dimensions:**

| Dimension | causal_inference | benchmark | configuration_study | robustness_audit | exploratory |
|---------------------------|-----------------|-----------|---------------------|------------------|-------------|
| estimand_clarity | critical | warning | warning | warning | info |
| hypothesis_falsifiability | critical | warning | warning | warning | info |
Use the `l1_severity` dict from the matched experiment type's registry entry (loaded in
Step 0). Keys are `estimand_clarity` and `hypothesis_falsifiability`; values are severity
levels (`critical`, `warning`, `info`). Calibration anchors: `causal_inference` → critical;
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[warning] slop: Lines 196-197 re-enumerate per-type severity calibration anchors that now live exclusively in the YAML registry. This prose will drift as types are added or modified. Remove the hardcoded examples and refer readers to the registry YAML instead.

`benchmark`, `configuration_study`, `robustness_audit` → warning; `exploratory` → info.

- `estimand_clarity` agent: "Can the claim be written as a formal contrast (A vs B on Y in Z)?"
Reference the exp-lens-estimand-clarity philosophical mode as guidance (do NOT invoke
the skill — reference its lens question only in the subagent prompt).
Use the calibration rubric above to assign severity. For `causal_inference`: absent formal
estimand = `critical`. For `benchmark`/`configuration_study`/`robustness_audit`: absent
formal estimand = `warning` (informal contrast sufficient). For `exploratory`: absent
estimand = `info` (intentionally absent).
Use the `l1_severity.estimand_clarity` value from the registry to assign severity.
- `hypothesis_falsifiability` agent: "What result would cause the author to conclude H0?"
Use the calibration rubric above. For `causal_inference`: unfalsifiable hypothesis =
`critical`. For `benchmark`/`configuration_study`/`robustness_audit`: comparison goal
without formal H0 = `warning`. For `exploratory`: absent H0/H1 = `info`
(pre-registration not required).
Use the `l1_severity.hypothesis_falsifiability` value from the registry to assign severity.

Each subagent returns findings in the standard JSON structure (see Finding Format below).

Expand Down Expand Up @@ -265,12 +261,8 @@ Receives: full plan text and `experiment_type` (from Step 1 triage output)
3. **Asymmetric tuning** — proposed method tuned against eval while baselines use defaults
4. **Survivorship bias** — cherry-picking best run from multiple seeds
5. **Evaluation collision** — same infrastructure in both treatment and measurement
- Type-specific focus per experiment type:
- benchmark → asymmetric effort
- configuration_study → overfitting to held-out set
- causal_inference → unblocked backdoor path
- robustness_audit → unrealistic threat distribution
- exploratory → HARKing vulnerability
- Type-specific focus: use `red_team_focus.specific` from the matched type's registry
entry (loaded in Step 0).
- ALL red-team findings must set `"requires_decision": true` and `"dimension": "red_team"`

**Red-team severity calibration rubric:**
Expand Down Expand Up @@ -413,14 +405,8 @@ One synthesis pass (no subagent — orchestrator synthesizes directly):
the same issue from inflating finding counts and obscuring distinct problems.
4. **Apply red-team severity cap, then verdict logic**:
```python
# Red-team severity cap: downgrade findings above the type ceiling
RT_MAX_SEVERITY = {
"causal_inference": "critical",
"benchmark": "warning",
"configuration_study": "warning",
"robustness_audit": "warning",
"exploratory": "info",
}
# RT_MAX_SEVERITY is built from the registry loaded in Step 0 (dict-of-dicts from YAML parsing):
RT_MAX_SEVERITY = {name: spec["red_team_focus"]["severity_cap"] for name, spec in registry.items()}
SEVERITY_RANK = {"info": 0, "warning": 1, "critical": 2}
rt_cap = RT_MAX_SEVERITY[experiment_type]

Expand Down
2 changes: 1 addition & 1 deletion tests/arch/test_subpackage_isolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ def test_no_subpackage_exceeds_10_files() -> None:
"""
EXEMPTIONS: dict[str, int] = {
"server": 18,
"recipe": 30,
"recipe": 31,
"execution": 26,
"core": 17,
"cli": 17,
Expand Down
5 changes: 3 additions & 2 deletions tests/execution/test_recording_sigterm.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@ def test_sigterm_writes_scenario_json(tmp_path):
# Poll stderr line-by-line for the "sigterm_handler_ready" token which
# serve() emits immediately after installing the SIGTERM handler. This
# guarantees the handler is active before we send SIGTERM, while still
# being responsive (no fixed sleep). Falls back after 5 s on slow CI.
# being responsive (no fixed sleep). Falls back after 15 s to tolerate
# xdist parallel load where subprocess startup can be slow.
stderr_lines: list[str] = []
deadline = time.monotonic() + 5.0
deadline = time.monotonic() + 15.0
while time.monotonic() < deadline:
remaining = deadline - time.monotonic()
readable, _, _ = select.select([proc.stderr], [], [], min(remaining, 0.2))
Expand Down
Loading
Loading