diff --git a/CLAUDE.md b/CLAUDE.md index 9f68a412..142fa503 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -175,6 +175,7 @@ generic_automation_mcp/ │ ├── loader.py # Path-based recipe metadata utilities │ ├── _api.py # Orchestration API │ ├── diagrams.py # Flow diagram generation + staleness detection +│ ├── experiment_type_registry.py # ExperimentTypeSpec, load_all_experiment_types │ ├── registry.py # RuleFinding, RuleSpec, semantic_rule decorator │ ├── repository.py │ ├── _analysis.py # Step graph building + dataflow analysis diff --git a/src/autoskillit/recipe/__init__.py b/src/autoskillit/recipe/__init__.py index 79212551..4e629b67 100644 --- a/src/autoskillit/recipe/__init__.py +++ b/src/autoskillit/recipe/__init__.py @@ -50,6 +50,10 @@ diagram_stale_to_suggestions, load_recipe_diagram, ) +from autoskillit.recipe.experiment_type_registry import ( # noqa: E402 + ExperimentTypeSpec, + load_all_experiment_types, +) from autoskillit.recipe.io import ( # noqa: E402 builtin_sub_recipes_dir, find_recipe_by_name, @@ -130,4 +134,6 @@ "diagram_stale_to_suggestions", "builtin_sub_recipes_dir", "find_sub_recipe_by_name", + "ExperimentTypeSpec", + "load_all_experiment_types", ] diff --git a/src/autoskillit/recipe/experiment_type_registry.py b/src/autoskillit/recipe/experiment_type_registry.py new file mode 100644 index 00000000..61649062 --- /dev/null +++ b/src/autoskillit/recipe/experiment_type_registry.py @@ -0,0 +1,79 @@ +"""Experiment type registry — load bundled and user-defined experiment type specs.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from autoskillit.core import get_logger, load_yaml, pkg_root + +_log = get_logger(__name__) + +BUNDLED_EXPERIMENT_TYPES_DIR: Path = pkg_root() / "recipes" / "experiment-types" + + +@dataclass +class ExperimentTypeSpec: + """Specification for a single experiment type.""" + + name: str + classification_triggers: list[str] + dimension_weights: dict[str, str] + applicable_lenses: dict[str, str | None] + red_team_focus: dict[str, str] + l1_severity: dict[str, str] + + +def _parse_experiment_type(data: dict, source_path: Path) -> ExperimentTypeSpec: + if "name" not in data: + raise ValueError(f"Experiment type YAML missing 'name' field: {source_path}") + return ExperimentTypeSpec( + name=data["name"], + classification_triggers=list(data.get("classification_triggers", [])), + dimension_weights=dict(data.get("dimension_weights", {})), + applicable_lenses=dict(data.get("applicable_lenses", {})), + red_team_focus=dict(data.get("red_team_focus", {})), + l1_severity=dict(data.get("l1_severity", {})), + ) + + +def _load_types_from_dir(directory: Path) -> dict[str, ExperimentTypeSpec]: + if not directory.exists(): + return {} + result: dict[str, ExperimentTypeSpec] = {} + for path in sorted(directory.glob("*.yaml")): + try: + data = load_yaml(path) + if isinstance(data, dict): + spec = _parse_experiment_type(data, path) + result[spec.name] = spec + except Exception: + _log.warning("Skipping malformed experiment type file: %s", path, exc_info=True) + return result + + +def load_all_experiment_types( + project_dir: Path | None = None, +) -> dict[str, ExperimentTypeSpec]: + """Load experiment types: bundled types merged with user-defined overrides. + + User-defined types with the same name as a bundled type replace the bundled + type entirely — no field merging. User-defined types with a new name are added + alongside bundled types. + + Args: + project_dir: Project root containing optional user-defined overrides at + ``.autoskillit/experiment-types/``. When ``None``, only bundled types + are returned. + + Returns: + Mapping of experiment type name to ``ExperimentTypeSpec``. + """ + types = _load_types_from_dir(BUNDLED_EXPERIMENT_TYPES_DIR) + + if project_dir is not None: + user_dir = Path(project_dir) / ".autoskillit" / "experiment-types" + user_types = _load_types_from_dir(user_dir) + types.update(user_types) + + return types diff --git a/src/autoskillit/recipes/experiment-types/benchmark.yaml b/src/autoskillit/recipes/experiment-types/benchmark.yaml new file mode 100644 index 00000000..f82897ca --- /dev/null +++ b/src/autoskillit/recipes/experiment-types/benchmark.yaml @@ -0,0 +1,21 @@ +name: benchmark +classification_triggers: + - "IVs are system/method names, DVs are performance metrics, multiple comparators" +dimension_weights: + causal_structure: S + variance_protocol: H + statistical_corrections: M + ecological_validity: M + measurement_alignment: M + resource_proportionality: L + data_acquisition: M + agent_implementability: H +applicable_lenses: + primary: exp-lens-estimand-clarity + secondary: null +red_team_focus: + specific: asymmetric effort + severity_cap: warning +l1_severity: + estimand_clarity: warning + hypothesis_falsifiability: warning diff --git a/src/autoskillit/recipes/experiment-types/causal_inference.yaml b/src/autoskillit/recipes/experiment-types/causal_inference.yaml new file mode 100644 index 00000000..0ff2da0e --- /dev/null +++ b/src/autoskillit/recipes/experiment-types/causal_inference.yaml @@ -0,0 +1,21 @@ +name: causal_inference +classification_triggers: + - "Causal language (\"causes\", \"effect of\"), confounders in threats" +dimension_weights: + causal_structure: H + variance_protocol: L + statistical_corrections: H + ecological_validity: L + measurement_alignment: M + resource_proportionality: L + data_acquisition: M + agent_implementability: M +applicable_lenses: + primary: exp-lens-estimand-clarity + secondary: null +red_team_focus: + specific: unblocked backdoor path + severity_cap: critical +l1_severity: + estimand_clarity: critical + hypothesis_falsifiability: critical diff --git a/src/autoskillit/recipes/experiment-types/configuration_study.yaml b/src/autoskillit/recipes/experiment-types/configuration_study.yaml new file mode 100644 index 00000000..901bddab --- /dev/null +++ b/src/autoskillit/recipes/experiment-types/configuration_study.yaml @@ -0,0 +1,21 @@ +name: configuration_study +classification_triggers: + - "IVs are numeric parameters of one system, grid/sweep structure" +dimension_weights: + causal_structure: S + variance_protocol: H + statistical_corrections: H + ecological_validity: L + measurement_alignment: M + resource_proportionality: L + data_acquisition: M + agent_implementability: H +applicable_lenses: + primary: exp-lens-estimand-clarity + secondary: null +red_team_focus: + specific: overfitting to held-out set + severity_cap: warning +l1_severity: + estimand_clarity: warning + hypothesis_falsifiability: warning diff --git a/src/autoskillit/recipes/experiment-types/exploratory.yaml b/src/autoskillit/recipes/experiment-types/exploratory.yaml new file mode 100644 index 00000000..a7dae8b6 --- /dev/null +++ b/src/autoskillit/recipes/experiment-types/exploratory.yaml @@ -0,0 +1,21 @@ +name: exploratory +classification_triggers: + - "Default — no prior rule fires, or hypothesis absent" +dimension_weights: + causal_structure: L + variance_protocol: L + statistical_corrections: S + ecological_validity: M + measurement_alignment: M + resource_proportionality: L + data_acquisition: M + agent_implementability: L +applicable_lenses: + primary: exp-lens-estimand-clarity + secondary: null +red_team_focus: + specific: HARKing vulnerability + severity_cap: info +l1_severity: + estimand_clarity: info + hypothesis_falsifiability: info diff --git a/src/autoskillit/recipes/experiment-types/robustness_audit.yaml b/src/autoskillit/recipes/experiment-types/robustness_audit.yaml new file mode 100644 index 00000000..ab4fe78f --- /dev/null +++ b/src/autoskillit/recipes/experiment-types/robustness_audit.yaml @@ -0,0 +1,21 @@ +name: robustness_audit +classification_triggers: + - "Tests generalization/stability, deliberately varied conditions" +dimension_weights: + causal_structure: M + variance_protocol: M + statistical_corrections: S + ecological_validity: H + measurement_alignment: H + resource_proportionality: L + data_acquisition: H + agent_implementability: M +applicable_lenses: + primary: exp-lens-estimand-clarity + secondary: null +red_team_focus: + specific: unrealistic threat distribution + severity_cap: warning +l1_severity: + estimand_clarity: warning + hypothesis_falsifiability: warning diff --git a/src/autoskillit/skills_extended/review-design/SKILL.md b/src/autoskillit/skills_extended/review-design/SKILL.md index 3d54c536..ae96597d 100644 --- a/src/autoskillit/skills_extended/review-design/SKILL.md +++ b/src/autoskillit/skills_extended/review-design/SKILL.md @@ -75,7 +75,19 @@ best available plan. 3. Read the plan file. **Error handling:** If the file does not exist or is unreadable at the resolved path, emit `verdict = STOP` with message "Plan file not found: {path}" and exit 0. - Parse YAML frontmatter using the **backward-compatible two-level fallback**: +4. **Load the experiment type registry:** + a. Locate bundled types dir: run + `python -c "from autoskillit.core import pkg_root; print(pkg_root() / 'recipes' / 'experiment-types')"` + to get the absolute bundled directory path. + b. Use Glob `*.yaml` in that directory, then Read each file. Parse YAML frontmatter to + extract `name`, `classification_triggers`, `dimension_weights`, `applicable_lenses`, + `red_team_focus`, and `l1_severity` fields from each. + c. Check `.autoskillit/experiment-types/` in the current working directory. If it exists, + read all `*.yaml` files there. A user-defined type with the same `name` as a bundled + type replaces the bundled entry entirely — do not merge fields. + d. The resulting registry is a mapping of type name → spec. The set of valid + `experiment_type` values for this run is the set of keys in the registry. +5. Parse YAML frontmatter using the **backward-compatible two-level fallback**: - **Level 1 (frontmatter)**: Read YAML frontmatter between `---` delimiters directly (zero LLM tokens). Return present fields and note which are missing. Record `source: frontmatter` for each extracted field. @@ -91,7 +103,7 @@ best available plan. | Missing Field | Prose Target | Extraction Prompt | |---|---|---| - | experiment_type | Full plan | "Classify: benchmark, configuration_study, causal_inference, robustness_audit, exploratory" | + | experiment_type | Full plan | "Classify using the loaded registry types: {', '.join(registry.keys())}" | | hypothesis_h0/h1 | ## Hypothesis | "Extract the null/alternative hypothesis" | | estimand | ## Hypothesis + ## Independent Variables | "Extract: treatment, outcome, population, contrast" | | metrics | ## Dependent Variables table | "Extract each row as structured object" | @@ -106,25 +118,22 @@ best available plan. ### Step 1: Triage Dispatcher Launch one subagent. Receives full plan text plus parsed fields. Returns: -- `experiment_type`: one of `benchmark | configuration_study | causal_inference | - robustness_audit | exploratory` +- `experiment_type`: one of the type names in the loaded registry (from Step 0) - `dimension_weights`: the complete weight matrix for this plan (H/M/L/S per dimension) - `secondary_modifiers`: list of active modifiers with their effects on weights -**Schema validation:** After the subagent returns, verify that `experiment_type` is one of -the five enumerated values above. If the returned value is unrecognized, default to -`exploratory` and log a warning — do not silently pass an invalid type into the weight +**Schema validation:** After the subagent returns, verify that `experiment_type` is a key +in the loaded registry (from Step 0). If the returned value is not in the registry, default +to `exploratory` and log a warning — do not silently pass an invalid type into the weight matrix lookup, as this would corrupt all subsequent spawning decisions. **Triage classification rules (first-match):** -| Rule | Type | Trigger | -|---|---|---| -| 1 | benchmark | IVs are system/method names, DVs are performance metrics, multiple comparators | -| 2 | configuration_study | IVs are numeric parameters of one system, grid/sweep structure | -| 3 | causal_inference | Causal language ("causes", "effect of"), confounders in threats | -| 4 | robustness_audit | Tests generalization/stability, deliberately varied conditions | -| 5 | exploratory | Default — no prior rule fires, or hypothesis absent | +Use the `classification_triggers` list from each type in the loaded registry to classify +the experiment. Apply first-match: iterate types in registry insertion order (bundled types +sorted alphabetically, then user-defined types sorted alphabetically). The first type whose +trigger description matches the plan is selected. If no trigger matches, default to +`exploratory`. **Secondary modifiers** (additive, increase dimension weights): - `+causal`: mechanism claim in non-causal type → causal_structure weight +1 tier @@ -132,20 +141,13 @@ matrix lookup, as this would corrupt all subsequent spawning decisions. - `+deployment`: motivation references production/users → ecological_validity floor = M - `+multi_metric`: ≥3 DVs → statistical_corrections weight +1 tier -**Full dimension-to-weight matrix** (W = weight per experiment type): +**Dimension weights:** -| Dimension | benchmark | config_study | causal_inf | robust_audit | exploratory | -|---|---|---|---|---|---| -| causal_structure | S | S | H | M | L | -| variance_protocol | H | H | L | M | L | -| statistical_corrections | M | H | H | S | S | -| ecological_validity | M | L | L | H | M | -| measurement_alignment | M | M | M | H | M | -| resource_proportionality | L | L | L | L | L | -| data_acquisition | M | M | M | H | M | -| agent_implementability | H | H | M | M | L | - -Weight tiers: H (High), M (Medium), L (Low), S (SILENT — dimension not spawned, not mentioned). +Use the `dimension_weights` dict from the matched type's registry entry (loaded in Step 0). +Each key is a dimension name; each value is one of weight=H (High), weight=M (Medium), weight=L (Low), +or weight=S (SILENT — dimension not spawned, not mentioned in output). Pass the full +`dimension_weights` dict to the triage subagent so it can return the complete weight +matrix for this plan. ### Subagent Evaluation Scope (applies to ALL dimension subagents) @@ -189,23 +191,17 @@ Each L1 subagent receives as explicit inputs: **Severity calibration rubric for L1 dimensions:** -| Dimension | causal_inference | benchmark | configuration_study | robustness_audit | exploratory | -|---------------------------|-----------------|-----------|---------------------|------------------|-------------| -| estimand_clarity | critical | warning | warning | warning | info | -| hypothesis_falsifiability | critical | warning | warning | warning | info | +Use the `l1_severity` dict from the matched experiment type's registry entry (loaded in +Step 0). Keys are `estimand_clarity` and `hypothesis_falsifiability`; values are severity +levels (`critical`, `warning`, `info`). Calibration anchors: `causal_inference` → critical; +`benchmark`, `configuration_study`, `robustness_audit` → warning; `exploratory` → info. - `estimand_clarity` agent: "Can the claim be written as a formal contrast (A vs B on Y in Z)?" Reference the exp-lens-estimand-clarity philosophical mode as guidance (do NOT invoke the skill — reference its lens question only in the subagent prompt). - Use the calibration rubric above to assign severity. For `causal_inference`: absent formal - estimand = `critical`. For `benchmark`/`configuration_study`/`robustness_audit`: absent - formal estimand = `warning` (informal contrast sufficient). For `exploratory`: absent - estimand = `info` (intentionally absent). + Use the `l1_severity.estimand_clarity` value from the registry to assign severity. - `hypothesis_falsifiability` agent: "What result would cause the author to conclude H0?" - Use the calibration rubric above. For `causal_inference`: unfalsifiable hypothesis = - `critical`. For `benchmark`/`configuration_study`/`robustness_audit`: comparison goal - without formal H0 = `warning`. For `exploratory`: absent H0/H1 = `info` - (pre-registration not required). + Use the `l1_severity.hypothesis_falsifiability` value from the registry to assign severity. Each subagent returns findings in the standard JSON structure (see Finding Format below). @@ -265,12 +261,8 @@ Receives: full plan text and `experiment_type` (from Step 1 triage output) 3. **Asymmetric tuning** — proposed method tuned against eval while baselines use defaults 4. **Survivorship bias** — cherry-picking best run from multiple seeds 5. **Evaluation collision** — same infrastructure in both treatment and measurement -- Type-specific focus per experiment type: - - benchmark → asymmetric effort - - configuration_study → overfitting to held-out set - - causal_inference → unblocked backdoor path - - robustness_audit → unrealistic threat distribution - - exploratory → HARKing vulnerability +- Type-specific focus: use `red_team_focus.specific` from the matched type's registry + entry (loaded in Step 0). - ALL red-team findings must set `"requires_decision": true` and `"dimension": "red_team"` **Red-team severity calibration rubric:** @@ -413,14 +405,8 @@ One synthesis pass (no subagent — orchestrator synthesizes directly): the same issue from inflating finding counts and obscuring distinct problems. 4. **Apply red-team severity cap, then verdict logic**: ```python - # Red-team severity cap: downgrade findings above the type ceiling - RT_MAX_SEVERITY = { - "causal_inference": "critical", - "benchmark": "warning", - "configuration_study": "warning", - "robustness_audit": "warning", - "exploratory": "info", - } + # RT_MAX_SEVERITY is built from the registry loaded in Step 0 (dict-of-dicts from YAML parsing): + RT_MAX_SEVERITY = {name: spec["red_team_focus"]["severity_cap"] for name, spec in registry.items()} SEVERITY_RANK = {"info": 0, "warning": 1, "critical": 2} rt_cap = RT_MAX_SEVERITY[experiment_type] diff --git a/tests/arch/test_subpackage_isolation.py b/tests/arch/test_subpackage_isolation.py index c31e10a1..8980f5c2 100644 --- a/tests/arch/test_subpackage_isolation.py +++ b/tests/arch/test_subpackage_isolation.py @@ -688,7 +688,7 @@ def test_no_subpackage_exceeds_10_files() -> None: """ EXEMPTIONS: dict[str, int] = { "server": 18, - "recipe": 30, + "recipe": 31, "execution": 26, "core": 17, "cli": 17, diff --git a/tests/execution/test_recording_sigterm.py b/tests/execution/test_recording_sigterm.py index 55908844..6b7d589a 100644 --- a/tests/execution/test_recording_sigterm.py +++ b/tests/execution/test_recording_sigterm.py @@ -41,9 +41,10 @@ def test_sigterm_writes_scenario_json(tmp_path): # Poll stderr line-by-line for the "sigterm_handler_ready" token which # serve() emits immediately after installing the SIGTERM handler. This # guarantees the handler is active before we send SIGTERM, while still - # being responsive (no fixed sleep). Falls back after 5 s on slow CI. + # being responsive (no fixed sleep). Falls back after 15 s to tolerate + # xdist parallel load where subprocess startup can be slow. stderr_lines: list[str] = [] - deadline = time.monotonic() + 5.0 + deadline = time.monotonic() + 15.0 while time.monotonic() < deadline: remaining = deadline - time.monotonic() readable, _, _ = select.select([proc.stderr], [], [], min(remaining, 0.2)) diff --git a/tests/recipe/test_experiment_type_registry.py b/tests/recipe/test_experiment_type_registry.py new file mode 100644 index 00000000..192828ac --- /dev/null +++ b/tests/recipe/test_experiment_type_registry.py @@ -0,0 +1,225 @@ +"""Tests for experiment type registry loader.""" + +from __future__ import annotations + +from pathlib import Path + +import yaml + +from autoskillit.recipe.experiment_type_registry import ( + ExperimentTypeSpec, + load_all_experiment_types, +) + +EXPECTED_TYPES = { + "benchmark", + "configuration_study", + "causal_inference", + "robustness_audit", + "exploratory", +} + +VALID_WEIGHT_VALUES = {"H", "M", "L", "S"} + +EXPECTED_DIMENSIONS = { + "causal_structure", + "variance_protocol", + "statistical_corrections", + "ecological_validity", + "measurement_alignment", + "resource_proportionality", + "data_acquisition", + "agent_implementability", +} + + +def test_all_bundled_types_present() -> None: + """All 5 bundled types load without error.""" + types = load_all_experiment_types() + assert set(types.keys()) == EXPECTED_TYPES + + +def test_each_type_has_required_fields() -> None: + """Each type spec has all required fields with correct structure.""" + types = load_all_experiment_types() + for name, spec in types.items(): + assert spec.name == name, f"{name}: spec.name mismatch" + assert isinstance(spec.classification_triggers, list), f"{name}: triggers not list" + assert len(spec.classification_triggers) >= 1, f"{name}: no triggers" + assert isinstance(spec.dimension_weights, dict), f"{name}: weights not dict" + assert isinstance(spec.applicable_lenses, dict), f"{name}: lenses not dict" + assert isinstance(spec.red_team_focus, dict), f"{name}: red_team_focus not dict" + assert isinstance(spec.l1_severity, dict), f"{name}: l1_severity not dict" + + +def test_all_weight_values_are_valid() -> None: + """All dimension_weights values are one of H, M, L, S.""" + types = load_all_experiment_types() + for name, spec in types.items(): + for dim, weight in spec.dimension_weights.items(): + assert weight in VALID_WEIGHT_VALUES, ( + f"{name}.dimension_weights[{dim!r}] = {weight!r} — not in {VALID_WEIGHT_VALUES}" + ) + + +def test_all_eight_dimensions_present() -> None: + """All 8 dimensions from the SKILL.md matrix are present in each bundled type.""" + types = load_all_experiment_types() + for name, spec in types.items(): + missing = EXPECTED_DIMENSIONS - set(spec.dimension_weights.keys()) + assert not missing, f"{name} missing dimensions: {missing}" + + +def test_dimension_weights_match_skill_matrix() -> None: + """Spot-check dimension weights against the values in SKILL.md.""" + types = load_all_experiment_types() + bench = types["benchmark"] + assert bench.dimension_weights["causal_structure"] == "S" + assert bench.dimension_weights["variance_protocol"] == "H" + assert bench.dimension_weights["agent_implementability"] == "H" + assert bench.dimension_weights["statistical_corrections"] == "M" + + causal = types["causal_inference"] + assert causal.dimension_weights["causal_structure"] == "H" + assert causal.dimension_weights["statistical_corrections"] == "H" + assert causal.dimension_weights["variance_protocol"] == "L" + + config = types["configuration_study"] + assert config.dimension_weights["causal_structure"] == "S" + assert config.dimension_weights["statistical_corrections"] == "H" + + robust = types["robustness_audit"] + assert robust.dimension_weights["ecological_validity"] == "H" + assert robust.dimension_weights["data_acquisition"] == "H" + + exploratory = types["exploratory"] + assert exploratory.dimension_weights["statistical_corrections"] == "S" + assert exploratory.dimension_weights["agent_implementability"] == "L" + + +def test_red_team_severity_caps_match_skill() -> None: + """Red-team severity caps match values in SKILL.md Step 7 RT_MAX_SEVERITY.""" + types = load_all_experiment_types() + assert types["causal_inference"].red_team_focus["severity_cap"] == "critical" + assert types["benchmark"].red_team_focus["severity_cap"] == "warning" + assert types["configuration_study"].red_team_focus["severity_cap"] == "warning" + assert types["robustness_audit"].red_team_focus["severity_cap"] == "warning" + assert types["exploratory"].red_team_focus["severity_cap"] == "info" + + +def test_red_team_type_specific_focus_present() -> None: + """Each type has a type-specific red_team_focus.specific value.""" + types = load_all_experiment_types() + assert "asymmetric effort" in types["benchmark"].red_team_focus["specific"] + assert "overfitting" in types["configuration_study"].red_team_focus["specific"] + assert "backdoor" in types["causal_inference"].red_team_focus["specific"] + assert "threat distribution" in types["robustness_audit"].red_team_focus["specific"] + assert "HARKing" in types["exploratory"].red_team_focus["specific"] + + +def test_l1_severity_values() -> None: + """l1_severity values are one of: critical, warning, info.""" + valid_severities = {"critical", "warning", "info"} + types = load_all_experiment_types() + for name, spec in types.items(): + for dim, sev in spec.l1_severity.items(): + assert sev in valid_severities, f"{name}.l1_severity[{dim!r}] = {sev!r}" + + +def test_l1_severity_causal_inference_is_critical() -> None: + """causal_inference has critical l1_severity for both L1 dimensions.""" + types = load_all_experiment_types() + causal = types["causal_inference"] + assert causal.l1_severity["estimand_clarity"] == "critical" + assert causal.l1_severity["hypothesis_falsifiability"] == "critical" + + +def test_l1_severity_exploratory_is_info() -> None: + """exploratory has info l1_severity for both L1 dimensions.""" + types = load_all_experiment_types() + exp = types["exploratory"] + assert exp.l1_severity["estimand_clarity"] == "info" + assert exp.l1_severity["hypothesis_falsifiability"] == "info" + + +def test_no_project_dir_returns_bundled_only() -> None: + """With project_dir=None, only bundled types are returned.""" + types = load_all_experiment_types(project_dir=None) + assert set(types.keys()) == EXPECTED_TYPES + + +def test_user_override_replaces_bundled_type(tmp_path: Path) -> None: + """User-defined type with same name fully replaces bundled type (no field merging).""" + user_dir = tmp_path / ".autoskillit" / "experiment-types" + user_dir.mkdir(parents=True) + (user_dir / "benchmark.yaml").write_text( + yaml.dump( + { + "name": "benchmark", + "classification_triggers": ["custom trigger only"], + "dimension_weights": {"causal_structure": "H"}, + "applicable_lenses": {"primary": "custom-lens", "secondary": None}, + "red_team_focus": {"specific": "custom focus", "severity_cap": "critical"}, + "l1_severity": { + "estimand_clarity": "critical", + "hypothesis_falsifiability": "critical", + }, + } + ) + ) + types = load_all_experiment_types(project_dir=tmp_path) + bench = types["benchmark"] + # Custom values take effect + assert bench.classification_triggers == ["custom trigger only"] + assert bench.dimension_weights == {"causal_structure": "H"} + # Bundled fields (variance_protocol, etc.) are NOT present — full replacement + assert "variance_protocol" not in bench.dimension_weights + # Other bundled types remain intact + assert "causal_inference" in types + assert len(types) == 5 + + +def test_user_new_type_is_added(tmp_path: Path) -> None: + """User-defined type with a new name is added alongside bundled types.""" + user_dir = tmp_path / ".autoskillit" / "experiment-types" + user_dir.mkdir(parents=True) + (user_dir / "network_analysis.yaml").write_text( + yaml.dump( + { + "name": "network_analysis", + "classification_triggers": ["IVs are graph topology parameters"], + "dimension_weights": { + "causal_structure": "M", + "variance_protocol": "M", + "statistical_corrections": "M", + "ecological_validity": "M", + "measurement_alignment": "H", + "resource_proportionality": "L", + "data_acquisition": "H", + "agent_implementability": "M", + }, + "applicable_lenses": {"primary": "exp-lens-estimand-clarity", "secondary": None}, + "red_team_focus": {"specific": "connectivity bias", "severity_cap": "warning"}, + "l1_severity": { + "estimand_clarity": "warning", + "hypothesis_falsifiability": "warning", + }, + } + ) + ) + types = load_all_experiment_types(project_dir=tmp_path) + assert "network_analysis" in types + assert len(types) == 6 # 5 bundled + 1 user + + +def test_missing_user_override_dir_is_silent(tmp_path: Path) -> None: + """A project_dir with no .autoskillit/experiment-types/ is fine — bundled only returned.""" + types = load_all_experiment_types(project_dir=tmp_path) + assert set(types.keys()) == EXPECTED_TYPES + + +def test_returns_dict_of_experiment_type_spec() -> None: + """load_all_experiment_types returns dict[str, ExperimentTypeSpec].""" + types = load_all_experiment_types() + for _name, spec in types.items(): + assert isinstance(spec, ExperimentTypeSpec) diff --git a/tests/skills/test_review_design_contracts.py b/tests/skills/test_review_design_contracts.py index 05bee867..81b4bfe0 100644 --- a/tests/skills/test_review_design_contracts.py +++ b/tests/skills/test_review_design_contracts.py @@ -557,32 +557,3 @@ def test_agent_implementability_l4_step5_placement(skill_text: str) -> None: assert "agent_implementability" in step5_text, ( "agent_implementability must be placed in Step 5 (Level 4 dimensions)" ) - - -def test_weight_matrix_has_eight_dimensions(skill_text: str) -> None: - """Weight matrix must have exactly 8 dimension rows after adding agent_implementability.""" - in_table = False - dim_count = 0 - known_dims = { - "causal_structure", - "variance_protocol", - "statistical_corrections", - "ecological_validity", - "measurement_alignment", - "resource_proportionality", - "data_acquisition", - "agent_implementability", - } - for line in skill_text.splitlines(): - if "| Dimension |" in line: - in_table = True - continue - if in_table and "|---" in line: - continue - if in_table and "|" in line: - cells = [c.strip() for c in line.split("|") if c.strip()] - if cells and cells[0] in known_dims: - dim_count += 1 - elif in_table and line.strip() and not line.strip().startswith("|"): - break - assert dim_count == 8, f"Weight matrix must have exactly 8 dimension rows, found {dim_count}" diff --git a/tests/skills/test_review_design_guards.py b/tests/skills/test_review_design_guards.py index e2f82501..5a1bffde 100644 --- a/tests/skills/test_review_design_guards.py +++ b/tests/skills/test_review_design_guards.py @@ -2,6 +2,8 @@ from pathlib import Path +from autoskillit.recipe.experiment_type_registry import load_all_experiment_types + SKILL_PATH = ( Path(__file__).resolve().parent.parent.parent / "src" @@ -18,16 +20,13 @@ def test_data_acquisition_dimension_exists() -> None: def test_data_acquisition_not_l_weight() -> None: - """data_acquisition must be M-weight minimum to influence verdict.""" - text = SKILL_PATH.read_text() - lines = text.split("\n") - for line in lines: - if "data_acquisition" in line and "|" in line: - assert "M" in line or "H" in line, ( - "data_acquisition must have M or H weight in at least one experiment type" - ) + """data_acquisition must be M-weight minimum to influence verdict in at least one type.""" + types = load_all_experiment_types() + for name, spec in types.items(): + weight = spec.dimension_weights.get("data_acquisition") + if weight in ("M", "H"): return - raise AssertionError("data_acquisition not found in weight table") + raise AssertionError("data_acquisition must have M or H weight in at least one bundled type") def test_agent_implementability_dimension_exists() -> None: @@ -36,16 +35,19 @@ def test_agent_implementability_dimension_exists() -> None: def test_agent_implementability_weight_row() -> None: - """agent_implementability must have H|H|M|M|L weights in the matrix.""" - text = SKILL_PATH.read_text() - for line in text.split("\n"): - if "agent_implementability" in line and "|" in line: - cells = [c.strip() for c in line.split("|") if c.strip()] - if len(cells) == 6: # dimension name + 5 weights - assert cells[1] == "H", f"benchmark weight should be H, got {cells[1]}" - assert cells[2] == "H", f"config_study weight should be H, got {cells[2]}" - assert cells[3] == "M", f"causal_inf weight should be M, got {cells[3]}" - assert cells[4] == "M", f"robust_audit weight should be M, got {cells[4]}" - assert cells[5] == "L", f"exploratory weight should be L, got {cells[5]}" - return - raise AssertionError("agent_implementability not found in weight table") + """agent_implementability must have H/H/M/M/L weights for the 5 bundled types.""" + types = load_all_experiment_types() + expected = { + "benchmark": "H", + "configuration_study": "H", + "causal_inference": "M", + "robustness_audit": "M", + "exploratory": "L", + } + for type_name, exp_weight in expected.items(): + spec = types.get(type_name) + assert spec is not None, f"Bundled type {type_name!r} not found" + actual = spec.dimension_weights.get("agent_implementability") + assert actual == exp_weight, ( + f"{type_name}.agent_implementability = {actual!r}, expected {exp_weight!r}" + )