-
Notifications
You must be signed in to change notification settings - Fork 0
feat: Experiment Type Registry — YAML-Driven, Extensible #795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
63f032a
f7eaf22
85cca81
a559824
6a449ea
35ce6f5
bb3f931
e409c4b
8dbaace
2868bce
492ca24
8fb8d59
783483c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| """Experiment type registry — load bundled and user-defined experiment type specs.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from dataclasses import dataclass | ||
| from pathlib import Path | ||
|
|
||
| from autoskillit.core import get_logger, load_yaml, pkg_root | ||
|
|
||
| _log = get_logger(__name__) | ||
|
|
||
| BUNDLED_EXPERIMENT_TYPES_DIR: Path = pkg_root() / "recipes" / "experiment-types" | ||
|
|
||
|
|
||
| @dataclass | ||
| class ExperimentTypeSpec: | ||
| """Specification for a single experiment type.""" | ||
|
|
||
| name: str | ||
| classification_triggers: list[str] | ||
| dimension_weights: dict[str, str] | ||
| applicable_lenses: dict[str, str | None] | ||
| red_team_focus: dict[str, str] | ||
| l1_severity: dict[str, str] | ||
|
|
||
|
|
||
| def _parse_experiment_type(data: dict, source_path: Path) -> ExperimentTypeSpec: | ||
| if "name" not in data: | ||
| raise ValueError(f"Experiment type YAML missing 'name' field: {source_path}") | ||
| return ExperimentTypeSpec( | ||
| name=data["name"], | ||
| classification_triggers=list(data.get("classification_triggers", [])), | ||
| dimension_weights=dict(data.get("dimension_weights", {})), | ||
| applicable_lenses=dict(data.get("applicable_lenses", {})), | ||
| red_team_focus=dict(data.get("red_team_focus", {})), | ||
| l1_severity=dict(data.get("l1_severity", {})), | ||
| ) | ||
|
|
||
|
|
||
| def _load_types_from_dir(directory: Path) -> dict[str, ExperimentTypeSpec]: | ||
|
Trecek marked this conversation as resolved.
|
||
| if not directory.exists(): | ||
| return {} | ||
| result: dict[str, ExperimentTypeSpec] = {} | ||
| for path in sorted(directory.glob("*.yaml")): | ||
| try: | ||
| data = load_yaml(path) | ||
| if isinstance(data, dict): | ||
| spec = _parse_experiment_type(data, path) | ||
| result[spec.name] = spec | ||
| except Exception: | ||
| _log.warning("Skipping malformed experiment type file: %s", path, exc_info=True) | ||
| return result | ||
|
|
||
|
|
||
| def load_all_experiment_types( | ||
| project_dir: Path | None = None, | ||
| ) -> dict[str, ExperimentTypeSpec]: | ||
| """Load experiment types: bundled types merged with user-defined overrides. | ||
|
|
||
| User-defined types with the same name as a bundled type replace the bundled | ||
| type entirely — no field merging. User-defined types with a new name are added | ||
| alongside bundled types. | ||
|
|
||
| Args: | ||
| project_dir: Project root containing optional user-defined overrides at | ||
| ``.autoskillit/experiment-types/``. When ``None``, only bundled types | ||
| are returned. | ||
|
|
||
| Returns: | ||
| Mapping of experiment type name to ``ExperimentTypeSpec``. | ||
| """ | ||
| types = _load_types_from_dir(BUNDLED_EXPERIMENT_TYPES_DIR) | ||
|
|
||
| if project_dir is not None: | ||
| user_dir = Path(project_dir) / ".autoskillit" / "experiment-types" | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [info] defense: Redundant |
||
| user_types = _load_types_from_dir(user_dir) | ||
| types.update(user_types) | ||
|
|
||
| return types | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| name: benchmark | ||
| classification_triggers: | ||
| - "IVs are system/method names, DVs are performance metrics, multiple comparators" | ||
| dimension_weights: | ||
| causal_structure: S | ||
| variance_protocol: H | ||
| statistical_corrections: M | ||
| ecological_validity: M | ||
| measurement_alignment: M | ||
| resource_proportionality: L | ||
| data_acquisition: M | ||
| agent_implementability: H | ||
| applicable_lenses: | ||
| primary: exp-lens-estimand-clarity | ||
| secondary: null | ||
| red_team_focus: | ||
| specific: asymmetric effort | ||
| severity_cap: warning | ||
| l1_severity: | ||
| estimand_clarity: warning | ||
| hypothesis_falsifiability: warning |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| name: causal_inference | ||
| classification_triggers: | ||
| - "Causal language (\"causes\", \"effect of\"), confounders in threats" | ||
| dimension_weights: | ||
| causal_structure: H | ||
| variance_protocol: L | ||
| statistical_corrections: H | ||
| ecological_validity: L | ||
| measurement_alignment: M | ||
| resource_proportionality: L | ||
| data_acquisition: M | ||
| agent_implementability: M | ||
| applicable_lenses: | ||
| primary: exp-lens-estimand-clarity | ||
| secondary: null | ||
| red_team_focus: | ||
| specific: unblocked backdoor path | ||
| severity_cap: critical | ||
| l1_severity: | ||
| estimand_clarity: critical | ||
| hypothesis_falsifiability: critical |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| name: configuration_study | ||
| classification_triggers: | ||
| - "IVs are numeric parameters of one system, grid/sweep structure" | ||
| dimension_weights: | ||
| causal_structure: S | ||
| variance_protocol: H | ||
| statistical_corrections: H | ||
| ecological_validity: L | ||
| measurement_alignment: M | ||
| resource_proportionality: L | ||
| data_acquisition: M | ||
| agent_implementability: H | ||
| applicable_lenses: | ||
| primary: exp-lens-estimand-clarity | ||
| secondary: null | ||
| red_team_focus: | ||
| specific: overfitting to held-out set | ||
| severity_cap: warning | ||
| l1_severity: | ||
| estimand_clarity: warning | ||
| hypothesis_falsifiability: warning |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| name: exploratory | ||
| classification_triggers: | ||
| - "Default — no prior rule fires, or hypothesis absent" | ||
| dimension_weights: | ||
| causal_structure: L | ||
| variance_protocol: L | ||
| statistical_corrections: S | ||
| ecological_validity: M | ||
| measurement_alignment: M | ||
| resource_proportionality: L | ||
| data_acquisition: M | ||
| agent_implementability: L | ||
| applicable_lenses: | ||
| primary: exp-lens-estimand-clarity | ||
| secondary: null | ||
| red_team_focus: | ||
| specific: HARKing vulnerability | ||
| severity_cap: info | ||
| l1_severity: | ||
| estimand_clarity: info | ||
| hypothesis_falsifiability: info |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| name: robustness_audit | ||
| classification_triggers: | ||
| - "Tests generalization/stability, deliberately varied conditions" | ||
| dimension_weights: | ||
| causal_structure: M | ||
| variance_protocol: M | ||
| statistical_corrections: S | ||
| ecological_validity: H | ||
| measurement_alignment: H | ||
| resource_proportionality: L | ||
| data_acquisition: H | ||
| agent_implementability: M | ||
| applicable_lenses: | ||
| primary: exp-lens-estimand-clarity | ||
| secondary: null | ||
| red_team_focus: | ||
| specific: unrealistic threat distribution | ||
| severity_cap: warning | ||
| l1_severity: | ||
| estimand_clarity: warning | ||
| hypothesis_falsifiability: warning |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -75,7 +75,19 @@ best available plan. | |
| 3. Read the plan file. | ||
| **Error handling:** If the file does not exist or is unreadable at the resolved path, | ||
| emit `verdict = STOP` with message "Plan file not found: {path}" and exit 0. | ||
| Parse YAML frontmatter using the **backward-compatible two-level fallback**: | ||
| 4. **Load the experiment type registry:** | ||
| a. Locate bundled types dir: run | ||
| `python -c "from autoskillit.core import pkg_root; print(pkg_root() / 'recipes' / 'experiment-types')"` | ||
| to get the absolute bundled directory path. | ||
| b. Use Glob `*.yaml` in that directory, then Read each file. Parse YAML frontmatter to | ||
| extract `name`, `classification_triggers`, `dimension_weights`, `applicable_lenses`, | ||
| `red_team_focus`, and `l1_severity` fields from each. | ||
| c. Check `.autoskillit/experiment-types/` in the current working directory. If it exists, | ||
| read all `*.yaml` files there. A user-defined type with the same `name` as a bundled | ||
| type replaces the bundled entry entirely — do not merge fields. | ||
| d. The resulting registry is a mapping of type name → spec. The set of valid | ||
| `experiment_type` values for this run is the set of keys in the registry. | ||
| 5. Parse YAML frontmatter using the **backward-compatible two-level fallback**: | ||
| - **Level 1 (frontmatter)**: Read YAML frontmatter between `---` delimiters directly | ||
| (zero LLM tokens). Return present fields and note which are missing. | ||
| Record `source: frontmatter` for each extracted field. | ||
|
|
@@ -91,7 +103,7 @@ best available plan. | |
|
|
||
| | Missing Field | Prose Target | Extraction Prompt | | ||
| |---|---|---| | ||
| | experiment_type | Full plan | "Classify: benchmark, configuration_study, causal_inference, robustness_audit, exploratory" | | ||
| | experiment_type | Full plan | "Classify using the loaded registry types: {', '.join(registry.keys())}" | | ||
| | hypothesis_h0/h1 | ## Hypothesis | "Extract the null/alternative hypothesis" | | ||
| | estimand | ## Hypothesis + ## Independent Variables | "Extract: treatment, outcome, population, contrast" | | ||
| | metrics | ## Dependent Variables table | "Extract each row as structured object" | | ||
|
|
@@ -106,46 +118,36 @@ best available plan. | |
| ### Step 1: Triage Dispatcher | ||
|
|
||
| Launch one subagent. Receives full plan text plus parsed fields. Returns: | ||
| - `experiment_type`: one of `benchmark | configuration_study | causal_inference | | ||
| robustness_audit | exploratory` | ||
| - `experiment_type`: one of the type names in the loaded registry (from Step 0) | ||
| - `dimension_weights`: the complete weight matrix for this plan (H/M/L/S per dimension) | ||
| - `secondary_modifiers`: list of active modifiers with their effects on weights | ||
|
|
||
| **Schema validation:** After the subagent returns, verify that `experiment_type` is one of | ||
| the five enumerated values above. If the returned value is unrecognized, default to | ||
| `exploratory` and log a warning — do not silently pass an invalid type into the weight | ||
| **Schema validation:** After the subagent returns, verify that `experiment_type` is a key | ||
| in the loaded registry (from Step 0). If the returned value is not in the registry, default | ||
| to `exploratory` and log a warning — do not silently pass an invalid type into the weight | ||
| matrix lookup, as this would corrupt all subsequent spawning decisions. | ||
|
|
||
| **Triage classification rules (first-match):** | ||
|
|
||
| | Rule | Type | Trigger | | ||
| |---|---|---| | ||
| | 1 | benchmark | IVs are system/method names, DVs are performance metrics, multiple comparators | | ||
| | 2 | configuration_study | IVs are numeric parameters of one system, grid/sweep structure | | ||
| | 3 | causal_inference | Causal language ("causes", "effect of"), confounders in threats | | ||
| | 4 | robustness_audit | Tests generalization/stability, deliberately varied conditions | | ||
| | 5 | exploratory | Default — no prior rule fires, or hypothesis absent | | ||
| Use the `classification_triggers` list from each type in the loaded registry to classify | ||
| the experiment. Apply first-match: iterate types in registry insertion order (bundled types | ||
| sorted alphabetically, then user-defined types sorted alphabetically). The first type whose | ||
| trigger description matches the plan is selected. If no trigger matches, default to | ||
| `exploratory`. | ||
|
|
||
| **Secondary modifiers** (additive, increase dimension weights): | ||
| - `+causal`: mechanism claim in non-causal type → causal_structure weight +1 tier | ||
| - `+high_cost`: resources > 4 GPU-hours → resource_proportionality L→M | ||
| - `+deployment`: motivation references production/users → ecological_validity floor = M | ||
| - `+multi_metric`: ≥3 DVs → statistical_corrections weight +1 tier | ||
|
|
||
| **Full dimension-to-weight matrix** (W = weight per experiment type): | ||
| **Dimension weights:** | ||
|
|
||
| | Dimension | benchmark | config_study | causal_inf | robust_audit | exploratory | | ||
| |---|---|---|---|---|---| | ||
| | causal_structure | S | S | H | M | L | | ||
| | variance_protocol | H | H | L | M | L | | ||
| | statistical_corrections | M | H | H | S | S | | ||
| | ecological_validity | M | L | L | H | M | | ||
| | measurement_alignment | M | M | M | H | M | | ||
| | resource_proportionality | L | L | L | L | L | | ||
| | data_acquisition | M | M | M | H | M | | ||
| | agent_implementability | H | H | M | M | L | | ||
|
|
||
| Weight tiers: H (High), M (Medium), L (Low), S (SILENT — dimension not spawned, not mentioned). | ||
| Use the `dimension_weights` dict from the matched type's registry entry (loaded in Step 0). | ||
| Each key is a dimension name; each value is one of weight=H (High), weight=M (Medium), weight=L (Low), | ||
| or weight=S (SILENT — dimension not spawned, not mentioned in output). Pass the full | ||
| `dimension_weights` dict to the triage subagent so it can return the complete weight | ||
| matrix for this plan. | ||
|
|
||
| ### Subagent Evaluation Scope (applies to ALL dimension subagents) | ||
|
|
||
|
|
@@ -189,23 +191,17 @@ Each L1 subagent receives as explicit inputs: | |
|
|
||
| **Severity calibration rubric for L1 dimensions:** | ||
|
|
||
| | Dimension | causal_inference | benchmark | configuration_study | robustness_audit | exploratory | | ||
| |---------------------------|-----------------|-----------|---------------------|------------------|-------------| | ||
| | estimand_clarity | critical | warning | warning | warning | info | | ||
| | hypothesis_falsifiability | critical | warning | warning | warning | info | | ||
| Use the `l1_severity` dict from the matched experiment type's registry entry (loaded in | ||
| Step 0). Keys are `estimand_clarity` and `hypothesis_falsifiability`; values are severity | ||
| levels (`critical`, `warning`, `info`). Calibration anchors: `causal_inference` → critical; | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [warning] slop: Lines 196-197 re-enumerate per-type severity calibration anchors that now live exclusively in the YAML registry. This prose will drift as types are added or modified. Remove the hardcoded examples and refer readers to the registry YAML instead. |
||
| `benchmark`, `configuration_study`, `robustness_audit` → warning; `exploratory` → info. | ||
|
|
||
| - `estimand_clarity` agent: "Can the claim be written as a formal contrast (A vs B on Y in Z)?" | ||
| Reference the exp-lens-estimand-clarity philosophical mode as guidance (do NOT invoke | ||
| the skill — reference its lens question only in the subagent prompt). | ||
| Use the calibration rubric above to assign severity. For `causal_inference`: absent formal | ||
| estimand = `critical`. For `benchmark`/`configuration_study`/`robustness_audit`: absent | ||
| formal estimand = `warning` (informal contrast sufficient). For `exploratory`: absent | ||
| estimand = `info` (intentionally absent). | ||
| Use the `l1_severity.estimand_clarity` value from the registry to assign severity. | ||
| - `hypothesis_falsifiability` agent: "What result would cause the author to conclude H0?" | ||
| Use the calibration rubric above. For `causal_inference`: unfalsifiable hypothesis = | ||
| `critical`. For `benchmark`/`configuration_study`/`robustness_audit`: comparison goal | ||
| without formal H0 = `warning`. For `exploratory`: absent H0/H1 = `info` | ||
| (pre-registration not required). | ||
| Use the `l1_severity.hypothesis_falsifiability` value from the registry to assign severity. | ||
|
|
||
| Each subagent returns findings in the standard JSON structure (see Finding Format below). | ||
|
|
||
|
|
@@ -265,12 +261,8 @@ Receives: full plan text and `experiment_type` (from Step 1 triage output) | |
| 3. **Asymmetric tuning** — proposed method tuned against eval while baselines use defaults | ||
| 4. **Survivorship bias** — cherry-picking best run from multiple seeds | ||
| 5. **Evaluation collision** — same infrastructure in both treatment and measurement | ||
| - Type-specific focus per experiment type: | ||
| - benchmark → asymmetric effort | ||
| - configuration_study → overfitting to held-out set | ||
| - causal_inference → unblocked backdoor path | ||
| - robustness_audit → unrealistic threat distribution | ||
| - exploratory → HARKing vulnerability | ||
| - Type-specific focus: use `red_team_focus.specific` from the matched type's registry | ||
| entry (loaded in Step 0). | ||
| - ALL red-team findings must set `"requires_decision": true` and `"dimension": "red_team"` | ||
|
|
||
| **Red-team severity calibration rubric:** | ||
|
|
@@ -413,14 +405,8 @@ One synthesis pass (no subagent — orchestrator synthesizes directly): | |
| the same issue from inflating finding counts and obscuring distinct problems. | ||
| 4. **Apply red-team severity cap, then verdict logic**: | ||
| ```python | ||
| # Red-team severity cap: downgrade findings above the type ceiling | ||
| RT_MAX_SEVERITY = { | ||
| "causal_inference": "critical", | ||
| "benchmark": "warning", | ||
| "configuration_study": "warning", | ||
| "robustness_audit": "warning", | ||
| "exploratory": "info", | ||
| } | ||
| # RT_MAX_SEVERITY is built from the registry loaded in Step 0 (dict-of-dicts from YAML parsing): | ||
| RT_MAX_SEVERITY = {name: spec["red_team_focus"]["severity_cap"] for name, spec in registry.items()} | ||
| SEVERITY_RANK = {"info": 0, "warning": 1, "critical": 2} | ||
| rt_cap = RT_MAX_SEVERITY[experiment_type] | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[warning] defense:
dataparameter is typed as baredictinstead ofdict[str, object]. Widen the type annotation so mypy can verify the signature correctly.