From 473ccb31cd367475b2d6a2f311d749171c892955 Mon Sep 17 00:00:00 2001 From: Eric Windmill Date: Mon, 30 Mar 2026 10:25:12 -0700 Subject: [PATCH 1/2] refactor: centralize dataset and task configuration logic into new shared `dataset_config_python.hydrate` module --- README.md | 1 + docs/contributing/packages/dash_evals.md | 7 +- docs/contributing/repository_structure.md | 2 +- docs/guides/about_the_framework.md | 24 +- packages/dash_evals/pyproject.toml | 1 + .../src/dash_evals/runner/json_runner.py | 68 +--- .../dash_evals/runner/tasks/task_helpers.py | 176 +-------- packages/dash_evals/tests/test_json_runner.py | 8 +- packages/dataset_config_dart/README.md | 6 +- packages/dataset_config_python/pyproject.toml | 1 + .../src/dataset_config_python/__init__.py | 13 +- .../src/dataset_config_python/hydrate.py | 273 ++++++++++++++ .../tests/test_hydrate.py | 344 ++++++++++++++++++ 13 files changed, 669 insertions(+), 255 deletions(-) create mode 100644 packages/dataset_config_python/src/dataset_config_python/hydrate.py create mode 100644 packages/dataset_config_python/tests/test_hydrate.py diff --git a/README.md b/README.md index 891058a..b66113f 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ This repo includes - **eval runner** — Python package for running LLM evaluations with configurable tasks, variants, and models - **config packages** — Dart and Python packages that resolve dataset YAML into EvalSet JSON for the runner + - **NB**: These packages largely overlap, and coexist for backwards compatibility purposes. In time, the Dart package will be deprecated. - **devals CLI** — Dart CLI for creating and managing dataset samples, tasks, and jobs - **Evaluation Explorer** — Dart/Flutter app for browsing and analyzing results diff --git a/docs/contributing/packages/dash_evals.md b/docs/contributing/packages/dash_evals.md index 3fd714a..aebdeed 100644 --- a/docs/contributing/packages/dash_evals.md +++ b/docs/contributing/packages/dash_evals.md @@ -41,9 +41,10 @@ src/dash_evals/ 1. **Configure**: The Dart `dataset_config_dart` package parses dataset YAML and resolves it into an `EvalSet` JSON manifest 2. **Load**: The Python runner reads the JSON manifest via `json_runner.py`, resolving task functions dynamically with `importlib` -3. **Execute**: Each task function receives its dataset and task definition, producing an `inspect_ai.Task` -4. **Score**: Scorers evaluate model outputs against targets -5. **Log**: Results written to the configured `log_dir` +3. **Hydrate**: Config dicts are converted to Inspect AI objects (datasets, MCP servers, skills) using shared helpers from `dataset_config_python.hydrate` +4. **Execute**: Each task function receives its dataset and task definition, producing an `inspect_ai.Task` +5. **Score**: Scorers evaluate model outputs against targets +6. **Log**: Results written to the configured `log_dir` Alternatively, the runner can be invoked directly with `--task` and `--model` arguments (via `args_runner.py`), bypassing the Dart config pipeline. diff --git a/docs/contributing/repository_structure.md b/docs/contributing/repository_structure.md index e4a0730..c1df438 100644 --- a/docs/contributing/repository_structure.md +++ b/docs/contributing/repository_structure.md @@ -10,7 +10,7 @@ evals/ │ ├── devals_cli/ # Dart CLI for managing dataset (devals) │ ├── dataset_config_dart/ # Dart library: YAML → EvalSet JSON │ ├── dash_evals/ # Python evaluation runner -│ ├── dataset_config_python/ # Python configuration models +│ ├── dataset_config_python/ # Python config: YAML → EvalSet JSON + config → Inspect AI objects │ └── eval_explorer/ # Dart/Flutter results viewer (Serverpod) ├── tool/ # Utility scripts ├── pubspec.yaml # Dart workspace configuration diff --git a/docs/guides/about_the_framework.md b/docs/guides/about_the_framework.md index 7eed13e..ff19a4f 100644 --- a/docs/guides/about_the_framework.md +++ b/docs/guides/about_the_framework.md @@ -18,6 +18,7 @@ YAML config → Dart resolver → JSON manifest → Python runner → Inspect AI |-------|---------|-------------| | **YAML config** | — | Your `task.yaml` and `job.yaml` files | | **Dart resolver** | `dataset_config_dart` | Parses YAML, resolves globs and references, produces a JSON manifest | +| **Hydration** | `dataset_config_python` | Converts config dicts into Inspect AI objects (datasets, MCP servers, skills) | | **Python runner** | `dash_evals` | Reads the manifest, builds Inspect AI `Task` objects, calls `eval_set()` | | **Inspect AI** | `inspect_ai` | Runs solver chains, sends prompts, collects responses, runs scorers | @@ -148,16 +149,19 @@ calling `submit()`. ## Shared helpers -The `task_helpers.py` module contains functions used across all tasks: - -| Helper | What it does | -|--------|-------------| -| `append_context_injection(chain, config)` | Adds a `context_injector` solver if the variant has `files` | -| `append_model_interaction(chain, config)` | Adds `react()` (if tools exist) or `generate()` (if not) | -| `get_skill_tool(config)` | Creates a skill tool if the variant has `skills` configured | -| `build_task_metadata(config)` | Builds the metadata dict for the `Task` object | -| `create_mcp_servers(configs, sandbox_type)` | Creates MCP server objects from variant config | -| `validate_sandbox_tools(config, tool_names)` | Checks that sandbox-requiring tools aren't used on local | +The `task_helpers.py` module contains functions used across all tasks. Some of +these are re-exported from `dataset_config_python.hydrate` — the shared +config-interpretation layer that both `dash_evals` and external consumers (like +yardstick) use to ensure consistent hydration of config into Inspect AI objects. + +| Helper | Source | What it does | +|--------|--------|-------------| +| `create_mcp_servers(configs, sandbox_type)` | `dataset_config_python` | Creates MCP server objects from variant config | +| `get_skill_tool(config)` | `dataset_config_python` | Creates a skill tool if the variant has `skills` configured | +| `build_task_metadata(config)` | `dataset_config_python` | Builds the metadata dict for the `Task` object | +| `append_context_injection(chain, config)` | `dash_evals` | Adds a `context_injector` solver if the variant has `files` | +| `append_model_interaction(chain, config)` | `dash_evals` | Adds `react()` (if tools exist) or `generate()` (if not) | +| `validate_sandbox_tools(config, tool_names)` | `dash_evals` | Checks that sandbox-requiring tools aren't used on local | These helpers mean that most of the variant logic (context injection, MCP tools, skills) is handled **automatically**. You just need to define the core solver diff --git a/packages/dash_evals/pyproject.toml b/packages/dash_evals/pyproject.toml index ac2f9f2..b3a3e6a 100644 --- a/packages/dash_evals/pyproject.toml +++ b/packages/dash_evals/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "openai>=2.8.1,<3.0.0", "firebase-admin>=6.0.0,<8.0.0", "pydantic>=2.0.0,<3.0.0", + "dataset-config-python", ] [project.optional-dependencies] diff --git a/packages/dash_evals/src/dash_evals/runner/json_runner.py b/packages/dash_evals/src/dash_evals/runner/json_runner.py index 828d048..41817b3 100644 --- a/packages/dash_evals/src/dash_evals/runner/json_runner.py +++ b/packages/dash_evals/src/dash_evals/runner/json_runner.py @@ -11,7 +11,7 @@ from pathlib import Path import inspect_ai -from inspect_ai.dataset import MemoryDataset, Sample, csv_dataset, json_dataset +from dataset_config_python.hydrate import build_dataset as _build_dataset from dash_evals.utils.logging import capture_output, setup_logging @@ -94,74 +94,8 @@ def _resolve_task_func(name: str): return func -def _build_dataset(task_def: dict): - """Build an Inspect AI dataset from a task definition. - Dispatches on ``task_def["dataset"]["format"]``: - - ``"memory"`` (default): builds a ``MemoryDataset`` from inline samples. - - ``"json"``: delegates to ``inspect_ai.dataset.json_dataset(source, **args)``. - - ``"csv"``: delegates to ``inspect_ai.dataset.csv_dataset(source, **args)``. - - Args: - task_def: A task entry from the EvalSet JSON manifest. - - Returns: - An Inspect AI dataset object. - - Raises: - ValueError: If the dataset format is unrecognized or required fields - (e.g. ``source`` for json/csv) are missing. - """ - dataset_def = task_def.get("dataset") - task_name = task_def.get("name", "") - - if not dataset_def: - return MemoryDataset([], name=task_name) - - fmt = dataset_def.get("format", "memory") - extra_args: dict = dataset_def.get("args") or {} - - if fmt == "json": - source = dataset_def.get("source") - if not source: - raise ValueError( - f"Task '{task_name}': dataset format 'json' requires a 'source' field." - ) - return json_dataset(source, **extra_args) - - if fmt == "csv": - source = dataset_def.get("source") - if not source: - raise ValueError( - f"Task '{task_name}': dataset format 'csv' requires a 'source' field." - ) - return csv_dataset(source, **extra_args) - - if fmt == "memory": - raw_samples = dataset_def.get("samples", []) - samples = [] - for raw in raw_samples: - sample = Sample( - input=raw["input"], - target=raw.get("target", ""), - id=raw.get("id"), - metadata=raw.get("metadata"), - files=raw.get("files"), - setup=raw.get("setup"), - sandbox=raw.get("sandbox"), - ) - samples.append(sample) - - return MemoryDataset( - samples, - name=dataset_def.get("name", task_name), - ) - - raise ValueError( - f"Task '{task_name}': unknown dataset format '{fmt}'. " - f"Expected one of: 'memory', 'json', 'csv'." - ) def run_from_json(manifest_path: str | Path) -> bool: diff --git a/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py b/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py index bca2517..767a06e 100644 --- a/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py +++ b/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py @@ -11,18 +11,22 @@ from __future__ import annotations -import importlib -from typing import Any, cast - +from typing import cast + +# Re-export config-interpretation helpers from the shared package. +# These are the single source of truth for interpreting config dicts +# as Inspect AI objects; both dash_evals and yardstick use them. +from dataset_config_python.hydrate import ( # noqa: F401 + build_task_metadata, + create_mcp_servers, + get_skill_tool, +) from inspect_ai.agent import react from inspect_ai.solver import Solver, generate from inspect_ai.tool import ( MCPServer, Tool, - mcp_server_http, - mcp_server_sandbox, mcp_server_stdio, - skill, ) from dash_evals.runner.solvers import context_injector @@ -66,127 +70,6 @@ def validate_sandbox_tools(config: dict, tool_names: list[str]) -> None: ) -def _resolve_mcp_ref(ref: str) -> MCPServer: - """Resolve a Python import reference to an MCPServer object. - - Supports ``"module.path:variable_name"`` format. - - Args: - ref: Import reference (e.g. ``"my_package.mcp:staging_server"``). - - Returns: - The resolved MCPServer object. - """ - if ":" not in ref: - raise ValueError( - f"Invalid MCP server ref '{ref}'. Expected format: 'module.path:variable_name'" - ) - module_path, attr_name = ref.rsplit(":", 1) - try: - module = importlib.import_module(module_path) - except ImportError as e: - raise ImportError( - f"Could not import module '{module_path}' for MCP server ref '{ref}': {e}" - ) from e - try: - server = getattr(module, attr_name) - except AttributeError as e: - raise AttributeError( - f"Module '{module_path}' has no attribute '{attr_name}' " - f"(referenced by MCP server ref '{ref}')" - ) from e - return server - - -def create_mcp_servers( - mcp_configs: list[dict], - sandbox_type: str = "local", -) -> list[MCPServer]: - """Create MCP server objects from variant config. - - Supports three modes per entry: - - **Declarative stdio/sandbox**: dict with ``command``, ``args``, etc. - - **Declarative HTTP**: dict with ``url``, and optionally ``authorization``/``headers``. - - **Python ref**: dict with ``ref`` key pointing to a pre-built MCPServer. - - Transport is auto-selected when not explicit: - - If ``url`` is present → ``mcp_server_http`` - - If sandbox is non-local → ``mcp_server_sandbox`` - - Otherwise → ``mcp_server_stdio`` - - Args: - mcp_configs: List of MCP server config dicts from variant_config. - sandbox_type: The sandbox type for the current eval run. - - Returns: - List of MCPServer objects. - """ - servers: list[MCPServer] = [] - for cfg in mcp_configs: - # Ref mode — import a pre-built MCPServer from Python - if cfg.get("ref"): - servers.append(_resolve_mcp_ref(cfg["ref"])) - continue - - # HTTP mode — url-based server - url = cfg.get("url") - if url: - name = cfg.get("name", url) - authorization = cfg.get("authorization") or cfg.get("auth") - headers = cfg.get("headers") - servers.append( - mcp_server_http( - url=url, - name=name, - authorization=authorization, - headers=headers, - ) - ) - continue - - # Stdio / sandbox mode — command-based server - command = cfg.get("command") - if not command: - raise ValueError( - f"MCP server config missing 'command' or 'url' for server " - f"'{cfg.get('name', 'unknown')}': {cfg}" - ) - - name = cfg.get("name", command) - args = cfg.get("args", []) - env = cfg.get("env") - cwd = cfg.get("cwd") - - transport = cfg.get("transport") - if transport is None: - transport = "sandbox" if sandbox_type != "local" else "stdio" - - if transport == "stdio": - servers.append( - mcp_server_stdio( - name=name, - command=command, - args=args, - env=env, - cwd=cwd, - ) - ) - elif transport == "sandbox": - servers.append( - mcp_server_sandbox( - name=name, - command=command, - args=args, - env=env, - cwd=cwd, - ) - ) - else: - raise ValueError(f"Unknown MCP transport '{transport}' for server '{name}'") - - return servers - - # Backwards-compatible alias def create_mcp_server(config: dict | None = None): """Create the default Dart MCP server (backwards-compatible alias).""" @@ -202,28 +85,6 @@ def create_dart_mcp_server(): return create_mcp_server() -def build_task_metadata(config: dict) -> dict: - """Build task metadata dictionary from manifest config. - - Args: - config: Task manifest entry with 'variant', 'save_examples', etc. - - Returns: - Metadata dictionary for Task. - """ - metadata: dict[str, Any] = {} - variant = config.get("variant", {}) - if variant: - metadata["variant_config"] = variant - - if config.get("save_examples") and config.get("examples_dir"): - metadata["save_examples"] = True - metadata["examples_dir"] = config["examples_dir"] - metadata["task_variant"] = config.get("task_name", "unknown") - - return metadata - - def append_context_injection(solver_chain: list, config: dict) -> None: """Append context injection solver if the variant has context files. @@ -238,23 +99,6 @@ def append_context_injection(solver_chain: list, config: dict) -> None: solver_chain.append(context_injector(context_files)) -def get_skill_tool(config: dict) -> Tool | None: - """Create the skill tool if the variant has skills configured. - - Args: - config: Task manifest entry with 'variant' key. - - Returns: - The skill Tool, or None if no skills are configured. - """ - variant = config.get("variant", {}) - # Support both old "skill_paths" and new "skills" key - skill_paths = variant.get("skills") or variant.get("skill_paths", []) - if skill_paths: - return skill(skill_paths) - return None - - def append_model_interaction( solver_chain: list, config: dict, diff --git a/packages/dash_evals/tests/test_json_runner.py b/packages/dash_evals/tests/test_json_runner.py index 067f30c..9cf7b57 100644 --- a/packages/dash_evals/tests/test_json_runner.py +++ b/packages/dash_evals/tests/test_json_runner.py @@ -126,7 +126,7 @@ def test_json_format_calls_json_dataset(self): }, } mock_ds = MagicMock(name="json_dataset_result") - with patch("dash_evals.runner.json_runner.json_dataset", return_value=mock_ds) as mock_fn: + with patch("dataset_config_python.hydrate.json_dataset", return_value=mock_ds) as mock_fn: result = _build_dataset(task_def) mock_fn.assert_called_once_with("gs://bucket/data.jsonl") @@ -142,7 +142,7 @@ def test_json_format_passes_extra_args(self): "args": {"auto_id": True, "shuffle": True}, }, } - with patch("dash_evals.runner.json_runner.json_dataset") as mock_fn: + with patch("dataset_config_python.hydrate.json_dataset") as mock_fn: _build_dataset(task_def) mock_fn.assert_called_once_with("./data.jsonl", auto_id=True, shuffle=True) @@ -170,7 +170,7 @@ def test_csv_format_calls_csv_dataset(self): }, } mock_ds = MagicMock(name="csv_dataset_result") - with patch("dash_evals.runner.json_runner.csv_dataset", return_value=mock_ds) as mock_fn: + with patch("dataset_config_python.hydrate.csv_dataset", return_value=mock_ds) as mock_fn: result = _build_dataset(task_def) mock_fn.assert_called_once_with("./data.csv") @@ -186,7 +186,7 @@ def test_csv_format_passes_extra_args(self): "args": {"delimiter": "\t", "encoding": "utf-8"}, }, } - with patch("dash_evals.runner.json_runner.csv_dataset") as mock_fn: + with patch("dataset_config_python.hydrate.csv_dataset") as mock_fn: _build_dataset(task_def) mock_fn.assert_called_once_with("./data.csv", delimiter="\t", encoding="utf-8") diff --git a/packages/dataset_config_dart/README.md b/packages/dataset_config_dart/README.md index 3816eca..378b0ef 100644 --- a/packages/dataset_config_dart/README.md +++ b/packages/dataset_config_dart/README.md @@ -1,2 +1,4 @@ -A sample command-line application with an entrypoint in `bin/`, library code -in `lib/`, and example unit test in `test/`. +# Dataset Config management library - Dart implementation + +> [!CAUTION] +> This library will be deprecated eventually. It almost entirely overlaps with the Python implementation, and the python lib should be considered the canonical package. \ No newline at end of file diff --git a/packages/dataset_config_python/pyproject.toml b/packages/dataset_config_python/pyproject.toml index 553eb3f..6a00247 100644 --- a/packages/dataset_config_python/pyproject.toml +++ b/packages/dataset_config_python/pyproject.toml @@ -8,6 +8,7 @@ requires-python = ">=3.13,<4.0.0" dependencies = [ "pyyaml>=6.0.3,<7.0.0", "pydantic>=2.0.0,<3.0.0", + "inspect-ai>=0.3.142,<0.4.0", ] [project.optional-dependencies] diff --git a/packages/dataset_config_python/src/dataset_config_python/__init__.py b/packages/dataset_config_python/src/dataset_config_python/__init__.py index e6dd675..9f60116 100644 --- a/packages/dataset_config_python/src/dataset_config_python/__init__.py +++ b/packages/dataset_config_python/src/dataset_config_python/__init__.py @@ -2,10 +2,14 @@ Reads YAML config files (jobs, tasks, samples) and produces the EvalSet JSON that dash_evals consumes. - -No Dart SDK or Inspect AI dependency required. """ +from dataset_config_python.hydrate import ( + build_dataset, + build_task_metadata, + create_mcp_servers, + get_skill_tool, +) from dataset_config_python.parser import ParsedTask, find_job_file, parse_job, parse_tasks from dataset_config_python.resolver import ( DEFAULT_SANDBOX_REGISTRY, @@ -19,10 +23,15 @@ "DEFAULT_SANDBOX_REGISTRY", "ParsedTask", "SandboxConfig", + "build_dataset", + "build_task_metadata", + "create_mcp_servers", "find_job_file", + "get_skill_tool", "parse_job", "parse_tasks", "resolve", "resolve_from_parsed", "write_eval_sets", ] + diff --git a/packages/dataset_config_python/src/dataset_config_python/hydrate.py b/packages/dataset_config_python/src/dataset_config_python/hydrate.py new file mode 100644 index 0000000..0ca148a --- /dev/null +++ b/packages/dataset_config_python/src/dataset_config_python/hydrate.py @@ -0,0 +1,273 @@ +"""Hydrate — convert resolved config dicts into Inspect AI objects. + +This module is the single source of truth for interpreting config structures +(datasets, MCP servers, skills, metadata) as Inspect AI objects. Both +``dash_evals`` and external consumers (e.g. yardstick) should use these +helpers rather than re-implementing the same logic. + +No solver or task-execution logic lives here — only config → object conversion. +""" + +from __future__ import annotations + +import importlib +from typing import Any + +from inspect_ai.dataset import MemoryDataset, Sample, csv_dataset, json_dataset +from inspect_ai.tool import ( + MCPServer, + Tool, + mcp_server_http, + mcp_server_sandbox, + mcp_server_stdio, + skill, +) + +# --------------------------------------------------------------------------- +# Dataset hydration +# --------------------------------------------------------------------------- + + +def build_dataset(task_def: dict) -> Any: + """Build an Inspect AI dataset from a task definition dict. + + Dispatches on ``task_def["dataset"]["format"]``: + + - ``"memory"`` (default): builds a ``MemoryDataset`` from inline samples. + - ``"json"``: delegates to ``inspect_ai.dataset.json_dataset(source, **args)``. + - ``"csv"``: delegates to ``inspect_ai.dataset.csv_dataset(source, **args)``. + + Args: + task_def: A task entry from the EvalSet JSON manifest. + + Returns: + An Inspect AI dataset object. + + Raises: + ValueError: If the dataset format is unrecognised or required fields + (e.g. ``source`` for json/csv) are missing. + """ + dataset_def = task_def.get("dataset") + task_name = task_def.get("name", "") + + if not dataset_def: + return MemoryDataset([], name=task_name) + + fmt = dataset_def.get("format", "memory") + extra_args: dict = dataset_def.get("args") or {} + + if fmt == "json": + source = dataset_def.get("source") + if not source: + raise ValueError( + f"Task '{task_name}': dataset format 'json' requires a 'source' field." + ) + return json_dataset(source, **extra_args) + + if fmt == "csv": + source = dataset_def.get("source") + if not source: + raise ValueError(f"Task '{task_name}': dataset format 'csv' requires a 'source' field.") + return csv_dataset(source, **extra_args) + + if fmt == "memory": + raw_samples = dataset_def.get("samples", []) + samples = [] + for raw in raw_samples: + sample = Sample( + input=raw["input"], + target=raw.get("target", ""), + id=raw.get("id"), + metadata=raw.get("metadata"), + files=raw.get("files"), + setup=raw.get("setup"), + sandbox=raw.get("sandbox"), + ) + samples.append(sample) + + return MemoryDataset( + samples, + name=dataset_def.get("name", task_name), + ) + + raise ValueError( + f"Task '{task_name}': unknown dataset format '{fmt}'. " + f"Expected one of: 'memory', 'json', 'csv'." + ) + + +# --------------------------------------------------------------------------- +# MCP server hydration +# --------------------------------------------------------------------------- + + +def _resolve_mcp_ref(ref: str) -> MCPServer: + """Resolve a Python import reference to an MCPServer object. + + Supports ``"module.path:variable_name"`` format. + + Args: + ref: Import reference (e.g. ``"my_package.mcp:staging_server"``). + + Returns: + The resolved MCPServer object. + """ + if ":" not in ref: + raise ValueError( + f"Invalid MCP server ref '{ref}'. Expected format: 'module.path:variable_name'" + ) + module_path, attr_name = ref.rsplit(":", 1) + try: + module = importlib.import_module(module_path) + except ImportError as e: + raise ImportError( + f"Could not import module '{module_path}' for MCP server ref '{ref}': {e}" + ) from e + try: + server = getattr(module, attr_name) + except AttributeError as e: + raise AttributeError( + f"Module '{module_path}' has no attribute '{attr_name}' " + f"(referenced by MCP server ref '{ref}')" + ) from e + return server + + +def create_mcp_servers( + mcp_configs: list[dict], + sandbox_type: str = "local", +) -> list[MCPServer]: + """Create MCP server objects from variant config. + + Supports three modes per entry: + + - **Declarative stdio/sandbox**: dict with ``command``, ``args``, etc. + - **Declarative HTTP**: dict with ``url``, and optionally ``authorization``/``headers``. + - **Python ref**: dict with ``ref`` key pointing to a pre-built MCPServer. + + Transport is auto-selected when not explicit: + + - If ``url`` is present → ``mcp_server_http`` + - If sandbox is non-local → ``mcp_server_sandbox`` + - Otherwise → ``mcp_server_stdio`` + + Args: + mcp_configs: List of MCP server config dicts from variant_config. + sandbox_type: The sandbox type for the current eval run. + + Returns: + List of MCPServer objects. + """ + servers: list[MCPServer] = [] + for cfg in mcp_configs: + # Ref mode — import a pre-built MCPServer from Python + if cfg.get("ref"): + servers.append(_resolve_mcp_ref(cfg["ref"])) + continue + + # HTTP mode — url-based server + url = cfg.get("url") + if url: + name = cfg.get("name", url) + authorization = cfg.get("authorization") or cfg.get("auth") + headers = cfg.get("headers") + servers.append( + mcp_server_http( + url=url, + name=name, + authorization=authorization, + headers=headers, + ) + ) + continue + + # Stdio / sandbox mode — command-based server + command = cfg.get("command") + if not command: + raise ValueError( + f"MCP server config missing 'command' or 'url' for server " + f"'{cfg.get('name', 'unknown')}': {cfg}" + ) + + name = cfg.get("name", command) + args = cfg.get("args", []) + env = cfg.get("env") + cwd = cfg.get("cwd") + + transport = cfg.get("transport") + if transport is None: + transport = "sandbox" if sandbox_type != "local" else "stdio" + + if transport == "stdio": + servers.append( + mcp_server_stdio( + name=name, + command=command, + args=args, + env=env, + cwd=cwd, + ) + ) + elif transport == "sandbox": + servers.append( + mcp_server_sandbox( + name=name, + command=command, + args=args, + env=env, + cwd=cwd, + ) + ) + else: + raise ValueError(f"Unknown MCP transport '{transport}' for server '{name}'") + + return servers + + +# --------------------------------------------------------------------------- +# Skill tool hydration +# --------------------------------------------------------------------------- + + +def get_skill_tool(config: dict) -> Tool | None: + """Create the skill tool if the variant has skills configured. + + Args: + config: Task manifest entry with 'variant' key. + + Returns: + The skill Tool, or None if no skills are configured. + """ + variant = config.get("variant", {}) + # Support both old "skill_paths" and new "skills" key + skill_paths = variant.get("skills") or variant.get("skill_paths", []) + if skill_paths: + return skill(skill_paths) + return None + + +# --------------------------------------------------------------------------- +# Task metadata +# --------------------------------------------------------------------------- + + +def build_task_metadata(config: dict) -> dict: + """Build task metadata dictionary from manifest config. + + Args: + config: Task manifest entry with 'variant', 'save_examples', etc. + + Returns: + Metadata dictionary for Task. + """ + metadata: dict[str, Any] = {} + variant = config.get("variant", {}) + if variant: + metadata["variant_config"] = variant + + if config.get("save_examples") and config.get("examples_dir"): + metadata["save_examples"] = True + metadata["examples_dir"] = config["examples_dir"] + metadata["task_variant"] = config.get("task_name", "unknown") + + return metadata diff --git a/packages/dataset_config_python/tests/test_hydrate.py b/packages/dataset_config_python/tests/test_hydrate.py new file mode 100644 index 0000000..9b2559f --- /dev/null +++ b/packages/dataset_config_python/tests/test_hydrate.py @@ -0,0 +1,344 @@ +"""Tests for dataset_config_python.hydrate — config → Inspect AI object conversion.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest +from inspect_ai.dataset import MemoryDataset + +from dataset_config_python.hydrate import ( + build_dataset, + build_task_metadata, + create_mcp_servers, + get_skill_tool, +) + +# =========================================================================== +# build_dataset +# =========================================================================== + + +class TestBuildDatasetMemoryFormat: + """Tests for inline MemoryDataset (format='memory').""" + + def test_no_dataset_returns_empty_memory_dataset(self): + """Tasks without a dataset key produce an empty MemoryDataset.""" + task_def = {"name": "my_task:baseline", "func": "question_answer"} + result = build_dataset(task_def) + assert isinstance(result, MemoryDataset) + assert len(result) == 0 + + def test_empty_dataset_dict_returns_empty_memory_dataset(self): + """An empty dataset dict produces an empty MemoryDataset.""" + task_def = {"name": "my_task:baseline", "dataset": {}} + result = build_dataset(task_def) + assert isinstance(result, MemoryDataset) + assert len(result) == 0 + + def test_memory_format_explicit(self): + """Explicit format='memory' builds a MemoryDataset from inline samples.""" + task_def = { + "name": "my_task:baseline", + "dataset": { + "format": "memory", + "samples": [ + {"id": "s1", "input": "What is Dart?", "target": "A language"}, + ], + }, + } + result = build_dataset(task_def) + assert isinstance(result, MemoryDataset) + assert len(result) == 1 + assert result[0].input == "What is Dart?" + assert result[0].target == "A language" + assert result[0].id == "s1" + + def test_memory_format_default_when_format_absent(self): + """Omitting 'format' defaults to memory format.""" + task_def = { + "name": "my_task:baseline", + "dataset": { + "samples": [ + {"id": "s1", "input": "q", "target": "a"}, + ], + }, + } + result = build_dataset(task_def) + assert isinstance(result, MemoryDataset) + assert len(result) == 1 + + def test_memory_format_preserves_optional_sample_fields(self): + """Optional sample fields (metadata, files, setup, sandbox) are passed through.""" + task_def = { + "name": "t:v", + "dataset": { + "samples": [ + { + "id": "s1", + "input": "q", + "target": "a", + "metadata": {"difficulty": "hard"}, + "files": {"/workspace": "./proj"}, + "setup": "dart pub get", + "sandbox": "docker", + } + ], + }, + } + result = build_dataset(task_def) + sample = result[0] + assert sample.metadata == {"difficulty": "hard"} + assert sample.files == {"/workspace": "./proj"} + assert sample.setup == "dart pub get" + sandbox = sample.sandbox + sandbox_type = sandbox.type if hasattr(sandbox, "type") else sandbox + assert sandbox_type == "docker" + + def test_memory_format_dataset_name(self): + """Dataset name falls back to task name when not set in dataset dict.""" + task_def = { + "name": "dart_qa:baseline", + "dataset": { + "samples": [], + }, + } + result = build_dataset(task_def) + assert isinstance(result, MemoryDataset) + assert result.name == "dart_qa:baseline" + + def test_memory_format_explicit_dataset_name_wins(self): + """Explicit dataset name takes precedence over task name.""" + task_def = { + "name": "dart_qa:baseline", + "dataset": { + "name": "custom_name", + "samples": [], + }, + } + result = build_dataset(task_def) + assert result.name == "custom_name" + + +class TestBuildDatasetJsonFormat: + """Tests for JSON file-backed dataset (format='json').""" + + def test_json_format_calls_json_dataset(self): + """format='json' calls inspect_ai.dataset.json_dataset(source).""" + task_def = { + "name": "my_task:baseline", + "dataset": { + "format": "json", + "source": "gs://bucket/data.jsonl", + }, + } + mock_ds = MagicMock(name="json_dataset_result") + with patch("dataset_config_python.hydrate.json_dataset", return_value=mock_ds) as mock_fn: + result = build_dataset(task_def) + + mock_fn.assert_called_once_with("gs://bucket/data.jsonl") + assert result is mock_ds + + def test_json_format_passes_extra_args(self): + """Extra args from dataset.args are passed as kwargs to json_dataset().""" + task_def = { + "name": "t:v", + "dataset": { + "format": "json", + "source": "./data.jsonl", + "args": {"auto_id": True, "shuffle": True}, + }, + } + with patch("dataset_config_python.hydrate.json_dataset") as mock_fn: + build_dataset(task_def) + + mock_fn.assert_called_once_with("./data.jsonl", auto_id=True, shuffle=True) + + def test_json_format_missing_source_raises(self): + """format='json' without a source raises ValueError.""" + task_def = { + "name": "my_task:baseline", + "dataset": {"format": "json"}, + } + with pytest.raises(ValueError, match="requires a 'source' field"): + build_dataset(task_def) + + +class TestBuildDatasetCsvFormat: + """Tests for CSV file-backed dataset (format='csv').""" + + def test_csv_format_calls_csv_dataset(self): + """format='csv' calls inspect_ai.dataset.csv_dataset(source).""" + task_def = { + "name": "my_task:baseline", + "dataset": { + "format": "csv", + "source": "./data.csv", + }, + } + mock_ds = MagicMock(name="csv_dataset_result") + with patch("dataset_config_python.hydrate.csv_dataset", return_value=mock_ds) as mock_fn: + result = build_dataset(task_def) + + mock_fn.assert_called_once_with("./data.csv") + assert result is mock_ds + + def test_csv_format_passes_extra_args(self): + """Extra args from dataset.args are passed as kwargs to csv_dataset().""" + task_def = { + "name": "t:v", + "dataset": { + "format": "csv", + "source": "./data.csv", + "args": {"delimiter": "\t", "encoding": "utf-8"}, + }, + } + with patch("dataset_config_python.hydrate.csv_dataset") as mock_fn: + build_dataset(task_def) + + mock_fn.assert_called_once_with("./data.csv", delimiter="\t", encoding="utf-8") + + def test_csv_format_missing_source_raises(self): + """format='csv' without a source raises ValueError.""" + task_def = { + "name": "my_task:baseline", + "dataset": {"format": "csv"}, + } + with pytest.raises(ValueError, match="requires a 'source' field"): + build_dataset(task_def) + + +class TestBuildDatasetUnknownFormat: + """Tests for unknown dataset formats.""" + + def test_unknown_format_raises(self): + """An unrecognised format string raises ValueError.""" + task_def = { + "name": "my_task:baseline", + "dataset": { + "format": "parquet", + "source": "./data.parquet", + }, + } + with pytest.raises(ValueError, match="unknown dataset format 'parquet'"): + build_dataset(task_def) + + +# =========================================================================== +# create_mcp_servers +# =========================================================================== + + +class TestCreateMcpServers: + """Tests for MCP server creation from config dicts.""" + + def test_empty_list_returns_empty(self): + result = create_mcp_servers([]) + assert result == [] + + def test_stdio_server_local(self): + """Local sandbox defaults to stdio transport.""" + cfg = [{"command": "dart", "args": ["mcp-server"], "name": "Dart"}] + servers = create_mcp_servers(cfg, sandbox_type="local") + assert len(servers) == 1 + + def test_sandbox_server_non_local(self): + """Non-local sandbox defaults to sandbox transport.""" + cfg = [{"command": "dart", "args": ["mcp-server"], "name": "Dart"}] + servers = create_mcp_servers(cfg, sandbox_type="podman") + assert len(servers) == 1 + + def test_http_server(self): + """URL-based config produces an HTTP server.""" + cfg = [{"url": "http://localhost:8080", "name": "test"}] + servers = create_mcp_servers(cfg) + assert len(servers) == 1 + + def test_ref_server(self): + """Ref mode imports a pre-built MCPServer.""" + mock_server = MagicMock() + with patch( + "dataset_config_python.hydrate._resolve_mcp_ref", + return_value=mock_server, + ): + servers = create_mcp_servers([{"ref": "my_pkg:my_server"}]) + assert len(servers) == 1 + assert servers[0] is mock_server + + def test_missing_command_and_url_raises(self): + """Config without command or url raises ValueError.""" + with pytest.raises(ValueError, match="missing 'command' or 'url'"): + create_mcp_servers([{"name": "broken"}]) + + def test_unknown_transport_raises(self): + """Unknown transport value raises ValueError.""" + with pytest.raises(ValueError, match="Unknown MCP transport"): + create_mcp_servers([{"command": "dart", "name": "test", "transport": "quantum"}]) + + +# =========================================================================== +# get_skill_tool +# =========================================================================== + + +class TestGetSkillTool: + """Tests for skill tool creation from config.""" + + def test_no_variant_returns_none(self): + assert get_skill_tool({}) is None + + def test_no_skills_returns_none(self): + assert get_skill_tool({"variant": {}}) is None + + def test_empty_skills_returns_none(self): + assert get_skill_tool({"variant": {"skills": []}}) is None + + def test_skills_returns_tool(self): + with patch("dataset_config_python.hydrate.skill") as mock_skill: + mock_skill.return_value = MagicMock() + result = get_skill_tool({"variant": {"skills": ["/path/to/skill"]}}) + assert result is not None + mock_skill.assert_called_once_with(["/path/to/skill"]) + + def test_old_skill_paths_key(self): + """Supports the legacy 'skill_paths' key.""" + with patch("dataset_config_python.hydrate.skill") as mock_skill: + mock_skill.return_value = MagicMock() + result = get_skill_tool({"variant": {"skill_paths": ["/path/to/skill"]}}) + assert result is not None + mock_skill.assert_called_once_with(["/path/to/skill"]) + + +# =========================================================================== +# build_task_metadata +# =========================================================================== + + +class TestBuildTaskMetadata: + """Tests for task metadata construction.""" + + def test_empty_config(self): + result = build_task_metadata({}) + assert result == {} + + def test_variant_included(self): + result = build_task_metadata({"variant": {"files": ["a.md"]}}) + assert "variant_config" in result + assert result["variant_config"] == {"files": ["a.md"]} + + def test_save_examples(self): + result = build_task_metadata( + { + "save_examples": True, + "examples_dir": "/logs/examples", + "task_name": "my_task:v1", + } + ) + assert result["save_examples"] is True + assert result["examples_dir"] == "/logs/examples" + assert result["task_variant"] == "my_task:v1" + + def test_save_examples_without_dir_omits(self): + """save_examples without examples_dir does not add metadata.""" + result = build_task_metadata({"save_examples": True}) + assert "save_examples" not in result From 523d101ef9139cb8d14f672e5a4bd67ad389e697 Mon Sep 17 00:00:00 2001 From: Eric Windmill Date: Mon, 30 Mar 2026 10:30:28 -0700 Subject: [PATCH 2/2] feat: update dataset configuration schemas and add missing python package dependencies to CI workflows --- .github/workflows/dash_evals_module_tests.yml | 1 + .github/workflows/docs.yml | 1 + .../dataset_config_dart.md | 22 ++++++++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dash_evals_module_tests.yml b/.github/workflows/dash_evals_module_tests.yml index d8dd65c..3ddda56 100644 --- a/.github/workflows/dash_evals_module_tests.yml +++ b/.github/workflows/dash_evals_module_tests.yml @@ -35,6 +35,7 @@ jobs: run: | source .venv/bin/activate pip install --upgrade pip + pip install -e ../dataset_config_python pip install -e ".[dev]" - name: Run tests diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 97f8ba8..14e694f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -32,6 +32,7 @@ jobs: run: | pip install --upgrade pip pip install -r docs/requirements.txt + pip install -e packages/dataset_config_python pip install -e packages/dash_evals - name: Install Dart dependencies diff --git a/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md b/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md index fc7e1e9..63e06a0 100644 --- a/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md +++ b/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md @@ -336,7 +336,7 @@ and [`MemoryDataset`](https://inspect.aisi.org.uk/reference/inspect_ai.dataset.h #### `Dataset` ```dart -Dataset({List samples, String? name, String? location, bool shuffled}) +Dataset({List samples, String? name, String? location, bool shuffled, String format, String? source, Map? args}) ``` #### `Dataset.fromJson` @@ -1007,7 +1007,7 @@ inspect_eval_arguments: #### `Job` ```dart -Job({String? description, required String logDir, int maxConnections, List? models, Map>? variants, List? taskPaths, Map? tasks, bool saveExamples, Map? sandbox, Map? inspectEvalArguments, TagFilter? taskFilters, TagFilter? sampleFilters}) +Job({String? description, required String logDir, int maxConnections, required List models, Map>? variants, List? taskPaths, Map? tasks, bool saveExamples, Map? sandbox, Map? inspectEvalArguments, TagFilter? taskFilters, TagFilter? sampleFilters}) ``` #### `Job.fromJson` @@ -1203,7 +1203,7 @@ former `TaskConfig` model-package class. #### `ParsedTask` ```dart -ParsedTask({required String id, required String func, required List samples, required Variant variant, String sandboxType, String? systemMessage, bool saveExamples, String? examplesDir, Map? sandboxParameters, Map? taskFiles, String? taskSetup, String? model, Map? config, Map? modelRoles, Object? sandbox, Object? approval, Object? epochs, Object? failOnError, bool? continueOnFail, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Object? earlyStopping, String? displayName, Object? version, Map? metadata}) +ParsedTask({required String id, required String func, required List samples, required Variant variant, String sandboxType, String? systemMessage, bool saveExamples, String? examplesDir, Map? sandboxParameters, Map? taskFiles, String? taskSetup, String? model, Map? config, Map? modelRoles, Object? sandbox, Object? approval, Object? epochs, Object? failOnError, bool? continueOnFail, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Object? earlyStopping, String? displayName, Object? version, Map? metadata, String datasetFormat, String? datasetSource, Map? datasetArgs}) ``` ### Properties @@ -1304,6 +1304,18 @@ ParsedTask({required String id, required String func, required List samp Additional metadata to associate with the task. +- **`datasetFormat`** → `String` *(final)* + + Dataset format: 'memory' (inline samples), 'json', or 'csv'. + +- **`datasetSource`** → `String?` *(final)* + + File path or URL for json/csv datasets. + +- **`datasetArgs`** → `Map?` *(final)* + + Extra kwargs passed to json_dataset() or csv_dataset(). + ### Methods #### `copyWith` @@ -1722,6 +1734,10 @@ Job createDefaultJob(String baseDir) Create a [Job] with default settings (when no job file is provided). +Note: The caller must specify models, as there are no defaults. +This method creates a job with an empty models list; the resolver +will raise an error if models is empty at resolution time. + **Parameters:** - `baseDir` (`String`) *(required)*