Agenta-AI · mmabrouk · Feb 16, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 28, 2026
diff --git a/api/oss/src/core/evaluators/service.py b/api/oss/src/core/evaluators/service.py
@@ -42,11 +42,11 @@
     #
     EvaluatorRevision,
     EvaluatorRevisionCreate,
-    EvaluatorRevisionData,
     EvaluatorRevisionEdit,
     EvaluatorRevisionCommit,
     EvaluatorRevisionQuery,
 )
+from oss.src.core.evaluators.utils import build_evaluator_data
 from oss.src.core.shared.dtos import Reference
 from oss.src.utils.logging import get_module_logger
 
@@ -759,6 +759,83 @@ def __init__(
     ):
         self.evaluators_service = evaluators_service
 
+    @staticmethod
+    def _extract_builtin_evaluator_key(
+        simple_evaluator_data: Optional[SimpleEvaluatorData],
+    ) -> Optional[str]:
+        uri = simple_evaluator_data.uri if simple_evaluator_data else None
+
+        if not uri:
+            return None
+
+        parts = uri.split(":")
+
+        if len(parts) < 4:
+            return None
+
+        if parts[0] != "agenta" or parts[1] != "builtin":
+            return None
+
+        return parts[2] or None
+
+    @staticmethod
+    def _has_outputs_schema(
+        simple_evaluator_data: Optional[SimpleEvaluatorData],
+    ) -> bool:
+        if not simple_evaluator_data or not isinstance(
+            simple_evaluator_data.schemas, dict
+        ):
+            return False
+
+        return bool(simple_evaluator_data.schemas.get("outputs"))
+
+    def _ensure_builtin_evaluator_data(
+        self,
+        simple_evaluator_data: Optional[SimpleEvaluatorData],
+    ) -> Optional[SimpleEvaluatorData]:
+        evaluator_key = self._extract_builtin_evaluator_key(simple_evaluator_data)
+
+        if not evaluator_key:
+            return simple_evaluator_data
+
+        if self._has_outputs_schema(simple_evaluator_data):
+            return simple_evaluator_data
+
+        settings_values = (
+            simple_evaluator_data.parameters
+            if simple_evaluator_data
+            and isinstance(simple_evaluator_data.parameters, dict)
+            else None
+        )
+
+        hydrated_data = build_evaluator_data(
+            evaluator_key=evaluator_key,
+            settings_values=settings_values,
+        )
+
+        hydrated_data_dict = hydrated_data.model_dump(
+            mode="json",
+            exclude_none=True,
+            exclude_unset=True,
+        )
+
+        existing_data_dict = (
+            simple_evaluator_data.model_dump(
+                mode="json",
+                exclude_none=True,
+                exclude_unset=True,
+            )
+            if simple_evaluator_data
+            else {}
+        )
+
+        return SimpleEvaluatorData(
+            **{
+                **hydrated_data_dict,
+                **existing_data_dict,
+            }
+        )
+
     # public -------------------------------------------------------------------
 
     async def create(
@@ -849,6 +926,10 @@ async def create(
 
         evaluator_revision_slug = uuid4().hex[-12:]
 
+        hydrated_simple_evaluator_data = self._ensure_builtin_evaluator_data(
+            simple_evaluator_create.data,
+        )
+
         evaluator_revision_commit = EvaluatorRevisionCommit(
             slug=evaluator_revision_slug,
             #
@@ -890,7 +971,7 @@ async def create(
             tags=evaluator_create.tags,
             meta=evaluator_create.meta,
             #
-            data=simple_evaluator_create.data,
+            data=hydrated_simple_evaluator_data,
             #
             evaluator_id=evaluator.id,
             evaluator_variant_id=evaluator_variant.id,
@@ -1135,6 +1216,10 @@ async def edit(
 
         evaluator_revision_slug = uuid4().hex[-12:]
 
+        hydrated_simple_evaluator_data = self._ensure_builtin_evaluator_data(
+            simple_evaluator_edit.data,
+        )
+
         evaluator_revision_commit = EvaluatorRevisionCommit(
             slug=evaluator_revision_slug,
             #
@@ -1145,7 +1230,7 @@ async def edit(
             tags=evaluator_edit.tags,
             meta=evaluator_edit.meta,
             #
-            data=simple_evaluator_edit.data,
+            data=hydrated_simple_evaluator_data,
             #
             evaluator_id=evaluator.id,
             evaluator_variant_id=evaluator_variant.id,

diff --git a/api/oss/src/models/api/evaluation_model.py b/api/oss/src/models/api/evaluation_model.py
@@ -7,15 +7,14 @@
 from oss.src.utils import traces
 from oss.src.models.api.api_models import Result
 
-from oss.src.core.shared.dtos import Tags, Meta
-
 
 class LegacyEvaluator(BaseModel):
     name: str
     key: str
     direct_use: bool
     settings_presets: Optional[list[dict]] = None
     settings_template: dict
+    outputs_schema: Optional[Dict[str, Any]] = None
     description: Optional[str] = None
     oss: Optional[bool] = False
     requires_llm_api_keys: Optional[bool] = False

diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py
@@ -1,3 +1,6 @@
+from copy import deepcopy
+
+
 rag_evaluator_settings_template = {
     "question_key": {
         "label": "Question Key",
@@ -832,6 +835,76 @@
 ]
 
 
+_SUCCESS_ONLY_OUTPUT_SCHEMA = {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+        "success": {"type": "boolean"},
+    },
+    "required": ["success"],
+    "additionalProperties": False,
+}
+
+_SCORE_AND_SUCCESS_OUTPUT_SCHEMA = {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "type": "object",
+    "properties": {
+        "score": {"type": "number"},
+        "success": {"type": "boolean"},
+    },
+    "required": [],
+    "additionalProperties": False,
+}
+
+_FIXED_OUTPUT_SCHEMA_BY_KEY = {
+    "auto_custom_code_run": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
+    "field_match_test": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_json_diff": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
+    "auto_semantic_similarity": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
+    "auto_webhook_test": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
+    "auto_exact_match": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_contains_json": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_similarity_match": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
+    "auto_regex_test": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_starts_with": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_ends_with": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_contains": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_contains_any": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_contains_all": _SUCCESS_ONLY_OUTPUT_SCHEMA,
+    "auto_levenshtein_distance": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
+}
+
+
+def _extract_auto_ai_critique_default_outputs_schema():
+    for evaluator in evaluators:
+        if evaluator.get("key") != "auto_ai_critique":
+            continue
+
+        settings_template = evaluator.get("settings_template") or {}
+        json_schema_field = settings_template.get("json_schema") or {}
+        default_value = json_schema_field.get("default") or {}
+        schema = default_value.get("schema")
+
+        return deepcopy(schema) if isinstance(schema, dict) else None
+
+    return None
+
+
+_auto_ai_critique_outputs_schema = _extract_auto_ai_critique_default_outputs_schema()
+if _auto_ai_critique_outputs_schema is not None:
+    _FIXED_OUTPUT_SCHEMA_BY_KEY["auto_ai_critique"] = _auto_ai_critique_outputs_schema
+
+for evaluator in evaluators:
+    evaluator_key = evaluator.get("key")
+    outputs_schema = (
+        _FIXED_OUTPUT_SCHEMA_BY_KEY.get(evaluator_key)
+        if isinstance(evaluator_key, str)
+        else None
+    )
+    if outputs_schema is not None:
+        evaluator["outputs_schema"] = deepcopy(outputs_schema)
+
+
 def get_all_evaluators():
     """
     Returns a list of evaluators

diff --git a/docs/design/migrate-evaluator-playground/README.md b/docs/design/migrate-evaluator-playground/README.md
@@ -0,0 +1,84 @@
+# Migrate Evaluator Playground to New Evaluator Endpoints
+
+## Overview
+
+This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model.
+
+## Migration Strategy
+
+**Direct migration (no adapters)** split into two PRs:
+
+| PR | Scope | Description |
+|----|-------|-------------|
+| **PR 1** | CRUD | Migrate to `/preview/simple/evaluators/*`, change internal types to `SimpleEvaluator` |
+| **PR 2** | Run | Migrate to `/preview/workflows/invoke`, add workflow service types |
+
+See [plan.md](./plan.md) for detailed implementation steps.
+
+## Context
+
+- **PR #3527**: Backend migration that introduces new evaluator endpoints
+- **Goal**: Full migration to new endpoints, no legacy code remaining
+
+## Documents
+
+| File | Description |
+|------|-------------|
+| [context.md](./context.md) | Background, motivation, problem statement, goals, and non-goals |
+| [current-system.md](./current-system.md) | Detailed map of current Evaluator Playground implementation |
+| [new-endpoints.md](./new-endpoints.md) | New evaluator endpoint shapes and differences from legacy |
+| [research.md](./research.md) | Deep dive into evaluator execution architecture and URI-based handlers |
+| [migration-options.md](./migration-options.md) | Why we chose direct migration over adapters |
+| [risk-analysis.md](./risk-analysis.md) | Coupling points and risk areas for the migration |
+| [plan.md](./plan.md) | **Main plan** - PR 1 (CRUD) and PR 2 (Run) implementation details |
+| [status.md](./status.md) | Living document for progress updates and decisions |
+
+## Key Mapping Changes
+
+| Legacy | New |
+|--------|-----|
+| `EvaluatorConfig` | `SimpleEvaluator` |
+| `evaluator_key` | derived from `data.uri` |
+| `settings_values` | `data.parameters` |
+| `GET /evaluators/configs/` | `POST /preview/simple/evaluators/query` |
+| `POST /evaluators/configs/` | `POST /preview/simple/evaluators/` |
+| `PUT /evaluators/configs/{id}/` | `PUT /preview/simple/evaluators/{id}` |
+| `DELETE /evaluators/configs/{id}/` | `POST /preview/simple/evaluators/{id}/archive` |
+| `POST /evaluators/{key}/run/` | `POST /preview/workflows/invoke` |
+
+## Files Affected
+
+### PR 1: CRUD Migration
+
+| Area | Files |
+|------|-------|
+| Types | `web/oss/src/lib/Types.ts` |
+| Services | `web/oss/src/services/evaluators/index.ts` |
+| State | `web/oss/src/state/evaluators/atoms.ts` |
+| Playground State | `web/oss/src/components/.../ConfigureEvaluator/state/atoms.ts` |
+| Playground UI | `web/oss/src/components/.../ConfigureEvaluator/index.tsx` |
+| Registry | `web/oss/src/components/Evaluators/index.tsx` |
+| Registry Hook | `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` |
+| Columns | `web/oss/src/components/Evaluators/assets/getColumns.tsx` |
+
+### PR 2: Run Migration
+
+| Area | Files |
+|------|-------|
+| Types | `web/oss/src/lib/Types.ts` (add workflow types) |
+| Invoke Service | `web/oss/src/services/workflows/invoke.ts` (new) |
+| Debug Section | `web/oss/src/components/.../ConfigureEvaluator/DebugSection.tsx` |
+
+### Backend Reference (PR #3527)
+- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept temporarily)
+- `api/oss/src/apis/fastapi/evaluators/router.py` - New `SimpleEvaluators` router
+- `api/oss/src/apis/fastapi/workflows/router.py` - Workflow invoke endpoint
+- `api/oss/src/core/evaluators/dtos.py` - New data transfer objects
+
+## Effort Estimate
+
+| PR | Effort |
+|----|--------|
+| PR 1: CRUD | 4-5 days |
+| PR 2: Run | 3-4 days |
+| **Total** | **7-9 days** |
diff --git a/docs/design/migrate-evaluator-playground/context.md b/docs/design/migrate-evaluator-playground/context.md
@@ -0,0 +1,72 @@
+# Context: Migrate Evaluator Playground
+
+## Background
+
+The Agenta platform has undergone a significant architectural change where **evaluators are now workflows**. This means evaluators follow the same git-like versioning model as other workflows:
+- **Artifact** (Evaluator) → **Variant** → **Revision**
+
+Previously, evaluators were stored in a flat `EvaluatorConfigDB` table with simple key-value settings. The new model stores evaluators as `WorkflowArtifactDBE`, `WorkflowVariantDBE`, and `WorkflowRevisionDBE` records with richer metadata and versioning.
+
+## Motivation
+
+1. **Unified Architecture**: Evaluators, testsets, and apps now share the same git-like workflow model
+2. **Better Versioning**: Evaluators can have multiple variants and revision history
+3. **Richer Metadata**: New model supports URIs, schemas, scripts, and configuration in a structured way
+4. **Future Extensibility**: Custom evaluators will be first-class citizens with the same capabilities as built-in ones
+
+## Problem Statement
+
+The Evaluator Playground frontend currently uses legacy endpoints:
+- `GET /evaluators/` - List evaluator templates
+- `GET/POST/PUT/DELETE /evaluators/configs/` - CRUD for evaluator configurations
+- `POST /evaluators/{key}/run/` - Run evaluator in playground
+
+The backend (PR #3527) has:
+1. Migrated all evaluator configs to the new workflow-based model via DB migrations
+2. Created new `SimpleEvaluators` endpoints at `/preview/simple/evaluators/`
+3. Native workflow execution available at `/preview/workflows/invoke`
+4. Kept legacy endpoints as thin wrappers (to be deprecated)
+
+**The frontend needs to migrate to use the new endpoints directly.**
+
+## Goals
+
+1. **Replace legacy evaluator config CRUD** with new `SimpleEvaluator` endpoints
+2. **Replace legacy evaluator run** with native workflow invoke (`/preview/workflows/invoke`)
+3. **Update data models** in frontend to match new `SimpleEvaluator` shape (no adapters)
+4. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality
+5. **Remove all legacy endpoint usage** - clean migration, no dual-path code
+
+## Non-Goals
+
+1. **Not changing the Evaluator Playground UI** - Only the data layer changes
+2. **Not migrating evaluation batch runs** - Those already use the new workflow system internally
+3. **Not introducing new evaluator features** - This is a pure endpoint migration
+
+## Success Criteria
+
+1. Evaluator Playground can create, edit, delete evaluators using new `SimpleEvaluator` endpoints
+2. Evaluator Playground can run evaluators using native workflow invoke
+3. All existing evaluator configurations continue to work
+4. No regression in evaluator testing functionality
+5. No legacy endpoint calls remain in frontend code
+
+## Constraints
+
+1. Must not break existing evaluator configurations
+2. Must coordinate with backend team on endpoint availability (PR #3527)
+3. Split into two PRs for reviewability (CRUD first, then Run)
+
+## Migration Approach
+
+**Direct migration (no adapters):**
+
+| PR | Scope | Endpoints |
+|----|-------|-----------|
+| PR 1 | CRUD | `/preview/simple/evaluators/*` |
+| PR 2 | Run | `/preview/workflows/invoke` |
+
+This approach:
+- Avoids tech debt from adapter layers
+- Aligns internal types with backend models
+- Keeps changes reviewable by splitting into two PRs