Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 88 additions & 3 deletions api/oss/src/core/evaluators/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
#
EvaluatorRevision,
EvaluatorRevisionCreate,
EvaluatorRevisionData,
EvaluatorRevisionEdit,
EvaluatorRevisionCommit,
EvaluatorRevisionQuery,
)
from oss.src.core.evaluators.utils import build_evaluator_data
from oss.src.core.shared.dtos import Reference
from oss.src.utils.logging import get_module_logger

Expand Down Expand Up @@ -759,6 +759,83 @@ def __init__(
):
self.evaluators_service = evaluators_service

@staticmethod
def _extract_builtin_evaluator_key(
simple_evaluator_data: Optional[SimpleEvaluatorData],
) -> Optional[str]:
uri = simple_evaluator_data.uri if simple_evaluator_data else None

if not uri:
return None

parts = uri.split(":")

if len(parts) < 4:
return None

if parts[0] != "agenta" or parts[1] != "builtin":
return None

return parts[2] or None

@staticmethod
def _has_outputs_schema(
simple_evaluator_data: Optional[SimpleEvaluatorData],
) -> bool:
if not simple_evaluator_data or not isinstance(
simple_evaluator_data.schemas, dict
):
return False

return bool(simple_evaluator_data.schemas.get("outputs"))

def _ensure_builtin_evaluator_data(
self,
simple_evaluator_data: Optional[SimpleEvaluatorData],
) -> Optional[SimpleEvaluatorData]:
evaluator_key = self._extract_builtin_evaluator_key(simple_evaluator_data)

if not evaluator_key:
return simple_evaluator_data

if self._has_outputs_schema(simple_evaluator_data):
return simple_evaluator_data

settings_values = (
simple_evaluator_data.parameters
if simple_evaluator_data
and isinstance(simple_evaluator_data.parameters, dict)
else None
)

hydrated_data = build_evaluator_data(
evaluator_key=evaluator_key,
settings_values=settings_values,
)

hydrated_data_dict = hydrated_data.model_dump(
mode="json",
exclude_none=True,
exclude_unset=True,
)

existing_data_dict = (
simple_evaluator_data.model_dump(
mode="json",
exclude_none=True,
exclude_unset=True,
)
if simple_evaluator_data
else {}
)

return SimpleEvaluatorData(
**{
**hydrated_data_dict,
**existing_data_dict,
}
)

# public -------------------------------------------------------------------

async def create(
Expand Down Expand Up @@ -849,6 +926,10 @@ async def create(

evaluator_revision_slug = uuid4().hex[-12:]

hydrated_simple_evaluator_data = self._ensure_builtin_evaluator_data(
simple_evaluator_create.data,
)

evaluator_revision_commit = EvaluatorRevisionCommit(
slug=evaluator_revision_slug,
#
Expand Down Expand Up @@ -890,7 +971,7 @@ async def create(
tags=evaluator_create.tags,
meta=evaluator_create.meta,
#
data=simple_evaluator_create.data,
data=hydrated_simple_evaluator_data,
#
evaluator_id=evaluator.id,
evaluator_variant_id=evaluator_variant.id,
Expand Down Expand Up @@ -1135,6 +1216,10 @@ async def edit(

evaluator_revision_slug = uuid4().hex[-12:]

hydrated_simple_evaluator_data = self._ensure_builtin_evaluator_data(
simple_evaluator_edit.data,
)

evaluator_revision_commit = EvaluatorRevisionCommit(
slug=evaluator_revision_slug,
#
Expand All @@ -1145,7 +1230,7 @@ async def edit(
tags=evaluator_edit.tags,
meta=evaluator_edit.meta,
#
data=simple_evaluator_edit.data,
data=hydrated_simple_evaluator_data,
#
evaluator_id=evaluator.id,
evaluator_variant_id=evaluator_variant.id,
Expand Down
3 changes: 1 addition & 2 deletions api/oss/src/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@
from oss.src.utils import traces
from oss.src.models.api.api_models import Result

from oss.src.core.shared.dtos import Tags, Meta


class LegacyEvaluator(BaseModel):
name: str
key: str
direct_use: bool
settings_presets: Optional[list[dict]] = None
settings_template: dict
outputs_schema: Optional[Dict[str, Any]] = None
description: Optional[str] = None
oss: Optional[bool] = False
requires_llm_api_keys: Optional[bool] = False
Expand Down
73 changes: 73 additions & 0 deletions api/oss/src/resources/evaluators/evaluators.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from copy import deepcopy


rag_evaluator_settings_template = {
"question_key": {
"label": "Question Key",
Expand Down Expand Up @@ -832,6 +835,76 @@
]


_SUCCESS_ONLY_OUTPUT_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"properties": {
"success": {"type": "boolean"},
},
"required": ["success"],
"additionalProperties": False,
}

_SCORE_AND_SUCCESS_OUTPUT_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"properties": {
"score": {"type": "number"},
"success": {"type": "boolean"},
},
"required": [],
"additionalProperties": False,
}

_FIXED_OUTPUT_SCHEMA_BY_KEY = {
"auto_custom_code_run": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"field_match_test": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_json_diff": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"auto_semantic_similarity": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"auto_webhook_test": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"auto_exact_match": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_contains_json": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_similarity_match": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"auto_regex_test": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_starts_with": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_ends_with": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_contains": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_contains_any": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_contains_all": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_levenshtein_distance": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
}


def _extract_auto_ai_critique_default_outputs_schema():
for evaluator in evaluators:
if evaluator.get("key") != "auto_ai_critique":
continue

settings_template = evaluator.get("settings_template") or {}
json_schema_field = settings_template.get("json_schema") or {}
default_value = json_schema_field.get("default") or {}
schema = default_value.get("schema")

return deepcopy(schema) if isinstance(schema, dict) else None

return None


_auto_ai_critique_outputs_schema = _extract_auto_ai_critique_default_outputs_schema()
if _auto_ai_critique_outputs_schema is not None:
_FIXED_OUTPUT_SCHEMA_BY_KEY["auto_ai_critique"] = _auto_ai_critique_outputs_schema

for evaluator in evaluators:
evaluator_key = evaluator.get("key")
outputs_schema = (
_FIXED_OUTPUT_SCHEMA_BY_KEY.get(evaluator_key)
if isinstance(evaluator_key, str)
else None
)
if outputs_schema is not None:
evaluator["outputs_schema"] = deepcopy(outputs_schema)


def get_all_evaluators():
"""
Returns a list of evaluators
Expand Down
84 changes: 84 additions & 0 deletions docs/design/migrate-evaluator-playground/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Migrate Evaluator Playground to New Evaluator Endpoints

## Overview

This planning workspace documents the migration of the Evaluator Playground frontend to use the new workflow-based evaluator endpoints. The backend team has migrated evaluators from the old `EvaluatorConfig` model to the new `SimpleEvaluator` (workflow-based) model.

## Migration Strategy

**Direct migration (no adapters)** split into two PRs:

| PR | Scope | Description |
|----|-------|-------------|
| **PR 1** | CRUD | Migrate to `/preview/simple/evaluators/*`, change internal types to `SimpleEvaluator` |
| **PR 2** | Run | Migrate to `/preview/workflows/invoke`, add workflow service types |

See [plan.md](./plan.md) for detailed implementation steps.

## Context

- **PR #3527**: Backend migration that introduces new evaluator endpoints
- **Goal**: Full migration to new endpoints, no legacy code remaining

## Documents

| File | Description |
|------|-------------|
| [context.md](./context.md) | Background, motivation, problem statement, goals, and non-goals |
| [current-system.md](./current-system.md) | Detailed map of current Evaluator Playground implementation |
| [new-endpoints.md](./new-endpoints.md) | New evaluator endpoint shapes and differences from legacy |
| [research.md](./research.md) | Deep dive into evaluator execution architecture and URI-based handlers |
| [migration-options.md](./migration-options.md) | Why we chose direct migration over adapters |
| [risk-analysis.md](./risk-analysis.md) | Coupling points and risk areas for the migration |
| [plan.md](./plan.md) | **Main plan** - PR 1 (CRUD) and PR 2 (Run) implementation details |
| [status.md](./status.md) | Living document for progress updates and decisions |

## Key Mapping Changes

| Legacy | New |
|--------|-----|
| `EvaluatorConfig` | `SimpleEvaluator` |
| `evaluator_key` | derived from `data.uri` |
| `settings_values` | `data.parameters` |
| `GET /evaluators/configs/` | `POST /preview/simple/evaluators/query` |
| `POST /evaluators/configs/` | `POST /preview/simple/evaluators/` |
| `PUT /evaluators/configs/{id}/` | `PUT /preview/simple/evaluators/{id}` |
| `DELETE /evaluators/configs/{id}/` | `POST /preview/simple/evaluators/{id}/archive` |
| `POST /evaluators/{key}/run/` | `POST /preview/workflows/invoke` |

## Files Affected

### PR 1: CRUD Migration

| Area | Files |
|------|-------|
| Types | `web/oss/src/lib/Types.ts` |
| Services | `web/oss/src/services/evaluators/index.ts` |
| State | `web/oss/src/state/evaluators/atoms.ts` |
| Playground State | `web/oss/src/components/.../ConfigureEvaluator/state/atoms.ts` |
| Playground UI | `web/oss/src/components/.../ConfigureEvaluator/index.tsx` |
| Registry | `web/oss/src/components/Evaluators/index.tsx` |
| Registry Hook | `web/oss/src/components/Evaluators/hooks/useEvaluatorsRegistryData.ts` |
| Columns | `web/oss/src/components/Evaluators/assets/getColumns.tsx` |

### PR 2: Run Migration

| Area | Files |
|------|-------|
| Types | `web/oss/src/lib/Types.ts` (add workflow types) |
| Invoke Service | `web/oss/src/services/workflows/invoke.ts` (new) |
| Debug Section | `web/oss/src/components/.../ConfigureEvaluator/DebugSection.tsx` |

### Backend Reference (PR #3527)
- `api/oss/src/routers/evaluators_router.py` - Legacy endpoints (kept temporarily)
- `api/oss/src/apis/fastapi/evaluators/router.py` - New `SimpleEvaluators` router
- `api/oss/src/apis/fastapi/workflows/router.py` - Workflow invoke endpoint
- `api/oss/src/core/evaluators/dtos.py` - New data transfer objects

## Effort Estimate

| PR | Effort |
|----|--------|
| PR 1: CRUD | 4-5 days |
| PR 2: Run | 3-4 days |
| **Total** | **7-9 days** |
72 changes: 72 additions & 0 deletions docs/design/migrate-evaluator-playground/context.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Context: Migrate Evaluator Playground

## Background

The Agenta platform has undergone a significant architectural change where **evaluators are now workflows**. This means evaluators follow the same git-like versioning model as other workflows:
- **Artifact** (Evaluator) → **Variant** → **Revision**

Previously, evaluators were stored in a flat `EvaluatorConfigDB` table with simple key-value settings. The new model stores evaluators as `WorkflowArtifactDBE`, `WorkflowVariantDBE`, and `WorkflowRevisionDBE` records with richer metadata and versioning.

## Motivation

1. **Unified Architecture**: Evaluators, testsets, and apps now share the same git-like workflow model
2. **Better Versioning**: Evaluators can have multiple variants and revision history
3. **Richer Metadata**: New model supports URIs, schemas, scripts, and configuration in a structured way
4. **Future Extensibility**: Custom evaluators will be first-class citizens with the same capabilities as built-in ones

## Problem Statement

The Evaluator Playground frontend currently uses legacy endpoints:
- `GET /evaluators/` - List evaluator templates
- `GET/POST/PUT/DELETE /evaluators/configs/` - CRUD for evaluator configurations
- `POST /evaluators/{key}/run/` - Run evaluator in playground

The backend (PR #3527) has:
1. Migrated all evaluator configs to the new workflow-based model via DB migrations
2. Created new `SimpleEvaluators` endpoints at `/preview/simple/evaluators/`
3. Native workflow execution available at `/preview/workflows/invoke`
4. Kept legacy endpoints as thin wrappers (to be deprecated)

**The frontend needs to migrate to use the new endpoints directly.**

## Goals

1. **Replace legacy evaluator config CRUD** with new `SimpleEvaluator` endpoints
2. **Replace legacy evaluator run** with native workflow invoke (`/preview/workflows/invoke`)
3. **Update data models** in frontend to match new `SimpleEvaluator` shape (no adapters)
4. **Preserve UX** - no user-facing changes to the Evaluator Playground functionality
5. **Remove all legacy endpoint usage** - clean migration, no dual-path code

## Non-Goals

1. **Not changing the Evaluator Playground UI** - Only the data layer changes
2. **Not migrating evaluation batch runs** - Those already use the new workflow system internally
3. **Not introducing new evaluator features** - This is a pure endpoint migration

## Success Criteria

1. Evaluator Playground can create, edit, delete evaluators using new `SimpleEvaluator` endpoints
2. Evaluator Playground can run evaluators using native workflow invoke
3. All existing evaluator configurations continue to work
4. No regression in evaluator testing functionality
5. No legacy endpoint calls remain in frontend code

## Constraints

1. Must not break existing evaluator configurations
2. Must coordinate with backend team on endpoint availability (PR #3527)
3. Split into two PRs for reviewability (CRUD first, then Run)

## Migration Approach

**Direct migration (no adapters):**

| PR | Scope | Endpoints |
|----|-------|-----------|
| PR 1 | CRUD | `/preview/simple/evaluators/*` |
| PR 2 | Run | `/preview/workflows/invoke` |

This approach:
- Avoids tech debt from adapter layers
- Aligns internal types with backend models
- Keeps changes reviewable by splitting into two PRs
Loading