Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 88 additions & 2 deletions api/oss/src/core/evaluators/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
EvaluatorRevisionCommit,
EvaluatorRevisionQuery,
)
from oss.src.core.evaluators.utils import build_evaluator_data
from oss.src.core.shared.dtos import Reference
from oss.src.utils.logging import get_module_logger

Expand Down Expand Up @@ -774,6 +775,83 @@ def __init__(
):
self.evaluators_service = evaluators_service

@staticmethod
def _extract_builtin_evaluator_key(
simple_evaluator_data: Optional[SimpleEvaluatorData],
) -> Optional[str]:
uri = simple_evaluator_data.uri if simple_evaluator_data else None

if not uri:
return None

parts = uri.split(":")

if len(parts) < 4:
return None

if parts[0] != "agenta" or parts[1] != "builtin":
return None

return parts[2] or None

@staticmethod
def _has_outputs_schema(
simple_evaluator_data: Optional[SimpleEvaluatorData],
) -> bool:
if not simple_evaluator_data or not isinstance(
simple_evaluator_data.schemas, dict
):
return False

return bool(simple_evaluator_data.schemas.get("outputs"))

def _ensure_builtin_evaluator_data(
self,
simple_evaluator_data: Optional[SimpleEvaluatorData],
) -> Optional[SimpleEvaluatorData]:
evaluator_key = self._extract_builtin_evaluator_key(simple_evaluator_data)

if not evaluator_key:
return simple_evaluator_data

if self._has_outputs_schema(simple_evaluator_data):
return simple_evaluator_data

settings_values = (
simple_evaluator_data.parameters
if simple_evaluator_data
and isinstance(simple_evaluator_data.parameters, dict)
else None
)

hydrated_data = build_evaluator_data(
evaluator_key=evaluator_key,
settings_values=settings_values,
)

hydrated_data_dict = hydrated_data.model_dump(
mode="json",
exclude_none=True,
exclude_unset=True,
)

existing_data_dict = (
simple_evaluator_data.model_dump(
mode="json",
exclude_none=True,
exclude_unset=True,
)
if simple_evaluator_data
else {}
)

return SimpleEvaluatorData(
**{
**hydrated_data_dict,
**existing_data_dict,
}
)
Comment on lines +848 to +853
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Shallow merge in _ensure_builtin_evaluator_data can silently discard hydrated output schemas

When the existing simple_evaluator_data has a schemas field set to a value that does not contain an outputs key (e.g., schemas: {} or schemas: {"inputs": {...}}), the shallow dict merge {**hydrated_data_dict, **existing_data_dict} completely replaces the hydrated schemas (which contains the computed outputs) with the existing incomplete schemas, thereby losing the output schema the function was supposed to inject.

Root Cause & Impact

The function's purpose is to fill in missing output schemas for builtin evaluators. At api/oss/src/core/evaluators/service.py:848-853, the merge is:

return SimpleEvaluatorData(
    **{
        **hydrated_data_dict,   # has schemas.outputs
        **existing_data_dict,   # may have schemas WITHOUT outputs
    }
)

Because Python dict unpacking is a shallow merge, the entire schemas key from existing_data_dict replaces the one from hydrated_data_dict. For example:

  • hydrated_data_dict["schemas"] = {"outputs": {"type": "object", ...}}
  • existing_data_dict["schemas"] = {} (or {"inputs": {...}})
  • Result: schemas = {} — the outputs schema is lost

The _has_outputs_schema guard at line 817 only checks if outputs already exist and returns early. It does not prevent the merge from discarding the hydrated outputs when schemas exists but lacks the outputs key.

Impact: Evaluator revisions can be persisted without the expected data.schemas.outputs, which may break downstream consumers that rely on knowing the evaluator's output shape (e.g., workflow invocation, result rendering).

Prompt for agents
In api/oss/src/core/evaluators/service.py, the _ensure_builtin_evaluator_data method (around line 848-853) performs a shallow dict merge that can lose the hydrated schemas.outputs. Replace the shallow merge with a deep merge that preserves nested dict keys. Specifically, when both hydrated_data_dict and existing_data_dict have a 'schemas' key, the merge should combine them so that existing schemas keys take precedence but missing keys (like 'outputs') are filled in from the hydrated dict. For example:

merged = {**hydrated_data_dict, **existing_data_dict}
if 'schemas' in hydrated_data_dict and 'schemas' in existing_data_dict:
    merged['schemas'] = {**hydrated_data_dict['schemas'], **existing_data_dict['schemas']}

Then construct SimpleEvaluatorData(**merged).
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


# public -------------------------------------------------------------------

async def create(
Expand Down Expand Up @@ -864,6 +942,10 @@ async def create(

evaluator_revision_slug = uuid4().hex[-12:]

hydrated_simple_evaluator_data = self._ensure_builtin_evaluator_data(
simple_evaluator_create.data,
)

evaluator_revision_commit = EvaluatorRevisionCommit(
slug=evaluator_revision_slug,
#
Expand Down Expand Up @@ -905,7 +987,7 @@ async def create(
tags=evaluator_create.tags,
meta=evaluator_create.meta,
#
data=simple_evaluator_create.data,
data=hydrated_simple_evaluator_data,
#
evaluator_id=evaluator.id,
evaluator_variant_id=evaluator_variant.id,
Expand Down Expand Up @@ -1150,6 +1232,10 @@ async def edit(

evaluator_revision_slug = uuid4().hex[-12:]

hydrated_simple_evaluator_data = self._ensure_builtin_evaluator_data(
simple_evaluator_edit.data,
)

evaluator_revision_commit = EvaluatorRevisionCommit(
slug=evaluator_revision_slug,
#
Expand All @@ -1160,7 +1246,7 @@ async def edit(
tags=evaluator_edit.tags,
meta=evaluator_edit.meta,
#
data=simple_evaluator_edit.data,
data=hydrated_simple_evaluator_data,
#
evaluator_id=evaluator.id,
evaluator_variant_id=evaluator_variant.id,
Expand Down
1 change: 1 addition & 0 deletions api/oss/src/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class LegacyEvaluator(BaseModel):
direct_use: bool
settings_presets: Optional[list[dict]] = None
settings_template: dict
outputs_schema: Optional[Dict[str, Any]] = None
description: Optional[str] = None
oss: Optional[bool] = False
requires_llm_api_keys: Optional[bool] = False
Expand Down
73 changes: 73 additions & 0 deletions api/oss/src/resources/evaluators/evaluators.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from copy import deepcopy


rag_evaluator_settings_template = {
"question_key": {
"label": "Question Key",
Expand Down Expand Up @@ -832,6 +835,76 @@
]


_SUCCESS_ONLY_OUTPUT_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"properties": {
"success": {"type": "boolean"},
},
"required": ["success"],
"additionalProperties": False,
}

_SCORE_AND_SUCCESS_OUTPUT_SCHEMA = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"properties": {
"score": {"type": "number"},
"success": {"type": "boolean"},
},
"required": [],
"additionalProperties": False,
}

_FIXED_OUTPUT_SCHEMA_BY_KEY = {
"auto_custom_code_run": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"field_match_test": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_json_diff": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"auto_semantic_similarity": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"auto_webhook_test": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"auto_exact_match": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_contains_json": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_similarity_match": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
"auto_regex_test": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_starts_with": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_ends_with": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_contains": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_contains_any": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_contains_all": _SUCCESS_ONLY_OUTPUT_SCHEMA,
"auto_levenshtein_distance": _SCORE_AND_SUCCESS_OUTPUT_SCHEMA,
}


def _extract_auto_ai_critique_default_outputs_schema():
for evaluator in evaluators:
if evaluator.get("key") != "auto_ai_critique":
continue

settings_template = evaluator.get("settings_template") or {}
json_schema_field = settings_template.get("json_schema") or {}
default_value = json_schema_field.get("default") or {}
schema = default_value.get("schema")

return deepcopy(schema) if isinstance(schema, dict) else None

return None


_auto_ai_critique_outputs_schema = _extract_auto_ai_critique_default_outputs_schema()
if _auto_ai_critique_outputs_schema is not None:
_FIXED_OUTPUT_SCHEMA_BY_KEY["auto_ai_critique"] = _auto_ai_critique_outputs_schema

for evaluator in evaluators:
evaluator_key = evaluator.get("key")
outputs_schema = (
_FIXED_OUTPUT_SCHEMA_BY_KEY.get(evaluator_key)
if isinstance(evaluator_key, str)
else None
)
if outputs_schema is not None:
evaluator["outputs_schema"] = deepcopy(outputs_schema)


def get_all_evaluators():
"""
Returns a list of evaluators
Expand Down
80 changes: 42 additions & 38 deletions docs/design/migrate-evaluator-playground/current-system.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,27 +90,26 @@ evaluatorByKeyAtomFamily // Find evaluator by key
#### Evaluators Service (`/web/oss/src/services/evaluators/index.ts`)

```typescript
// Evaluator Templates (legacy)
// Evaluator Templates
fetchAllEvaluators() // GET /evaluators

// Evaluator Configs (legacy)
fetchAllEvaluatorConfigs() // GET /evaluators/configs
createEvaluatorConfig() // POST /evaluators/configs
updateEvaluatorConfig() // PUT /evaluators/configs/{id}
deleteEvaluatorConfig() // DELETE /evaluators/configs/{id}
// Evaluator Configs
fetchAllEvaluatorConfigs() // POST /preview/simple/evaluators/query
createEvaluatorConfig() // POST /preview/simple/evaluators/
updateEvaluatorConfig() // PUT /preview/simple/evaluators/{id}
deleteEvaluatorConfig() // POST /preview/simple/evaluators/{id}/archive

// Custom/Human Evaluators (new)
// Custom/Human Evaluators
createEvaluator() // POST /preview/simple/evaluators/
updateEvaluator() // PUT /preview/simple/evaluators/{id}
fetchEvaluatorById() // GET /preview/simple/evaluators/{id}
deleteHumanEvaluator() // POST /preview/simple/evaluators/{id}/archive
```

#### Evaluator Run Service (`/web/oss/src/services/evaluations/api_ee/index.ts`)
#### Evaluator Run Service (`/web/oss/src/services/workflows/invoke.ts`)

```typescript
createEvaluatorDataMapping() // POST /evaluators/map
createEvaluatorRunExecution() // POST /evaluators/{key}/run
invokeEvaluator() // POST /preview/workflows/invoke
```

## Data Flow
Expand All @@ -130,7 +129,7 @@ createEvaluatorRunExecution() // POST /evaluators/{key}/run
│ /evaluators → EvaluatorsRegistry │
│ ├─ Uses useEvaluatorsRegistryData() hook │
│ │ ├─ Calls fetchAllEvaluators() → GET /evaluators │
│ │ └─ Calls fetchAllEvaluatorConfigs() → GET /evaluators/configs
│ │ └─ Calls fetchAllEvaluatorConfigs() → POST /preview/simple/evaluators/query
│ │ │
│ ├─ "Create new" → SelectEvaluatorModal → /evaluators/configure/new │
│ └─ Click row → /evaluators/configure/{id} │
Expand All @@ -153,71 +152,76 @@ createEvaluatorRunExecution() // POST /evaluators/{key}/run
│ └─────────────────────────────┘ └─────────────────────────────┘ │
│ │
│ Commit Actions: │
│ - Create: POST /evaluators/configs → createEvaluatorConfig()
│ - Update: PUT /evaluators/configs/{id} → updateEvaluatorConfig()
│ - Create: POST /preview/simple/evaluators → createEvaluatorConfig() │
│ - Update: PUT /preview/simple/evaluators/{id} → updateEvaluatorConfig() │
│ │
│ Test Actions: │
│ - Run Variant: callVariant() → POST to variant URL │
│ - Run Evaluator: createEvaluatorRunExecution()
│ → POST /evaluators/{key}/run
│ - Run Evaluator: invokeEvaluator()
│ → POST /preview/workflows/invoke
└─────────────────────────────────────────────────────────────────────────────┘
```

## Current API Endpoints Used

### Legacy Endpoints (to be migrated)
### Evaluator Templates

| Endpoint | Method | Frontend Function | Purpose |
|----------|--------|-------------------|---------|
| `/evaluators/` | GET | `fetchAllEvaluators()` | List evaluator templates |
| `/evaluators/configs/` | GET | `fetchAllEvaluatorConfigs()` | List evaluator configs |
| `/evaluators/configs/` | POST | `createEvaluatorConfig()` | Create new config |
| `/evaluators/configs/{id}/` | PUT | `updateEvaluatorConfig()` | Update existing config |
| `/evaluators/configs/{id}/` | DELETE | `deleteEvaluatorConfig()` | Delete config |

### Endpoints That Remain Unchanged
### Evaluator CRUD

| Endpoint | Method | Frontend Function | Purpose |
|----------|--------|-------------------|---------|
| `/evaluators/map/` | POST | `createEvaluatorDataMapping()` | Map trace data for RAG evaluators |
| `/evaluators/{key}/run/` | POST | `createEvaluatorRunExecution()` | Run evaluator (test) |
| `/preview/simple/evaluators/query` | POST | `fetchAllEvaluatorConfigs()` | List evaluator configs |
| `/preview/simple/evaluators/` | POST | `createEvaluatorConfig()` | Create evaluator config |
| `/preview/simple/evaluators/{id}` | PUT | `updateEvaluatorConfig()` | Update evaluator config |
| `/preview/simple/evaluators/{id}/archive` | POST | `deleteEvaluatorConfig()` | Archive evaluator config |

### Already Using New Endpoints (for custom evaluators)
### Evaluator Run (Playground)

| Endpoint | Method | Frontend Function | Purpose |
|----------|--------|-------------------|---------|
| `/preview/simple/evaluators/` | POST | `createEvaluator()` | Create custom evaluator |
| `/preview/simple/evaluators/{id}` | PUT | `updateEvaluator()` | Update custom evaluator |
| `/preview/simple/evaluators/{id}` | GET | `fetchEvaluatorById()` | Fetch evaluator by ID |
| `/preview/simple/evaluators/{id}/archive` | POST | `deleteHumanEvaluator()` | Archive human evaluator |
| `/preview/workflows/invoke` | POST | `invokeEvaluator()` | Run evaluator using workflow invocation |

## Data Types

### Current EvaluatorConfig (Legacy)
### Current Evaluator Config

```typescript
interface EvaluatorConfig {
interface SimpleEvaluator {
id: string
evaluator_key: string
name: string
settings_values: Record<string, any>
slug: string
name?: string
description?: string
tags?: string[]
flags?: {
is_custom?: boolean
is_evaluator?: boolean
is_human?: boolean
}
data?: {
uri?: string
parameters?: Record<string, any>
schemas?: {
outputs?: Record<string, any>
}
}
created_at: string
updated_at: string
color?: string
tags?: string[]
// Frontend additions
icon_url?: string | StaticImageData
}
```

### Current Evaluator Template (Legacy)
### Current Evaluator Template

```typescript
interface Evaluator {
name: string
key: string
settings_presets?: SettingsPreset[]
settings_template: Record<string, EvaluationSettingsTemplate>
outputs_schema?: Record<string, any>
icon_url?: string | StaticImageData
color?: string
direct_use?: boolean
Expand Down
Loading